epstein-files 1.1.3__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,40 +3,39 @@ import json
3
3
  import pickle
4
4
  import re
5
5
  from collections import defaultdict
6
+ from copy import copy
6
7
  from dataclasses import dataclass, field
7
8
  from datetime import datetime
8
9
  from pathlib import Path
9
- from typing import Sequence, Type
10
-
11
- from rich.padding import Padding
12
- from rich.table import Table
13
- from rich.text import Text
10
+ from typing import Sequence, Type, cast
14
11
 
15
12
  from epstein_files.documents.document import Document
16
- from epstein_files.documents.email import DETECT_EMAIL_REGEX, USELESS_EMAILERS, Email
17
- from epstein_files.documents.emails.email_header import AUTHOR
13
+ from epstein_files.documents.email import DETECT_EMAIL_REGEX, Email
18
14
  from epstein_files.documents.json_file import JsonFile
19
15
  from epstein_files.documents.messenger_log import MSG_REGEX, MessengerLog
20
16
  from epstein_files.documents.other_file import OtherFile
17
+ from epstein_files.person import Person
21
18
  from epstein_files.util.constant.strings import *
22
19
  from epstein_files.util.constants import *
23
- from epstein_files.util.data import days_between, dict_sets_to_lists, json_safe, listify
20
+ from epstein_files.util.data import flatten, json_safe, listify, uniquify
24
21
  from epstein_files.util.doc_cfg import EmailCfg, Metadata
25
22
  from epstein_files.util.env import DOCS_DIR, args, logger
26
23
  from epstein_files.util.file_helper import file_size_str
27
- from epstein_files.util.highlighted_group import HIGHLIGHTED_NAMES, HighlightedNames, get_info_for_name, get_style_for_name
28
- from epstein_files.util.rich import (NA_TXT, add_cols_to_table, build_table, console, highlighter,
29
- print_author_panel, print_centered, print_subtitle_panel)
24
+ from epstein_files.util.highlighted_group import HIGHLIGHTED_NAMES, HighlightedNames
25
+ from epstein_files.util.rich import NA_TXT, add_cols_to_table, build_table, console, print_centered
30
26
  from epstein_files.util.search_result import SearchResult
31
27
  from epstein_files.util.timer import Timer
32
28
 
33
- EXCLUDED_EMAILERS = USELESS_EMAILERS + [JEFFREY_EPSTEIN]
34
- DEVICE_SIGNATURE_SUBTITLE = f"Email [italic]Sent from \\[DEVICE][/italic] Signature Breakdown"
35
- DEVICE_SIGNATURE = 'Device Signature'
36
- DEVICE_SIGNATURE_PADDING = (1, 0)
29
+ DUPLICATE_PROPS_TO_COPY = ['author', 'recipients', 'timestamp']
37
30
  PICKLED_PATH = Path("the_epstein_files.pkl.gz")
38
31
  SLOW_FILE_SECONDS = 1.0
39
32
 
33
+ EMAILS_WITH_UNINTERESTING_CCS = [
34
+ '025329', # Krassner
35
+ '024923', # Krassner
36
+ '033568', # Krassner
37
+ ]
38
+
40
39
 
41
40
  @dataclass
42
41
  class EpsteinFiles:
@@ -46,19 +45,13 @@ class EpsteinFiles:
46
45
  json_files: list[JsonFile] = field(default_factory=list)
47
46
  other_files: list[OtherFile] = field(default_factory=list)
48
47
  timer: Timer = field(default_factory=lambda: Timer())
49
-
50
- # Analytics / calculations
51
- email_author_counts: dict[str | None, int] = field(default_factory=lambda: defaultdict(int))
52
- email_authors_to_device_signatures: dict[str, set] = field(default_factory=lambda: defaultdict(set))
53
- email_device_signatures_to_authors: dict[str, set] = field(default_factory=lambda: defaultdict(set))
54
- email_recipient_counts: dict[str | None, int] = field(default_factory=lambda: defaultdict(int))
55
- unknown_recipient_email_ids: set[str] = field(default_factory=set)
48
+ uninteresting_ccs: list[Name] = field(init=False)
56
49
 
57
50
  def __post_init__(self):
58
51
  """Iterate through files and build appropriate objects."""
59
52
  self.all_files = sorted([f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')])
60
53
  documents = []
61
- file_type_count = defaultdict(int)
54
+ file_type_count = defaultdict(int) # Hack used by --skip-other-files option
62
55
 
63
56
  # Read through and classify all the files
64
57
  for file_arg in self.all_files:
@@ -84,14 +77,15 @@ class EpsteinFiles:
84
77
  self.imessage_logs = Document.sort_by_timestamp([d for d in documents if isinstance(d, MessengerLog)])
85
78
  self.other_files = Document.sort_by_timestamp([d for d in documents if isinstance(d, (JsonFile, OtherFile))])
86
79
  self.json_files = [doc for doc in self.other_files if isinstance(doc, JsonFile)]
87
- self._tally_email_data()
80
+ self._set_uninteresting_ccs()
81
+ self._copy_duplicate_email_properties()
88
82
 
89
83
  @classmethod
90
84
  def get_files(cls, timer: Timer | None = None) -> 'EpsteinFiles':
91
85
  """Alternate constructor that reads/writes a pickled version of the data ('timer' arg is for logging)."""
92
86
  timer = timer or Timer()
93
87
 
94
- if PICKLED_PATH.exists() and not args.overwrite_pickle:
88
+ if PICKLED_PATH.exists() and not args.overwrite_pickle and not args.skip_other_files:
95
89
  with gzip.open(PICKLED_PATH, 'rb') as file:
96
90
  epstein_files = pickle.load(file)
97
91
  epstein_files.timer = timer
@@ -115,17 +109,7 @@ class EpsteinFiles:
115
109
  def all_documents(self) -> Sequence[Document]:
116
110
  return self.imessage_logs + self.emails + self.other_files
117
111
 
118
- def all_emailers(self, include_useless: bool = False) -> list[str | None]:
119
- """Returns all emailers except Epstein and EXCLUDED_EMAILERS, sorted from least frequent to most."""
120
- names = [a for a in self.email_author_counts.keys()] + [r for r in self.email_recipient_counts.keys()]
121
- names = names if include_useless else [e for e in names if e not in EXCLUDED_EMAILERS]
122
- return sorted(list(set(names)), key=lambda e: self.email_author_counts[e] + self.email_recipient_counts[e])
123
-
124
- def docs_matching(
125
- self,
126
- pattern: re.Pattern | str,
127
- names: list[str | None] | None = None
128
- ) -> list[SearchResult]:
112
+ def docs_matching(self, pattern: re.Pattern | str, names: list[Name] | None = None) -> list[SearchResult]:
129
113
  """Find documents whose text matches a pattern (file_type and names args limit the documents searched)."""
130
114
  results: list[SearchResult] = []
131
115
 
@@ -140,14 +124,39 @@ class EpsteinFiles:
140
124
 
141
125
  return results
142
126
 
143
- def earliest_email_at(self, author: str | None) -> datetime:
144
- return self.emails_for(author)[0].timestamp
127
+ def earliest_email_at(self, name: Name) -> datetime:
128
+ return self.emails_for(name)[0].timestamp
129
+
130
+ def last_email_at(self, name: Name) -> datetime:
131
+ return self.emails_for(name)[-1].timestamp
132
+
133
+ def email_author_counts(self) -> dict[Name, int]:
134
+ return {
135
+ person.name: len(person.unique_emails_by())
136
+ for person in self.emailers() if len(person.unique_emails_by()) > 0
137
+ }
138
+
139
+ def email_authors_to_device_signatures(self) -> dict[str, set[str]]:
140
+ signatures = defaultdict(set)
141
+
142
+ for email in [e for e in self.non_duplicate_emails() if e.sent_from_device]:
143
+ signatures[email.author_or_unknown()].add(email.sent_from_device)
144
+
145
+ return signatures
145
146
 
146
- def last_email_at(self, author: str | None) -> datetime:
147
- return self.emails_for(author)[-1].timestamp
147
+ def email_device_signatures_to_authors(self) -> dict[str, set[str]]:
148
+ signatures = defaultdict(set)
148
149
 
149
- def email_conversation_length_in_days(self, author: str | None) -> int:
150
- return days_between(self.earliest_email_at(author), self.last_email_at(author))
150
+ for email in [e for e in self.non_duplicate_emails() if e.sent_from_device]:
151
+ signatures[email.sent_from_device].add(email.author_or_unknown())
152
+
153
+ return signatures
154
+
155
+ def email_recipient_counts(self) -> dict[Name, int]:
156
+ return {
157
+ person.name: len(person.unique_emails_to())
158
+ for person in self.emailers() if len(person.unique_emails_to()) > 0
159
+ }
151
160
 
152
161
  def email_signature_substitution_counts(self) -> dict[str, int]:
153
162
  """Return the number of times an email signature was replaced with "<...snipped...>" for each author."""
@@ -159,30 +168,41 @@ class EpsteinFiles:
159
168
 
160
169
  return substitution_counts
161
170
 
162
- def email_unknown_recipient_file_ids(self) -> list[str]:
163
- return sorted(list(self.unknown_recipient_email_ids))
171
+ def emailers(self) -> list[Person]:
172
+ """All the people who sent or received an email."""
173
+ authors = [email.author for email in self.emails]
174
+ recipients = flatten([email.recipients for email in self.emails])
175
+ return self.person_objs(uniquify(authors + recipients))
164
176
 
165
- def emails_by(self, author: str | None) -> list[Email]:
177
+ def emails_by(self, author: Name) -> list[Email]:
166
178
  return Document.sort_by_timestamp([e for e in self.emails if e.author == author])
167
179
 
168
- def emails_for(self, author: str | None) -> list[Email]:
180
+ def emails_for(self, name: Name) -> list[Email]:
169
181
  """Returns emails to or from a given 'author' sorted chronologically."""
170
- emails = self.emails_by(author) + self.emails_to(author)
182
+ emails = self.emails_by(name) + self.emails_to(name)
171
183
 
172
184
  if len(emails) == 0:
173
- raise RuntimeError(f"No emails found for '{author}'")
185
+ raise RuntimeError(f"No emails found for '{name}'")
174
186
 
175
187
  return Document.sort_by_timestamp(Document.uniquify(emails))
176
188
 
177
- def emails_to(self, author: str | None) -> list[Email]:
178
- if author is None:
189
+ def emails_to(self, name: Name) -> list[Email]:
190
+ if name is None:
179
191
  emails = [e for e in self.emails if len(e.recipients) == 0 or None in e.recipients]
180
192
  else:
181
- emails = [e for e in self.emails if author in e.recipients]
193
+ emails = [e for e in self.emails if name in e.recipients]
182
194
 
183
195
  return Document.sort_by_timestamp(emails)
184
196
 
185
- def get_documents_by_id(self, file_ids: str | list[str]) -> list[Document]:
197
+ def email_for_id(self, file_id: str) -> Email:
198
+ docs = self.for_ids([file_id])
199
+
200
+ if docs and isinstance(docs[0], Email):
201
+ return docs[0]
202
+ else:
203
+ raise ValueError(f"No email found for {file_id}")
204
+
205
+ def for_ids(self, file_ids: str | list[str]) -> list[Document]:
186
206
  file_ids = listify(file_ids)
187
207
  docs = [doc for doc in self.all_documents() if doc.file_id in file_ids]
188
208
 
@@ -191,6 +211,9 @@ class EpsteinFiles:
191
211
 
192
212
  return docs
193
213
 
214
+ def imessage_logs_for(self, name: Name) -> list[MessengerLog]:
215
+ return [log for log in self.imessage_logs if name == log.author]
216
+
194
217
  def json_metadata(self) -> str:
195
218
  """Create a JSON string containing metadata for all the files."""
196
219
  metadata = {
@@ -201,7 +224,7 @@ class EpsteinFiles:
201
224
  OtherFile.__name__: _sorted_metadata(self.non_json_other_files()),
202
225
  },
203
226
  'people': {
204
- name: highlighted_group.get_info(name)
227
+ name: highlighted_group.info_for(name, include_category=True)
205
228
  for highlighted_group in HIGHLIGHTED_NAMES
206
229
  if isinstance(highlighted_group, HighlightedNames)
207
230
  for name, description in highlighted_group.emailers.items()
@@ -217,6 +240,19 @@ class EpsteinFiles:
217
240
  def non_json_other_files(self) -> list[OtherFile]:
218
241
  return [doc for doc in self.other_files if not isinstance(doc, JsonFile)]
219
242
 
243
+ def person_objs(self, names: list[Name]) -> list[Person]:
244
+ """Construct Person objects for a list of names."""
245
+ return [
246
+ Person(
247
+ name=name,
248
+ emails=self.emails_for(name),
249
+ imessage_logs=self.imessage_logs_for(name),
250
+ is_uninteresting_cc=name in self.uninteresting_emailers(),
251
+ other_files=[f for f in self.other_files if name and name == f.author]
252
+ )
253
+ for name in names
254
+ ]
255
+
220
256
  def print_files_summary(self) -> None:
221
257
  table = build_table('File Overview')
222
258
  add_cols_to_table(table, ['File Type', 'Count', 'Author Known', 'Author Unknown', 'Duplicates'])
@@ -240,61 +276,44 @@ class EpsteinFiles:
240
276
  print_centered(table)
241
277
  console.line()
242
278
 
243
- def print_emails_for(self, _author: str | None) -> list[Email]:
244
- """Print complete emails to or from a particular 'author'. Returns the Emails that were printed."""
245
- emails = self.emails_for(_author)
246
- num_days = self.email_conversation_length_in_days(_author)
247
- unique_emails = [email for email in emails if not email.is_duplicate()]
248
- start_date = emails[0].timestamp.date()
249
- author = _author or UNKNOWN
250
-
251
- print_author_panel(
252
- f"Found {len(unique_emails)} emails to/from {author} starting {start_date} covering {num_days:,} days",
253
- get_style_for_name(author),
254
- get_info_for_name(author)
255
- )
256
-
257
- self.print_emails_table_for(_author)
258
- last_printed_email_was_duplicate = False
259
-
260
- for email in emails:
261
- if email.is_duplicate():
262
- console.print(Padding(email.duplicate_file_txt().append('...'), (0, 0, 0, 4)))
263
- last_printed_email_was_duplicate = True
264
- else:
265
- if last_printed_email_was_duplicate:
266
- console.line()
267
-
268
- console.print(email)
269
- last_printed_email_was_duplicate = False
270
-
271
- return emails
272
-
273
- def print_emails_table_for(self, author: str | None) -> None:
274
- emails = [email for email in self.emails_for(author) if not email.is_duplicate()] # Remove dupes
275
- print_centered(Email.build_emails_table(emails, author))
276
- console.line()
279
+ def unknown_recipient_ids(self) -> list[str]:
280
+ """IDs of emails whose recipient is not known."""
281
+ return sorted([e.file_id for e in self.emails if None in e.recipients or not e.recipients])
282
+
283
+ def uninteresting_emailers(self) -> list[Name]:
284
+ if '_uninteresting_emailers' not in vars(self):
285
+ self._uninteresting_emailers = sorted(uniquify(UNINTERESTING_EMAILERS + self.uninteresting_ccs))
286
+
287
+ return self._uninteresting_emailers
288
+
289
+ def _copy_duplicate_email_properties(self) -> None:
290
+ """Ensure dupe emails have the properties of the emails they duplicate to capture any repairs, config etc."""
291
+ for email in self.emails:
292
+ if not email.is_duplicate():
293
+ continue
294
+
295
+ original = self.email_for_id(email.duplicate_of_id())
277
296
 
278
- def print_email_device_info(self) -> None:
279
- print_subtitle_panel(DEVICE_SIGNATURE_SUBTITLE, padding=(2, 0, 0, 0), centered=True)
280
- console.print(_build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
281
- console.print(_build_signature_table(self.email_authors_to_device_signatures, (AUTHOR, DEVICE_SIGNATURE)))
297
+ for field_name in DUPLICATE_PROPS_TO_COPY:
298
+ original_prop = getattr(original, field_name)
299
+ duplicate_prop = getattr(email, field_name)
282
300
 
283
- def _tally_email_data(self) -> None:
284
- """Tally up summary info about Email objects."""
285
- for email in self.non_duplicate_emails():
286
- self.email_author_counts[email.author] += 1
301
+ if original_prop != duplicate_prop:
302
+ email.warn(f"Replacing {field_name} {duplicate_prop} with {original_prop} from duplicated '{original.file_id}'")
303
+ setattr(email, field_name, original_prop)
287
304
 
288
- if len(email.recipients) == 0:
289
- self.unknown_recipient_email_ids.add(email.file_id)
290
- self.email_recipient_counts[None] += 1
291
- else:
292
- for recipient in email.recipients:
293
- self.email_recipient_counts[recipient] += 1
305
+ # Resort in case any timestamp were updated
306
+ self.emails = Document.sort_by_timestamp(self.emails)
294
307
 
295
- if email.sent_from_device:
296
- self.email_authors_to_device_signatures[email.author_or_unknown()].add(email.sent_from_device)
297
- self.email_device_signatures_to_authors[email.sent_from_device].add(email.author_or_unknown())
308
+ def _set_uninteresting_ccs(self) -> None:
309
+ ross_gow_email = self.email_for_id('014797_1')
310
+ self.uninteresting_ccs = copy(cast(list[Name], ross_gow_email.header.bcc))
311
+
312
+ for id in EMAILS_WITH_UNINTERESTING_CCS:
313
+ self.uninteresting_ccs += self.email_for_id(id).recipients
314
+
315
+ self.uninteresting_ccs = sorted(uniquify(self.uninteresting_ccs))
316
+ logger.info(f"Extracted uninteresting_ccs: {self.uninteresting_ccs}")
298
317
 
299
318
 
300
319
  def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
@@ -324,21 +343,6 @@ def document_cls(doc: Document) -> Type[Document]:
324
343
  return OtherFile
325
344
 
326
345
 
327
- def _build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str], join_char: str = '\n') -> Padding:
328
- title = 'Signatures Used By Authors' if cols[0] == AUTHOR else 'Authors Seen Using Signatures'
329
- table = build_table(title, header_style="bold reverse", show_lines=True)
330
-
331
- for i, col in enumerate(cols):
332
- table.add_column(col.title() + ('s' if i == 1 else ''))
333
-
334
- new_dict = dict_sets_to_lists(keyed_sets)
335
-
336
- for k in sorted(new_dict.keys()):
337
- table.add_row(highlighter(k or UNKNOWN), highlighter(join_char.join(sorted(new_dict[k]))))
338
-
339
- return Padding(table, DEVICE_SIGNATURE_PADDING)
340
-
341
-
342
346
  def _sorted_metadata(docs: Sequence[Document]) -> list[Metadata]:
343
347
  docs_sorted_by_id = sorted(docs, key=lambda d: d.file_id)
344
348
  return [json_safe(d.metadata()) for d in docs_sorted_by_id]