epstein-files 1.1.5__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,39 +3,39 @@ import json
3
3
  import pickle
4
4
  import re
5
5
  from collections import defaultdict
6
+ from copy import copy
6
7
  from dataclasses import dataclass, field
7
8
  from datetime import datetime
8
9
  from pathlib import Path
9
- from typing import Sequence, Type
10
-
11
- from rich.padding import Padding
12
- from rich.table import Table
13
- from rich.text import Text
10
+ from typing import Sequence, Type, cast
14
11
 
15
12
  from epstein_files.documents.document import Document
16
- from epstein_files.documents.email import DETECT_EMAIL_REGEX, USELESS_EMAILERS, Email
17
- from epstein_files.documents.emails.email_header import AUTHOR
13
+ from epstein_files.documents.email import DETECT_EMAIL_REGEX, Email
18
14
  from epstein_files.documents.json_file import JsonFile
19
15
  from epstein_files.documents.messenger_log import MSG_REGEX, MessengerLog
20
16
  from epstein_files.documents.other_file import OtherFile
17
+ from epstein_files.person import Person
21
18
  from epstein_files.util.constant.strings import *
22
19
  from epstein_files.util.constants import *
23
- from epstein_files.util.data import days_between, dict_sets_to_lists, json_safe, listify
20
+ from epstein_files.util.data import flatten, json_safe, listify, uniquify
24
21
  from epstein_files.util.doc_cfg import EmailCfg, Metadata
25
22
  from epstein_files.util.env import DOCS_DIR, args, logger
26
23
  from epstein_files.util.file_helper import file_size_str
27
- from epstein_files.util.highlighted_group import HIGHLIGHTED_NAMES, HighlightedNames, get_info_for_name, get_style_for_name
28
- from epstein_files.util.rich import (NA_TXT, add_cols_to_table, build_table, console, highlighter,
29
- print_author_panel, print_centered, print_subtitle_panel)
24
+ from epstein_files.util.highlighted_group import HIGHLIGHTED_NAMES, HighlightedNames
25
+ from epstein_files.util.rich import NA_TXT, add_cols_to_table, build_table, console, print_centered
30
26
  from epstein_files.util.search_result import SearchResult
31
27
  from epstein_files.util.timer import Timer
32
28
 
33
- DEVICE_SIGNATURE_SUBTITLE = f"Email [italic]Sent from \\[DEVICE][/italic] Signature Breakdown"
34
- DEVICE_SIGNATURE = 'Device Signature'
35
- DEVICE_SIGNATURE_PADDING = (1, 0)
29
+ DUPLICATE_PROPS_TO_COPY = ['author', 'recipients', 'timestamp']
36
30
  PICKLED_PATH = Path("the_epstein_files.pkl.gz")
37
31
  SLOW_FILE_SECONDS = 1.0
38
32
 
33
+ EMAILS_WITH_UNINTERESTING_CCS = [
34
+ '025329', # Krassner
35
+ '024923', # Krassner
36
+ '033568', # Krassner
37
+ ]
38
+
39
39
 
40
40
  @dataclass
41
41
  class EpsteinFiles:
@@ -45,19 +45,13 @@ class EpsteinFiles:
45
45
  json_files: list[JsonFile] = field(default_factory=list)
46
46
  other_files: list[OtherFile] = field(default_factory=list)
47
47
  timer: Timer = field(default_factory=lambda: Timer())
48
-
49
- # Analytics / calculations
50
- email_author_counts: dict[str | None, int] = field(default_factory=lambda: defaultdict(int))
51
- email_authors_to_device_signatures: dict[str, set] = field(default_factory=lambda: defaultdict(set))
52
- email_device_signatures_to_authors: dict[str, set] = field(default_factory=lambda: defaultdict(set))
53
- email_recipient_counts: dict[str | None, int] = field(default_factory=lambda: defaultdict(int))
54
- unknown_recipient_email_ids: set[str] = field(default_factory=set)
48
+ uninteresting_ccs: list[Name] = field(init=False)
55
49
 
56
50
  def __post_init__(self):
57
51
  """Iterate through files and build appropriate objects."""
58
52
  self.all_files = sorted([f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')])
59
53
  documents = []
60
- file_type_count = defaultdict(int)
54
+ file_type_count = defaultdict(int) # Hack used by --skip-other-files option
61
55
 
62
56
  # Read through and classify all the files
63
57
  for file_arg in self.all_files:
@@ -83,14 +77,15 @@ class EpsteinFiles:
83
77
  self.imessage_logs = Document.sort_by_timestamp([d for d in documents if isinstance(d, MessengerLog)])
84
78
  self.other_files = Document.sort_by_timestamp([d for d in documents if isinstance(d, (JsonFile, OtherFile))])
85
79
  self.json_files = [doc for doc in self.other_files if isinstance(doc, JsonFile)]
86
- self._tally_email_data()
80
+ self._set_uninteresting_ccs()
81
+ self._copy_duplicate_email_properties()
87
82
 
88
83
  @classmethod
89
84
  def get_files(cls, timer: Timer | None = None) -> 'EpsteinFiles':
90
85
  """Alternate constructor that reads/writes a pickled version of the data ('timer' arg is for logging)."""
91
86
  timer = timer or Timer()
92
87
 
93
- if PICKLED_PATH.exists() and not args.overwrite_pickle:
88
+ if PICKLED_PATH.exists() and not args.overwrite_pickle and not args.skip_other_files:
94
89
  with gzip.open(PICKLED_PATH, 'rb') as file:
95
90
  epstein_files = pickle.load(file)
96
91
  epstein_files.timer = timer
@@ -114,17 +109,7 @@ class EpsteinFiles:
114
109
  def all_documents(self) -> Sequence[Document]:
115
110
  return self.imessage_logs + self.emails + self.other_files
116
111
 
117
- def all_emailers(self, include_useless: bool = False) -> list[str | None]:
118
- """Returns all emailers USELESS_EMAILERS, sorted from least frequent to most."""
119
- names = [a for a in self.email_author_counts.keys()] + [r for r in self.email_recipient_counts.keys()]
120
- names = names if include_useless else [e for e in names if e not in USELESS_EMAILERS]
121
- return sorted(list(set(names)), key=lambda e: self.email_author_counts[e] + self.email_recipient_counts[e])
122
-
123
- def docs_matching(
124
- self,
125
- pattern: re.Pattern | str,
126
- names: list[str | None] | None = None
127
- ) -> list[SearchResult]:
112
+ def docs_matching(self, pattern: re.Pattern | str, names: list[Name] | None = None) -> list[SearchResult]:
128
113
  """Find documents whose text matches a pattern (file_type and names args limit the documents searched)."""
129
114
  results: list[SearchResult] = []
130
115
 
@@ -139,14 +124,39 @@ class EpsteinFiles:
139
124
 
140
125
  return results
141
126
 
142
- def earliest_email_at(self, author: str | None) -> datetime:
143
- return self.emails_for(author)[0].timestamp
127
+ def earliest_email_at(self, name: Name) -> datetime:
128
+ return self.emails_for(name)[0].timestamp
129
+
130
+ def last_email_at(self, name: Name) -> datetime:
131
+ return self.emails_for(name)[-1].timestamp
132
+
133
+ def email_author_counts(self) -> dict[Name, int]:
134
+ return {
135
+ person.name: len(person.unique_emails_by())
136
+ for person in self.emailers() if len(person.unique_emails_by()) > 0
137
+ }
138
+
139
+ def email_authors_to_device_signatures(self) -> dict[str, set[str]]:
140
+ signatures = defaultdict(set)
141
+
142
+ for email in [e for e in self.non_duplicate_emails() if e.sent_from_device]:
143
+ signatures[email.author_or_unknown()].add(email.sent_from_device)
144
144
 
145
- def last_email_at(self, author: str | None) -> datetime:
146
- return self.emails_for(author)[-1].timestamp
145
+ return signatures
147
146
 
148
- def email_conversation_length_in_days(self, author: str | None) -> int:
149
- return days_between(self.earliest_email_at(author), self.last_email_at(author))
147
+ def email_device_signatures_to_authors(self) -> dict[str, set[str]]:
148
+ signatures = defaultdict(set)
149
+
150
+ for email in [e for e in self.non_duplicate_emails() if e.sent_from_device]:
151
+ signatures[email.sent_from_device].add(email.author_or_unknown())
152
+
153
+ return signatures
154
+
155
+ def email_recipient_counts(self) -> dict[Name, int]:
156
+ return {
157
+ person.name: len(person.unique_emails_to())
158
+ for person in self.emailers() if len(person.unique_emails_to()) > 0
159
+ }
150
160
 
151
161
  def email_signature_substitution_counts(self) -> dict[str, int]:
152
162
  """Return the number of times an email signature was replaced with "<...snipped...>" for each author."""
@@ -158,32 +168,40 @@ class EpsteinFiles:
158
168
 
159
169
  return substitution_counts
160
170
 
161
- def email_unknown_recipient_file_ids(self) -> list[str]:
162
- return sorted(list(self.unknown_recipient_email_ids))
171
+ def emailers(self) -> list[Person]:
172
+ """All the people who sent or received an email."""
173
+ authors = [email.author for email in self.emails]
174
+ recipients = flatten([email.recipients for email in self.emails])
175
+ return self.person_objs(uniquify(authors + recipients))
163
176
 
164
- def emails_by(self, author: str | None) -> list[Email]:
177
+ def emails_by(self, author: Name) -> list[Email]:
165
178
  return Document.sort_by_timestamp([e for e in self.emails if e.author == author])
166
179
 
167
- def emails_for(self, author: str | None) -> list[Email]:
180
+ def emails_for(self, name: Name) -> list[Email]:
168
181
  """Returns emails to or from a given 'author' sorted chronologically."""
169
- if author == JEFFREY_EPSTEIN:
170
- emails = [e for e in self.emails_by(JEFFREY_EPSTEIN) if e.is_note_to_self()]
171
- else:
172
- emails = self.emails_by(author) + self.emails_to(author)
182
+ emails = self.emails_by(name) + self.emails_to(name)
173
183
 
174
184
  if len(emails) == 0:
175
- raise RuntimeError(f"No emails found for '{author}'")
185
+ raise RuntimeError(f"No emails found for '{name}'")
176
186
 
177
187
  return Document.sort_by_timestamp(Document.uniquify(emails))
178
188
 
179
- def emails_to(self, author: str | None) -> list[Email]:
180
- if author is None:
189
+ def emails_to(self, name: Name) -> list[Email]:
190
+ if name is None:
181
191
  emails = [e for e in self.emails if len(e.recipients) == 0 or None in e.recipients]
182
192
  else:
183
- emails = [e for e in self.emails if author in e.recipients]
193
+ emails = [e for e in self.emails if name in e.recipients]
184
194
 
185
195
  return Document.sort_by_timestamp(emails)
186
196
 
197
+ def email_for_id(self, file_id: str) -> Email:
198
+ docs = self.for_ids([file_id])
199
+
200
+ if docs and isinstance(docs[0], Email):
201
+ return docs[0]
202
+ else:
203
+ raise ValueError(f"No email found for {file_id}")
204
+
187
205
  def for_ids(self, file_ids: str | list[str]) -> list[Document]:
188
206
  file_ids = listify(file_ids)
189
207
  docs = [doc for doc in self.all_documents() if doc.file_id in file_ids]
@@ -193,6 +211,9 @@ class EpsteinFiles:
193
211
 
194
212
  return docs
195
213
 
214
+ def imessage_logs_for(self, name: Name) -> list[MessengerLog]:
215
+ return [log for log in self.imessage_logs if name == log.author]
216
+
196
217
  def json_metadata(self) -> str:
197
218
  """Create a JSON string containing metadata for all the files."""
198
219
  metadata = {
@@ -203,7 +224,7 @@ class EpsteinFiles:
203
224
  OtherFile.__name__: _sorted_metadata(self.non_json_other_files()),
204
225
  },
205
226
  'people': {
206
- name: highlighted_group.get_info(name)
227
+ name: highlighted_group.info_for(name, include_category=True)
207
228
  for highlighted_group in HIGHLIGHTED_NAMES
208
229
  if isinstance(highlighted_group, HighlightedNames)
209
230
  for name, description in highlighted_group.emailers.items()
@@ -219,6 +240,19 @@ class EpsteinFiles:
219
240
  def non_json_other_files(self) -> list[OtherFile]:
220
241
  return [doc for doc in self.other_files if not isinstance(doc, JsonFile)]
221
242
 
243
+ def person_objs(self, names: list[Name]) -> list[Person]:
244
+ """Construct Person objects for a list of names."""
245
+ return [
246
+ Person(
247
+ name=name,
248
+ emails=self.emails_for(name),
249
+ imessage_logs=self.imessage_logs_for(name),
250
+ is_uninteresting_cc=name in self.uninteresting_emailers(),
251
+ other_files=[f for f in self.other_files if name and name == f.author]
252
+ )
253
+ for name in names
254
+ ]
255
+
222
256
  def print_files_summary(self) -> None:
223
257
  table = build_table('File Overview')
224
258
  add_cols_to_table(table, ['File Type', 'Count', 'Author Known', 'Author Unknown', 'Duplicates'])
@@ -242,61 +276,44 @@ class EpsteinFiles:
242
276
  print_centered(table)
243
277
  console.line()
244
278
 
245
- def print_emails_for(self, _author: str | None) -> list[Email]:
246
- """Print complete emails to or from a particular 'author'. Returns the Emails that were printed."""
247
- emails = self.emails_for(_author)
248
- num_days = self.email_conversation_length_in_days(_author)
249
- unique_emails = [email for email in emails if not email.is_duplicate()]
250
- start_date = emails[0].timestamp.date()
251
- author = _author or UNKNOWN
252
- title = f"Found {len(unique_emails)} emails"
253
-
254
- if author == JEFFREY_EPSTEIN:
255
- title += f" sent by {JEFFREY_EPSTEIN} to himself"
256
- else:
257
- title += f" to/from {author} starting {start_date} covering {num_days:,} days"
279
+ def unknown_recipient_ids(self) -> list[str]:
280
+ """IDs of emails whose recipient is not known."""
281
+ return sorted([e.file_id for e in self.emails if None in e.recipients or not e.recipients])
282
+
283
+ def uninteresting_emailers(self) -> list[Name]:
284
+ if '_uninteresting_emailers' not in vars(self):
285
+ self._uninteresting_emailers = sorted(uniquify(UNINTERESTING_EMAILERS + self.uninteresting_ccs))
258
286
 
259
- print_author_panel(title, get_info_for_name(author), get_style_for_name(author))
260
- self.print_emails_table_for(_author)
261
- last_printed_email_was_duplicate = False
287
+ return self._uninteresting_emailers
262
288
 
263
- for email in emails:
264
- if email.is_duplicate():
265
- console.print(Padding(email.duplicate_file_txt().append('...'), (0, 0, 0, 4)))
266
- last_printed_email_was_duplicate = True
267
- else:
268
- if last_printed_email_was_duplicate:
269
- console.line()
289
+ def _copy_duplicate_email_properties(self) -> None:
290
+ """Ensure dupe emails have the properties of the emails they duplicate to capture any repairs, config etc."""
291
+ for email in self.emails:
292
+ if not email.is_duplicate():
293
+ continue
270
294
 
271
- console.print(email)
272
- last_printed_email_was_duplicate = False
295
+ original = self.email_for_id(email.duplicate_of_id())
273
296
 
274
- return emails
297
+ for field_name in DUPLICATE_PROPS_TO_COPY:
298
+ original_prop = getattr(original, field_name)
299
+ duplicate_prop = getattr(email, field_name)
275
300
 
276
- def print_emails_table_for(self, author: str | None) -> None:
277
- emails = [email for email in self.emails_for(author) if not email.is_duplicate()] # Remove dupes
278
- print_centered(Padding(Email.build_emails_table(emails, author), (0, 5, 1, 5)))
301
+ if original_prop != duplicate_prop:
302
+ email.warn(f"Replacing {field_name} {duplicate_prop} with {original_prop} from duplicated '{original.file_id}'")
303
+ setattr(email, field_name, original_prop)
279
304
 
280
- def print_email_device_info(self) -> None:
281
- print_subtitle_panel(DEVICE_SIGNATURE_SUBTITLE)
282
- console.print(_build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
283
- console.print(_build_signature_table(self.email_authors_to_device_signatures, (AUTHOR, DEVICE_SIGNATURE)))
305
+ # Resort in case any timestamp were updated
306
+ self.emails = Document.sort_by_timestamp(self.emails)
284
307
 
285
- def _tally_email_data(self) -> None:
286
- """Tally up summary info about Email objects."""
287
- for email in self.non_duplicate_emails():
288
- self.email_author_counts[email.author] += 1
308
+ def _set_uninteresting_ccs(self) -> None:
309
+ ross_gow_email = self.email_for_id('014797_1')
310
+ self.uninteresting_ccs = copy(cast(list[Name], ross_gow_email.header.bcc))
289
311
 
290
- if len(email.recipients) == 0:
291
- self.unknown_recipient_email_ids.add(email.file_id)
292
- self.email_recipient_counts[None] += 1
293
- else:
294
- for recipient in email.recipients:
295
- self.email_recipient_counts[recipient] += 1
312
+ for id in EMAILS_WITH_UNINTERESTING_CCS:
313
+ self.uninteresting_ccs += self.email_for_id(id).recipients
296
314
 
297
- if email.sent_from_device:
298
- self.email_authors_to_device_signatures[email.author_or_unknown()].add(email.sent_from_device)
299
- self.email_device_signatures_to_authors[email.sent_from_device].add(email.author_or_unknown())
315
+ self.uninteresting_ccs = sorted(uniquify(self.uninteresting_ccs))
316
+ logger.info(f"Extracted uninteresting_ccs: {self.uninteresting_ccs}")
300
317
 
301
318
 
302
319
  def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
@@ -326,21 +343,6 @@ def document_cls(doc: Document) -> Type[Document]:
326
343
  return OtherFile
327
344
 
328
345
 
329
- def _build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str], join_char: str = '\n') -> Padding:
330
- title = 'Signatures Used By Authors' if cols[0] == AUTHOR else 'Authors Seen Using Signatures'
331
- table = build_table(title, header_style="bold reverse", show_lines=True)
332
-
333
- for i, col in enumerate(cols):
334
- table.add_column(col.title() + ('s' if i == 1 else ''))
335
-
336
- new_dict = dict_sets_to_lists(keyed_sets)
337
-
338
- for k in sorted(new_dict.keys()):
339
- table.add_row(highlighter(k or UNKNOWN), highlighter(join_char.join(sorted(new_dict[k]))))
340
-
341
- return Padding(table, DEVICE_SIGNATURE_PADDING)
342
-
343
-
344
346
  def _sorted_metadata(docs: Sequence[Document]) -> list[Metadata]:
345
347
  docs_sorted_by_id = sorted(docs, key=lambda d: d.file_id)
346
348
  return [json_safe(d.metadata()) for d in docs_sorted_by_id]