epstein-files 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. epstein_files/__init__.py +75 -135
  2. epstein_files/documents/communication.py +9 -9
  3. epstein_files/documents/document.py +115 -87
  4. epstein_files/documents/email.py +154 -85
  5. epstein_files/documents/emails/email_header.py +7 -6
  6. epstein_files/documents/imessage/text_message.py +3 -2
  7. epstein_files/documents/json_file.py +17 -0
  8. epstein_files/documents/messenger_log.py +62 -3
  9. epstein_files/documents/other_file.py +165 -17
  10. epstein_files/epstein_files.py +128 -169
  11. epstein_files/util/constant/names.py +8 -1
  12. epstein_files/util/constant/output_files.py +29 -0
  13. epstein_files/util/constant/strings.py +27 -0
  14. epstein_files/util/constant/urls.py +25 -9
  15. epstein_files/util/constants.py +1018 -1045
  16. epstein_files/util/data.py +20 -55
  17. epstein_files/util/{file_cfg.py → doc_cfg.py} +121 -43
  18. epstein_files/util/env.py +19 -20
  19. epstein_files/util/file_helper.py +38 -21
  20. epstein_files/util/highlighted_group.py +229 -177
  21. epstein_files/util/logging.py +63 -0
  22. epstein_files/util/output.py +180 -0
  23. epstein_files/util/rich.py +29 -17
  24. epstein_files/util/search_result.py +14 -6
  25. epstein_files/util/timer.py +24 -0
  26. epstein_files/util/word_count.py +2 -1
  27. {epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/METADATA +20 -4
  28. epstein_files-1.0.2.dist-info/RECORD +33 -0
  29. epstein_files-1.0.2.dist-info/entry_points.txt +7 -0
  30. epstein_files-1.0.0.dist-info/RECORD +0 -28
  31. {epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/LICENSE +0 -0
  32. {epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/WHEEL +0 -0
@@ -1,14 +1,14 @@
1
1
  import gzip
2
+ import json
2
3
  import pickle
3
4
  import re
4
5
  from collections import defaultdict
5
6
  from dataclasses import dataclass, field
6
7
  from datetime import datetime
7
8
  from pathlib import Path
8
- from typing import Literal, Sequence
9
+ from typing import Sequence, Type
9
10
 
10
11
  from rich.align import Align
11
- from rich.console import Group
12
12
  from rich.padding import Padding
13
13
  from rich.table import Table
14
14
  from rich.text import Text
@@ -19,24 +19,26 @@ from epstein_files.documents.emails.email_header import AUTHOR
19
19
  from epstein_files.documents.json_file import JsonFile
20
20
  from epstein_files.documents.messenger_log import MSG_REGEX, MessengerLog
21
21
  from epstein_files.documents.other_file import OtherFile
22
+ from epstein_files.util.constant.output_files import PICKLED_PATH
22
23
  from epstein_files.util.constant.strings import *
23
24
  from epstein_files.util.constant.urls import (EPSTEIN_WEB, JMAIL, epsteinify_name_url, epstein_web_person_url,
24
25
  search_jmail_url, search_twitter_url)
25
26
  from epstein_files.util.constants import *
26
- from epstein_files.util.data import Timer, dict_sets_to_lists, iso_timestamp, sort_dict
27
+ from epstein_files.util.data import dict_sets_to_lists, json_safe, sort_dict
28
+ from epstein_files.util.doc_cfg import EmailCfg
27
29
  from epstein_files.util.env import args, logger
28
- from epstein_files.util.file_cfg import MessageCfg
29
- from epstein_files.util.file_helper import DOCS_DIR, FILENAME_LENGTH, PICKLED_PATH, file_size_str
30
+ from epstein_files.util.file_helper import DOCS_DIR, file_size_str
30
31
  from epstein_files.util.highlighted_group import get_info_for_name, get_style_for_name
31
- from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, QUESTION_MARK_TXT, add_cols_to_table, console,
32
- highlighter, link_text_obj, link_markup, print_author_header, print_centered, print_other_site_link, print_panel,
32
+ from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, add_cols_to_table, console, highlighter,
33
+ link_text_obj, link_markup, print_author_header, print_centered, print_other_site_link, print_panel,
33
34
  print_section_header, vertically_pad)
34
35
  from epstein_files.util.search_result import SearchResult
36
+ from epstein_files.util.timer import Timer
35
37
 
36
38
  DEVICE_SIGNATURE = 'Device Signature'
37
- FIRST_FEW_LINES = 'First Few Lines'
38
39
  DEVICE_SIGNATURE_PADDING = (1, 0)
39
40
  NOT_INCLUDED_EMAILERS = [e.lower() for e in (USELESS_EMAILERS + [JEFFREY_EPSTEIN])]
41
+ SLOW_FILE_SECONDS = 1.0
40
42
 
41
43
  INVALID_FOR_EPSTEIN_WEB = JUNK_EMAILERS + KRASSNER_RECIPIENTS + [
42
44
  'ACT for America',
@@ -53,70 +55,55 @@ class EpsteinFiles:
53
55
  imessage_logs: list[MessengerLog] = field(default_factory=list)
54
56
  json_files: list[JsonFile] = field(default_factory=list)
55
57
  other_files: list[OtherFile] = field(default_factory=list)
58
+ timer: Timer = field(default_factory=lambda: Timer())
56
59
 
57
60
  # Analytics / calculations
58
61
  email_author_counts: dict[str | None, int] = field(default_factory=lambda: defaultdict(int))
59
62
  email_authors_to_device_signatures: dict[str, set] = field(default_factory=lambda: defaultdict(set))
60
63
  email_device_signatures_to_authors: dict[str, set] = field(default_factory=lambda: defaultdict(set))
61
64
  email_recipient_counts: dict[str | None, int] = field(default_factory=lambda: defaultdict(int))
62
- _email_unknown_recipient_file_ids: set[str] = field(default_factory=set)
65
+ unknown_recipient_email_ids: set[str] = field(default_factory=set)
63
66
 
64
67
  def __post_init__(self):
68
+ """Iterate through files and build appropriate objects."""
65
69
  self.all_files = [f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')]
70
+ documents = []
66
71
 
67
72
  # Read through and classify all the files
68
73
  for file_arg in self.all_files:
69
- logger.info(f"Scanning '{file_arg.name}'...")
74
+ doc_timer = Timer(decimals=4)
70
75
  document = Document(file_arg)
71
76
 
72
77
  if document.length == 0:
73
- logger.info(f"Skipping empty file {document.description().plain}")
74
- elif document.text[0] == '{':
75
- # Handle JSON files
76
- self.json_files.append(JsonFile(file_arg, text=document.text))
77
- logger.info(self.json_files[-1].description().plain)
78
- elif MSG_REGEX.search(document.text):
79
- # Handle iMessage log files
80
- self.imessage_logs.append(MessengerLog(file_arg, text=document.text))
81
- logger.info(self.imessage_logs[-1].description().plain)
82
- elif DETECT_EMAIL_REGEX.match(document.text) or isinstance(document.config, MessageCfg):
83
- # Handle emails
84
- email = Email(file_arg, text=document.text)
85
- logger.info(email.description().plain)
86
- self.emails.append(email)
87
- self.email_author_counts[email.author] += 1
88
-
89
- if len(email.recipients) == 0:
90
- self._email_unknown_recipient_file_ids.add(email.file_id)
91
- self.email_recipient_counts[None] += 1
92
- else:
93
- for recipient in email.recipients:
94
- self.email_recipient_counts[recipient] += 1
95
-
96
- if email.sent_from_device:
97
- self.email_authors_to_device_signatures[email.author_or_unknown()].add(email.sent_from_device)
98
- self.email_device_signatures_to_authors[email.sent_from_device].add(email.author_or_unknown())
99
- else:
100
- # Handle OtherFiles
101
- self.other_files.append(OtherFile(file_arg, text=document.text))
102
- logger.info(self.other_files[-1].description().plain)
78
+ logger.warning(f"Skipping empty file: {document}")
79
+ continue
80
+
81
+ cls = document_cls(document)
82
+ documents.append(cls(file_arg, text=document.text))
83
+ logger.info(str(documents[-1]))
103
84
 
104
- self.emails = Document.sort_by_timestamp(self.emails)
105
- self.imessage_logs = Document.sort_by_timestamp(self.imessage_logs)
106
- self.other_files = Document.sort_by_timestamp(self.other_files + self.json_files)
85
+ if doc_timer.seconds_since_start() > SLOW_FILE_SECONDS:
86
+ doc_timer.print_at_checkpoint(f"Slow file: {documents[-1]} processed")
87
+
88
+ self.emails = Document.sort_by_timestamp([d for d in documents if isinstance(d, Email)])
89
+ self.imessage_logs = Document.sort_by_timestamp([d for d in documents if isinstance(d, MessengerLog)])
90
+ self.other_files = Document.sort_by_timestamp([d for d in documents if isinstance(d, (JsonFile, OtherFile))])
91
+ self.json_files = [doc for doc in self.other_files if isinstance(doc, JsonFile)]
92
+ self._tally_email_data()
107
93
 
108
94
  @classmethod
109
- def get_files(cls, timer: Timer | None = None) -> 'EpsteinFiles':
95
+ def get_files(cls, timer: Timer | None = None, use_pickled: bool = False) -> 'EpsteinFiles':
110
96
  """Alternate constructor that reads/writes a pickled version of the data ('timer' arg is for logging)."""
111
97
  timer = timer or Timer()
112
98
 
113
- if (args.pickled and PICKLED_PATH.exists()) and not args.overwrite_pickle:
99
+ if ((args.pickled or use_pickled) and PICKLED_PATH.exists()) and not args.overwrite_pickle:
114
100
  with gzip.open(PICKLED_PATH, 'rb') as file:
115
101
  epstein_files = pickle.load(file)
116
102
  timer.print_at_checkpoint(f"Loaded {len(epstein_files.all_files):,} documents from '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})")
103
+ epstein_files.timer = timer
117
104
  return epstein_files
118
105
 
119
- epstein_files = EpsteinFiles()
106
+ epstein_files = EpsteinFiles(timer=timer)
120
107
 
121
108
  if args.overwrite_pickle or not PICKLED_PATH.exists():
122
109
  with gzip.open(PICKLED_PATH, 'wb') as file:
@@ -141,18 +128,17 @@ class EpsteinFiles:
141
128
  def docs_matching(
142
129
  self,
143
130
  pattern: re.Pattern | str,
144
- file_type: Literal['all', 'other'] = 'all',
145
131
  names: list[str | None] | None = None
146
132
  ) -> list[SearchResult]:
147
133
  """Find documents whose text matches a pattern (file_type and names args limit the documents searched)."""
148
134
  results: list[SearchResult] = []
149
135
 
150
- for doc in (self.all_documents() if file_type == 'all' else self.other_files):
151
- lines = doc.lines_matching_txt(pattern)
152
-
153
- if names and ((not isinstance(doc, (Email, MessengerLog))) or doc.author not in names):
136
+ for doc in self.all_documents():
137
+ if names and doc.author not in names:
154
138
  continue
155
139
 
140
+ lines = doc.matching_lines(pattern)
141
+
156
142
  if len(lines) > 0:
157
143
  results.append(SearchResult(doc, lines))
158
144
 
@@ -178,7 +164,7 @@ class EpsteinFiles:
178
164
  return substitution_counts
179
165
 
180
166
  def email_unknown_recipient_file_ids(self) -> list[str]:
181
- return sorted(list(self._email_unknown_recipient_file_ids))
167
+ return sorted(list(self.unknown_recipient_email_ids))
182
168
 
183
169
  def emails_by(self, author: str | None) -> list[Email]:
184
170
  return [e for e in self.emails if e.author == author]
@@ -198,48 +184,52 @@ class EpsteinFiles:
198
184
  else:
199
185
  return [e for e in self.emails if author in e.recipients]
200
186
 
201
- def imessage_logs_for(self, author: str | None | list[str | None]) -> Sequence[MessengerLog]:
202
- if author in [EVERYONE, JEFFREY_EPSTEIN]:
203
- return self.imessage_logs
187
+ def get_documents_by_id(self, file_ids: list[str]) -> list[Document]:
188
+ docs = [doc for doc in self.all_documents() if doc.file_id in file_ids]
189
+
190
+ if len(docs) != len(file_ids):
191
+ logger.warning(f"{len(file_ids)} file IDs provided but only {len(docs)} Epstein files found!")
204
192
 
205
- authors = author if isinstance(author, list) else [author]
206
- return [log for log in self.imessage_logs if log.author in authors]
193
+ return docs
194
+
195
+ def imessage_logs_for(self, author: str | None | list[str | None]) -> Sequence[MessengerLog]:
196
+ return MessengerLog.logs_for(author, self.imessage_logs)
207
197
 
208
198
  def identified_imessage_log_count(self) -> int:
209
199
  return len([log for log in self.imessage_logs if log.author])
210
200
 
211
- def imessage_sender_counts(self) -> dict[str | None, int]:
212
- sender_counts: dict[str | None, int] = defaultdict(int)
201
+ def json_metadata(self) -> str:
202
+ metadata = {
203
+ EMAIL_CLASS: [json_safe(d.metadata()) for d in self.emails],
204
+ JSON_FILE_CLASS: [json_safe(d.metadata()) for d in self.json_files],
205
+ MESSENGER_LOG_CLASS: [json_safe(d.metadata()) for d in self.imessage_logs],
206
+ OTHER_FILE_CLASS: [json_safe(d.metadata()) for d in self.other_files if not isinstance(d, JsonFile)],
207
+ }
213
208
 
214
- for message_log in self.imessage_logs:
215
- for message in message_log.messages():
216
- sender_counts[message.author] += 1
209
+ return json.dumps(metadata, indent=4, sort_keys=True)
217
210
 
218
- return sender_counts
211
+ def non_json_other_files(self) -> list[OtherFile]:
212
+ return [doc for doc in self.other_files if not isinstance(doc, JsonFile)]
219
213
 
220
214
  def print_files_summary(self) -> None:
221
- dupes = defaultdict(int)
222
-
223
- for doc in self.all_documents():
224
- if doc.is_duplicate:
225
- dupes[doc.document_type()] += 1
226
-
227
- table = Table()
215
+ table = Table(title='Summary of Document Types')
228
216
  add_cols_to_table(table, ['File Type', 'Files', 'Author Known', 'Author Unknown', 'Duplicates'])
229
217
 
230
- def add_row(label: str, docs: list, known: int | None = None, dupes: int | None = None):
218
+ def add_row(label: str, docs: list):
219
+ known = None if isinstance(docs[0], JsonFile) else len([d for d in docs if d.author])
220
+
231
221
  table.add_row(
232
222
  label,
233
223
  f"{len(docs):,}",
234
- f"{known:,}" if known else NA_TXT,
235
- f"{len(docs) - known:,}" if known else NA_TXT,
236
- f"{dupes:,}" if dupes else NA_TXT,
224
+ f"{known:,}" if known is not None else NA_TXT,
225
+ f"{len(docs) - known:,}" if known is not None else NA_TXT,
226
+ f"{len([d for d in docs if d.is_duplicate])}",
237
227
  )
238
228
 
239
- add_row('iMessage Logs', self.imessage_logs, self.identified_imessage_log_count())
240
- add_row('Emails', self.emails, len([e for e in self.emails if e.author]), dupes[EMAIL_CLASS])
241
- add_row('JSON Data', self.json_files, dupes=0)
242
- add_row('Other', self.other_files, dupes=dupes[OTHER_FILE_CLASS])
229
+ add_row('iMessage Logs', self.imessage_logs)
230
+ add_row('Emails', self.emails)
231
+ add_row('JSON Data', self.json_files)
232
+ add_row('Other', self.non_json_other_files())
243
233
  console.print(Align.center(table))
244
234
  console.line()
245
235
 
@@ -247,10 +237,11 @@ class EpsteinFiles:
247
237
  """Print complete emails to or from a particular 'author'. Returns the Emails that were printed."""
248
238
  conversation_length = self.email_conversation_length_in_days(_author)
249
239
  emails = self.emails_for(_author)
240
+ unique_emails = [email for email in emails if not email.is_duplicate]
250
241
  author = _author or UNKNOWN
251
242
 
252
243
  print_author_header(
253
- f"Found {len(emails)} {author} emails starting {emails[0].timestamp.date()} over {conversation_length:,} days",
244
+ f"Found {len(unique_emails)} {author} emails starting {emails[0].timestamp.date()} over {conversation_length:,} days",
254
245
  get_style_for_name(author),
255
246
  get_info_for_name(author)
256
247
  )
@@ -271,28 +262,9 @@ class EpsteinFiles:
271
262
 
272
263
  return emails
273
264
 
274
- def print_emails_table_for(self, _author: str | None) -> None:
275
- emails = [email for email in self.emails_for(_author) if not email.is_duplicate] # Remove dupes
276
- author = _author or UNKNOWN
277
-
278
- table = Table(
279
- title=f"Emails to/from {author} starting {emails[0].timestamp.date()}",
280
- border_style=get_style_for_name(author, allow_bold=False),
281
- header_style="bold"
282
- )
283
-
284
- table.add_column('From', justify='left')
285
- table.add_column('Timestamp', justify='center')
286
- table.add_column('Subject', justify='left', style='honeydew2', min_width=60)
287
-
288
- for email in emails:
289
- table.add_row(
290
- email.author_txt,
291
- email.epstein_media_link(link_txt=email.timestamp_without_seconds()),
292
- highlighter(email.subject())
293
- )
294
-
295
- console.print(Align.center(table), '\n')
265
+ def print_emails_table_for(self, author: str | None) -> None:
266
+ emails = [email for email in self.emails_for(author) if not email.is_duplicate] # Remove dupes
267
+ console.print(Align.center(Email.build_table(emails, author)), '\n')
296
268
 
297
269
  def print_email_device_info(self) -> None:
298
270
  print_panel(f"Email [italic]Sent from \\[DEVICE][/italic] Signature Breakdown", padding=(4, 0, 0, 0), centered=True)
@@ -300,13 +272,13 @@ class EpsteinFiles:
300
272
  console.print(build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
301
273
 
302
274
  def print_emailer_counts_table(self) -> None:
303
- footer = f"Identified authors of {self.attributed_email_count()} emails out of {len(self.emails)} potential email files."
275
+ footer = f"Identified authors of {self.attributed_email_count():,} emails out of {len(self.emails):,}."
304
276
  counts_table = Table(title=f"Email Counts", caption=footer, header_style="bold")
305
277
  add_cols_to_table(counts_table, ['Name', 'Count', 'Sent', "Recv'd", JMAIL, EPSTEIN_WEB, 'Twitter'])
306
278
 
307
279
  emailer_counts = {
308
- e: self.email_author_counts[e] + self.email_recipient_counts[e]
309
- for e in self.all_emailers(True)
280
+ emailer: self.email_author_counts[emailer] + self.email_recipient_counts[emailer]
281
+ for emailer in self.all_emailers(True)
310
282
  }
311
283
 
312
284
  for p, count in sort_dict(emailer_counts):
@@ -326,76 +298,50 @@ class EpsteinFiles:
326
298
 
327
299
  def print_imessage_summary(self) -> None:
328
300
  """Print summary table and stats for text messages."""
329
- counts_table = Table(title="Text Message Counts By Author", header_style="bold")
330
- counts_table.add_column(AUTHOR.title(), justify='left', style="steel_blue bold", width=30)
331
- counts_table.add_column('Files', justify='right', style='white')
332
- counts_table.add_column("Msgs", justify='right')
333
- counts_table.add_column('First Sent At', justify='center', highlight=True, width=21)
334
- counts_table.add_column('Last Sent At', justify='center', style='wheat4', width=21)
335
- counts_table.add_column('Days', justify='right', style='dim')
336
-
337
- for name, count in sort_dict(self.imessage_sender_counts()):
338
- logs = self.imessage_logs_for(name)
339
- first_at = logs[0].first_message_at(name)
340
- last_at = logs[-1].first_message_at(name)
341
-
342
- counts_table.add_row(
343
- Text(name or UNKNOWN,
344
- get_style_for_name(name)),
345
- str(len(logs)),
346
- f"{count:,}",
347
- iso_timestamp(first_at),
348
- iso_timestamp(last_at),
349
- str((last_at - first_at).days + 1),
350
- )
351
-
352
- console.print(counts_table)
301
+ console.print(MessengerLog.summary_table(self.imessage_logs))
353
302
  text_summary_msg = f"\nDeanonymized {self.identified_imessage_log_count()} of "
354
- text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files)} files."
303
+ text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files):,} files."
355
304
  console.print(text_summary_msg)
356
305
  imessage_msg_count = sum([len(log.messages()) for log in self.imessage_logs])
357
- console.print(f"Found {imessage_msg_count} total text messages in {len(self.imessage_logs)} conversations.")
358
- console.print(f"(Last deploy found 4668 messages in 77 conversations)", style='dim')
306
+ console.print(f"Found {imessage_msg_count} text messages in {len(self.imessage_logs)} iMessage log files.")
359
307
 
360
308
  def print_other_files_table(self) -> list[OtherFile]:
361
- """Returns the OtherFiles that were interesting enough to print."""
309
+ """Returns the OtherFile objects that were interesting enough to print."""
362
310
  interesting_files = [doc for doc in self.other_files if args.all_other_files or doc.is_interesting()]
363
311
  header_pfx = '' if args.all_other_files else 'Selected '
364
312
  print_section_header(f"{FIRST_FEW_LINES} of {len(interesting_files)} {header_pfx}Files That Are Neither Emails Nor Text Msgs")
365
313
 
366
314
  if not args.all_other_files:
367
- print_centered(f"(the other site is uncurated and has all {len(self.other_files)} unclassifiable files and all {len(self.emails):,} emails)", style='dim')
315
+ print_centered(f"(the other site is uncurated and has all {len(self.other_files)} unclassifiable files and {len(self.emails):,} emails)", style='dim')
368
316
  print_other_site_link(False)
369
317
  console.line(2)
370
318
 
371
- table = Table(header_style='bold', show_lines=True)
372
- table.add_column('File', justify='center', width=FILENAME_LENGTH)
373
- table.add_column('Date', justify='center')
374
- table.add_column('Length', justify='center')
375
- table.add_column(FIRST_FEW_LINES, justify='left', style='pale_turquoise4')
319
+ console.print(OtherFile.build_table(interesting_files))
320
+ skipped_file_count = len(self.other_files) - len(interesting_files)
376
321
 
377
- for doc in interesting_files:
378
- link_and_info = [doc.raw_document_link_txt(), *doc.hints()]
379
- date_str = doc.date_str()
322
+ if skipped_file_count > 0:
323
+ logger.warning(f"Skipped {skipped_file_count} uninteresting files...")
380
324
 
381
- if doc.is_duplicate:
382
- preview_text = doc.duplicate_file_txt()
383
- row_style = ' dim'
384
- else:
385
- preview_text = doc.highlighted_preview_text()
386
- row_style = ''
325
+ return interesting_files
387
326
 
388
- table.add_row(
389
- Group(*link_and_info),
390
- Text(date_str, style=TIMESTAMP_DIM) if date_str else QUESTION_MARK_TXT,
391
- doc.file_size_str(),
392
- preview_text,
393
- style=row_style
394
- )
327
+ def _tally_email_data(self) -> None:
328
+ """Tally up summary info about Email objects."""
329
+ for email in self.emails:
330
+ if email.is_duplicate:
331
+ continue
395
332
 
396
- console.print(table)
397
- logger.warning(f"Skipped {len(self.other_files) - len(interesting_files)} uninteresting files...")
398
- return interesting_files
333
+ self.email_author_counts[email.author] += 1
334
+
335
+ if len(email.recipients) == 0:
336
+ self.unknown_recipient_email_ids.add(email.file_id)
337
+ self.email_recipient_counts[None] += 1
338
+ else:
339
+ for recipient in email.recipients:
340
+ self.email_recipient_counts[recipient] += 1
341
+
342
+ if email.sent_from_device:
343
+ self.email_authors_to_device_signatures[email.author_or_unknown()].add(email.sent_from_device)
344
+ self.email_device_signatures_to_authors[email.sent_from_device].add(email.author_or_unknown())
399
345
 
400
346
 
401
347
  def build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str], join_char: str = '\n') -> Padding:
@@ -413,18 +359,6 @@ def build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str]
413
359
  return Padding(table, DEVICE_SIGNATURE_PADDING)
414
360
 
415
361
 
416
- def is_ok_for_epstein_web(name: str | None) -> bool:
417
- """Return True if it's likely that EpsteinWeb has a page for this name."""
418
- if name is None or ' ' not in name:
419
- return False
420
- elif '@' in name or '/' in name or '??' in name:
421
- return False
422
- elif name in INVALID_FOR_EPSTEIN_WEB:
423
- return False
424
-
425
- return True
426
-
427
-
428
362
  def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
429
363
  counts: dict[str | None, int] = defaultdict(int)
430
364
 
@@ -435,3 +369,28 @@ def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
435
369
  counts[None] += 1
436
370
 
437
371
  return counts
372
+
373
+
374
+ def document_cls(document: Document) -> Type[Document]:
375
+ search_area = document.text[0:5000] # Limit search area to avoid pointless scans of huge files
376
+
377
+ if document.text[0] == '{':
378
+ return JsonFile
379
+ elif isinstance(document.config, EmailCfg) or DETECT_EMAIL_REGEX.match(search_area):
380
+ return Email
381
+ elif MSG_REGEX.search(search_area):
382
+ return MessengerLog
383
+ else:
384
+ return OtherFile
385
+
386
+
387
+ def is_ok_for_epstein_web(name: str | None) -> bool:
388
+ """Return True if it's likely that EpsteinWeb has a page for this name."""
389
+ if name is None or ' ' not in name:
390
+ return False
391
+ elif '@' in name or '/' in name or '??' in name:
392
+ return False
393
+ elif name in INVALID_FOR_EPSTEIN_WEB:
394
+ return False
395
+
396
+ return True
@@ -1,6 +1,5 @@
1
1
  from epstein_files.util.constant.strings import QUESTION_MARKS, remove_question_marks
2
2
 
3
-
4
3
  UNKNOWN = '(unknown)'
5
4
 
6
5
  # Texting Names
@@ -170,6 +169,7 @@ ZUBAIR_KHAN = 'Zubair Khan'
170
169
 
171
170
  # No communications but name is in the files
172
171
  BILL_GATES = 'Bill Gates'
172
+ DONALD_TRUMP = 'Donald Trump'
173
173
  ELON_MUSK = 'Elon Musk'
174
174
  HENRY_HOLT = 'Henry Holt' # Actually a company?
175
175
  IVANKA = 'Ivanka'
@@ -184,15 +184,22 @@ TULSI_GABBARD = 'Tulsi Gabbard'
184
184
  VIRGINIA_GIUFFRE = 'Virginia Giuffre'
185
185
 
186
186
  # Organizations
187
+ BOFA = 'BofA'
187
188
  CNN = 'CNN'
188
189
  DEUTSCHE_BANK = 'Deutsche Bank'
190
+ ELECTRON_CAPITAL_PARTNERS = 'Electron Capital Partners'
189
191
  GOLDMAN_SACHS = 'Goldman Sachs'
192
+ GOLDMAN_INVESTMENT_MGMT = f'{GOLDMAN_SACHS} Investment Management Division'
190
193
  HARVARD = 'Harvard'
191
194
  INSIGHTS_POD = f"InsightsPod" # Zubair bots
195
+ NEXT_MANAGEMENT = 'Next Management LLC'
192
196
  JP_MORGAN = 'JP Morgan'
193
197
  OSBORNE_LLP = f"{IAN_OSBORNE} & Partners LLP" # Ian Osborne's PR firm
198
+ TRUMP_ORG = 'Trump Organization'
199
+ UBS = 'UBS'
194
200
 
195
201
  # Locations
202
+ PALM_BEACH = 'Palm Beach'
196
203
  VIRGIN_ISLANDS = 'Virgin Islands'
197
204
 
198
205
  # First and last names that should be made part of a highlighting regex for emailers
@@ -0,0 +1,29 @@
1
+ from pathlib import Path
2
+
3
+ PICKLED_PATH = Path("the_epstein_files.pkl.gz")
4
+
5
+ EPSTEIN_FILES_NOV_2025 = 'epstein_files_nov_2025'
6
+ URLS_ENV = '.urls.env'
7
+
8
+ HTML_DIR = Path('docs')
9
+ ALL_EMAILS_PATH = HTML_DIR.joinpath(f'all_emails_{EPSTEIN_FILES_NOV_2025}.html')
10
+ JSON_METADATA_PATH = HTML_DIR.joinpath(f'file_metadata_{EPSTEIN_FILES_NOV_2025}.json')
11
+ TEXT_MSGS_HTML_PATH = HTML_DIR.joinpath('index.html')
12
+ WORD_COUNT_HTML_PATH = HTML_DIR.joinpath(f'communication_word_count_{EPSTEIN_FILES_NOV_2025}.html')
13
+ # EPSTEIN_WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_texts_and_emails_word_count.html')
14
+
15
+ BUILD_ARTIFACTS = [
16
+ ALL_EMAILS_PATH,
17
+ # EPSTEIN_WORD_COUNT_HTML_PATH,
18
+ JSON_METADATA_PATH,
19
+ TEXT_MSGS_HTML_PATH,
20
+ WORD_COUNT_HTML_PATH,
21
+ ]
22
+
23
+
24
+ def make_clean() -> None:
25
+ """Delete all build artifacts."""
26
+ for build_file in BUILD_ARTIFACTS:
27
+ if build_file.exists():
28
+ print(f"Removing build file '{build_file}'...")
29
+ build_file.unlink()
@@ -9,6 +9,27 @@ JSON_FILE_CLASS = 'JsonFile'
9
9
  MESSENGER_LOG_CLASS = 'MessengerLog'
10
10
  OTHER_FILE_CLASS = 'OtherFile'
11
11
 
12
+ # categories
13
+ ACADEMIA = 'academia'
14
+ ARTS = 'arts'
15
+ ARTICLE = 'article'
16
+ BOOK = 'book'
17
+ BUSINESS = 'business'
18
+ CONFERENCE = 'conference'
19
+ ENTERTAINER = 'entertainer'
20
+ FINANCE = 'finance'
21
+ FLIGHT_LOGS = 'flight logs'
22
+ JOURNALIST = 'journalist'
23
+ JUNK = 'junk'
24
+ LEGAL = 'legal'
25
+ LOBBYIST = 'lobbyist'
26
+ POLITICS = 'politics'
27
+ PROPERTY = 'property'
28
+ PUBLICIST = 'publicist'
29
+ REPUTATION = 'reputation'
30
+ SOCIAL = 'social'
31
+ SPEECH = 'speech'
32
+
12
33
  # Publications
13
34
  BBC = 'BBC'
14
35
  BLOOMBERG = 'Bloomberg'
@@ -36,11 +57,17 @@ TIMESTAMP_DIM = f"turquoise4 dim"
36
57
  AUTHOR = 'author'
37
58
  DEFAULT = 'default'
38
59
  EVERYONE = 'everyone'
60
+ FIRST_FEW_LINES = 'First Few Lines'
39
61
  HOUSE_OVERSIGHT_PREFIX = 'HOUSE_OVERSIGHT_'
62
+ JSON = 'json'
40
63
  NA = 'n/a'
41
64
  REDACTED = '<REDACTED>'
42
65
  URL_SIGNIFIERS = ['gclid', 'htm', 'ref=', 'utm']
43
66
  QUESTION_MARKS = '(???)'
67
+
68
+ # Regexes
69
+ FILE_STEM_REGEX = re.compile(fr"{HOUSE_OVERSIGHT_PREFIX}(\d{{6}}(_\d{{1,2}})?)")
70
+ FILE_NAME_REGEX = re.compile(fr"{FILE_STEM_REGEX.pattern}(\.txt(\.json)?)?")
44
71
  QUESTION_MARKS_REGEX = re.compile(fr' {re.escape(QUESTION_MARKS)}$')
45
72
 
46
73
 
@@ -5,8 +5,9 @@ from typing import Literal
5
5
  from inflection import parameterize
6
6
  from rich.text import Text
7
7
 
8
+ from epstein_files.util.constant.output_files import *
8
9
  from epstein_files.util.constant.strings import EMAIL, TEXT_MESSAGE, SiteType
9
- from epstein_files.util.file_helper import coerce_file_stem, filename_for_id
10
+ from epstein_files.util.file_helper import coerce_file_stem
10
11
 
11
12
  # Style stuff
12
13
  ARCHIVE_LINK_COLOR = 'slate_blue3'
@@ -20,8 +21,29 @@ EPSTEIN_WEB = 'EpsteinWeb'
20
21
  EPSTEINIFY = 'epsteinify'
21
22
  JMAIL = 'Jmail'
22
23
 
23
- # URLs
24
- ATTRIBUTIONS_URL = 'https://github.com/michelcrypt4d4mus/epstein_text_messages/blob/master/epstein_files/util/constants.py'
24
+
25
+ # Deployment URLS
26
+ # NOTE: don't rename these variables without changing deploy.sh!
27
+ GH_PAGES_BASE_URL = 'https://michelcrypt4d4mus.github.io'
28
+ TEXT_MSGS_URL = f"{GH_PAGES_BASE_URL}/epstein_text_messages"
29
+ ALL_EMAILS_URL = f'{TEXT_MSGS_URL}/{ALL_EMAILS_PATH.name}'
30
+ JSON_METADATA_URL = f'{TEXT_MSGS_URL}/{JSON_METADATA_PATH.name}'
31
+ WORD_COUNT_URL = f'{TEXT_MSGS_URL}/{WORD_COUNT_HTML_PATH.name}'
32
+
33
+ SITE_URLS: dict[SiteType, str] = {
34
+ EMAIL: ALL_EMAILS_URL,
35
+ TEXT_MESSAGE: TEXT_MSGS_URL,
36
+ }
37
+
38
+ GH_PROJECT_URL = 'https://github.com/michelcrypt4d4mus/epstein_text_messages'
39
+ GH_MASTER_URL = f"{GH_PROJECT_URL}/blob/master"
40
+ ATTRIBUTIONS_URL = f'{GH_MASTER_URL}/epstein_files/util/constants.py'
41
+ EXTRACTS_BASE_URL = f'{GH_MASTER_URL}/emails_extracted_from_legal_filings'
42
+
43
+ extracted_file_url = lambda f: f"{EXTRACTS_BASE_URL}/{f}"
44
+
45
+
46
+ # External URLs
25
47
  COFFEEZILLA_ARCHIVE_URL = 'https://journaliststudio.google.com/pinpoint/search?collection=061ce61c9e70bdfd'
26
48
  COURIER_NEWSROOM_ARCHIVE_URL = 'https://journaliststudio.google.com/pinpoint/search?collection=092314e384a58618'
27
49
  EPSTEINIFY_URL = 'https://epsteinify.com'
@@ -31,12 +53,6 @@ JMAIL_URL = 'https://jmail.world'
31
53
  OVERSIGHT_REPUBLICANS_PRESSER_URL = 'https://oversight.house.gov/release/oversight-committee-releases-additional-epstein-estate-documents/'
32
54
  RAW_OVERSIGHT_DOCS_GOOGLE_DRIVE_URL = 'https://drive.google.com/drive/folders/1hTNH5woIRio578onLGElkTWofUSWRoH_'
33
55
  SUBSTACK_URL = 'https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great'
34
- WORD_COUNT_URL = 'https://michelcrypt4d4mus.github.io/epstein_text_messages/epstein_emails_word_count.html'
35
-
36
- SITE_URLS: dict[SiteType, str] = {
37
- EMAIL: 'https://michelcrypt4d4mus.github.io/epstein_emails_house_oversight/',
38
- TEXT_MESSAGE: 'https://michelcrypt4d4mus.github.io/epstein_text_messages/',
39
- }
40
56
 
41
57
  DOC_LINK_BASE_URLS: dict[ExternalSite, str] = {
42
58
  EPSTEIN_MEDIA: f"{EPSTEIN_MEDIA_URL}/files",