epstein-files 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,14 @@
1
1
  import gzip
2
+ import json
2
3
  import pickle
3
4
  import re
4
5
  from collections import defaultdict
5
6
  from dataclasses import dataclass, field
6
7
  from datetime import datetime
7
8
  from pathlib import Path
8
- from typing import Literal, Sequence
9
+ from typing import Sequence, Type
9
10
 
10
11
  from rich.align import Align
11
- from rich.console import Group
12
12
  from rich.padding import Padding
13
13
  from rich.table import Table
14
14
  from rich.text import Text
@@ -23,20 +23,21 @@ from epstein_files.util.constant.strings import *
23
23
  from epstein_files.util.constant.urls import (EPSTEIN_WEB, JMAIL, epsteinify_name_url, epstein_web_person_url,
24
24
  search_jmail_url, search_twitter_url)
25
25
  from epstein_files.util.constants import *
26
- from epstein_files.util.data import Timer, dict_sets_to_lists, iso_timestamp, sort_dict
26
+ from epstein_files.util.data import dict_sets_to_lists, json_safe, sort_dict
27
+ from epstein_files.util.doc_cfg import EmailCfg
27
28
  from epstein_files.util.env import args, logger
28
- from epstein_files.util.file_cfg import MessageCfg
29
- from epstein_files.util.file_helper import DOCS_DIR, FILENAME_LENGTH, PICKLED_PATH, file_size_str
29
+ from epstein_files.util.file_helper import DOCS_DIR, PICKLED_PATH, file_size_str
30
30
  from epstein_files.util.highlighted_group import get_info_for_name, get_style_for_name
31
- from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, QUESTION_MARK_TXT, add_cols_to_table, console,
32
- highlighter, link_text_obj, link_markup, print_author_header, print_centered, print_other_site_link, print_panel,
31
+ from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, add_cols_to_table, console, highlighter,
32
+ link_text_obj, link_markup, print_author_header, print_centered, print_other_site_link, print_panel,
33
33
  print_section_header, vertically_pad)
34
34
  from epstein_files.util.search_result import SearchResult
35
+ from epstein_files.util.timer import Timer
35
36
 
36
37
  DEVICE_SIGNATURE = 'Device Signature'
37
- FIRST_FEW_LINES = 'First Few Lines'
38
38
  DEVICE_SIGNATURE_PADDING = (1, 0)
39
39
  NOT_INCLUDED_EMAILERS = [e.lower() for e in (USELESS_EMAILERS + [JEFFREY_EPSTEIN])]
40
+ SLOW_FILE_SECONDS = 0.4
40
41
 
41
42
  INVALID_FOR_EPSTEIN_WEB = JUNK_EMAILERS + KRASSNER_RECIPIENTS + [
42
43
  'ACT for America',
@@ -59,51 +60,34 @@ class EpsteinFiles:
59
60
  email_authors_to_device_signatures: dict[str, set] = field(default_factory=lambda: defaultdict(set))
60
61
  email_device_signatures_to_authors: dict[str, set] = field(default_factory=lambda: defaultdict(set))
61
62
  email_recipient_counts: dict[str | None, int] = field(default_factory=lambda: defaultdict(int))
62
- _email_unknown_recipient_file_ids: set[str] = field(default_factory=set)
63
+ unknown_recipient_email_ids: set[str] = field(default_factory=set)
63
64
 
64
65
  def __post_init__(self):
66
+ """Iterate through files and build appropriate objects."""
65
67
  self.all_files = [f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')]
68
+ documents = []
66
69
 
67
70
  # Read through and classify all the files
68
71
  for file_arg in self.all_files:
69
- logger.info(f"Scanning '{file_arg.name}'...")
72
+ doc_timer = Timer(decimals=4)
70
73
  document = Document(file_arg)
71
74
 
72
75
  if document.length == 0:
73
- logger.info(f"Skipping empty file {document.description().plain}")
74
- elif document.text[0] == '{':
75
- # Handle JSON files
76
- self.json_files.append(JsonFile(file_arg, text=document.text))
77
- logger.info(self.json_files[-1].description().plain)
78
- elif MSG_REGEX.search(document.text):
79
- # Handle iMessage log files
80
- self.imessage_logs.append(MessengerLog(file_arg, text=document.text))
81
- logger.info(self.imessage_logs[-1].description().plain)
82
- elif DETECT_EMAIL_REGEX.match(document.text) or isinstance(document.config, MessageCfg):
83
- # Handle emails
84
- email = Email(file_arg, text=document.text)
85
- logger.info(email.description().plain)
86
- self.emails.append(email)
87
- self.email_author_counts[email.author] += 1
88
-
89
- if len(email.recipients) == 0:
90
- self._email_unknown_recipient_file_ids.add(email.file_id)
91
- self.email_recipient_counts[None] += 1
92
- else:
93
- for recipient in email.recipients:
94
- self.email_recipient_counts[recipient] += 1
95
-
96
- if email.sent_from_device:
97
- self.email_authors_to_device_signatures[email.author_or_unknown()].add(email.sent_from_device)
98
- self.email_device_signatures_to_authors[email.sent_from_device].add(email.author_or_unknown())
99
- else:
100
- # Handle OtherFiles
101
- self.other_files.append(OtherFile(file_arg, text=document.text))
102
- logger.info(self.other_files[-1].description().plain)
76
+ logger.warning(f"Skipping empty file: {document}")
77
+ continue
78
+
79
+ cls = document_cls(document)
80
+ documents.append(cls(file_arg, text=document.text))
81
+ logger.info(str(documents[-1]))
103
82
 
104
- self.emails = Document.sort_by_timestamp(self.emails)
105
- self.imessage_logs = Document.sort_by_timestamp(self.imessage_logs)
106
- self.other_files = Document.sort_by_timestamp(self.other_files + self.json_files)
83
+ if doc_timer.seconds_since_start() > SLOW_FILE_SECONDS:
84
+ doc_timer.print_at_checkpoint(f"Slow file: {documents[-1]} processed")
85
+
86
+ self.emails = Document.sort_by_timestamp([d for d in documents if isinstance(d, Email)])
87
+ self.imessage_logs = Document.sort_by_timestamp([d for d in documents if isinstance(d, MessengerLog)])
88
+ self.other_files = Document.sort_by_timestamp([d for d in documents if isinstance(d, (JsonFile, OtherFile))])
89
+ self.json_files = [doc for doc in self.other_files if isinstance(doc, JsonFile)]
90
+ self._tally_email_data()
107
91
 
108
92
  @classmethod
109
93
  def get_files(cls, timer: Timer | None = None) -> 'EpsteinFiles':
@@ -141,18 +125,17 @@ class EpsteinFiles:
141
125
  def docs_matching(
142
126
  self,
143
127
  pattern: re.Pattern | str,
144
- file_type: Literal['all', 'other'] = 'all',
145
128
  names: list[str | None] | None = None
146
129
  ) -> list[SearchResult]:
147
130
  """Find documents whose text matches a pattern (file_type and names args limit the documents searched)."""
148
131
  results: list[SearchResult] = []
149
132
 
150
- for doc in (self.all_documents() if file_type == 'all' else self.other_files):
151
- lines = doc.lines_matching_txt(pattern)
152
-
153
- if names and ((not isinstance(doc, (Email, MessengerLog))) or doc.author not in names):
133
+ for doc in self.all_documents():
134
+ if names and doc.author not in names:
154
135
  continue
155
136
 
137
+ lines = doc.matching_lines(pattern)
138
+
156
139
  if len(lines) > 0:
157
140
  results.append(SearchResult(doc, lines))
158
141
 
@@ -178,7 +161,7 @@ class EpsteinFiles:
178
161
  return substitution_counts
179
162
 
180
163
  def email_unknown_recipient_file_ids(self) -> list[str]:
181
- return sorted(list(self._email_unknown_recipient_file_ids))
164
+ return sorted(list(self.unknown_recipient_email_ids))
182
165
 
183
166
  def emails_by(self, author: str | None) -> list[Email]:
184
167
  return [e for e in self.emails if e.author == author]
@@ -198,33 +181,38 @@ class EpsteinFiles:
198
181
  else:
199
182
  return [e for e in self.emails if author in e.recipients]
200
183
 
201
- def imessage_logs_for(self, author: str | None | list[str | None]) -> Sequence[MessengerLog]:
202
- if author in [EVERYONE, JEFFREY_EPSTEIN]:
203
- return self.imessage_logs
184
+ def get_documents_by_id(self, file_ids: list[str]) -> list[Document]:
185
+ docs = [doc for doc in self.all_documents() if doc.file_id in file_ids]
186
+
187
+ if len(docs) != len(file_ids):
188
+ logger.warning(f"{len(file_ids)} file IDs provided but only {len(docs)} Epstein files found!")
189
+
190
+ return docs
204
191
 
205
- authors = author if isinstance(author, list) else [author]
206
- return [log for log in self.imessage_logs if log.author in authors]
192
+ def imessage_logs_for(self, author: str | None | list[str | None]) -> Sequence[MessengerLog]:
193
+ return MessengerLog.logs_for(author, self.imessage_logs)
207
194
 
208
195
  def identified_imessage_log_count(self) -> int:
209
196
  return len([log for log in self.imessage_logs if log.author])
210
197
 
211
- def imessage_sender_counts(self) -> dict[str | None, int]:
212
- sender_counts: dict[str | None, int] = defaultdict(int)
213
-
214
- for message_log in self.imessage_logs:
215
- for message in message_log.messages():
216
- sender_counts[message.author] += 1
198
+ def json_metadata(self) -> str:
199
+ metadata = {
200
+ EMAIL_CLASS: [json_safe(doc.metadata()) for doc in self.emails],
201
+ MESSENGER_LOG_CLASS: [json_safe(doc.metadata()) for doc in self.imessage_logs],
202
+ OTHER_FILE_CLASS: [json_safe(doc.metadata()) for doc in self.other_files],
203
+ }
217
204
 
218
- return sender_counts
205
+ return json.dumps(metadata, indent=4, sort_keys=True)
219
206
 
220
207
  def print_files_summary(self) -> None:
208
+ other_files = [doc for doc in self.other_files if not isinstance(doc, JsonFile)]
221
209
  dupes = defaultdict(int)
222
210
 
223
211
  for doc in self.all_documents():
224
212
  if doc.is_duplicate:
225
- dupes[doc.document_type()] += 1
213
+ dupes[doc.class_name()] += 1
226
214
 
227
- table = Table()
215
+ table = Table(title='Summary of Document Types')
228
216
  add_cols_to_table(table, ['File Type', 'Files', 'Author Known', 'Author Unknown', 'Duplicates'])
229
217
 
230
218
  def add_row(label: str, docs: list, known: int | None = None, dupes: int | None = None):
@@ -239,7 +227,7 @@ class EpsteinFiles:
239
227
  add_row('iMessage Logs', self.imessage_logs, self.identified_imessage_log_count())
240
228
  add_row('Emails', self.emails, len([e for e in self.emails if e.author]), dupes[EMAIL_CLASS])
241
229
  add_row('JSON Data', self.json_files, dupes=0)
242
- add_row('Other', self.other_files, dupes=dupes[OTHER_FILE_CLASS])
230
+ add_row('Other', other_files, dupes=dupes[OTHER_FILE_CLASS])
243
231
  console.print(Align.center(table))
244
232
  console.line()
245
233
 
@@ -247,10 +235,11 @@ class EpsteinFiles:
247
235
  """Print complete emails to or from a particular 'author'. Returns the Emails that were printed."""
248
236
  conversation_length = self.email_conversation_length_in_days(_author)
249
237
  emails = self.emails_for(_author)
238
+ unique_emails = [email for email in emails if not email.is_duplicate]
250
239
  author = _author or UNKNOWN
251
240
 
252
241
  print_author_header(
253
- f"Found {len(emails)} {author} emails starting {emails[0].timestamp.date()} over {conversation_length:,} days",
242
+ f"Found {len(unique_emails)} {author} emails starting {emails[0].timestamp.date()} over {conversation_length:,} days",
254
243
  get_style_for_name(author),
255
244
  get_info_for_name(author)
256
245
  )
@@ -271,28 +260,9 @@ class EpsteinFiles:
271
260
 
272
261
  return emails
273
262
 
274
- def print_emails_table_for(self, _author: str | None) -> None:
275
- emails = [email for email in self.emails_for(_author) if not email.is_duplicate] # Remove dupes
276
- author = _author or UNKNOWN
277
-
278
- table = Table(
279
- title=f"Emails to/from {author} starting {emails[0].timestamp.date()}",
280
- border_style=get_style_for_name(author, allow_bold=False),
281
- header_style="bold"
282
- )
283
-
284
- table.add_column('From', justify='left')
285
- table.add_column('Timestamp', justify='center')
286
- table.add_column('Subject', justify='left', style='honeydew2', min_width=60)
287
-
288
- for email in emails:
289
- table.add_row(
290
- email.author_txt,
291
- email.epstein_media_link(link_txt=email.timestamp_without_seconds()),
292
- highlighter(email.subject())
293
- )
294
-
295
- console.print(Align.center(table), '\n')
263
+ def print_emails_table_for(self, author: str | None) -> None:
264
+ emails = [email for email in self.emails_for(author) if not email.is_duplicate] # Remove dupes
265
+ console.print(Align.center(Email.build_table(emails, author)), '\n')
296
266
 
297
267
  def print_email_device_info(self) -> None:
298
268
  print_panel(f"Email [italic]Sent from \\[DEVICE][/italic] Signature Breakdown", padding=(4, 0, 0, 0), centered=True)
@@ -300,13 +270,13 @@ class EpsteinFiles:
300
270
  console.print(build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
301
271
 
302
272
  def print_emailer_counts_table(self) -> None:
303
- footer = f"Identified authors of {self.attributed_email_count()} emails out of {len(self.emails)} potential email files."
273
+ footer = f"Identified authors of {self.attributed_email_count():,} emails out of {len(self.emails):,}."
304
274
  counts_table = Table(title=f"Email Counts", caption=footer, header_style="bold")
305
275
  add_cols_to_table(counts_table, ['Name', 'Count', 'Sent', "Recv'd", JMAIL, EPSTEIN_WEB, 'Twitter'])
306
276
 
307
277
  emailer_counts = {
308
- e: self.email_author_counts[e] + self.email_recipient_counts[e]
309
- for e in self.all_emailers(True)
278
+ emailer: self.email_author_counts[emailer] + self.email_recipient_counts[emailer]
279
+ for emailer in self.all_emailers(True)
310
280
  }
311
281
 
312
282
  for p, count in sort_dict(emailer_counts):
@@ -326,76 +296,50 @@ class EpsteinFiles:
326
296
 
327
297
  def print_imessage_summary(self) -> None:
328
298
  """Print summary table and stats for text messages."""
329
- counts_table = Table(title="Text Message Counts By Author", header_style="bold")
330
- counts_table.add_column(AUTHOR.title(), justify='left', style="steel_blue bold", width=30)
331
- counts_table.add_column('Files', justify='right', style='white')
332
- counts_table.add_column("Msgs", justify='right')
333
- counts_table.add_column('First Sent At', justify='center', highlight=True, width=21)
334
- counts_table.add_column('Last Sent At', justify='center', style='wheat4', width=21)
335
- counts_table.add_column('Days', justify='right', style='dim')
336
-
337
- for name, count in sort_dict(self.imessage_sender_counts()):
338
- logs = self.imessage_logs_for(name)
339
- first_at = logs[0].first_message_at(name)
340
- last_at = logs[-1].first_message_at(name)
341
-
342
- counts_table.add_row(
343
- Text(name or UNKNOWN,
344
- get_style_for_name(name)),
345
- str(len(logs)),
346
- f"{count:,}",
347
- iso_timestamp(first_at),
348
- iso_timestamp(last_at),
349
- str((last_at - first_at).days + 1),
350
- )
351
-
352
- console.print(counts_table)
299
+ console.print(MessengerLog.summary_table(self.imessage_logs))
353
300
  text_summary_msg = f"\nDeanonymized {self.identified_imessage_log_count()} of "
354
- text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files)} files."
301
+ text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files):,} files."
355
302
  console.print(text_summary_msg)
356
303
  imessage_msg_count = sum([len(log.messages()) for log in self.imessage_logs])
357
- console.print(f"Found {imessage_msg_count} total text messages in {len(self.imessage_logs)} conversations.")
358
- console.print(f"(Last deploy found 4668 messages in 77 conversations)", style='dim')
304
+ console.print(f"Found {imessage_msg_count} text messages in {len(self.imessage_logs)} iMessage log files.")
359
305
 
360
306
  def print_other_files_table(self) -> list[OtherFile]:
361
- """Returns the OtherFiles that were interesting enough to print."""
307
+ """Returns the OtherFile objects that were interesting enough to print."""
362
308
  interesting_files = [doc for doc in self.other_files if args.all_other_files or doc.is_interesting()]
363
309
  header_pfx = '' if args.all_other_files else 'Selected '
364
310
  print_section_header(f"{FIRST_FEW_LINES} of {len(interesting_files)} {header_pfx}Files That Are Neither Emails Nor Text Msgs")
365
311
 
366
312
  if not args.all_other_files:
367
- print_centered(f"(the other site is uncurated and has all {len(self.other_files)} unclassifiable files and all {len(self.emails):,} emails)", style='dim')
313
+ print_centered(f"(the other site is uncurated and has all {len(self.other_files)} unclassifiable files and {len(self.emails):,} emails)", style='dim')
368
314
  print_other_site_link(False)
369
315
  console.line(2)
370
316
 
371
- table = Table(header_style='bold', show_lines=True)
372
- table.add_column('File', justify='center', width=FILENAME_LENGTH)
373
- table.add_column('Date', justify='center')
374
- table.add_column('Length', justify='center')
375
- table.add_column(FIRST_FEW_LINES, justify='left', style='pale_turquoise4')
317
+ console.print(OtherFile.build_table(interesting_files))
318
+ skipped_file_count = len(self.other_files) - len(interesting_files)
376
319
 
377
- for doc in interesting_files:
378
- link_and_info = [doc.raw_document_link_txt(), *doc.hints()]
379
- date_str = doc.date_str()
320
+ if skipped_file_count > 0:
321
+ logger.warning(f"Skipped {skipped_file_count} uninteresting files...")
380
322
 
381
- if doc.is_duplicate:
382
- preview_text = doc.duplicate_file_txt()
383
- row_style = ' dim'
384
- else:
385
- preview_text = doc.highlighted_preview_text()
386
- row_style = ''
323
+ return interesting_files
387
324
 
388
- table.add_row(
389
- Group(*link_and_info),
390
- Text(date_str, style=TIMESTAMP_DIM) if date_str else QUESTION_MARK_TXT,
391
- doc.file_size_str(),
392
- preview_text,
393
- style=row_style
394
- )
325
+ def _tally_email_data(self) -> None:
326
+ """Tally up summary info about Email objects."""
327
+ for email in self.emails:
328
+ if email.is_duplicate:
329
+ continue
395
330
 
396
- console.print(table)
397
- logger.warning(f"Skipped {len(self.other_files) - len(interesting_files)} uninteresting files...")
398
- return interesting_files
331
+ self.email_author_counts[email.author] += 1
332
+
333
+ if len(email.recipients) == 0:
334
+ self.unknown_recipient_email_ids.add(email.file_id)
335
+ self.email_recipient_counts[None] += 1
336
+ else:
337
+ for recipient in email.recipients:
338
+ self.email_recipient_counts[recipient] += 1
339
+
340
+ if email.sent_from_device:
341
+ self.email_authors_to_device_signatures[email.author_or_unknown()].add(email.sent_from_device)
342
+ self.email_device_signatures_to_authors[email.sent_from_device].add(email.author_or_unknown())
399
343
 
400
344
 
401
345
  def build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str], join_char: str = '\n') -> Padding:
@@ -413,6 +357,19 @@ def build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str]
413
357
  return Padding(table, DEVICE_SIGNATURE_PADDING)
414
358
 
415
359
 
360
+ def document_cls(document: Document) -> Type[Document]:
361
+ search_area = document.text[0:5000] # Limit search area to avoid pointless scans of huge files
362
+
363
+ if document.text[0] == '{':
364
+ return JsonFile
365
+ elif isinstance(document.config, EmailCfg) or DETECT_EMAIL_REGEX.match(search_area):
366
+ return Email
367
+ elif MSG_REGEX.search(search_area):
368
+ return MessengerLog
369
+ else:
370
+ return OtherFile
371
+
372
+
416
373
  def is_ok_for_epstein_web(name: str | None) -> bool:
417
374
  """Return True if it's likely that EpsteinWeb has a page for this name."""
418
375
  if name is None or ' ' not in name:
@@ -184,15 +184,21 @@ TULSI_GABBARD = 'Tulsi Gabbard'
184
184
  VIRGINIA_GIUFFRE = 'Virginia Giuffre'
185
185
 
186
186
  # Organizations
187
+ BOFA = 'BofA'
187
188
  CNN = 'CNN'
188
189
  DEUTSCHE_BANK = 'Deutsche Bank'
190
+ ELECTRON_CAPITAL_PARTNERS = 'Electron Capital Partners'
189
191
  GOLDMAN_SACHS = 'Goldman Sachs'
192
+ GOLDMAN_INVESTMENT_MGMT = f'{GOLDMAN_SACHS} Investment Management Division'
190
193
  HARVARD = 'Harvard'
191
194
  INSIGHTS_POD = f"InsightsPod" # Zubair bots
195
+ NEXT_MANAGEMENT = 'Next Management LLC'
192
196
  JP_MORGAN = 'JP Morgan'
193
197
  OSBORNE_LLP = f"{IAN_OSBORNE} & Partners LLP" # Ian Osborne's PR firm
198
+ UBS = 'UBS'
194
199
 
195
200
  # Locations
201
+ PALM_BEACH = 'Palm Beach'
196
202
  VIRGIN_ISLANDS = 'Virgin Islands'
197
203
 
198
204
  # First and last names that should be made part of a highlighting regex for emailers
@@ -9,6 +9,27 @@ JSON_FILE_CLASS = 'JsonFile'
9
9
  MESSENGER_LOG_CLASS = 'MessengerLog'
10
10
  OTHER_FILE_CLASS = 'OtherFile'
11
11
 
12
+ # categories
13
+ ACADEMIA = 'academia'
14
+ ARTS = 'arts'
15
+ ARTICLE = 'article'
16
+ BOOK = 'book'
17
+ BUSINESS = 'business'
18
+ CONFERENCE = 'conference'
19
+ ENTERTAINER = 'entertainer'
20
+ FINANCE = 'finance'
21
+ FLIGHT_LOGS = 'flight logs'
22
+ JOURNALIST = 'journalist'
23
+ JUNK = 'junk'
24
+ LEGAL = 'legal'
25
+ LOBBYIST = 'lobbyist'
26
+ POLITICS = 'politics'
27
+ PROPERTY = 'property'
28
+ PUBLICIST = 'publicist'
29
+ REPUTATION = 'reputation'
30
+ SOCIAL = 'social'
31
+ SPEECH = 'speech'
32
+
12
33
  # Publications
13
34
  BBC = 'BBC'
14
35
  BLOOMBERG = 'Bloomberg'
@@ -36,11 +57,17 @@ TIMESTAMP_DIM = f"turquoise4 dim"
36
57
  AUTHOR = 'author'
37
58
  DEFAULT = 'default'
38
59
  EVERYONE = 'everyone'
60
+ FIRST_FEW_LINES = 'First Few Lines'
39
61
  HOUSE_OVERSIGHT_PREFIX = 'HOUSE_OVERSIGHT_'
62
+ JSON = 'json'
40
63
  NA = 'n/a'
41
64
  REDACTED = '<REDACTED>'
42
65
  URL_SIGNIFIERS = ['gclid', 'htm', 'ref=', 'utm']
43
66
  QUESTION_MARKS = '(???)'
67
+
68
+ # Regexes
69
+ FILE_STEM_REGEX = re.compile(fr"{HOUSE_OVERSIGHT_PREFIX}(\d{{6}}(_\d{{1,2}})?)")
70
+ FILE_NAME_REGEX = re.compile(fr"{FILE_STEM_REGEX.pattern}(\.txt(\.json)?)?")
44
71
  QUESTION_MARKS_REGEX = re.compile(fr' {re.escape(QUESTION_MARKS)}$')
45
72
 
46
73
 
@@ -6,7 +6,7 @@ from inflection import parameterize
6
6
  from rich.text import Text
7
7
 
8
8
  from epstein_files.util.constant.strings import EMAIL, TEXT_MESSAGE, SiteType
9
- from epstein_files.util.file_helper import coerce_file_stem, filename_for_id
9
+ from epstein_files.util.file_helper import JSON_METADATA_PATH, WORD_COUNT_HTML_PATH, coerce_file_stem
10
10
 
11
11
  # Style stuff
12
12
  ARCHIVE_LINK_COLOR = 'slate_blue3'
@@ -20,8 +20,27 @@ EPSTEIN_WEB = 'EpsteinWeb'
20
20
  EPSTEINIFY = 'epsteinify'
21
21
  JMAIL = 'Jmail'
22
22
 
23
- # URLs
24
- ATTRIBUTIONS_URL = 'https://github.com/michelcrypt4d4mus/epstein_text_messages/blob/master/epstein_files/util/constants.py'
23
+
24
+ # Cryptadamus URLs
25
+ GH_PAGES_BASE_URL = 'https://michelcrypt4d4mus.github.io'
26
+ TEXT_MSGS_BASE_URL = f"{GH_PAGES_BASE_URL}/epstein_text_messages"
27
+ JSON_METADATA_URL = f'{TEXT_MSGS_BASE_URL}/{JSON_METADATA_PATH.name}'
28
+ WORD_COUNT_URL = f'{TEXT_MSGS_BASE_URL}/{WORD_COUNT_HTML_PATH.name}'
29
+
30
+ SITE_URLS: dict[SiteType, str] = {
31
+ EMAIL: f'{GH_PAGES_BASE_URL}/epstein_emails_house_oversight/', # TODO should just be same repo
32
+ TEXT_MESSAGE: TEXT_MSGS_BASE_URL,
33
+ }
34
+
35
+ GH_PROJECT_URL = 'https://github.com/michelcrypt4d4mus/epstein_text_messages'
36
+ GH_MASTER_URL = f"{GH_PROJECT_URL}/blob/master"
37
+ ATTRIBUTIONS_URL = f'{GH_MASTER_URL}/epstein_files/util/constants.py'
38
+ EXTRACTS_BASE_URL = f'{GH_MASTER_URL}/emails_extracted_from_legal_filings'
39
+
40
+ extracted_file_url = lambda f: f"{EXTRACTS_BASE_URL}/{f}"
41
+
42
+
43
+ # External URLs
25
44
  COFFEEZILLA_ARCHIVE_URL = 'https://journaliststudio.google.com/pinpoint/search?collection=061ce61c9e70bdfd'
26
45
  COURIER_NEWSROOM_ARCHIVE_URL = 'https://journaliststudio.google.com/pinpoint/search?collection=092314e384a58618'
27
46
  EPSTEINIFY_URL = 'https://epsteinify.com'
@@ -31,12 +50,6 @@ JMAIL_URL = 'https://jmail.world'
31
50
  OVERSIGHT_REPUBLICANS_PRESSER_URL = 'https://oversight.house.gov/release/oversight-committee-releases-additional-epstein-estate-documents/'
32
51
  RAW_OVERSIGHT_DOCS_GOOGLE_DRIVE_URL = 'https://drive.google.com/drive/folders/1hTNH5woIRio578onLGElkTWofUSWRoH_'
33
52
  SUBSTACK_URL = 'https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great'
34
- WORD_COUNT_URL = 'https://michelcrypt4d4mus.github.io/epstein_text_messages/epstein_emails_word_count.html'
35
-
36
- SITE_URLS: dict[SiteType, str] = {
37
- EMAIL: 'https://michelcrypt4d4mus.github.io/epstein_emails_house_oversight/',
38
- TEXT_MESSAGE: 'https://michelcrypt4d4mus.github.io/epstein_text_messages/',
39
- }
40
53
 
41
54
  DOC_LINK_BASE_URLS: dict[ExternalSite, str] = {
42
55
  EPSTEIN_MEDIA: f"{EPSTEIN_MEDIA_URL}/files",