epstein-files 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
epstein_files/__init__.py CHANGED
@@ -10,11 +10,12 @@ from sys import exit
10
10
 
11
11
  from dotenv import load_dotenv
12
12
  load_dotenv()
13
-
14
13
  from rich.markup import escape
15
14
  from rich.padding import Padding
16
15
  from rich.panel import Panel
16
+ from rich.text import Text
17
17
 
18
+ from epstein_files.count_words import write_word_counts_html
18
19
  from epstein_files.epstein_files import EpsteinFiles, document_cls
19
20
  from epstein_files.documents.document import INFO_PADDING, Document
20
21
  from epstein_files.documents.email import Email
@@ -24,22 +25,25 @@ from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, TEXT_MSGS_
24
25
  from epstein_files.util.env import args, specified_names
25
26
  from epstein_files.util.file_helper import coerce_file_path, extract_file_id
26
27
  from epstein_files.util.logging import logger
27
- from epstein_files.util.output import print_emails, print_json_metadata, print_json_stats, print_text_messages, write_urls
28
+ from epstein_files.util.output import print_emails, print_json_files, print_json_metadata, print_json_stats, print_text_messages, write_urls
28
29
  from epstein_files.util.rich import build_highlighter, console, print_header, print_panel, write_html
29
30
  from epstein_files.util.timer import Timer
30
31
 
32
+ timer = Timer()
33
+ epstein_files = EpsteinFiles.get_files(timer)
34
+
31
35
 
32
36
  def generate_html() -> None:
33
37
  if args.make_clean:
34
38
  make_clean()
39
+ write_urls()
35
40
  exit()
36
-
37
- timer = Timer()
38
- epstein_files = EpsteinFiles.get_files(timer)
39
-
40
- if args.json_metadata:
41
+ elif args.json_metadata:
41
42
  print_json_metadata(epstein_files)
42
43
  exit()
44
+ elif args.output_json_files:
45
+ print_json_files(epstein_files)
46
+ exit()
43
47
 
44
48
  print_header(epstein_files)
45
49
 
@@ -75,7 +79,7 @@ def epstein_diff():
75
79
  def epstein_search():
76
80
  """Search the cleaned up text of the files."""
77
81
  _assert_positional_args()
78
- epstein_files = EpsteinFiles.get_files(use_pickled=True)
82
+ epstein_files = EpsteinFiles.get_files()
79
83
 
80
84
  for search_term in args.positional_args:
81
85
  temp_highlighter = build_highlighter(search_term)
@@ -103,32 +107,27 @@ def epstein_show():
103
107
  """Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
104
108
  _assert_positional_args()
105
109
  ids = [extract_file_id(arg) for arg in args.positional_args]
110
+ raw_docs = [Document(coerce_file_path(id)) for id in ids]
111
+ docs = [document_cls(doc)(doc.file_path) for doc in raw_docs]
106
112
  console.line()
107
113
 
108
- if args.pickled:
109
- epstein_files = EpsteinFiles.get_files(use_pickled=True)
110
- docs = epstein_files.get_documents_by_id(ids)
111
- else:
112
- raw_docs = [Document(coerce_file_path(id)) for id in ids]
113
- docs = [document_cls(doc)(doc.file_path) for doc in raw_docs]
114
-
115
114
  for doc in docs:
116
- console.line()
117
- console.print(doc)
115
+ if isinstance(doc, Email):
116
+ doc.truncation_allowed = False
117
+
118
+ console.print('\n', doc, '\n')
118
119
 
119
120
  if args.raw:
120
- console.line()
121
- console.print(Panel(f"*** {doc.url_slug} RAW ***", expand=False, style=doc._border_style()))
122
- console.print(escape(doc.raw_text()))
121
+ console.print(Panel(Text("RAW: ").append(doc.summary()), expand=False, style=doc._border_style()))
122
+ console.print(escape(doc.raw_text()), '\n')
123
123
 
124
124
  if isinstance(doc, Email):
125
- console.line()
126
- console.print(Panel(f"*** {doc.url_slug} actual_text ***", expand=False, style=doc._border_style()))
127
- console.print(escape(doc._actual_text()))
125
+ console.print(Panel(Text("actual_text: ").append(doc.summary()), expand=False, style=doc._border_style()))
126
+ console.print(escape(doc._actual_text()), '\n')
128
127
 
129
128
 
130
- def epstein_dump_urls() -> None:
131
- write_urls()
129
+ def epstein_word_count() -> None:
130
+ write_word_counts_html()
132
131
 
133
132
 
134
133
  def _assert_positional_args():
@@ -0,0 +1,72 @@
1
+ # Count word usage in emails and texts
2
+ import re
3
+
4
+ from epstein_files.epstein_files import EpsteinFiles
5
+ from epstein_files.util.constant.common_words import COMMON_WORDS_LIST
6
+ from epstein_files.util.constant.output_files import WORD_COUNT_HTML_PATH
7
+ from epstein_files.util.env import args, specified_names
8
+ from epstein_files.util.logging import logger
9
+ from epstein_files.util.rich import (console, print_centered, print_color_key, print_page_title, print_panel,
10
+ print_starred_header, write_html)
11
+ from epstein_files.util.search_result import MatchedLine, SearchResult
12
+ from epstein_files.util.timer import Timer
13
+ from epstein_files.util.word_count import WordCount
14
+
15
+ HTML_REGEX = re.compile(r"^http|#yiv")
16
+
17
+
18
+ def write_word_counts_html() -> None:
19
+ timer = Timer()
20
+ epstein_files = EpsteinFiles.get_files(timer)
21
+ email_subjects: set[str] = set()
22
+ word_count = WordCount()
23
+
24
+ # Remove dupes, junk mail, and fwded articles from emails
25
+ emails = [
26
+ e for e in epstein_files.emails
27
+ if not (e.is_duplicate or e.is_junk_mail() or (e.config and e.config.is_fwded_article)) \
28
+ and (len(specified_names) == 0 or e.author in specified_names)
29
+ ]
30
+
31
+ for email in emails:
32
+ logger.info(f"Counting words in {email}\n [SUBJECT] {email.subject()}")
33
+ lines = email.actual_text.split('\n')
34
+
35
+ if email.subject() not in email_subjects and f'Re: {email.subject()}' not in email_subjects:
36
+ email_subjects.add(email.subject())
37
+ lines.append(email.subject())
38
+
39
+ for i, line in enumerate(lines):
40
+ if HTML_REGEX.search(line):
41
+ continue
42
+
43
+ for word in line.split():
44
+ word_count.tally_word(word, SearchResult(email, [MatchedLine(line, i)]))
45
+
46
+ # Add in iMessage conversation words
47
+ imessage_logs = epstein_files.imessage_logs_for(specified_names) if specified_names else epstein_files.imessage_logs
48
+
49
+ for imessage_log in imessage_logs:
50
+ logger.info(f"Counting words in {imessage_log}")
51
+
52
+ for msg in imessage_log.messages():
53
+ if len(specified_names) > 0 and msg.author not in specified_names:
54
+ continue
55
+ elif HTML_REGEX.search(line):
56
+ continue
57
+
58
+ for word in msg.text.split():
59
+ word_count.tally_word(word, SearchResult(imessage_log, [msg.text]))
60
+
61
+ print_page_title(expand=False)
62
+ print_starred_header(f"Most Common Words in {len(emails):,} Emails and {len(imessage_logs)} iMessage Logs")
63
+ print_centered(f"(excluding {len(COMMON_WORDS_LIST)} particularly common words at bottom)", style='dim')
64
+ console.line()
65
+ print_color_key()
66
+ console.line()
67
+ console.print(word_count)
68
+ console.line(2)
69
+ print_panel(f"{len(COMMON_WORDS_LIST):,} Excluded Words", centered=True)
70
+ console.print(', '.join(COMMON_WORDS_LIST), highlight=False)
71
+ write_html(WORD_COUNT_HTML_PATH)
72
+ timer.print_at_checkpoint(f"Finished counting words")
@@ -85,10 +85,9 @@ class Document:
85
85
 
86
86
  if self.is_local_extract_file():
87
87
  self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
88
- cfg_type = type(self.config).__name__ if self.config else None
89
88
 
90
89
  # Coerce FileConfig for court docs etc. to MessageCfg for email files extracted from that document
91
- if self.class_name() == EMAIL_CLASS and self.config and cfg_type != EmailCfg.__name__:
90
+ if self.class_name() == EMAIL_CLASS and self.config and not isinstance(self.config, EmailCfg):
92
91
  self.config = EmailCfg.from_doc_cfg(self.config)
93
92
  else:
94
93
  self.url_slug = self.file_path.stem
@@ -26,7 +26,7 @@ from epstein_files.util.logging import logger
26
26
  from epstein_files.util.rich import *
27
27
 
28
28
  BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
29
- BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
29
+ BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
30
30
  DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
31
31
  LINK_LINE_REGEX = re.compile(f"^(> )?htt")
32
32
  QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
@@ -245,12 +245,10 @@ TRUNCATE_TERMS = [
245
245
  ]
246
246
 
247
247
  # Some Paul Krassner emails have a ton of CCed parties we don't care about
248
- KRASSNER_RECIPIENTS = uniquify(flatten(ALL_FILE_CONFIGS[id].recipients for id in ['025329', '024923', '033568']))
248
+ KRASSNER_RECIPIENTS = uniquify(flatten([ALL_FILE_CONFIGS[id].recipients for id in ['025329', '024923', '033568']]))
249
249
 
250
250
  # No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
251
- USELESS_EMAILERS = IRAN_NUCLEAR_DEAL_SPAM_EMAIL_RECIPIENTS + \
252
- KRASSNER_RECIPIENTS + \
253
- FLIGHT_IN_2012_PEOPLE + [
251
+ USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIPIENTS + [
254
252
  'Alan Rogers', # Random CC
255
253
  'Andrew Friendly', # Presumably some relation of Kelly Friendly
256
254
  'BS Stern', # A random fwd of email we have
@@ -322,11 +320,18 @@ class Email(Communication):
322
320
  def __post_init__(self):
323
321
  super().__post_init__()
324
322
 
325
- if self.config and self.config.recipients:
326
- self.recipients = cast(list[str | None], self.config.recipients)
327
- else:
328
- for recipient in self.header.recipients():
329
- self.recipients.extend(self._get_names(recipient))
323
+ try:
324
+ if self.config and self.config.recipients:
325
+ self.recipients = cast(list[str | None], self.config.recipients)
326
+ else:
327
+ for recipient in self.header.recipients():
328
+ self.recipients.extend(self._get_names(recipient))
329
+ except Exception as e:
330
+ console.print_exception()
331
+ console.line(2)
332
+ logger.fatal(f"Failed on {self.file_id}")
333
+ console.line(2)
334
+ raise e
330
335
 
331
336
  # Remove self CCs
332
337
  recipients = [r for r in self.recipients if r != self.author or self.file_id in SELF_EMAILS_FILE_IDS]
@@ -21,14 +21,11 @@ class JsonFile(OtherFile):
21
21
  if self.url_slug.endswith('.txt') or self.url_slug.endswith('.json'):
22
22
  self.url_slug = Path(self.url_slug).stem
23
23
 
24
- self._set_computed_fields(text=self.formatted_json())
24
+ self._set_computed_fields(text=self.json_str())
25
25
 
26
26
  def category(self) -> str:
27
27
  return JSON
28
28
 
29
- def formatted_json(self) -> str:
30
- return json.dumps(self.json_data(), indent=4)
31
-
32
29
  def info_txt(self) -> Text | None:
33
30
  return Text(f"JSON file, possibly iMessage or similar app metadata", style='white dim italic')
34
31
 
@@ -38,3 +35,6 @@ class JsonFile(OtherFile):
38
35
  def json_data(self) -> object:
39
36
  with open(self.file_path, encoding='utf-8-sig') as f:
40
37
  return json.load(f)
38
+
39
+ def json_str(self) -> str:
40
+ return json.dumps(self.json_data(), indent=4)
@@ -15,6 +15,7 @@ from epstein_files.util.data import iso_timestamp, listify, sort_dict
15
15
  from epstein_files.util.doc_cfg import Metadata, TextCfg
16
16
  from epstein_files.util.highlighted_group import get_style_for_name
17
17
  from epstein_files.util.logging import logger
18
+ from epstein_files.util.rich import build_table
18
19
 
19
20
  CONFIRMED_MSG = 'Found confirmed counterparty'
20
21
  GUESSED_MSG = 'This is probably a conversation with'
@@ -111,7 +112,7 @@ class MessengerLog(Communication):
111
112
  @classmethod
112
113
  def summary_table(cls, imessage_logs: list['MessengerLog']) -> Table:
113
114
  """Build a table summarizing the text messages in 'imessage_logs'."""
114
- counts_table = Table(title="Text Message Counts By Author", header_style="bold")
115
+ counts_table = build_table("Text Message Counts By Author")
115
116
  counts_table.add_column(AUTHOR.title(), justify='left', style="steel_blue bold", width=30)
116
117
  counts_table.add_column('Files', justify='right', style='white')
117
118
  counts_table.add_column("Msgs", justify='right')
@@ -20,7 +20,7 @@ from epstein_files.util.data import escape_single_quotes, remove_timezone, uniqu
20
20
  from epstein_files.util.file_helper import FILENAME_LENGTH
21
21
  from epstein_files.util.env import args
22
22
  from epstein_files.util.highlighted_group import get_style_for_category
23
- from epstein_files.util.rich import QUESTION_MARK_TXT, highlighter
23
+ from epstein_files.util.rich import QUESTION_MARK_TXT, build_table, highlighter
24
24
  from epstein_files.util.logging import logger
25
25
 
26
26
  MAX_DAYS_SPANNED_TO_BE_VALID = 10
@@ -233,7 +233,7 @@ class OtherFile(Document):
233
233
  @staticmethod
234
234
  def build_table(docs: list['OtherFile']) -> Table:
235
235
  """Build a table of OtherFile documents."""
236
- table = Table(header_style='bold', show_lines=True)
236
+ table = build_table(None, show_lines=True)
237
237
  table.add_column('File', justify='center', width=FILENAME_LENGTH)
238
238
  table.add_column('Date', justify='center')
239
239
  table.add_column('Size', justify='center')
@@ -19,7 +19,6 @@ from epstein_files.documents.emails.email_header import AUTHOR
19
19
  from epstein_files.documents.json_file import JsonFile
20
20
  from epstein_files.documents.messenger_log import MSG_REGEX, MessengerLog
21
21
  from epstein_files.documents.other_file import OtherFile
22
- from epstein_files.util.constant.output_files import PICKLED_PATH
23
22
  from epstein_files.util.constant.strings import *
24
23
  from epstein_files.util.constant.urls import (EPSTEIN_MEDIA, EPSTEIN_WEB, JMAIL, epstein_media_person_url,
25
24
  epsteinify_name_url, epstein_web_person_url, search_jmail_url, search_twitter_url)
@@ -29,15 +28,16 @@ from epstein_files.util.doc_cfg import EmailCfg, Metadata
29
28
  from epstein_files.util.env import args, logger
30
29
  from epstein_files.util.file_helper import DOCS_DIR, file_size_str
31
30
  from epstein_files.util.highlighted_group import get_info_for_name, get_style_for_name
32
- from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, add_cols_to_table, console, highlighter,
33
- link_text_obj, link_markup, print_author_header, print_centered, print_other_site_link, print_panel,
34
- print_section_header, vertically_pad)
31
+ from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, TABLE_BORDER_STYLE, add_cols_to_table,
32
+ build_table, console, highlighter, link_text_obj, link_markup, print_author_header, print_centered,
33
+ print_other_site_link, print_panel, print_section_header, vertically_pad)
35
34
  from epstein_files.util.search_result import SearchResult
36
35
  from epstein_files.util.timer import Timer
37
36
 
37
+ EXCLUDED_EMAILERS = [e.lower() for e in (USELESS_EMAILERS + [JEFFREY_EPSTEIN])]
38
+ PICKLED_PATH = Path("the_epstein_files.pkl.gz")
38
39
  DEVICE_SIGNATURE = 'Device Signature'
39
40
  DEVICE_SIGNATURE_PADDING = (1, 0)
40
- NOT_INCLUDED_EMAILERS = [e.lower() for e in (USELESS_EMAILERS + [JEFFREY_EPSTEIN])]
41
41
  SLOW_FILE_SECONDS = 1.0
42
42
 
43
43
  INVALID_FOR_EPSTEIN_WEB = JUNK_EMAILERS + KRASSNER_RECIPIENTS + [
@@ -94,23 +94,23 @@ class EpsteinFiles:
94
94
  self._tally_email_data()
95
95
 
96
96
  @classmethod
97
- def get_files(cls, timer: Timer | None = None, use_pickled: bool = False) -> 'EpsteinFiles':
97
+ def get_files(cls, timer: Timer | None = None) -> 'EpsteinFiles':
98
98
  """Alternate constructor that reads/writes a pickled version of the data ('timer' arg is for logging)."""
99
99
  timer = timer or Timer()
100
100
 
101
- if ((args.pickled or use_pickled) and PICKLED_PATH.exists()) and not args.overwrite_pickle:
101
+ if PICKLED_PATH.exists() and not args.overwrite_pickle:
102
102
  with gzip.open(PICKLED_PATH, 'rb') as file:
103
103
  epstein_files = pickle.load(file)
104
104
  timer.print_at_checkpoint(f"Loaded {len(epstein_files.all_files):,} documents from '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})")
105
105
  epstein_files.timer = timer
106
106
  return epstein_files
107
107
 
108
+ logger.warning(f"Building new cache file, this will take a few minutes...")
108
109
  epstein_files = EpsteinFiles(timer=timer)
109
110
 
110
- if args.overwrite_pickle or not PICKLED_PATH.exists():
111
- with gzip.open(PICKLED_PATH, 'wb') as file:
112
- pickle.dump(epstein_files, file)
113
- logger.warning(f"Pickled data to '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})...")
111
+ with gzip.open(PICKLED_PATH, 'wb') as file:
112
+ pickle.dump(epstein_files, file)
113
+ logger.warning(f"Pickled data to '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})...")
114
114
 
115
115
  timer.print_at_checkpoint(f'Processed {len(epstein_files.all_files):,} documents')
116
116
  return epstein_files
@@ -119,9 +119,9 @@ class EpsteinFiles:
119
119
  return self.imessage_logs + self.emails + self.other_files
120
120
 
121
121
  def all_emailers(self, include_useless: bool = False) -> list[str | None]:
122
- """Returns all emailers except Epstein and USELESS_EMAILERS, sorted from least frequent to most."""
122
+ """Returns all emailers except Epstein and EXCLUDED_EMAILERS, sorted from least frequent to most."""
123
123
  names = [a for a in self.email_author_counts.keys()] + [r for r in self.email_recipient_counts.keys()]
124
- names = names if include_useless else [e for e in names if e is None or e.lower() not in NOT_INCLUDED_EMAILERS]
124
+ names = names if include_useless else [e for e in names if e is None or e.lower() not in EXCLUDED_EMAILERS]
125
125
  return sorted(list(set(names)), key=lambda e: self.email_author_counts[e] + self.email_recipient_counts[e])
126
126
 
127
127
  def attributed_email_count(self) -> int:
@@ -200,10 +200,10 @@ class EpsteinFiles:
200
200
  def json_metadata(self) -> str:
201
201
  """Create a JSON string containing metadata for all the files."""
202
202
  metadata = {
203
- EMAIL_CLASS: _sorted_metadata(self.emails),
204
- JSON_FILE_CLASS: _sorted_metadata(self.json_files),
205
- MESSENGER_LOG_CLASS: _sorted_metadata(self.imessage_logs),
206
- OTHER_FILE_CLASS: _sorted_metadata(self.non_json_other_files()),
203
+ Email.__name__: _sorted_metadata(self.emails),
204
+ JsonFile.__name__: _sorted_metadata(self.json_files),
205
+ MessengerLog.__name__: _sorted_metadata(self.imessage_logs),
206
+ OtherFile.__name__: _sorted_metadata(self.non_json_other_files()),
207
207
  }
208
208
 
209
209
  return json.dumps(metadata, indent=4, sort_keys=True)
@@ -212,7 +212,7 @@ class EpsteinFiles:
212
212
  return [doc for doc in self.other_files if not isinstance(doc, JsonFile)]
213
213
 
214
214
  def print_files_summary(self) -> None:
215
- table = Table(title='Summary of Document Types')
215
+ table = build_table('Summary of Document Types')
216
216
  add_cols_to_table(table, ['File Type', 'Files', 'Author Known', 'Author Unknown', 'Duplicates'])
217
217
 
218
218
  def add_row(label: str, docs: list):
@@ -268,12 +268,12 @@ class EpsteinFiles:
268
268
 
269
269
  def print_email_device_info(self) -> None:
270
270
  print_panel(f"Email [italic]Sent from \\[DEVICE][/italic] Signature Breakdown", padding=(4, 0, 0, 0), centered=True)
271
- console.print(build_signature_table(self.email_authors_to_device_signatures, (AUTHOR, DEVICE_SIGNATURE)))
272
- console.print(build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
271
+ console.print(_build_signature_table(self.email_authors_to_device_signatures, (AUTHOR, DEVICE_SIGNATURE)))
272
+ console.print(_build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
273
273
 
274
274
  def print_emailer_counts_table(self) -> None:
275
275
  footer = f"Identified authors of {self.attributed_email_count():,} emails out of {len(self.emails):,}."
276
- counts_table = Table(title=f"Email Counts", caption=footer, header_style="bold")
276
+ counts_table = build_table("Email Counts", caption=footer)
277
277
  add_cols_to_table(counts_table, ['Name', 'Count', 'Sent', "Recv'd", JMAIL, EPSTEIN_MEDIA, EPSTEIN_WEB, 'Twitter'])
278
278
 
279
279
  emailer_counts = {
@@ -345,21 +345,6 @@ class EpsteinFiles:
345
345
  self.email_device_signatures_to_authors[email.sent_from_device].add(email.author_or_unknown())
346
346
 
347
347
 
348
- def build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str], join_char: str = '\n') -> Padding:
349
- title = 'Signatures Used By Authors' if cols[0] == AUTHOR else 'Authors Seen Using Signatures'
350
- table = Table(header_style="bold reverse", show_lines=True, title=title)
351
-
352
- for i, col in enumerate(cols):
353
- table.add_column(col.title() + ('s' if i == 1 else ''))
354
-
355
- new_dict = dict_sets_to_lists(keyed_sets)
356
-
357
- for k in sorted(new_dict.keys()):
358
- table.add_row(highlighter(k or UNKNOWN), highlighter(join_char.join(sorted(new_dict[k]))))
359
-
360
- return Padding(table, DEVICE_SIGNATURE_PADDING)
361
-
362
-
363
348
  def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
364
349
  counts: dict[str | None, int] = defaultdict(int)
365
350
 
@@ -372,12 +357,12 @@ def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
372
357
  return counts
373
358
 
374
359
 
375
- def document_cls(document: Document) -> Type[Document]:
376
- search_area = document.text[0:5000] # Limit search area to avoid pointless scans of huge files
360
+ def document_cls(doc: Document) -> Type[Document]:
361
+ search_area = doc.text[0:5000] # Limit search area to avoid pointless scans of huge files
377
362
 
378
- if document.text[0] == '{':
363
+ if doc.text[0] == '{':
379
364
  return JsonFile
380
- elif isinstance(document.config, EmailCfg) or DETECT_EMAIL_REGEX.match(search_area):
365
+ elif isinstance(doc.config, EmailCfg) or (DETECT_EMAIL_REGEX.match(search_area) and doc.config is None):
381
366
  return Email
382
367
  elif MSG_REGEX.search(search_area):
383
368
  return MessengerLog
@@ -397,6 +382,21 @@ def is_ok_for_epstein_web(name: str | None) -> bool:
397
382
  return True
398
383
 
399
384
 
385
+ def _build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str], join_char: str = '\n') -> Padding:
386
+ title = 'Signatures Used By Authors' if cols[0] == AUTHOR else 'Authors Seen Using Signatures'
387
+ table = build_table(title, header_style="bold reverse", show_lines=True)
388
+
389
+ for i, col in enumerate(cols):
390
+ table.add_column(col.title() + ('s' if i == 1 else ''))
391
+
392
+ new_dict = dict_sets_to_lists(keyed_sets)
393
+
394
+ for k in sorted(new_dict.keys()):
395
+ table.add_row(highlighter(k or UNKNOWN), highlighter(join_char.join(sorted(new_dict[k]))))
396
+
397
+ return Padding(table, DEVICE_SIGNATURE_PADDING)
398
+
399
+
400
400
  def _sorted_metadata(docs: Sequence[Document]) -> list[Metadata]:
401
401
  docs_sorted_by_id = sorted(docs, key=lambda d: d.file_id)
402
402
  return [json_safe(d.metadata()) for d in docs_sorted_by_id]
@@ -1,20 +1,36 @@
1
1
  from pathlib import Path
2
2
 
3
- PICKLED_PATH = Path("the_epstein_files.pkl.gz")
4
-
5
- EPSTEIN_FILES_NOV_2025 = 'epstein_files_nov_2025'
6
- URLS_ENV = '.urls.env'
3
+ from epstein_files.util.constant.strings import EMAIL, TEXT_MESSAGE, SiteType
7
4
 
5
+ # Files output by the code
8
6
  HTML_DIR = Path('docs')
7
+ EPSTEIN_FILES_NOV_2025 = 'epstein_files_nov_2025'
9
8
  ALL_EMAILS_PATH = HTML_DIR.joinpath(f'all_emails_{EPSTEIN_FILES_NOV_2025}.html')
9
+ JSON_FILES_JSON_PATH = HTML_DIR.joinpath(f'json_files_from_{EPSTEIN_FILES_NOV_2025}.json')
10
10
  JSON_METADATA_PATH = HTML_DIR.joinpath(f'file_metadata_{EPSTEIN_FILES_NOV_2025}.json')
11
11
  TEXT_MSGS_HTML_PATH = HTML_DIR.joinpath('index.html')
12
12
  WORD_COUNT_HTML_PATH = HTML_DIR.joinpath(f'communication_word_count_{EPSTEIN_FILES_NOV_2025}.html')
13
13
  # EPSTEIN_WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_texts_and_emails_word_count.html')
14
+ URLS_ENV = '.urls.env'
15
+
16
+ # Deployment URLS
17
+ # NOTE: don't rename these variables without changing deploy.sh!
18
+ GH_PAGES_BASE_URL = 'https://michelcrypt4d4mus.github.io'
19
+ TEXT_MSGS_URL = f"{GH_PAGES_BASE_URL}/epstein_text_messages"
20
+ ALL_EMAILS_URL = f"{TEXT_MSGS_URL}/{ALL_EMAILS_PATH.name}"
21
+ JSON_FILES_URL = f"{TEXT_MSGS_URL}/{JSON_FILES_JSON_PATH.name}"
22
+ JSON_METADATA_URL = f"{TEXT_MSGS_URL}/{JSON_METADATA_PATH.name}"
23
+ WORD_COUNT_URL = f"{TEXT_MSGS_URL}/{WORD_COUNT_HTML_PATH.name}"
24
+
25
+ SITE_URLS: dict[SiteType, str] = {
26
+ EMAIL: ALL_EMAILS_URL,
27
+ TEXT_MESSAGE: TEXT_MSGS_URL,
28
+ }
14
29
 
15
30
  BUILD_ARTIFACTS = [
16
31
  ALL_EMAILS_PATH,
17
32
  # EPSTEIN_WORD_COUNT_HTML_PATH,
33
+ JSON_FILES_JSON_PATH,
18
34
  JSON_METADATA_PATH,
19
35
  TEXT_MSGS_HTML_PATH,
20
36
  WORD_COUNT_HTML_PATH,
@@ -2,13 +2,6 @@ import re
2
2
  from typing import Literal
3
3
 
4
4
 
5
- # Document subclass names (this sucks)
6
- DOCUMENT_CLASS = 'Document'
7
- EMAIL_CLASS = 'Email'
8
- JSON_FILE_CLASS = 'JsonFile'
9
- MESSENGER_LOG_CLASS = 'MessengerLog'
10
- OTHER_FILE_CLASS = 'OtherFile'
11
-
12
5
  # categories
13
6
  ACADEMIA = 'academia'
14
7
  ARTS = 'arts'
@@ -27,6 +20,7 @@ POLITICS = 'politics'
27
20
  PROPERTY = 'property'
28
21
  PUBLICIST = 'publicist'
29
22
  REPUTATION = 'reputation'
23
+ SKYPE_LOG= 'skype log'
30
24
  SOCIAL = 'social'
31
25
  SPEECH = 'speech'
32
26
 
@@ -55,7 +49,6 @@ TEXT_MESSAGE = 'text message'
55
49
  SiteType = Literal['email', 'text message']
56
50
 
57
51
  # Styles
58
- OTHER_SITE_LINK_STYLE = 'dark_goldenrod'
59
52
  TIMESTAMP_STYLE = 'turquoise4'
60
53
  TIMESTAMP_DIM = f"turquoise4 dim"
61
54
 
@@ -76,5 +69,12 @@ FILE_STEM_REGEX = re.compile(fr"{HOUSE_OVERSIGHT_PREFIX}(\d{{6}}(_\d{{1,2}})?)")
76
69
  FILE_NAME_REGEX = re.compile(fr"{FILE_STEM_REGEX.pattern}(\.txt(\.json)?)?")
77
70
  QUESTION_MARKS_REGEX = re.compile(fr' {re.escape(QUESTION_MARKS)}$')
78
71
 
72
+ # Document subclass names (this sucks)
73
+ DOCUMENT_CLASS = 'Document'
74
+ EMAIL_CLASS = 'Email'
75
+ JSON_FILE_CLASS = 'JsonFile'
76
+ MESSENGER_LOG_CLASS = 'MessengerLog'
77
+ OTHER_FILE_CLASS = 'OtherFile'
78
+
79
79
 
80
80
  remove_question_marks = lambda name: QUESTION_MARKS_REGEX.sub('', name)
@@ -6,7 +6,6 @@ from inflection import parameterize
6
6
  from rich.text import Text
7
7
 
8
8
  from epstein_files.util.constant.output_files import *
9
- from epstein_files.util.constant.strings import EMAIL, TEXT_MESSAGE, SiteType
10
9
  from epstein_files.util.file_helper import coerce_file_stem
11
10
 
12
11
  # Style stuff
@@ -15,26 +14,11 @@ TEXT_LINK = 'text_link'
15
14
 
16
15
  # External site names
17
16
  ExternalSite = Literal['epstein.media', 'epsteinify', 'EpsteinWeb']
18
-
19
17
  EPSTEIN_MEDIA = 'epstein.media'
20
18
  EPSTEIN_WEB = 'EpsteinWeb'
21
19
  EPSTEINIFY = 'epsteinify'
22
20
  JMAIL = 'Jmail'
23
21
 
24
-
25
- # Deployment URLS
26
- # NOTE: don't rename these variables without changing deploy.sh!
27
- GH_PAGES_BASE_URL = 'https://michelcrypt4d4mus.github.io'
28
- TEXT_MSGS_URL = f"{GH_PAGES_BASE_URL}/epstein_text_messages"
29
- ALL_EMAILS_URL = f'{TEXT_MSGS_URL}/{ALL_EMAILS_PATH.name}'
30
- JSON_METADATA_URL = f'{TEXT_MSGS_URL}/{JSON_METADATA_PATH.name}'
31
- WORD_COUNT_URL = f'{TEXT_MSGS_URL}/{WORD_COUNT_HTML_PATH.name}'
32
-
33
- SITE_URLS: dict[SiteType, str] = {
34
- EMAIL: ALL_EMAILS_URL,
35
- TEXT_MESSAGE: TEXT_MSGS_URL,
36
- }
37
-
38
22
  GH_PROJECT_URL = 'https://github.com/michelcrypt4d4mus/epstein_text_messages'
39
23
  GH_MASTER_URL = f"{GH_PROJECT_URL}/blob/master"
40
24
  ATTRIBUTIONS_URL = f'{GH_MASTER_URL}/epstein_files/util/constants.py'
@@ -46,14 +30,16 @@ extracted_file_url = lambda f: f"{EXTRACTS_BASE_URL}/{f}"
46
30
  # External URLs
47
31
  COFFEEZILLA_ARCHIVE_URL = 'https://journaliststudio.google.com/pinpoint/search?collection=061ce61c9e70bdfd'
48
32
  COURIER_NEWSROOM_ARCHIVE_URL = 'https://journaliststudio.google.com/pinpoint/search?collection=092314e384a58618'
49
- EPSTEINIFY_URL = 'https://epsteinify.com'
50
- EPSTEIN_MEDIA_URL = 'https://www.epstein.media'
51
- EPSTEIN_WEB_URL = 'https://epsteinweb.org'
52
- JMAIL_URL = 'https://jmail.world'
53
33
  OVERSIGHT_REPUBLICANS_PRESSER_URL = 'https://oversight.house.gov/release/oversight-committee-releases-additional-epstein-estate-documents/'
54
34
  RAW_OVERSIGHT_DOCS_GOOGLE_DRIVE_URL = 'https://drive.google.com/drive/folders/1hTNH5woIRio578onLGElkTWofUSWRoH_'
55
35
  SUBSTACK_URL = 'https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great'
56
36
 
37
+ # Document source sites
38
+ EPSTEINIFY_URL = 'https://epsteinify.com'
39
+ EPSTEIN_MEDIA_URL = 'https://epstein.media'
40
+ EPSTEIN_WEB_URL = 'https://epsteinweb.org'
41
+ JMAIL_URL = 'https://jmail.world'
42
+
57
43
  DOC_LINK_BASE_URLS: dict[ExternalSite, str] = {
58
44
  EPSTEIN_MEDIA: f"{EPSTEIN_MEDIA_URL}/files",
59
45
  EPSTEIN_WEB: f'{EPSTEIN_WEB_URL}/wp-content/uploads/epstein_evidence/images',
@@ -61,7 +47,6 @@ DOC_LINK_BASE_URLS: dict[ExternalSite, str] = {
61
47
  }
62
48
 
63
49
 
64
- # TODO: epsteinify.com seems to be down as of 2025-12-30, switched to epstein.web for links
65
50
  epsteinify_api_url = lambda file_id: f"{EPSTEINIFY_URL}/api/documents/HOUSE_OVERSIGHT_{file_id}"
66
51
  epsteinify_doc_link_markup = lambda filename_or_id, style = TEXT_LINK: external_doc_link_markup(EPSTEINIFY, filename_or_id, style)
67
52
  epsteinify_doc_link_txt = lambda filename_or_id, style = TEXT_LINK: Text.from_markup(external_doc_link_markup(filename_or_id, style))