epstein-files 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +75 -135
- epstein_files/documents/communication.py +9 -9
- epstein_files/documents/document.py +115 -87
- epstein_files/documents/email.py +154 -85
- epstein_files/documents/emails/email_header.py +7 -6
- epstein_files/documents/imessage/text_message.py +3 -2
- epstein_files/documents/json_file.py +17 -0
- epstein_files/documents/messenger_log.py +62 -3
- epstein_files/documents/other_file.py +165 -17
- epstein_files/epstein_files.py +128 -169
- epstein_files/util/constant/names.py +8 -1
- epstein_files/util/constant/output_files.py +29 -0
- epstein_files/util/constant/strings.py +27 -0
- epstein_files/util/constant/urls.py +25 -9
- epstein_files/util/constants.py +1018 -1045
- epstein_files/util/data.py +20 -55
- epstein_files/util/{file_cfg.py → doc_cfg.py} +121 -43
- epstein_files/util/env.py +19 -20
- epstein_files/util/file_helper.py +38 -21
- epstein_files/util/highlighted_group.py +229 -177
- epstein_files/util/logging.py +63 -0
- epstein_files/util/output.py +180 -0
- epstein_files/util/rich.py +29 -17
- epstein_files/util/search_result.py +14 -6
- epstein_files/util/timer.py +24 -0
- epstein_files/util/word_count.py +2 -1
- {epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/METADATA +20 -4
- epstein_files-1.0.2.dist-info/RECORD +33 -0
- epstein_files-1.0.2.dist-info/entry_points.txt +7 -0
- epstein_files-1.0.0.dist-info/RECORD +0 -28
- {epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/LICENSE +0 -0
- {epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/WHEEL +0 -0
epstein_files/epstein_files.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
import gzip
|
|
2
|
+
import json
|
|
2
3
|
import pickle
|
|
3
4
|
import re
|
|
4
5
|
from collections import defaultdict
|
|
5
6
|
from dataclasses import dataclass, field
|
|
6
7
|
from datetime import datetime
|
|
7
8
|
from pathlib import Path
|
|
8
|
-
from typing import
|
|
9
|
+
from typing import Sequence, Type
|
|
9
10
|
|
|
10
11
|
from rich.align import Align
|
|
11
|
-
from rich.console import Group
|
|
12
12
|
from rich.padding import Padding
|
|
13
13
|
from rich.table import Table
|
|
14
14
|
from rich.text import Text
|
|
@@ -19,24 +19,26 @@ from epstein_files.documents.emails.email_header import AUTHOR
|
|
|
19
19
|
from epstein_files.documents.json_file import JsonFile
|
|
20
20
|
from epstein_files.documents.messenger_log import MSG_REGEX, MessengerLog
|
|
21
21
|
from epstein_files.documents.other_file import OtherFile
|
|
22
|
+
from epstein_files.util.constant.output_files import PICKLED_PATH
|
|
22
23
|
from epstein_files.util.constant.strings import *
|
|
23
24
|
from epstein_files.util.constant.urls import (EPSTEIN_WEB, JMAIL, epsteinify_name_url, epstein_web_person_url,
|
|
24
25
|
search_jmail_url, search_twitter_url)
|
|
25
26
|
from epstein_files.util.constants import *
|
|
26
|
-
from epstein_files.util.data import
|
|
27
|
+
from epstein_files.util.data import dict_sets_to_lists, json_safe, sort_dict
|
|
28
|
+
from epstein_files.util.doc_cfg import EmailCfg
|
|
27
29
|
from epstein_files.util.env import args, logger
|
|
28
|
-
from epstein_files.util.
|
|
29
|
-
from epstein_files.util.file_helper import DOCS_DIR, FILENAME_LENGTH, PICKLED_PATH, file_size_str
|
|
30
|
+
from epstein_files.util.file_helper import DOCS_DIR, file_size_str
|
|
30
31
|
from epstein_files.util.highlighted_group import get_info_for_name, get_style_for_name
|
|
31
|
-
from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT,
|
|
32
|
-
|
|
32
|
+
from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, add_cols_to_table, console, highlighter,
|
|
33
|
+
link_text_obj, link_markup, print_author_header, print_centered, print_other_site_link, print_panel,
|
|
33
34
|
print_section_header, vertically_pad)
|
|
34
35
|
from epstein_files.util.search_result import SearchResult
|
|
36
|
+
from epstein_files.util.timer import Timer
|
|
35
37
|
|
|
36
38
|
DEVICE_SIGNATURE = 'Device Signature'
|
|
37
|
-
FIRST_FEW_LINES = 'First Few Lines'
|
|
38
39
|
DEVICE_SIGNATURE_PADDING = (1, 0)
|
|
39
40
|
NOT_INCLUDED_EMAILERS = [e.lower() for e in (USELESS_EMAILERS + [JEFFREY_EPSTEIN])]
|
|
41
|
+
SLOW_FILE_SECONDS = 1.0
|
|
40
42
|
|
|
41
43
|
INVALID_FOR_EPSTEIN_WEB = JUNK_EMAILERS + KRASSNER_RECIPIENTS + [
|
|
42
44
|
'ACT for America',
|
|
@@ -53,70 +55,55 @@ class EpsteinFiles:
|
|
|
53
55
|
imessage_logs: list[MessengerLog] = field(default_factory=list)
|
|
54
56
|
json_files: list[JsonFile] = field(default_factory=list)
|
|
55
57
|
other_files: list[OtherFile] = field(default_factory=list)
|
|
58
|
+
timer: Timer = field(default_factory=lambda: Timer())
|
|
56
59
|
|
|
57
60
|
# Analytics / calculations
|
|
58
61
|
email_author_counts: dict[str | None, int] = field(default_factory=lambda: defaultdict(int))
|
|
59
62
|
email_authors_to_device_signatures: dict[str, set] = field(default_factory=lambda: defaultdict(set))
|
|
60
63
|
email_device_signatures_to_authors: dict[str, set] = field(default_factory=lambda: defaultdict(set))
|
|
61
64
|
email_recipient_counts: dict[str | None, int] = field(default_factory=lambda: defaultdict(int))
|
|
62
|
-
|
|
65
|
+
unknown_recipient_email_ids: set[str] = field(default_factory=set)
|
|
63
66
|
|
|
64
67
|
def __post_init__(self):
|
|
68
|
+
"""Iterate through files and build appropriate objects."""
|
|
65
69
|
self.all_files = [f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')]
|
|
70
|
+
documents = []
|
|
66
71
|
|
|
67
72
|
# Read through and classify all the files
|
|
68
73
|
for file_arg in self.all_files:
|
|
69
|
-
|
|
74
|
+
doc_timer = Timer(decimals=4)
|
|
70
75
|
document = Document(file_arg)
|
|
71
76
|
|
|
72
77
|
if document.length == 0:
|
|
73
|
-
logger.
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
# Handle iMessage log files
|
|
80
|
-
self.imessage_logs.append(MessengerLog(file_arg, text=document.text))
|
|
81
|
-
logger.info(self.imessage_logs[-1].description().plain)
|
|
82
|
-
elif DETECT_EMAIL_REGEX.match(document.text) or isinstance(document.config, MessageCfg):
|
|
83
|
-
# Handle emails
|
|
84
|
-
email = Email(file_arg, text=document.text)
|
|
85
|
-
logger.info(email.description().plain)
|
|
86
|
-
self.emails.append(email)
|
|
87
|
-
self.email_author_counts[email.author] += 1
|
|
88
|
-
|
|
89
|
-
if len(email.recipients) == 0:
|
|
90
|
-
self._email_unknown_recipient_file_ids.add(email.file_id)
|
|
91
|
-
self.email_recipient_counts[None] += 1
|
|
92
|
-
else:
|
|
93
|
-
for recipient in email.recipients:
|
|
94
|
-
self.email_recipient_counts[recipient] += 1
|
|
95
|
-
|
|
96
|
-
if email.sent_from_device:
|
|
97
|
-
self.email_authors_to_device_signatures[email.author_or_unknown()].add(email.sent_from_device)
|
|
98
|
-
self.email_device_signatures_to_authors[email.sent_from_device].add(email.author_or_unknown())
|
|
99
|
-
else:
|
|
100
|
-
# Handle OtherFiles
|
|
101
|
-
self.other_files.append(OtherFile(file_arg, text=document.text))
|
|
102
|
-
logger.info(self.other_files[-1].description().plain)
|
|
78
|
+
logger.warning(f"Skipping empty file: {document}")
|
|
79
|
+
continue
|
|
80
|
+
|
|
81
|
+
cls = document_cls(document)
|
|
82
|
+
documents.append(cls(file_arg, text=document.text))
|
|
83
|
+
logger.info(str(documents[-1]))
|
|
103
84
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
85
|
+
if doc_timer.seconds_since_start() > SLOW_FILE_SECONDS:
|
|
86
|
+
doc_timer.print_at_checkpoint(f"Slow file: {documents[-1]} processed")
|
|
87
|
+
|
|
88
|
+
self.emails = Document.sort_by_timestamp([d for d in documents if isinstance(d, Email)])
|
|
89
|
+
self.imessage_logs = Document.sort_by_timestamp([d for d in documents if isinstance(d, MessengerLog)])
|
|
90
|
+
self.other_files = Document.sort_by_timestamp([d for d in documents if isinstance(d, (JsonFile, OtherFile))])
|
|
91
|
+
self.json_files = [doc for doc in self.other_files if isinstance(doc, JsonFile)]
|
|
92
|
+
self._tally_email_data()
|
|
107
93
|
|
|
108
94
|
@classmethod
|
|
109
|
-
def get_files(cls, timer: Timer | None = None) -> 'EpsteinFiles':
|
|
95
|
+
def get_files(cls, timer: Timer | None = None, use_pickled: bool = False) -> 'EpsteinFiles':
|
|
110
96
|
"""Alternate constructor that reads/writes a pickled version of the data ('timer' arg is for logging)."""
|
|
111
97
|
timer = timer or Timer()
|
|
112
98
|
|
|
113
|
-
if (args.pickled and PICKLED_PATH.exists()) and not args.overwrite_pickle:
|
|
99
|
+
if ((args.pickled or use_pickled) and PICKLED_PATH.exists()) and not args.overwrite_pickle:
|
|
114
100
|
with gzip.open(PICKLED_PATH, 'rb') as file:
|
|
115
101
|
epstein_files = pickle.load(file)
|
|
116
102
|
timer.print_at_checkpoint(f"Loaded {len(epstein_files.all_files):,} documents from '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})")
|
|
103
|
+
epstein_files.timer = timer
|
|
117
104
|
return epstein_files
|
|
118
105
|
|
|
119
|
-
epstein_files = EpsteinFiles()
|
|
106
|
+
epstein_files = EpsteinFiles(timer=timer)
|
|
120
107
|
|
|
121
108
|
if args.overwrite_pickle or not PICKLED_PATH.exists():
|
|
122
109
|
with gzip.open(PICKLED_PATH, 'wb') as file:
|
|
@@ -141,18 +128,17 @@ class EpsteinFiles:
|
|
|
141
128
|
def docs_matching(
|
|
142
129
|
self,
|
|
143
130
|
pattern: re.Pattern | str,
|
|
144
|
-
file_type: Literal['all', 'other'] = 'all',
|
|
145
131
|
names: list[str | None] | None = None
|
|
146
132
|
) -> list[SearchResult]:
|
|
147
133
|
"""Find documents whose text matches a pattern (file_type and names args limit the documents searched)."""
|
|
148
134
|
results: list[SearchResult] = []
|
|
149
135
|
|
|
150
|
-
for doc in
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
if names and ((not isinstance(doc, (Email, MessengerLog))) or doc.author not in names):
|
|
136
|
+
for doc in self.all_documents():
|
|
137
|
+
if names and doc.author not in names:
|
|
154
138
|
continue
|
|
155
139
|
|
|
140
|
+
lines = doc.matching_lines(pattern)
|
|
141
|
+
|
|
156
142
|
if len(lines) > 0:
|
|
157
143
|
results.append(SearchResult(doc, lines))
|
|
158
144
|
|
|
@@ -178,7 +164,7 @@ class EpsteinFiles:
|
|
|
178
164
|
return substitution_counts
|
|
179
165
|
|
|
180
166
|
def email_unknown_recipient_file_ids(self) -> list[str]:
|
|
181
|
-
return sorted(list(self.
|
|
167
|
+
return sorted(list(self.unknown_recipient_email_ids))
|
|
182
168
|
|
|
183
169
|
def emails_by(self, author: str | None) -> list[Email]:
|
|
184
170
|
return [e for e in self.emails if e.author == author]
|
|
@@ -198,48 +184,52 @@ class EpsteinFiles:
|
|
|
198
184
|
else:
|
|
199
185
|
return [e for e in self.emails if author in e.recipients]
|
|
200
186
|
|
|
201
|
-
def
|
|
202
|
-
if
|
|
203
|
-
|
|
187
|
+
def get_documents_by_id(self, file_ids: list[str]) -> list[Document]:
|
|
188
|
+
docs = [doc for doc in self.all_documents() if doc.file_id in file_ids]
|
|
189
|
+
|
|
190
|
+
if len(docs) != len(file_ids):
|
|
191
|
+
logger.warning(f"{len(file_ids)} file IDs provided but only {len(docs)} Epstein files found!")
|
|
204
192
|
|
|
205
|
-
|
|
206
|
-
|
|
193
|
+
return docs
|
|
194
|
+
|
|
195
|
+
def imessage_logs_for(self, author: str | None | list[str | None]) -> Sequence[MessengerLog]:
|
|
196
|
+
return MessengerLog.logs_for(author, self.imessage_logs)
|
|
207
197
|
|
|
208
198
|
def identified_imessage_log_count(self) -> int:
|
|
209
199
|
return len([log for log in self.imessage_logs if log.author])
|
|
210
200
|
|
|
211
|
-
def
|
|
212
|
-
|
|
201
|
+
def json_metadata(self) -> str:
|
|
202
|
+
metadata = {
|
|
203
|
+
EMAIL_CLASS: [json_safe(d.metadata()) for d in self.emails],
|
|
204
|
+
JSON_FILE_CLASS: [json_safe(d.metadata()) for d in self.json_files],
|
|
205
|
+
MESSENGER_LOG_CLASS: [json_safe(d.metadata()) for d in self.imessage_logs],
|
|
206
|
+
OTHER_FILE_CLASS: [json_safe(d.metadata()) for d in self.other_files if not isinstance(d, JsonFile)],
|
|
207
|
+
}
|
|
213
208
|
|
|
214
|
-
|
|
215
|
-
for message in message_log.messages():
|
|
216
|
-
sender_counts[message.author] += 1
|
|
209
|
+
return json.dumps(metadata, indent=4, sort_keys=True)
|
|
217
210
|
|
|
218
|
-
|
|
211
|
+
def non_json_other_files(self) -> list[OtherFile]:
|
|
212
|
+
return [doc for doc in self.other_files if not isinstance(doc, JsonFile)]
|
|
219
213
|
|
|
220
214
|
def print_files_summary(self) -> None:
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
for doc in self.all_documents():
|
|
224
|
-
if doc.is_duplicate:
|
|
225
|
-
dupes[doc.document_type()] += 1
|
|
226
|
-
|
|
227
|
-
table = Table()
|
|
215
|
+
table = Table(title='Summary of Document Types')
|
|
228
216
|
add_cols_to_table(table, ['File Type', 'Files', 'Author Known', 'Author Unknown', 'Duplicates'])
|
|
229
217
|
|
|
230
|
-
def add_row(label: str, docs: list
|
|
218
|
+
def add_row(label: str, docs: list):
|
|
219
|
+
known = None if isinstance(docs[0], JsonFile) else len([d for d in docs if d.author])
|
|
220
|
+
|
|
231
221
|
table.add_row(
|
|
232
222
|
label,
|
|
233
223
|
f"{len(docs):,}",
|
|
234
|
-
f"{known:,}" if known else NA_TXT,
|
|
235
|
-
f"{len(docs) - known:,}" if known else NA_TXT,
|
|
236
|
-
f"{
|
|
224
|
+
f"{known:,}" if known is not None else NA_TXT,
|
|
225
|
+
f"{len(docs) - known:,}" if known is not None else NA_TXT,
|
|
226
|
+
f"{len([d for d in docs if d.is_duplicate])}",
|
|
237
227
|
)
|
|
238
228
|
|
|
239
|
-
add_row('iMessage Logs', self.imessage_logs
|
|
240
|
-
add_row('Emails', self.emails
|
|
241
|
-
add_row('JSON Data', self.json_files
|
|
242
|
-
add_row('Other', self.
|
|
229
|
+
add_row('iMessage Logs', self.imessage_logs)
|
|
230
|
+
add_row('Emails', self.emails)
|
|
231
|
+
add_row('JSON Data', self.json_files)
|
|
232
|
+
add_row('Other', self.non_json_other_files())
|
|
243
233
|
console.print(Align.center(table))
|
|
244
234
|
console.line()
|
|
245
235
|
|
|
@@ -247,10 +237,11 @@ class EpsteinFiles:
|
|
|
247
237
|
"""Print complete emails to or from a particular 'author'. Returns the Emails that were printed."""
|
|
248
238
|
conversation_length = self.email_conversation_length_in_days(_author)
|
|
249
239
|
emails = self.emails_for(_author)
|
|
240
|
+
unique_emails = [email for email in emails if not email.is_duplicate]
|
|
250
241
|
author = _author or UNKNOWN
|
|
251
242
|
|
|
252
243
|
print_author_header(
|
|
253
|
-
f"Found {len(
|
|
244
|
+
f"Found {len(unique_emails)} {author} emails starting {emails[0].timestamp.date()} over {conversation_length:,} days",
|
|
254
245
|
get_style_for_name(author),
|
|
255
246
|
get_info_for_name(author)
|
|
256
247
|
)
|
|
@@ -271,28 +262,9 @@ class EpsteinFiles:
|
|
|
271
262
|
|
|
272
263
|
return emails
|
|
273
264
|
|
|
274
|
-
def print_emails_table_for(self,
|
|
275
|
-
emails = [email for email in self.emails_for(
|
|
276
|
-
author
|
|
277
|
-
|
|
278
|
-
table = Table(
|
|
279
|
-
title=f"Emails to/from {author} starting {emails[0].timestamp.date()}",
|
|
280
|
-
border_style=get_style_for_name(author, allow_bold=False),
|
|
281
|
-
header_style="bold"
|
|
282
|
-
)
|
|
283
|
-
|
|
284
|
-
table.add_column('From', justify='left')
|
|
285
|
-
table.add_column('Timestamp', justify='center')
|
|
286
|
-
table.add_column('Subject', justify='left', style='honeydew2', min_width=60)
|
|
287
|
-
|
|
288
|
-
for email in emails:
|
|
289
|
-
table.add_row(
|
|
290
|
-
email.author_txt,
|
|
291
|
-
email.epstein_media_link(link_txt=email.timestamp_without_seconds()),
|
|
292
|
-
highlighter(email.subject())
|
|
293
|
-
)
|
|
294
|
-
|
|
295
|
-
console.print(Align.center(table), '\n')
|
|
265
|
+
def print_emails_table_for(self, author: str | None) -> None:
|
|
266
|
+
emails = [email for email in self.emails_for(author) if not email.is_duplicate] # Remove dupes
|
|
267
|
+
console.print(Align.center(Email.build_table(emails, author)), '\n')
|
|
296
268
|
|
|
297
269
|
def print_email_device_info(self) -> None:
|
|
298
270
|
print_panel(f"Email [italic]Sent from \\[DEVICE][/italic] Signature Breakdown", padding=(4, 0, 0, 0), centered=True)
|
|
@@ -300,13 +272,13 @@ class EpsteinFiles:
|
|
|
300
272
|
console.print(build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
|
|
301
273
|
|
|
302
274
|
def print_emailer_counts_table(self) -> None:
|
|
303
|
-
footer = f"Identified authors of {self.attributed_email_count()} emails out of {len(self.emails)}
|
|
275
|
+
footer = f"Identified authors of {self.attributed_email_count():,} emails out of {len(self.emails):,}."
|
|
304
276
|
counts_table = Table(title=f"Email Counts", caption=footer, header_style="bold")
|
|
305
277
|
add_cols_to_table(counts_table, ['Name', 'Count', 'Sent', "Recv'd", JMAIL, EPSTEIN_WEB, 'Twitter'])
|
|
306
278
|
|
|
307
279
|
emailer_counts = {
|
|
308
|
-
|
|
309
|
-
for
|
|
280
|
+
emailer: self.email_author_counts[emailer] + self.email_recipient_counts[emailer]
|
|
281
|
+
for emailer in self.all_emailers(True)
|
|
310
282
|
}
|
|
311
283
|
|
|
312
284
|
for p, count in sort_dict(emailer_counts):
|
|
@@ -326,76 +298,50 @@ class EpsteinFiles:
|
|
|
326
298
|
|
|
327
299
|
def print_imessage_summary(self) -> None:
|
|
328
300
|
"""Print summary table and stats for text messages."""
|
|
329
|
-
|
|
330
|
-
counts_table.add_column(AUTHOR.title(), justify='left', style="steel_blue bold", width=30)
|
|
331
|
-
counts_table.add_column('Files', justify='right', style='white')
|
|
332
|
-
counts_table.add_column("Msgs", justify='right')
|
|
333
|
-
counts_table.add_column('First Sent At', justify='center', highlight=True, width=21)
|
|
334
|
-
counts_table.add_column('Last Sent At', justify='center', style='wheat4', width=21)
|
|
335
|
-
counts_table.add_column('Days', justify='right', style='dim')
|
|
336
|
-
|
|
337
|
-
for name, count in sort_dict(self.imessage_sender_counts()):
|
|
338
|
-
logs = self.imessage_logs_for(name)
|
|
339
|
-
first_at = logs[0].first_message_at(name)
|
|
340
|
-
last_at = logs[-1].first_message_at(name)
|
|
341
|
-
|
|
342
|
-
counts_table.add_row(
|
|
343
|
-
Text(name or UNKNOWN,
|
|
344
|
-
get_style_for_name(name)),
|
|
345
|
-
str(len(logs)),
|
|
346
|
-
f"{count:,}",
|
|
347
|
-
iso_timestamp(first_at),
|
|
348
|
-
iso_timestamp(last_at),
|
|
349
|
-
str((last_at - first_at).days + 1),
|
|
350
|
-
)
|
|
351
|
-
|
|
352
|
-
console.print(counts_table)
|
|
301
|
+
console.print(MessengerLog.summary_table(self.imessage_logs))
|
|
353
302
|
text_summary_msg = f"\nDeanonymized {self.identified_imessage_log_count()} of "
|
|
354
|
-
text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files)} files."
|
|
303
|
+
text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files):,} files."
|
|
355
304
|
console.print(text_summary_msg)
|
|
356
305
|
imessage_msg_count = sum([len(log.messages()) for log in self.imessage_logs])
|
|
357
|
-
console.print(f"Found {imessage_msg_count}
|
|
358
|
-
console.print(f"(Last deploy found 4668 messages in 77 conversations)", style='dim')
|
|
306
|
+
console.print(f"Found {imessage_msg_count} text messages in {len(self.imessage_logs)} iMessage log files.")
|
|
359
307
|
|
|
360
308
|
def print_other_files_table(self) -> list[OtherFile]:
|
|
361
|
-
"""Returns the
|
|
309
|
+
"""Returns the OtherFile objects that were interesting enough to print."""
|
|
362
310
|
interesting_files = [doc for doc in self.other_files if args.all_other_files or doc.is_interesting()]
|
|
363
311
|
header_pfx = '' if args.all_other_files else 'Selected '
|
|
364
312
|
print_section_header(f"{FIRST_FEW_LINES} of {len(interesting_files)} {header_pfx}Files That Are Neither Emails Nor Text Msgs")
|
|
365
313
|
|
|
366
314
|
if not args.all_other_files:
|
|
367
|
-
print_centered(f"(the other site is uncurated and has all {len(self.other_files)} unclassifiable files and
|
|
315
|
+
print_centered(f"(the other site is uncurated and has all {len(self.other_files)} unclassifiable files and {len(self.emails):,} emails)", style='dim')
|
|
368
316
|
print_other_site_link(False)
|
|
369
317
|
console.line(2)
|
|
370
318
|
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
table.add_column('Date', justify='center')
|
|
374
|
-
table.add_column('Length', justify='center')
|
|
375
|
-
table.add_column(FIRST_FEW_LINES, justify='left', style='pale_turquoise4')
|
|
319
|
+
console.print(OtherFile.build_table(interesting_files))
|
|
320
|
+
skipped_file_count = len(self.other_files) - len(interesting_files)
|
|
376
321
|
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
date_str = doc.date_str()
|
|
322
|
+
if skipped_file_count > 0:
|
|
323
|
+
logger.warning(f"Skipped {skipped_file_count} uninteresting files...")
|
|
380
324
|
|
|
381
|
-
|
|
382
|
-
preview_text = doc.duplicate_file_txt()
|
|
383
|
-
row_style = ' dim'
|
|
384
|
-
else:
|
|
385
|
-
preview_text = doc.highlighted_preview_text()
|
|
386
|
-
row_style = ''
|
|
325
|
+
return interesting_files
|
|
387
326
|
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
style=row_style
|
|
394
|
-
)
|
|
327
|
+
def _tally_email_data(self) -> None:
|
|
328
|
+
"""Tally up summary info about Email objects."""
|
|
329
|
+
for email in self.emails:
|
|
330
|
+
if email.is_duplicate:
|
|
331
|
+
continue
|
|
395
332
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
333
|
+
self.email_author_counts[email.author] += 1
|
|
334
|
+
|
|
335
|
+
if len(email.recipients) == 0:
|
|
336
|
+
self.unknown_recipient_email_ids.add(email.file_id)
|
|
337
|
+
self.email_recipient_counts[None] += 1
|
|
338
|
+
else:
|
|
339
|
+
for recipient in email.recipients:
|
|
340
|
+
self.email_recipient_counts[recipient] += 1
|
|
341
|
+
|
|
342
|
+
if email.sent_from_device:
|
|
343
|
+
self.email_authors_to_device_signatures[email.author_or_unknown()].add(email.sent_from_device)
|
|
344
|
+
self.email_device_signatures_to_authors[email.sent_from_device].add(email.author_or_unknown())
|
|
399
345
|
|
|
400
346
|
|
|
401
347
|
def build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str], join_char: str = '\n') -> Padding:
|
|
@@ -413,18 +359,6 @@ def build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str]
|
|
|
413
359
|
return Padding(table, DEVICE_SIGNATURE_PADDING)
|
|
414
360
|
|
|
415
361
|
|
|
416
|
-
def is_ok_for_epstein_web(name: str | None) -> bool:
|
|
417
|
-
"""Return True if it's likely that EpsteinWeb has a page for this name."""
|
|
418
|
-
if name is None or ' ' not in name:
|
|
419
|
-
return False
|
|
420
|
-
elif '@' in name or '/' in name or '??' in name:
|
|
421
|
-
return False
|
|
422
|
-
elif name in INVALID_FOR_EPSTEIN_WEB:
|
|
423
|
-
return False
|
|
424
|
-
|
|
425
|
-
return True
|
|
426
|
-
|
|
427
|
-
|
|
428
362
|
def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
|
|
429
363
|
counts: dict[str | None, int] = defaultdict(int)
|
|
430
364
|
|
|
@@ -435,3 +369,28 @@ def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
|
|
|
435
369
|
counts[None] += 1
|
|
436
370
|
|
|
437
371
|
return counts
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def document_cls(document: Document) -> Type[Document]:
|
|
375
|
+
search_area = document.text[0:5000] # Limit search area to avoid pointless scans of huge files
|
|
376
|
+
|
|
377
|
+
if document.text[0] == '{':
|
|
378
|
+
return JsonFile
|
|
379
|
+
elif isinstance(document.config, EmailCfg) or DETECT_EMAIL_REGEX.match(search_area):
|
|
380
|
+
return Email
|
|
381
|
+
elif MSG_REGEX.search(search_area):
|
|
382
|
+
return MessengerLog
|
|
383
|
+
else:
|
|
384
|
+
return OtherFile
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def is_ok_for_epstein_web(name: str | None) -> bool:
|
|
388
|
+
"""Return True if it's likely that EpsteinWeb has a page for this name."""
|
|
389
|
+
if name is None or ' ' not in name:
|
|
390
|
+
return False
|
|
391
|
+
elif '@' in name or '/' in name or '??' in name:
|
|
392
|
+
return False
|
|
393
|
+
elif name in INVALID_FOR_EPSTEIN_WEB:
|
|
394
|
+
return False
|
|
395
|
+
|
|
396
|
+
return True
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from epstein_files.util.constant.strings import QUESTION_MARKS, remove_question_marks
|
|
2
2
|
|
|
3
|
-
|
|
4
3
|
UNKNOWN = '(unknown)'
|
|
5
4
|
|
|
6
5
|
# Texting Names
|
|
@@ -170,6 +169,7 @@ ZUBAIR_KHAN = 'Zubair Khan'
|
|
|
170
169
|
|
|
171
170
|
# No communications but name is in the files
|
|
172
171
|
BILL_GATES = 'Bill Gates'
|
|
172
|
+
DONALD_TRUMP = 'Donald Trump'
|
|
173
173
|
ELON_MUSK = 'Elon Musk'
|
|
174
174
|
HENRY_HOLT = 'Henry Holt' # Actually a company?
|
|
175
175
|
IVANKA = 'Ivanka'
|
|
@@ -184,15 +184,22 @@ TULSI_GABBARD = 'Tulsi Gabbard'
|
|
|
184
184
|
VIRGINIA_GIUFFRE = 'Virginia Giuffre'
|
|
185
185
|
|
|
186
186
|
# Organizations
|
|
187
|
+
BOFA = 'BofA'
|
|
187
188
|
CNN = 'CNN'
|
|
188
189
|
DEUTSCHE_BANK = 'Deutsche Bank'
|
|
190
|
+
ELECTRON_CAPITAL_PARTNERS = 'Electron Capital Partners'
|
|
189
191
|
GOLDMAN_SACHS = 'Goldman Sachs'
|
|
192
|
+
GOLDMAN_INVESTMENT_MGMT = f'{GOLDMAN_SACHS} Investment Management Division'
|
|
190
193
|
HARVARD = 'Harvard'
|
|
191
194
|
INSIGHTS_POD = f"InsightsPod" # Zubair bots
|
|
195
|
+
NEXT_MANAGEMENT = 'Next Management LLC'
|
|
192
196
|
JP_MORGAN = 'JP Morgan'
|
|
193
197
|
OSBORNE_LLP = f"{IAN_OSBORNE} & Partners LLP" # Ian Osborne's PR firm
|
|
198
|
+
TRUMP_ORG = 'Trump Organization'
|
|
199
|
+
UBS = 'UBS'
|
|
194
200
|
|
|
195
201
|
# Locations
|
|
202
|
+
PALM_BEACH = 'Palm Beach'
|
|
196
203
|
VIRGIN_ISLANDS = 'Virgin Islands'
|
|
197
204
|
|
|
198
205
|
# First and last names that should be made part of a highlighting regex for emailers
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
PICKLED_PATH = Path("the_epstein_files.pkl.gz")
|
|
4
|
+
|
|
5
|
+
EPSTEIN_FILES_NOV_2025 = 'epstein_files_nov_2025'
|
|
6
|
+
URLS_ENV = '.urls.env'
|
|
7
|
+
|
|
8
|
+
HTML_DIR = Path('docs')
|
|
9
|
+
ALL_EMAILS_PATH = HTML_DIR.joinpath(f'all_emails_{EPSTEIN_FILES_NOV_2025}.html')
|
|
10
|
+
JSON_METADATA_PATH = HTML_DIR.joinpath(f'file_metadata_{EPSTEIN_FILES_NOV_2025}.json')
|
|
11
|
+
TEXT_MSGS_HTML_PATH = HTML_DIR.joinpath('index.html')
|
|
12
|
+
WORD_COUNT_HTML_PATH = HTML_DIR.joinpath(f'communication_word_count_{EPSTEIN_FILES_NOV_2025}.html')
|
|
13
|
+
# EPSTEIN_WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_texts_and_emails_word_count.html')
|
|
14
|
+
|
|
15
|
+
BUILD_ARTIFACTS = [
|
|
16
|
+
ALL_EMAILS_PATH,
|
|
17
|
+
# EPSTEIN_WORD_COUNT_HTML_PATH,
|
|
18
|
+
JSON_METADATA_PATH,
|
|
19
|
+
TEXT_MSGS_HTML_PATH,
|
|
20
|
+
WORD_COUNT_HTML_PATH,
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def make_clean() -> None:
|
|
25
|
+
"""Delete all build artifacts."""
|
|
26
|
+
for build_file in BUILD_ARTIFACTS:
|
|
27
|
+
if build_file.exists():
|
|
28
|
+
print(f"Removing build file '{build_file}'...")
|
|
29
|
+
build_file.unlink()
|
|
@@ -9,6 +9,27 @@ JSON_FILE_CLASS = 'JsonFile'
|
|
|
9
9
|
MESSENGER_LOG_CLASS = 'MessengerLog'
|
|
10
10
|
OTHER_FILE_CLASS = 'OtherFile'
|
|
11
11
|
|
|
12
|
+
# categories
|
|
13
|
+
ACADEMIA = 'academia'
|
|
14
|
+
ARTS = 'arts'
|
|
15
|
+
ARTICLE = 'article'
|
|
16
|
+
BOOK = 'book'
|
|
17
|
+
BUSINESS = 'business'
|
|
18
|
+
CONFERENCE = 'conference'
|
|
19
|
+
ENTERTAINER = 'entertainer'
|
|
20
|
+
FINANCE = 'finance'
|
|
21
|
+
FLIGHT_LOGS = 'flight logs'
|
|
22
|
+
JOURNALIST = 'journalist'
|
|
23
|
+
JUNK = 'junk'
|
|
24
|
+
LEGAL = 'legal'
|
|
25
|
+
LOBBYIST = 'lobbyist'
|
|
26
|
+
POLITICS = 'politics'
|
|
27
|
+
PROPERTY = 'property'
|
|
28
|
+
PUBLICIST = 'publicist'
|
|
29
|
+
REPUTATION = 'reputation'
|
|
30
|
+
SOCIAL = 'social'
|
|
31
|
+
SPEECH = 'speech'
|
|
32
|
+
|
|
12
33
|
# Publications
|
|
13
34
|
BBC = 'BBC'
|
|
14
35
|
BLOOMBERG = 'Bloomberg'
|
|
@@ -36,11 +57,17 @@ TIMESTAMP_DIM = f"turquoise4 dim"
|
|
|
36
57
|
AUTHOR = 'author'
|
|
37
58
|
DEFAULT = 'default'
|
|
38
59
|
EVERYONE = 'everyone'
|
|
60
|
+
FIRST_FEW_LINES = 'First Few Lines'
|
|
39
61
|
HOUSE_OVERSIGHT_PREFIX = 'HOUSE_OVERSIGHT_'
|
|
62
|
+
JSON = 'json'
|
|
40
63
|
NA = 'n/a'
|
|
41
64
|
REDACTED = '<REDACTED>'
|
|
42
65
|
URL_SIGNIFIERS = ['gclid', 'htm', 'ref=', 'utm']
|
|
43
66
|
QUESTION_MARKS = '(???)'
|
|
67
|
+
|
|
68
|
+
# Regexes
|
|
69
|
+
FILE_STEM_REGEX = re.compile(fr"{HOUSE_OVERSIGHT_PREFIX}(\d{{6}}(_\d{{1,2}})?)")
|
|
70
|
+
FILE_NAME_REGEX = re.compile(fr"{FILE_STEM_REGEX.pattern}(\.txt(\.json)?)?")
|
|
44
71
|
QUESTION_MARKS_REGEX = re.compile(fr' {re.escape(QUESTION_MARKS)}$')
|
|
45
72
|
|
|
46
73
|
|
|
@@ -5,8 +5,9 @@ from typing import Literal
|
|
|
5
5
|
from inflection import parameterize
|
|
6
6
|
from rich.text import Text
|
|
7
7
|
|
|
8
|
+
from epstein_files.util.constant.output_files import *
|
|
8
9
|
from epstein_files.util.constant.strings import EMAIL, TEXT_MESSAGE, SiteType
|
|
9
|
-
from epstein_files.util.file_helper import coerce_file_stem
|
|
10
|
+
from epstein_files.util.file_helper import coerce_file_stem
|
|
10
11
|
|
|
11
12
|
# Style stuff
|
|
12
13
|
ARCHIVE_LINK_COLOR = 'slate_blue3'
|
|
@@ -20,8 +21,29 @@ EPSTEIN_WEB = 'EpsteinWeb'
|
|
|
20
21
|
EPSTEINIFY = 'epsteinify'
|
|
21
22
|
JMAIL = 'Jmail'
|
|
22
23
|
|
|
23
|
-
|
|
24
|
-
|
|
24
|
+
|
|
25
|
+
# Deployment URLS
|
|
26
|
+
# NOTE: don't rename these variables without changing deploy.sh!
|
|
27
|
+
GH_PAGES_BASE_URL = 'https://michelcrypt4d4mus.github.io'
|
|
28
|
+
TEXT_MSGS_URL = f"{GH_PAGES_BASE_URL}/epstein_text_messages"
|
|
29
|
+
ALL_EMAILS_URL = f'{TEXT_MSGS_URL}/{ALL_EMAILS_PATH.name}'
|
|
30
|
+
JSON_METADATA_URL = f'{TEXT_MSGS_URL}/{JSON_METADATA_PATH.name}'
|
|
31
|
+
WORD_COUNT_URL = f'{TEXT_MSGS_URL}/{WORD_COUNT_HTML_PATH.name}'
|
|
32
|
+
|
|
33
|
+
SITE_URLS: dict[SiteType, str] = {
|
|
34
|
+
EMAIL: ALL_EMAILS_URL,
|
|
35
|
+
TEXT_MESSAGE: TEXT_MSGS_URL,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
GH_PROJECT_URL = 'https://github.com/michelcrypt4d4mus/epstein_text_messages'
|
|
39
|
+
GH_MASTER_URL = f"{GH_PROJECT_URL}/blob/master"
|
|
40
|
+
ATTRIBUTIONS_URL = f'{GH_MASTER_URL}/epstein_files/util/constants.py'
|
|
41
|
+
EXTRACTS_BASE_URL = f'{GH_MASTER_URL}/emails_extracted_from_legal_filings'
|
|
42
|
+
|
|
43
|
+
extracted_file_url = lambda f: f"{EXTRACTS_BASE_URL}/{f}"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# External URLs
|
|
25
47
|
COFFEEZILLA_ARCHIVE_URL = 'https://journaliststudio.google.com/pinpoint/search?collection=061ce61c9e70bdfd'
|
|
26
48
|
COURIER_NEWSROOM_ARCHIVE_URL = 'https://journaliststudio.google.com/pinpoint/search?collection=092314e384a58618'
|
|
27
49
|
EPSTEINIFY_URL = 'https://epsteinify.com'
|
|
@@ -31,12 +53,6 @@ JMAIL_URL = 'https://jmail.world'
|
|
|
31
53
|
OVERSIGHT_REPUBLICANS_PRESSER_URL = 'https://oversight.house.gov/release/oversight-committee-releases-additional-epstein-estate-documents/'
|
|
32
54
|
RAW_OVERSIGHT_DOCS_GOOGLE_DRIVE_URL = 'https://drive.google.com/drive/folders/1hTNH5woIRio578onLGElkTWofUSWRoH_'
|
|
33
55
|
SUBSTACK_URL = 'https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great'
|
|
34
|
-
WORD_COUNT_URL = 'https://michelcrypt4d4mus.github.io/epstein_text_messages/epstein_emails_word_count.html'
|
|
35
|
-
|
|
36
|
-
SITE_URLS: dict[SiteType, str] = {
|
|
37
|
-
EMAIL: 'https://michelcrypt4d4mus.github.io/epstein_emails_house_oversight/',
|
|
38
|
-
TEXT_MESSAGE: 'https://michelcrypt4d4mus.github.io/epstein_text_messages/',
|
|
39
|
-
}
|
|
40
56
|
|
|
41
57
|
DOC_LINK_BASE_URLS: dict[ExternalSite, str] = {
|
|
42
58
|
EPSTEIN_MEDIA: f"{EPSTEIN_MEDIA_URL}/files",
|