epstein-files 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +59 -51
- epstein_files/documents/communication.py +9 -9
- epstein_files/documents/document.py +111 -87
- epstein_files/documents/email.py +154 -85
- epstein_files/documents/emails/email_header.py +7 -6
- epstein_files/documents/imessage/text_message.py +3 -2
- epstein_files/documents/json_file.py +17 -0
- epstein_files/documents/messenger_log.py +62 -3
- epstein_files/documents/other_file.py +165 -17
- epstein_files/epstein_files.py +100 -143
- epstein_files/util/constant/names.py +6 -0
- epstein_files/util/constant/strings.py +27 -0
- epstein_files/util/constant/urls.py +22 -9
- epstein_files/util/constants.py +968 -1015
- epstein_files/util/data.py +14 -28
- epstein_files/util/{file_cfg.py → doc_cfg.py} +120 -34
- epstein_files/util/env.py +16 -18
- epstein_files/util/file_helper.py +56 -17
- epstein_files/util/highlighted_group.py +227 -175
- epstein_files/util/logging.py +57 -0
- epstein_files/util/rich.py +18 -13
- epstein_files/util/search_result.py +14 -6
- epstein_files/util/timer.py +24 -0
- epstein_files/util/word_count.py +2 -1
- {epstein_files-1.0.0.dist-info → epstein_files-1.0.1.dist-info}/METADATA +3 -2
- epstein_files-1.0.1.dist-info/RECORD +30 -0
- epstein_files-1.0.0.dist-info/RECORD +0 -28
- {epstein_files-1.0.0.dist-info → epstein_files-1.0.1.dist-info}/LICENSE +0 -0
- {epstein_files-1.0.0.dist-info → epstein_files-1.0.1.dist-info}/WHEEL +0 -0
epstein_files/epstein_files.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
import gzip
|
|
2
|
+
import json
|
|
2
3
|
import pickle
|
|
3
4
|
import re
|
|
4
5
|
from collections import defaultdict
|
|
5
6
|
from dataclasses import dataclass, field
|
|
6
7
|
from datetime import datetime
|
|
7
8
|
from pathlib import Path
|
|
8
|
-
from typing import
|
|
9
|
+
from typing import Sequence, Type
|
|
9
10
|
|
|
10
11
|
from rich.align import Align
|
|
11
|
-
from rich.console import Group
|
|
12
12
|
from rich.padding import Padding
|
|
13
13
|
from rich.table import Table
|
|
14
14
|
from rich.text import Text
|
|
@@ -23,20 +23,21 @@ from epstein_files.util.constant.strings import *
|
|
|
23
23
|
from epstein_files.util.constant.urls import (EPSTEIN_WEB, JMAIL, epsteinify_name_url, epstein_web_person_url,
|
|
24
24
|
search_jmail_url, search_twitter_url)
|
|
25
25
|
from epstein_files.util.constants import *
|
|
26
|
-
from epstein_files.util.data import
|
|
26
|
+
from epstein_files.util.data import dict_sets_to_lists, json_safe, sort_dict
|
|
27
|
+
from epstein_files.util.doc_cfg import EmailCfg
|
|
27
28
|
from epstein_files.util.env import args, logger
|
|
28
|
-
from epstein_files.util.
|
|
29
|
-
from epstein_files.util.file_helper import DOCS_DIR, FILENAME_LENGTH, PICKLED_PATH, file_size_str
|
|
29
|
+
from epstein_files.util.file_helper import DOCS_DIR, PICKLED_PATH, file_size_str
|
|
30
30
|
from epstein_files.util.highlighted_group import get_info_for_name, get_style_for_name
|
|
31
|
-
from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT,
|
|
32
|
-
|
|
31
|
+
from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, add_cols_to_table, console, highlighter,
|
|
32
|
+
link_text_obj, link_markup, print_author_header, print_centered, print_other_site_link, print_panel,
|
|
33
33
|
print_section_header, vertically_pad)
|
|
34
34
|
from epstein_files.util.search_result import SearchResult
|
|
35
|
+
from epstein_files.util.timer import Timer
|
|
35
36
|
|
|
36
37
|
DEVICE_SIGNATURE = 'Device Signature'
|
|
37
|
-
FIRST_FEW_LINES = 'First Few Lines'
|
|
38
38
|
DEVICE_SIGNATURE_PADDING = (1, 0)
|
|
39
39
|
NOT_INCLUDED_EMAILERS = [e.lower() for e in (USELESS_EMAILERS + [JEFFREY_EPSTEIN])]
|
|
40
|
+
SLOW_FILE_SECONDS = 0.4
|
|
40
41
|
|
|
41
42
|
INVALID_FOR_EPSTEIN_WEB = JUNK_EMAILERS + KRASSNER_RECIPIENTS + [
|
|
42
43
|
'ACT for America',
|
|
@@ -59,51 +60,34 @@ class EpsteinFiles:
|
|
|
59
60
|
email_authors_to_device_signatures: dict[str, set] = field(default_factory=lambda: defaultdict(set))
|
|
60
61
|
email_device_signatures_to_authors: dict[str, set] = field(default_factory=lambda: defaultdict(set))
|
|
61
62
|
email_recipient_counts: dict[str | None, int] = field(default_factory=lambda: defaultdict(int))
|
|
62
|
-
|
|
63
|
+
unknown_recipient_email_ids: set[str] = field(default_factory=set)
|
|
63
64
|
|
|
64
65
|
def __post_init__(self):
|
|
66
|
+
"""Iterate through files and build appropriate objects."""
|
|
65
67
|
self.all_files = [f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')]
|
|
68
|
+
documents = []
|
|
66
69
|
|
|
67
70
|
# Read through and classify all the files
|
|
68
71
|
for file_arg in self.all_files:
|
|
69
|
-
|
|
72
|
+
doc_timer = Timer(decimals=4)
|
|
70
73
|
document = Document(file_arg)
|
|
71
74
|
|
|
72
75
|
if document.length == 0:
|
|
73
|
-
logger.
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
# Handle iMessage log files
|
|
80
|
-
self.imessage_logs.append(MessengerLog(file_arg, text=document.text))
|
|
81
|
-
logger.info(self.imessage_logs[-1].description().plain)
|
|
82
|
-
elif DETECT_EMAIL_REGEX.match(document.text) or isinstance(document.config, MessageCfg):
|
|
83
|
-
# Handle emails
|
|
84
|
-
email = Email(file_arg, text=document.text)
|
|
85
|
-
logger.info(email.description().plain)
|
|
86
|
-
self.emails.append(email)
|
|
87
|
-
self.email_author_counts[email.author] += 1
|
|
88
|
-
|
|
89
|
-
if len(email.recipients) == 0:
|
|
90
|
-
self._email_unknown_recipient_file_ids.add(email.file_id)
|
|
91
|
-
self.email_recipient_counts[None] += 1
|
|
92
|
-
else:
|
|
93
|
-
for recipient in email.recipients:
|
|
94
|
-
self.email_recipient_counts[recipient] += 1
|
|
95
|
-
|
|
96
|
-
if email.sent_from_device:
|
|
97
|
-
self.email_authors_to_device_signatures[email.author_or_unknown()].add(email.sent_from_device)
|
|
98
|
-
self.email_device_signatures_to_authors[email.sent_from_device].add(email.author_or_unknown())
|
|
99
|
-
else:
|
|
100
|
-
# Handle OtherFiles
|
|
101
|
-
self.other_files.append(OtherFile(file_arg, text=document.text))
|
|
102
|
-
logger.info(self.other_files[-1].description().plain)
|
|
76
|
+
logger.warning(f"Skipping empty file: {document}")
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
cls = document_cls(document)
|
|
80
|
+
documents.append(cls(file_arg, text=document.text))
|
|
81
|
+
logger.info(str(documents[-1]))
|
|
103
82
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
83
|
+
if doc_timer.seconds_since_start() > SLOW_FILE_SECONDS:
|
|
84
|
+
doc_timer.print_at_checkpoint(f"Slow file: {documents[-1]} processed")
|
|
85
|
+
|
|
86
|
+
self.emails = Document.sort_by_timestamp([d for d in documents if isinstance(d, Email)])
|
|
87
|
+
self.imessage_logs = Document.sort_by_timestamp([d for d in documents if isinstance(d, MessengerLog)])
|
|
88
|
+
self.other_files = Document.sort_by_timestamp([d for d in documents if isinstance(d, (JsonFile, OtherFile))])
|
|
89
|
+
self.json_files = [doc for doc in self.other_files if isinstance(doc, JsonFile)]
|
|
90
|
+
self._tally_email_data()
|
|
107
91
|
|
|
108
92
|
@classmethod
|
|
109
93
|
def get_files(cls, timer: Timer | None = None) -> 'EpsteinFiles':
|
|
@@ -141,18 +125,17 @@ class EpsteinFiles:
|
|
|
141
125
|
def docs_matching(
|
|
142
126
|
self,
|
|
143
127
|
pattern: re.Pattern | str,
|
|
144
|
-
file_type: Literal['all', 'other'] = 'all',
|
|
145
128
|
names: list[str | None] | None = None
|
|
146
129
|
) -> list[SearchResult]:
|
|
147
130
|
"""Find documents whose text matches a pattern (file_type and names args limit the documents searched)."""
|
|
148
131
|
results: list[SearchResult] = []
|
|
149
132
|
|
|
150
|
-
for doc in
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
if names and ((not isinstance(doc, (Email, MessengerLog))) or doc.author not in names):
|
|
133
|
+
for doc in self.all_documents():
|
|
134
|
+
if names and doc.author not in names:
|
|
154
135
|
continue
|
|
155
136
|
|
|
137
|
+
lines = doc.matching_lines(pattern)
|
|
138
|
+
|
|
156
139
|
if len(lines) > 0:
|
|
157
140
|
results.append(SearchResult(doc, lines))
|
|
158
141
|
|
|
@@ -178,7 +161,7 @@ class EpsteinFiles:
|
|
|
178
161
|
return substitution_counts
|
|
179
162
|
|
|
180
163
|
def email_unknown_recipient_file_ids(self) -> list[str]:
|
|
181
|
-
return sorted(list(self.
|
|
164
|
+
return sorted(list(self.unknown_recipient_email_ids))
|
|
182
165
|
|
|
183
166
|
def emails_by(self, author: str | None) -> list[Email]:
|
|
184
167
|
return [e for e in self.emails if e.author == author]
|
|
@@ -198,33 +181,38 @@ class EpsteinFiles:
|
|
|
198
181
|
else:
|
|
199
182
|
return [e for e in self.emails if author in e.recipients]
|
|
200
183
|
|
|
201
|
-
def
|
|
202
|
-
if
|
|
203
|
-
|
|
184
|
+
def get_documents_by_id(self, file_ids: list[str]) -> list[Document]:
|
|
185
|
+
docs = [doc for doc in self.all_documents() if doc.file_id in file_ids]
|
|
186
|
+
|
|
187
|
+
if len(docs) != len(file_ids):
|
|
188
|
+
logger.warning(f"{len(file_ids)} file IDs provided but only {len(docs)} Epstein files found!")
|
|
189
|
+
|
|
190
|
+
return docs
|
|
204
191
|
|
|
205
|
-
|
|
206
|
-
return
|
|
192
|
+
def imessage_logs_for(self, author: str | None | list[str | None]) -> Sequence[MessengerLog]:
|
|
193
|
+
return MessengerLog.logs_for(author, self.imessage_logs)
|
|
207
194
|
|
|
208
195
|
def identified_imessage_log_count(self) -> int:
|
|
209
196
|
return len([log for log in self.imessage_logs if log.author])
|
|
210
197
|
|
|
211
|
-
def
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
for
|
|
216
|
-
|
|
198
|
+
def json_metadata(self) -> str:
|
|
199
|
+
metadata = {
|
|
200
|
+
EMAIL_CLASS: [json_safe(doc.metadata()) for doc in self.emails],
|
|
201
|
+
MESSENGER_LOG_CLASS: [json_safe(doc.metadata()) for doc in self.imessage_logs],
|
|
202
|
+
OTHER_FILE_CLASS: [json_safe(doc.metadata()) for doc in self.other_files],
|
|
203
|
+
}
|
|
217
204
|
|
|
218
|
-
return
|
|
205
|
+
return json.dumps(metadata, indent=4, sort_keys=True)
|
|
219
206
|
|
|
220
207
|
def print_files_summary(self) -> None:
|
|
208
|
+
other_files = [doc for doc in self.other_files if not isinstance(doc, JsonFile)]
|
|
221
209
|
dupes = defaultdict(int)
|
|
222
210
|
|
|
223
211
|
for doc in self.all_documents():
|
|
224
212
|
if doc.is_duplicate:
|
|
225
|
-
dupes[doc.
|
|
213
|
+
dupes[doc.class_name()] += 1
|
|
226
214
|
|
|
227
|
-
table = Table()
|
|
215
|
+
table = Table(title='Summary of Document Types')
|
|
228
216
|
add_cols_to_table(table, ['File Type', 'Files', 'Author Known', 'Author Unknown', 'Duplicates'])
|
|
229
217
|
|
|
230
218
|
def add_row(label: str, docs: list, known: int | None = None, dupes: int | None = None):
|
|
@@ -239,7 +227,7 @@ class EpsteinFiles:
|
|
|
239
227
|
add_row('iMessage Logs', self.imessage_logs, self.identified_imessage_log_count())
|
|
240
228
|
add_row('Emails', self.emails, len([e for e in self.emails if e.author]), dupes[EMAIL_CLASS])
|
|
241
229
|
add_row('JSON Data', self.json_files, dupes=0)
|
|
242
|
-
add_row('Other',
|
|
230
|
+
add_row('Other', other_files, dupes=dupes[OTHER_FILE_CLASS])
|
|
243
231
|
console.print(Align.center(table))
|
|
244
232
|
console.line()
|
|
245
233
|
|
|
@@ -247,10 +235,11 @@ class EpsteinFiles:
|
|
|
247
235
|
"""Print complete emails to or from a particular 'author'. Returns the Emails that were printed."""
|
|
248
236
|
conversation_length = self.email_conversation_length_in_days(_author)
|
|
249
237
|
emails = self.emails_for(_author)
|
|
238
|
+
unique_emails = [email for email in emails if not email.is_duplicate]
|
|
250
239
|
author = _author or UNKNOWN
|
|
251
240
|
|
|
252
241
|
print_author_header(
|
|
253
|
-
f"Found {len(
|
|
242
|
+
f"Found {len(unique_emails)} {author} emails starting {emails[0].timestamp.date()} over {conversation_length:,} days",
|
|
254
243
|
get_style_for_name(author),
|
|
255
244
|
get_info_for_name(author)
|
|
256
245
|
)
|
|
@@ -271,28 +260,9 @@ class EpsteinFiles:
|
|
|
271
260
|
|
|
272
261
|
return emails
|
|
273
262
|
|
|
274
|
-
def print_emails_table_for(self,
|
|
275
|
-
emails = [email for email in self.emails_for(
|
|
276
|
-
author
|
|
277
|
-
|
|
278
|
-
table = Table(
|
|
279
|
-
title=f"Emails to/from {author} starting {emails[0].timestamp.date()}",
|
|
280
|
-
border_style=get_style_for_name(author, allow_bold=False),
|
|
281
|
-
header_style="bold"
|
|
282
|
-
)
|
|
283
|
-
|
|
284
|
-
table.add_column('From', justify='left')
|
|
285
|
-
table.add_column('Timestamp', justify='center')
|
|
286
|
-
table.add_column('Subject', justify='left', style='honeydew2', min_width=60)
|
|
287
|
-
|
|
288
|
-
for email in emails:
|
|
289
|
-
table.add_row(
|
|
290
|
-
email.author_txt,
|
|
291
|
-
email.epstein_media_link(link_txt=email.timestamp_without_seconds()),
|
|
292
|
-
highlighter(email.subject())
|
|
293
|
-
)
|
|
294
|
-
|
|
295
|
-
console.print(Align.center(table), '\n')
|
|
263
|
+
def print_emails_table_for(self, author: str | None) -> None:
|
|
264
|
+
emails = [email for email in self.emails_for(author) if not email.is_duplicate] # Remove dupes
|
|
265
|
+
console.print(Align.center(Email.build_table(emails, author)), '\n')
|
|
296
266
|
|
|
297
267
|
def print_email_device_info(self) -> None:
|
|
298
268
|
print_panel(f"Email [italic]Sent from \\[DEVICE][/italic] Signature Breakdown", padding=(4, 0, 0, 0), centered=True)
|
|
@@ -300,13 +270,13 @@ class EpsteinFiles:
|
|
|
300
270
|
console.print(build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
|
|
301
271
|
|
|
302
272
|
def print_emailer_counts_table(self) -> None:
|
|
303
|
-
footer = f"Identified authors of {self.attributed_email_count()} emails out of {len(self.emails)}
|
|
273
|
+
footer = f"Identified authors of {self.attributed_email_count():,} emails out of {len(self.emails):,}."
|
|
304
274
|
counts_table = Table(title=f"Email Counts", caption=footer, header_style="bold")
|
|
305
275
|
add_cols_to_table(counts_table, ['Name', 'Count', 'Sent', "Recv'd", JMAIL, EPSTEIN_WEB, 'Twitter'])
|
|
306
276
|
|
|
307
277
|
emailer_counts = {
|
|
308
|
-
|
|
309
|
-
for
|
|
278
|
+
emailer: self.email_author_counts[emailer] + self.email_recipient_counts[emailer]
|
|
279
|
+
for emailer in self.all_emailers(True)
|
|
310
280
|
}
|
|
311
281
|
|
|
312
282
|
for p, count in sort_dict(emailer_counts):
|
|
@@ -326,76 +296,50 @@ class EpsteinFiles:
|
|
|
326
296
|
|
|
327
297
|
def print_imessage_summary(self) -> None:
|
|
328
298
|
"""Print summary table and stats for text messages."""
|
|
329
|
-
|
|
330
|
-
counts_table.add_column(AUTHOR.title(), justify='left', style="steel_blue bold", width=30)
|
|
331
|
-
counts_table.add_column('Files', justify='right', style='white')
|
|
332
|
-
counts_table.add_column("Msgs", justify='right')
|
|
333
|
-
counts_table.add_column('First Sent At', justify='center', highlight=True, width=21)
|
|
334
|
-
counts_table.add_column('Last Sent At', justify='center', style='wheat4', width=21)
|
|
335
|
-
counts_table.add_column('Days', justify='right', style='dim')
|
|
336
|
-
|
|
337
|
-
for name, count in sort_dict(self.imessage_sender_counts()):
|
|
338
|
-
logs = self.imessage_logs_for(name)
|
|
339
|
-
first_at = logs[0].first_message_at(name)
|
|
340
|
-
last_at = logs[-1].first_message_at(name)
|
|
341
|
-
|
|
342
|
-
counts_table.add_row(
|
|
343
|
-
Text(name or UNKNOWN,
|
|
344
|
-
get_style_for_name(name)),
|
|
345
|
-
str(len(logs)),
|
|
346
|
-
f"{count:,}",
|
|
347
|
-
iso_timestamp(first_at),
|
|
348
|
-
iso_timestamp(last_at),
|
|
349
|
-
str((last_at - first_at).days + 1),
|
|
350
|
-
)
|
|
351
|
-
|
|
352
|
-
console.print(counts_table)
|
|
299
|
+
console.print(MessengerLog.summary_table(self.imessage_logs))
|
|
353
300
|
text_summary_msg = f"\nDeanonymized {self.identified_imessage_log_count()} of "
|
|
354
|
-
text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files)} files."
|
|
301
|
+
text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files):,} files."
|
|
355
302
|
console.print(text_summary_msg)
|
|
356
303
|
imessage_msg_count = sum([len(log.messages()) for log in self.imessage_logs])
|
|
357
|
-
console.print(f"Found {imessage_msg_count}
|
|
358
|
-
console.print(f"(Last deploy found 4668 messages in 77 conversations)", style='dim')
|
|
304
|
+
console.print(f"Found {imessage_msg_count} text messages in {len(self.imessage_logs)} iMessage log files.")
|
|
359
305
|
|
|
360
306
|
def print_other_files_table(self) -> list[OtherFile]:
|
|
361
|
-
"""Returns the
|
|
307
|
+
"""Returns the OtherFile objects that were interesting enough to print."""
|
|
362
308
|
interesting_files = [doc for doc in self.other_files if args.all_other_files or doc.is_interesting()]
|
|
363
309
|
header_pfx = '' if args.all_other_files else 'Selected '
|
|
364
310
|
print_section_header(f"{FIRST_FEW_LINES} of {len(interesting_files)} {header_pfx}Files That Are Neither Emails Nor Text Msgs")
|
|
365
311
|
|
|
366
312
|
if not args.all_other_files:
|
|
367
|
-
print_centered(f"(the other site is uncurated and has all {len(self.other_files)} unclassifiable files and
|
|
313
|
+
print_centered(f"(the other site is uncurated and has all {len(self.other_files)} unclassifiable files and {len(self.emails):,} emails)", style='dim')
|
|
368
314
|
print_other_site_link(False)
|
|
369
315
|
console.line(2)
|
|
370
316
|
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
table.add_column('Date', justify='center')
|
|
374
|
-
table.add_column('Length', justify='center')
|
|
375
|
-
table.add_column(FIRST_FEW_LINES, justify='left', style='pale_turquoise4')
|
|
317
|
+
console.print(OtherFile.build_table(interesting_files))
|
|
318
|
+
skipped_file_count = len(self.other_files) - len(interesting_files)
|
|
376
319
|
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
date_str = doc.date_str()
|
|
320
|
+
if skipped_file_count > 0:
|
|
321
|
+
logger.warning(f"Skipped {skipped_file_count} uninteresting files...")
|
|
380
322
|
|
|
381
|
-
|
|
382
|
-
preview_text = doc.duplicate_file_txt()
|
|
383
|
-
row_style = ' dim'
|
|
384
|
-
else:
|
|
385
|
-
preview_text = doc.highlighted_preview_text()
|
|
386
|
-
row_style = ''
|
|
323
|
+
return interesting_files
|
|
387
324
|
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
style=row_style
|
|
394
|
-
)
|
|
325
|
+
def _tally_email_data(self) -> None:
|
|
326
|
+
"""Tally up summary info about Email objects."""
|
|
327
|
+
for email in self.emails:
|
|
328
|
+
if email.is_duplicate:
|
|
329
|
+
continue
|
|
395
330
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
331
|
+
self.email_author_counts[email.author] += 1
|
|
332
|
+
|
|
333
|
+
if len(email.recipients) == 0:
|
|
334
|
+
self.unknown_recipient_email_ids.add(email.file_id)
|
|
335
|
+
self.email_recipient_counts[None] += 1
|
|
336
|
+
else:
|
|
337
|
+
for recipient in email.recipients:
|
|
338
|
+
self.email_recipient_counts[recipient] += 1
|
|
339
|
+
|
|
340
|
+
if email.sent_from_device:
|
|
341
|
+
self.email_authors_to_device_signatures[email.author_or_unknown()].add(email.sent_from_device)
|
|
342
|
+
self.email_device_signatures_to_authors[email.sent_from_device].add(email.author_or_unknown())
|
|
399
343
|
|
|
400
344
|
|
|
401
345
|
def build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str], join_char: str = '\n') -> Padding:
|
|
@@ -413,6 +357,19 @@ def build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str]
|
|
|
413
357
|
return Padding(table, DEVICE_SIGNATURE_PADDING)
|
|
414
358
|
|
|
415
359
|
|
|
360
|
+
def document_cls(document: Document) -> Type[Document]:
|
|
361
|
+
search_area = document.text[0:5000] # Limit search area to avoid pointless scans of huge files
|
|
362
|
+
|
|
363
|
+
if document.text[0] == '{':
|
|
364
|
+
return JsonFile
|
|
365
|
+
elif isinstance(document.config, EmailCfg) or DETECT_EMAIL_REGEX.match(search_area):
|
|
366
|
+
return Email
|
|
367
|
+
elif MSG_REGEX.search(search_area):
|
|
368
|
+
return MessengerLog
|
|
369
|
+
else:
|
|
370
|
+
return OtherFile
|
|
371
|
+
|
|
372
|
+
|
|
416
373
|
def is_ok_for_epstein_web(name: str | None) -> bool:
|
|
417
374
|
"""Return True if it's likely that EpsteinWeb has a page for this name."""
|
|
418
375
|
if name is None or ' ' not in name:
|
|
@@ -184,15 +184,21 @@ TULSI_GABBARD = 'Tulsi Gabbard'
|
|
|
184
184
|
VIRGINIA_GIUFFRE = 'Virginia Giuffre'
|
|
185
185
|
|
|
186
186
|
# Organizations
|
|
187
|
+
BOFA = 'BofA'
|
|
187
188
|
CNN = 'CNN'
|
|
188
189
|
DEUTSCHE_BANK = 'Deutsche Bank'
|
|
190
|
+
ELECTRON_CAPITAL_PARTNERS = 'Electron Capital Partners'
|
|
189
191
|
GOLDMAN_SACHS = 'Goldman Sachs'
|
|
192
|
+
GOLDMAN_INVESTMENT_MGMT = f'{GOLDMAN_SACHS} Investment Management Division'
|
|
190
193
|
HARVARD = 'Harvard'
|
|
191
194
|
INSIGHTS_POD = f"InsightsPod" # Zubair bots
|
|
195
|
+
NEXT_MANAGEMENT = 'Next Management LLC'
|
|
192
196
|
JP_MORGAN = 'JP Morgan'
|
|
193
197
|
OSBORNE_LLP = f"{IAN_OSBORNE} & Partners LLP" # Ian Osborne's PR firm
|
|
198
|
+
UBS = 'UBS'
|
|
194
199
|
|
|
195
200
|
# Locations
|
|
201
|
+
PALM_BEACH = 'Palm Beach'
|
|
196
202
|
VIRGIN_ISLANDS = 'Virgin Islands'
|
|
197
203
|
|
|
198
204
|
# First and last names that should be made part of a highlighting regex for emailers
|
|
@@ -9,6 +9,27 @@ JSON_FILE_CLASS = 'JsonFile'
|
|
|
9
9
|
MESSENGER_LOG_CLASS = 'MessengerLog'
|
|
10
10
|
OTHER_FILE_CLASS = 'OtherFile'
|
|
11
11
|
|
|
12
|
+
# categories
|
|
13
|
+
ACADEMIA = 'academia'
|
|
14
|
+
ARTS = 'arts'
|
|
15
|
+
ARTICLE = 'article'
|
|
16
|
+
BOOK = 'book'
|
|
17
|
+
BUSINESS = 'business'
|
|
18
|
+
CONFERENCE = 'conference'
|
|
19
|
+
ENTERTAINER = 'entertainer'
|
|
20
|
+
FINANCE = 'finance'
|
|
21
|
+
FLIGHT_LOGS = 'flight logs'
|
|
22
|
+
JOURNALIST = 'journalist'
|
|
23
|
+
JUNK = 'junk'
|
|
24
|
+
LEGAL = 'legal'
|
|
25
|
+
LOBBYIST = 'lobbyist'
|
|
26
|
+
POLITICS = 'politics'
|
|
27
|
+
PROPERTY = 'property'
|
|
28
|
+
PUBLICIST = 'publicist'
|
|
29
|
+
REPUTATION = 'reputation'
|
|
30
|
+
SOCIAL = 'social'
|
|
31
|
+
SPEECH = 'speech'
|
|
32
|
+
|
|
12
33
|
# Publications
|
|
13
34
|
BBC = 'BBC'
|
|
14
35
|
BLOOMBERG = 'Bloomberg'
|
|
@@ -36,11 +57,17 @@ TIMESTAMP_DIM = f"turquoise4 dim"
|
|
|
36
57
|
AUTHOR = 'author'
|
|
37
58
|
DEFAULT = 'default'
|
|
38
59
|
EVERYONE = 'everyone'
|
|
60
|
+
FIRST_FEW_LINES = 'First Few Lines'
|
|
39
61
|
HOUSE_OVERSIGHT_PREFIX = 'HOUSE_OVERSIGHT_'
|
|
62
|
+
JSON = 'json'
|
|
40
63
|
NA = 'n/a'
|
|
41
64
|
REDACTED = '<REDACTED>'
|
|
42
65
|
URL_SIGNIFIERS = ['gclid', 'htm', 'ref=', 'utm']
|
|
43
66
|
QUESTION_MARKS = '(???)'
|
|
67
|
+
|
|
68
|
+
# Regexes
|
|
69
|
+
FILE_STEM_REGEX = re.compile(fr"{HOUSE_OVERSIGHT_PREFIX}(\d{{6}}(_\d{{1,2}})?)")
|
|
70
|
+
FILE_NAME_REGEX = re.compile(fr"{FILE_STEM_REGEX.pattern}(\.txt(\.json)?)?")
|
|
44
71
|
QUESTION_MARKS_REGEX = re.compile(fr' {re.escape(QUESTION_MARKS)}$')
|
|
45
72
|
|
|
46
73
|
|
|
@@ -6,7 +6,7 @@ from inflection import parameterize
|
|
|
6
6
|
from rich.text import Text
|
|
7
7
|
|
|
8
8
|
from epstein_files.util.constant.strings import EMAIL, TEXT_MESSAGE, SiteType
|
|
9
|
-
from epstein_files.util.file_helper import
|
|
9
|
+
from epstein_files.util.file_helper import JSON_METADATA_PATH, WORD_COUNT_HTML_PATH, coerce_file_stem
|
|
10
10
|
|
|
11
11
|
# Style stuff
|
|
12
12
|
ARCHIVE_LINK_COLOR = 'slate_blue3'
|
|
@@ -20,8 +20,27 @@ EPSTEIN_WEB = 'EpsteinWeb'
|
|
|
20
20
|
EPSTEINIFY = 'epsteinify'
|
|
21
21
|
JMAIL = 'Jmail'
|
|
22
22
|
|
|
23
|
-
|
|
24
|
-
|
|
23
|
+
|
|
24
|
+
# Cryptadamus URLs
|
|
25
|
+
GH_PAGES_BASE_URL = 'https://michelcrypt4d4mus.github.io'
|
|
26
|
+
TEXT_MSGS_BASE_URL = f"{GH_PAGES_BASE_URL}/epstein_text_messages"
|
|
27
|
+
JSON_METADATA_URL = f'{TEXT_MSGS_BASE_URL}/{JSON_METADATA_PATH.name}'
|
|
28
|
+
WORD_COUNT_URL = f'{TEXT_MSGS_BASE_URL}/{WORD_COUNT_HTML_PATH.name}'
|
|
29
|
+
|
|
30
|
+
SITE_URLS: dict[SiteType, str] = {
|
|
31
|
+
EMAIL: f'{GH_PAGES_BASE_URL}/epstein_emails_house_oversight/', # TODO should just be same repo
|
|
32
|
+
TEXT_MESSAGE: TEXT_MSGS_BASE_URL,
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
GH_PROJECT_URL = 'https://github.com/michelcrypt4d4mus/epstein_text_messages'
|
|
36
|
+
GH_MASTER_URL = f"{GH_PROJECT_URL}/blob/master"
|
|
37
|
+
ATTRIBUTIONS_URL = f'{GH_MASTER_URL}/epstein_files/util/constants.py'
|
|
38
|
+
EXTRACTS_BASE_URL = f'{GH_MASTER_URL}/emails_extracted_from_legal_filings'
|
|
39
|
+
|
|
40
|
+
extracted_file_url = lambda f: f"{EXTRACTS_BASE_URL}/{f}"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# External URLs
|
|
25
44
|
COFFEEZILLA_ARCHIVE_URL = 'https://journaliststudio.google.com/pinpoint/search?collection=061ce61c9e70bdfd'
|
|
26
45
|
COURIER_NEWSROOM_ARCHIVE_URL = 'https://journaliststudio.google.com/pinpoint/search?collection=092314e384a58618'
|
|
27
46
|
EPSTEINIFY_URL = 'https://epsteinify.com'
|
|
@@ -31,12 +50,6 @@ JMAIL_URL = 'https://jmail.world'
|
|
|
31
50
|
OVERSIGHT_REPUBLICANS_PRESSER_URL = 'https://oversight.house.gov/release/oversight-committee-releases-additional-epstein-estate-documents/'
|
|
32
51
|
RAW_OVERSIGHT_DOCS_GOOGLE_DRIVE_URL = 'https://drive.google.com/drive/folders/1hTNH5woIRio578onLGElkTWofUSWRoH_'
|
|
33
52
|
SUBSTACK_URL = 'https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great'
|
|
34
|
-
WORD_COUNT_URL = 'https://michelcrypt4d4mus.github.io/epstein_text_messages/epstein_emails_word_count.html'
|
|
35
|
-
|
|
36
|
-
SITE_URLS: dict[SiteType, str] = {
|
|
37
|
-
EMAIL: 'https://michelcrypt4d4mus.github.io/epstein_emails_house_oversight/',
|
|
38
|
-
TEXT_MESSAGE: 'https://michelcrypt4d4mus.github.io/epstein_text_messages/',
|
|
39
|
-
}
|
|
40
53
|
|
|
41
54
|
DOC_LINK_BASE_URLS: dict[ExternalSite, str] = {
|
|
42
55
|
EPSTEIN_MEDIA: f"{EPSTEIN_MEDIA_URL}/files",
|