epstein-files 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +194 -0
- epstein_files/documents/communication.py +53 -0
- epstein_files/documents/document.py +357 -0
- epstein_files/documents/email.py +655 -0
- epstein_files/documents/emails/email_header.py +167 -0
- epstein_files/documents/imessage/text_message.py +93 -0
- epstein_files/documents/json_file.py +23 -0
- epstein_files/documents/messenger_log.py +73 -0
- epstein_files/documents/other_file.py +117 -0
- epstein_files/epstein_files.py +437 -0
- epstein_files/util/constant/common_words.py +94 -0
- epstein_files/util/constant/html.py +57 -0
- epstein_files/util/constant/names.py +261 -0
- epstein_files/util/constant/strings.py +47 -0
- epstein_files/util/constant/urls.py +103 -0
- epstein_files/util/constants.py +1552 -0
- epstein_files/util/data.py +131 -0
- epstein_files/util/env.py +80 -0
- epstein_files/util/file_cfg.py +172 -0
- epstein_files/util/file_helper.py +81 -0
- epstein_files/util/highlighted_group.py +620 -0
- epstein_files/util/rich.py +324 -0
- epstein_files/util/search_result.py +15 -0
- epstein_files/util/word_count.py +191 -0
- epstein_files-1.0.0.dist-info/LICENSE +674 -0
- epstein_files-1.0.0.dist-info/METADATA +60 -0
- epstein_files-1.0.0.dist-info/RECORD +28 -0
- epstein_files-1.0.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
import gzip
|
|
2
|
+
import pickle
|
|
3
|
+
import re
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Literal, Sequence
|
|
9
|
+
|
|
10
|
+
from rich.align import Align
|
|
11
|
+
from rich.console import Group
|
|
12
|
+
from rich.padding import Padding
|
|
13
|
+
from rich.table import Table
|
|
14
|
+
from rich.text import Text
|
|
15
|
+
|
|
16
|
+
from epstein_files.documents.document import Document
|
|
17
|
+
from epstein_files.documents.email import DETECT_EMAIL_REGEX, JUNK_EMAILERS, KRASSNER_RECIPIENTS, USELESS_EMAILERS, Email
|
|
18
|
+
from epstein_files.documents.emails.email_header import AUTHOR
|
|
19
|
+
from epstein_files.documents.json_file import JsonFile
|
|
20
|
+
from epstein_files.documents.messenger_log import MSG_REGEX, MessengerLog
|
|
21
|
+
from epstein_files.documents.other_file import OtherFile
|
|
22
|
+
from epstein_files.util.constant.strings import *
|
|
23
|
+
from epstein_files.util.constant.urls import (EPSTEIN_WEB, JMAIL, epsteinify_name_url, epstein_web_person_url,
|
|
24
|
+
search_jmail_url, search_twitter_url)
|
|
25
|
+
from epstein_files.util.constants import *
|
|
26
|
+
from epstein_files.util.data import Timer, dict_sets_to_lists, iso_timestamp, sort_dict
|
|
27
|
+
from epstein_files.util.env import args, logger
|
|
28
|
+
from epstein_files.util.file_cfg import MessageCfg
|
|
29
|
+
from epstein_files.util.file_helper import DOCS_DIR, FILENAME_LENGTH, PICKLED_PATH, file_size_str
|
|
30
|
+
from epstein_files.util.highlighted_group import get_info_for_name, get_style_for_name
|
|
31
|
+
from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, QUESTION_MARK_TXT, add_cols_to_table, console,
|
|
32
|
+
highlighter, link_text_obj, link_markup, print_author_header, print_centered, print_other_site_link, print_panel,
|
|
33
|
+
print_section_header, vertically_pad)
|
|
34
|
+
from epstein_files.util.search_result import SearchResult
|
|
35
|
+
|
|
36
|
+
DEVICE_SIGNATURE = 'Device Signature'
|
|
37
|
+
FIRST_FEW_LINES = 'First Few Lines'
|
|
38
|
+
DEVICE_SIGNATURE_PADDING = (1, 0)
|
|
39
|
+
NOT_INCLUDED_EMAILERS = [e.lower() for e in (USELESS_EMAILERS + [JEFFREY_EPSTEIN])]
|
|
40
|
+
|
|
41
|
+
INVALID_FOR_EPSTEIN_WEB = JUNK_EMAILERS + KRASSNER_RECIPIENTS + [
|
|
42
|
+
'ACT for America',
|
|
43
|
+
'BS Stern',
|
|
44
|
+
INTELLIGENCE_SQUARED,
|
|
45
|
+
UNKNOWN,
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class EpsteinFiles:
|
|
51
|
+
all_files: list[Path] = field(init=False)
|
|
52
|
+
emails: list[Email] = field(default_factory=list)
|
|
53
|
+
imessage_logs: list[MessengerLog] = field(default_factory=list)
|
|
54
|
+
json_files: list[JsonFile] = field(default_factory=list)
|
|
55
|
+
other_files: list[OtherFile] = field(default_factory=list)
|
|
56
|
+
|
|
57
|
+
# Analytics / calculations
|
|
58
|
+
email_author_counts: dict[str | None, int] = field(default_factory=lambda: defaultdict(int))
|
|
59
|
+
email_authors_to_device_signatures: dict[str, set] = field(default_factory=lambda: defaultdict(set))
|
|
60
|
+
email_device_signatures_to_authors: dict[str, set] = field(default_factory=lambda: defaultdict(set))
|
|
61
|
+
email_recipient_counts: dict[str | None, int] = field(default_factory=lambda: defaultdict(int))
|
|
62
|
+
_email_unknown_recipient_file_ids: set[str] = field(default_factory=set)
|
|
63
|
+
|
|
64
|
+
def __post_init__(self):
|
|
65
|
+
self.all_files = [f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')]
|
|
66
|
+
|
|
67
|
+
# Read through and classify all the files
|
|
68
|
+
for file_arg in self.all_files:
|
|
69
|
+
logger.info(f"Scanning '{file_arg.name}'...")
|
|
70
|
+
document = Document(file_arg)
|
|
71
|
+
|
|
72
|
+
if document.length == 0:
|
|
73
|
+
logger.info(f"Skipping empty file {document.description().plain}")
|
|
74
|
+
elif document.text[0] == '{':
|
|
75
|
+
# Handle JSON files
|
|
76
|
+
self.json_files.append(JsonFile(file_arg, text=document.text))
|
|
77
|
+
logger.info(self.json_files[-1].description().plain)
|
|
78
|
+
elif MSG_REGEX.search(document.text):
|
|
79
|
+
# Handle iMessage log files
|
|
80
|
+
self.imessage_logs.append(MessengerLog(file_arg, text=document.text))
|
|
81
|
+
logger.info(self.imessage_logs[-1].description().plain)
|
|
82
|
+
elif DETECT_EMAIL_REGEX.match(document.text) or isinstance(document.config, MessageCfg):
|
|
83
|
+
# Handle emails
|
|
84
|
+
email = Email(file_arg, text=document.text)
|
|
85
|
+
logger.info(email.description().plain)
|
|
86
|
+
self.emails.append(email)
|
|
87
|
+
self.email_author_counts[email.author] += 1
|
|
88
|
+
|
|
89
|
+
if len(email.recipients) == 0:
|
|
90
|
+
self._email_unknown_recipient_file_ids.add(email.file_id)
|
|
91
|
+
self.email_recipient_counts[None] += 1
|
|
92
|
+
else:
|
|
93
|
+
for recipient in email.recipients:
|
|
94
|
+
self.email_recipient_counts[recipient] += 1
|
|
95
|
+
|
|
96
|
+
if email.sent_from_device:
|
|
97
|
+
self.email_authors_to_device_signatures[email.author_or_unknown()].add(email.sent_from_device)
|
|
98
|
+
self.email_device_signatures_to_authors[email.sent_from_device].add(email.author_or_unknown())
|
|
99
|
+
else:
|
|
100
|
+
# Handle OtherFiles
|
|
101
|
+
self.other_files.append(OtherFile(file_arg, text=document.text))
|
|
102
|
+
logger.info(self.other_files[-1].description().plain)
|
|
103
|
+
|
|
104
|
+
self.emails = Document.sort_by_timestamp(self.emails)
|
|
105
|
+
self.imessage_logs = Document.sort_by_timestamp(self.imessage_logs)
|
|
106
|
+
self.other_files = Document.sort_by_timestamp(self.other_files + self.json_files)
|
|
107
|
+
|
|
108
|
+
@classmethod
|
|
109
|
+
def get_files(cls, timer: Timer | None = None) -> 'EpsteinFiles':
|
|
110
|
+
"""Alternate constructor that reads/writes a pickled version of the data ('timer' arg is for logging)."""
|
|
111
|
+
timer = timer or Timer()
|
|
112
|
+
|
|
113
|
+
if (args.pickled and PICKLED_PATH.exists()) and not args.overwrite_pickle:
|
|
114
|
+
with gzip.open(PICKLED_PATH, 'rb') as file:
|
|
115
|
+
epstein_files = pickle.load(file)
|
|
116
|
+
timer.print_at_checkpoint(f"Loaded {len(epstein_files.all_files):,} documents from '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})")
|
|
117
|
+
return epstein_files
|
|
118
|
+
|
|
119
|
+
epstein_files = EpsteinFiles()
|
|
120
|
+
|
|
121
|
+
if args.overwrite_pickle or not PICKLED_PATH.exists():
|
|
122
|
+
with gzip.open(PICKLED_PATH, 'wb') as file:
|
|
123
|
+
pickle.dump(epstein_files, file)
|
|
124
|
+
logger.warning(f"Pickled data to '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})...")
|
|
125
|
+
|
|
126
|
+
timer.print_at_checkpoint(f'Processed {len(epstein_files.all_files):,} documents')
|
|
127
|
+
return epstein_files
|
|
128
|
+
|
|
129
|
+
def all_documents(self) -> Sequence[Document]:
|
|
130
|
+
return self.imessage_logs + self.emails + self.other_files
|
|
131
|
+
|
|
132
|
+
def all_emailers(self, include_useless: bool = False) -> list[str | None]:
|
|
133
|
+
"""Returns all emailers except Epstein and USELESS_EMAILERS, sorted from least frequent to most."""
|
|
134
|
+
names = [a for a in self.email_author_counts.keys()] + [r for r in self.email_recipient_counts.keys()]
|
|
135
|
+
names = names if include_useless else [e for e in names if e is None or e.lower() not in NOT_INCLUDED_EMAILERS]
|
|
136
|
+
return sorted(list(set(names)), key=lambda e: self.email_author_counts[e] + self.email_recipient_counts[e])
|
|
137
|
+
|
|
138
|
+
def attributed_email_count(self) -> int:
|
|
139
|
+
return sum([i for author, i in self.email_author_counts.items() if author != UNKNOWN])
|
|
140
|
+
|
|
141
|
+
def docs_matching(
|
|
142
|
+
self,
|
|
143
|
+
pattern: re.Pattern | str,
|
|
144
|
+
file_type: Literal['all', 'other'] = 'all',
|
|
145
|
+
names: list[str | None] | None = None
|
|
146
|
+
) -> list[SearchResult]:
|
|
147
|
+
"""Find documents whose text matches a pattern (file_type and names args limit the documents searched)."""
|
|
148
|
+
results: list[SearchResult] = []
|
|
149
|
+
|
|
150
|
+
for doc in (self.all_documents() if file_type == 'all' else self.other_files):
|
|
151
|
+
lines = doc.lines_matching_txt(pattern)
|
|
152
|
+
|
|
153
|
+
if names and ((not isinstance(doc, (Email, MessengerLog))) or doc.author not in names):
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
if len(lines) > 0:
|
|
157
|
+
results.append(SearchResult(doc, lines))
|
|
158
|
+
|
|
159
|
+
return results
|
|
160
|
+
|
|
161
|
+
def earliest_email_at(self, author: str | None) -> datetime:
|
|
162
|
+
return self.emails_for(author)[0].timestamp
|
|
163
|
+
|
|
164
|
+
def last_email_at(self, author: str | None) -> datetime:
|
|
165
|
+
return self.emails_for(author)[-1].timestamp
|
|
166
|
+
|
|
167
|
+
def email_conversation_length_in_days(self, author: str | None) -> int:
|
|
168
|
+
return (self.last_email_at(author) - self.earliest_email_at(author)).days + 1
|
|
169
|
+
|
|
170
|
+
def email_signature_substitution_counts(self) -> dict[str, int]:
|
|
171
|
+
"""Return the number of times an email signature was replaced with "<...snipped...>" for each author."""
|
|
172
|
+
substitution_counts = defaultdict(int)
|
|
173
|
+
|
|
174
|
+
for email in self.emails:
|
|
175
|
+
for name, num_replaced in email.signature_substitution_counts.items():
|
|
176
|
+
substitution_counts[name] += num_replaced
|
|
177
|
+
|
|
178
|
+
return substitution_counts
|
|
179
|
+
|
|
180
|
+
def email_unknown_recipient_file_ids(self) -> list[str]:
|
|
181
|
+
return sorted(list(self._email_unknown_recipient_file_ids))
|
|
182
|
+
|
|
183
|
+
def emails_by(self, author: str | None) -> list[Email]:
|
|
184
|
+
return [e for e in self.emails if e.author == author]
|
|
185
|
+
|
|
186
|
+
def emails_for(self, author: str | None) -> list[Email]:
|
|
187
|
+
"""Returns emails to or from a given 'author' sorted chronologically."""
|
|
188
|
+
emails = self.emails if author == EVERYONE else (self.emails_by(author) + self.emails_to(author))
|
|
189
|
+
|
|
190
|
+
if len(emails) == 0:
|
|
191
|
+
raise RuntimeError(f"No emails found for '{author}'")
|
|
192
|
+
|
|
193
|
+
return Document.sort_by_timestamp(Document.uniquify(emails))
|
|
194
|
+
|
|
195
|
+
def emails_to(self, author: str | None) -> list[Email]:
|
|
196
|
+
if author is None:
|
|
197
|
+
return [e for e in self.emails if len(e.recipients) == 0 or None in e.recipients]
|
|
198
|
+
else:
|
|
199
|
+
return [e for e in self.emails if author in e.recipients]
|
|
200
|
+
|
|
201
|
+
def imessage_logs_for(self, author: str | None | list[str | None]) -> Sequence[MessengerLog]:
|
|
202
|
+
if author in [EVERYONE, JEFFREY_EPSTEIN]:
|
|
203
|
+
return self.imessage_logs
|
|
204
|
+
|
|
205
|
+
authors = author if isinstance(author, list) else [author]
|
|
206
|
+
return [log for log in self.imessage_logs if log.author in authors]
|
|
207
|
+
|
|
208
|
+
def identified_imessage_log_count(self) -> int:
|
|
209
|
+
return len([log for log in self.imessage_logs if log.author])
|
|
210
|
+
|
|
211
|
+
def imessage_sender_counts(self) -> dict[str | None, int]:
|
|
212
|
+
sender_counts: dict[str | None, int] = defaultdict(int)
|
|
213
|
+
|
|
214
|
+
for message_log in self.imessage_logs:
|
|
215
|
+
for message in message_log.messages():
|
|
216
|
+
sender_counts[message.author] += 1
|
|
217
|
+
|
|
218
|
+
return sender_counts
|
|
219
|
+
|
|
220
|
+
def print_files_summary(self) -> None:
|
|
221
|
+
dupes = defaultdict(int)
|
|
222
|
+
|
|
223
|
+
for doc in self.all_documents():
|
|
224
|
+
if doc.is_duplicate:
|
|
225
|
+
dupes[doc.document_type()] += 1
|
|
226
|
+
|
|
227
|
+
table = Table()
|
|
228
|
+
add_cols_to_table(table, ['File Type', 'Files', 'Author Known', 'Author Unknown', 'Duplicates'])
|
|
229
|
+
|
|
230
|
+
def add_row(label: str, docs: list, known: int | None = None, dupes: int | None = None):
|
|
231
|
+
table.add_row(
|
|
232
|
+
label,
|
|
233
|
+
f"{len(docs):,}",
|
|
234
|
+
f"{known:,}" if known else NA_TXT,
|
|
235
|
+
f"{len(docs) - known:,}" if known else NA_TXT,
|
|
236
|
+
f"{dupes:,}" if dupes else NA_TXT,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
add_row('iMessage Logs', self.imessage_logs, self.identified_imessage_log_count())
|
|
240
|
+
add_row('Emails', self.emails, len([e for e in self.emails if e.author]), dupes[EMAIL_CLASS])
|
|
241
|
+
add_row('JSON Data', self.json_files, dupes=0)
|
|
242
|
+
add_row('Other', self.other_files, dupes=dupes[OTHER_FILE_CLASS])
|
|
243
|
+
console.print(Align.center(table))
|
|
244
|
+
console.line()
|
|
245
|
+
|
|
246
|
+
def print_emails_for(self, _author: str | None) -> list[Email]:
|
|
247
|
+
"""Print complete emails to or from a particular 'author'. Returns the Emails that were printed."""
|
|
248
|
+
conversation_length = self.email_conversation_length_in_days(_author)
|
|
249
|
+
emails = self.emails_for(_author)
|
|
250
|
+
author = _author or UNKNOWN
|
|
251
|
+
|
|
252
|
+
print_author_header(
|
|
253
|
+
f"Found {len(emails)} {author} emails starting {emails[0].timestamp.date()} over {conversation_length:,} days",
|
|
254
|
+
get_style_for_name(author),
|
|
255
|
+
get_info_for_name(author)
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
self.print_emails_table_for(_author)
|
|
259
|
+
last_printed_email_was_duplicate = False
|
|
260
|
+
|
|
261
|
+
for email in emails:
|
|
262
|
+
if email.is_duplicate:
|
|
263
|
+
console.print(Padding(email.duplicate_file_txt().append('...'), (0, 0, 0, 4)))
|
|
264
|
+
last_printed_email_was_duplicate = True
|
|
265
|
+
else:
|
|
266
|
+
if last_printed_email_was_duplicate:
|
|
267
|
+
console.line()
|
|
268
|
+
|
|
269
|
+
console.print(email)
|
|
270
|
+
last_printed_email_was_duplicate = False
|
|
271
|
+
|
|
272
|
+
return emails
|
|
273
|
+
|
|
274
|
+
def print_emails_table_for(self, _author: str | None) -> None:
|
|
275
|
+
emails = [email for email in self.emails_for(_author) if not email.is_duplicate] # Remove dupes
|
|
276
|
+
author = _author or UNKNOWN
|
|
277
|
+
|
|
278
|
+
table = Table(
|
|
279
|
+
title=f"Emails to/from {author} starting {emails[0].timestamp.date()}",
|
|
280
|
+
border_style=get_style_for_name(author, allow_bold=False),
|
|
281
|
+
header_style="bold"
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
table.add_column('From', justify='left')
|
|
285
|
+
table.add_column('Timestamp', justify='center')
|
|
286
|
+
table.add_column('Subject', justify='left', style='honeydew2', min_width=60)
|
|
287
|
+
|
|
288
|
+
for email in emails:
|
|
289
|
+
table.add_row(
|
|
290
|
+
email.author_txt,
|
|
291
|
+
email.epstein_media_link(link_txt=email.timestamp_without_seconds()),
|
|
292
|
+
highlighter(email.subject())
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
console.print(Align.center(table), '\n')
|
|
296
|
+
|
|
297
|
+
def print_email_device_info(self) -> None:
|
|
298
|
+
print_panel(f"Email [italic]Sent from \\[DEVICE][/italic] Signature Breakdown", padding=(4, 0, 0, 0), centered=True)
|
|
299
|
+
console.print(build_signature_table(self.email_authors_to_device_signatures, (AUTHOR, DEVICE_SIGNATURE)))
|
|
300
|
+
console.print(build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
|
|
301
|
+
|
|
302
|
+
def print_emailer_counts_table(self) -> None:
|
|
303
|
+
footer = f"Identified authors of {self.attributed_email_count()} emails out of {len(self.emails)} potential email files."
|
|
304
|
+
counts_table = Table(title=f"Email Counts", caption=footer, header_style="bold")
|
|
305
|
+
add_cols_to_table(counts_table, ['Name', 'Count', 'Sent', "Recv'd", JMAIL, EPSTEIN_WEB, 'Twitter'])
|
|
306
|
+
|
|
307
|
+
emailer_counts = {
|
|
308
|
+
e: self.email_author_counts[e] + self.email_recipient_counts[e]
|
|
309
|
+
for e in self.all_emailers(True)
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
for p, count in sort_dict(emailer_counts):
|
|
313
|
+
style = get_style_for_name(p, default_style=DEFAULT_NAME_STYLE)
|
|
314
|
+
|
|
315
|
+
counts_table.add_row(
|
|
316
|
+
Text.from_markup(link_markup(epsteinify_name_url(p or UNKNOWN), p or UNKNOWN, style)),
|
|
317
|
+
str(count),
|
|
318
|
+
str(self.email_author_counts[p]),
|
|
319
|
+
str(self.email_recipient_counts[p]),
|
|
320
|
+
'' if p is None else link_text_obj(search_jmail_url(p), JMAIL),
|
|
321
|
+
'' if not is_ok_for_epstein_web(p) else link_text_obj(epstein_web_person_url(p), EPSTEIN_WEB.lower()),
|
|
322
|
+
'' if p is None else link_text_obj(search_twitter_url(p), 'search X'),
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
console.print(vertically_pad(counts_table, 2))
|
|
326
|
+
|
|
327
|
+
def print_imessage_summary(self) -> None:
|
|
328
|
+
"""Print summary table and stats for text messages."""
|
|
329
|
+
counts_table = Table(title="Text Message Counts By Author", header_style="bold")
|
|
330
|
+
counts_table.add_column(AUTHOR.title(), justify='left', style="steel_blue bold", width=30)
|
|
331
|
+
counts_table.add_column('Files', justify='right', style='white')
|
|
332
|
+
counts_table.add_column("Msgs", justify='right')
|
|
333
|
+
counts_table.add_column('First Sent At', justify='center', highlight=True, width=21)
|
|
334
|
+
counts_table.add_column('Last Sent At', justify='center', style='wheat4', width=21)
|
|
335
|
+
counts_table.add_column('Days', justify='right', style='dim')
|
|
336
|
+
|
|
337
|
+
for name, count in sort_dict(self.imessage_sender_counts()):
|
|
338
|
+
logs = self.imessage_logs_for(name)
|
|
339
|
+
first_at = logs[0].first_message_at(name)
|
|
340
|
+
last_at = logs[-1].first_message_at(name)
|
|
341
|
+
|
|
342
|
+
counts_table.add_row(
|
|
343
|
+
Text(name or UNKNOWN,
|
|
344
|
+
get_style_for_name(name)),
|
|
345
|
+
str(len(logs)),
|
|
346
|
+
f"{count:,}",
|
|
347
|
+
iso_timestamp(first_at),
|
|
348
|
+
iso_timestamp(last_at),
|
|
349
|
+
str((last_at - first_at).days + 1),
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
console.print(counts_table)
|
|
353
|
+
text_summary_msg = f"\nDeanonymized {self.identified_imessage_log_count()} of "
|
|
354
|
+
text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files)} files."
|
|
355
|
+
console.print(text_summary_msg)
|
|
356
|
+
imessage_msg_count = sum([len(log.messages()) for log in self.imessage_logs])
|
|
357
|
+
console.print(f"Found {imessage_msg_count} total text messages in {len(self.imessage_logs)} conversations.")
|
|
358
|
+
console.print(f"(Last deploy found 4668 messages in 77 conversations)", style='dim')
|
|
359
|
+
|
|
360
|
+
def print_other_files_table(self) -> list[OtherFile]:
|
|
361
|
+
"""Returns the OtherFiles that were interesting enough to print."""
|
|
362
|
+
interesting_files = [doc for doc in self.other_files if args.all_other_files or doc.is_interesting()]
|
|
363
|
+
header_pfx = '' if args.all_other_files else 'Selected '
|
|
364
|
+
print_section_header(f"{FIRST_FEW_LINES} of {len(interesting_files)} {header_pfx}Files That Are Neither Emails Nor Text Msgs")
|
|
365
|
+
|
|
366
|
+
if not args.all_other_files:
|
|
367
|
+
print_centered(f"(the other site is uncurated and has all {len(self.other_files)} unclassifiable files and all {len(self.emails):,} emails)", style='dim')
|
|
368
|
+
print_other_site_link(False)
|
|
369
|
+
console.line(2)
|
|
370
|
+
|
|
371
|
+
table = Table(header_style='bold', show_lines=True)
|
|
372
|
+
table.add_column('File', justify='center', width=FILENAME_LENGTH)
|
|
373
|
+
table.add_column('Date', justify='center')
|
|
374
|
+
table.add_column('Length', justify='center')
|
|
375
|
+
table.add_column(FIRST_FEW_LINES, justify='left', style='pale_turquoise4')
|
|
376
|
+
|
|
377
|
+
for doc in interesting_files:
|
|
378
|
+
link_and_info = [doc.raw_document_link_txt(), *doc.hints()]
|
|
379
|
+
date_str = doc.date_str()
|
|
380
|
+
|
|
381
|
+
if doc.is_duplicate:
|
|
382
|
+
preview_text = doc.duplicate_file_txt()
|
|
383
|
+
row_style = ' dim'
|
|
384
|
+
else:
|
|
385
|
+
preview_text = doc.highlighted_preview_text()
|
|
386
|
+
row_style = ''
|
|
387
|
+
|
|
388
|
+
table.add_row(
|
|
389
|
+
Group(*link_and_info),
|
|
390
|
+
Text(date_str, style=TIMESTAMP_DIM) if date_str else QUESTION_MARK_TXT,
|
|
391
|
+
doc.file_size_str(),
|
|
392
|
+
preview_text,
|
|
393
|
+
style=row_style
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
console.print(table)
|
|
397
|
+
logger.warning(f"Skipped {len(self.other_files) - len(interesting_files)} uninteresting files...")
|
|
398
|
+
return interesting_files
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str], join_char: str = '\n') -> Padding:
|
|
402
|
+
title = 'Signatures Used By Authors' if cols[0] == AUTHOR else 'Authors Seen Using Signatures'
|
|
403
|
+
table = Table(header_style="bold reverse", show_lines=True, title=title)
|
|
404
|
+
|
|
405
|
+
for i, col in enumerate(cols):
|
|
406
|
+
table.add_column(col.title() + ('s' if i == 1 else ''))
|
|
407
|
+
|
|
408
|
+
new_dict = dict_sets_to_lists(keyed_sets)
|
|
409
|
+
|
|
410
|
+
for k in sorted(new_dict.keys()):
|
|
411
|
+
table.add_row(highlighter(k or UNKNOWN), highlighter(join_char.join(sorted(new_dict[k]))))
|
|
412
|
+
|
|
413
|
+
return Padding(table, DEVICE_SIGNATURE_PADDING)
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def is_ok_for_epstein_web(name: str | None) -> bool:
|
|
417
|
+
"""Return True if it's likely that EpsteinWeb has a page for this name."""
|
|
418
|
+
if name is None or ' ' not in name:
|
|
419
|
+
return False
|
|
420
|
+
elif '@' in name or '/' in name or '??' in name:
|
|
421
|
+
return False
|
|
422
|
+
elif name in INVALID_FOR_EPSTEIN_WEB:
|
|
423
|
+
return False
|
|
424
|
+
|
|
425
|
+
return True
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
|
|
429
|
+
counts: dict[str | None, int] = defaultdict(int)
|
|
430
|
+
|
|
431
|
+
for doc in docs:
|
|
432
|
+
if doc.timestamp:
|
|
433
|
+
counts[doc.timestamp.date().isoformat()[0:7]] += 1
|
|
434
|
+
else:
|
|
435
|
+
counts[None] += 1
|
|
436
|
+
|
|
437
|
+
return counts
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from epstein_files.util.env import args
|
|
2
|
+
|
|
3
|
+
# Removed: look, make, no, see, think, up, use, want
|
|
4
|
+
# https://www.gonaturalenglish.com/1000-most-common-words-in-the-english-language/
|
|
5
|
+
MOST_COMMON_WORDS = """
|
|
6
|
+
a about after all also am an and any are as at
|
|
7
|
+
be because been being but by
|
|
8
|
+
came can can't cannot cant come could couldnt
|
|
9
|
+
day do doing dont did didnt
|
|
10
|
+
even
|
|
11
|
+
find first for from
|
|
12
|
+
get getting got give go going
|
|
13
|
+
had hadnt has hasnt have havent having he hed her here him his how
|
|
14
|
+
i if in into is isnt it its ive
|
|
15
|
+
just
|
|
16
|
+
know
|
|
17
|
+
like
|
|
18
|
+
man many me more my
|
|
19
|
+
new not now
|
|
20
|
+
of on one only or other our out
|
|
21
|
+
people pm
|
|
22
|
+
re
|
|
23
|
+
said say saying says she shed so some subject
|
|
24
|
+
take than that the their them then there these they theyd theyll theyre theyve thing this those through time to too two
|
|
25
|
+
very
|
|
26
|
+
was way we well went were werent weve
|
|
27
|
+
what whatever when whenever where wherever which whichever who whoever why
|
|
28
|
+
will with without wont would wouldnt wouldve
|
|
29
|
+
year you youd youll your youre youve
|
|
30
|
+
""".strip().split()
|
|
31
|
+
|
|
32
|
+
OTHER_COMMON_WORDS = """
|
|
33
|
+
january february march april may june july august september october november december
|
|
34
|
+
jan feb mar apr jun jul aug sep sept oct nov dec
|
|
35
|
+
sunday monday tuesday wednesday thursday friday saturday
|
|
36
|
+
sun mon tue tues wed thu thur thurs fri sat
|
|
37
|
+
st nd rd th skrev
|
|
38
|
+
|
|
39
|
+
addthis attachments ave
|
|
40
|
+
bcc bst btn
|
|
41
|
+
cc ce cel
|
|
42
|
+
date de des div dont du
|
|
43
|
+
each ecrit edt el email en envoye epstein et
|
|
44
|
+
fa fax fb fw fwd
|
|
45
|
+
herself himself
|
|
46
|
+
id ii iii im iphone iPad BlackBerry
|
|
47
|
+
je jeffrey jr
|
|
48
|
+
kl
|
|
49
|
+
las le les let
|
|
50
|
+
mr mrs ms much
|
|
51
|
+
ne nonus nor
|
|
52
|
+
ou over
|
|
53
|
+
pdt pst
|
|
54
|
+
rss
|
|
55
|
+
sent ses si signature smtp snipped somers
|
|
56
|
+
te tel tenu tho though trimmed
|
|
57
|
+
via vous voye
|
|
58
|
+
was wasnt whether while wrote
|
|
59
|
+
""".strip().split()
|
|
60
|
+
|
|
61
|
+
COMMON_WORDS = {line.lower(): True for line in (MOST_COMMON_WORDS + OTHER_COMMON_WORDS)}
|
|
62
|
+
COMMON_WORDS_LIST = sorted([word for word in COMMON_WORDS.keys()])
|
|
63
|
+
|
|
64
|
+
UNSINGULARIZABLE_WORDS = """
|
|
65
|
+
abbas academia acosta aids alas algeria alice always andres angeles anus apparatus apropos arabia ares asia asus atlanta australia austria avia
|
|
66
|
+
bahamas bata beatles beta betts bias boies bonus brookings brussels
|
|
67
|
+
california campus candia cannes carlos caucus cbs cds census chaos chorus chris christmas clothes cms collins columbia com comms conchita consensus costa csis curves cvs cyprus
|
|
68
|
+
dallas data davis davos dawkins deborah dementia denis dennis des diabetes dis drougas
|
|
69
|
+
emirates emphasis encyclopedia ens eps eta
|
|
70
|
+
facs ferris focus folks forbes francis
|
|
71
|
+
gas gaydos georgia gittes gloria gmt gps gravitas
|
|
72
|
+
halitosis hamas harris has hiatus hillis his hivaids hopkins
|
|
73
|
+
impetus india indonesia ios ips irs isis isosceles
|
|
74
|
+
jacques jános jones josephus jules
|
|
75
|
+
kansas
|
|
76
|
+
las lens les lewis lhs lls los louis luis
|
|
77
|
+
madars malaysia maldives marcus maria massachusetts mbs media melania meta mets meyers mlpf&s mongolia moonves multimedia
|
|
78
|
+
nadia nafta natalie nautilus nas nigeria novartis nucleus nunes
|
|
79
|
+
olas orleans
|
|
80
|
+
pants paris parkes patricia pbs pennsylvania peres perhaps philadelphia physics pls plus potus pres prevus
|
|
81
|
+
rees reis-dennis reuters rodgers rogers russia
|
|
82
|
+
sachs sadis saks santa ses shia simmons slovakia sometimes soros stimulus surplus syria
|
|
83
|
+
tennis texas this thus trans tries tunisia
|
|
84
|
+
ups uterus
|
|
85
|
+
valeria vegas versus via victoria villafaria vinicius virginia vis
|
|
86
|
+
was whereas whoops wikipedia
|
|
87
|
+
yemen yes yikes
|
|
88
|
+
zakaria
|
|
89
|
+
""".strip().split()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
if args.deep_debug:
|
|
93
|
+
word_str = '\n'.join(COMMON_WORDS_LIST)
|
|
94
|
+
print(f"common words:\n\n{word_str}")
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from rich.terminal_theme import TerminalTheme
|
|
2
|
+
|
|
3
|
+
from epstein_files.util.env import args
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
PAGE_TITLE = ' ∞ Michel de Cryptadamus ∞ '
|
|
7
|
+
|
|
8
|
+
CONSOLE_HTML_FORMAT = """<!DOCTYPE html>
|
|
9
|
+
<html>
|
|
10
|
+
<head>
|
|
11
|
+
<meta charset="UTF-8">
|
|
12
|
+
<link rel="icon" type="image/x-icon" href="https://media.universeodon.com/accounts/avatars/109/363/179/904/598/380/original/eecdc2393e75e8bf.jpg" />
|
|
13
|
+
|
|
14
|
+
<style>
|
|
15
|
+
{stylesheet}
|
|
16
|
+
body {{
|
|
17
|
+
color: {foreground};
|
|
18
|
+
background-color: {background};
|
|
19
|
+
}}
|
|
20
|
+
</style>
|
|
21
|
+
""" + f"<title>Epstein {'Emails' if args.all_emails else 'Text Messages'}</title>" + """
|
|
22
|
+
</head>
|
|
23
|
+
<body>
|
|
24
|
+
<pre style="font-family: Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace; white-space: pre-wrap; overflow-wrap: break-word;">
|
|
25
|
+
<code style="font-family: inherit; white-space: pre-wrap; overflow-wrap: break-word;">
|
|
26
|
+
{code}
|
|
27
|
+
</code>
|
|
28
|
+
</pre>
|
|
29
|
+
</body>
|
|
30
|
+
</html>
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
# Swap black for white
|
|
34
|
+
HTML_TERMINAL_THEME = TerminalTheme(
|
|
35
|
+
(0, 0, 0),
|
|
36
|
+
(255, 255, 255),
|
|
37
|
+
[
|
|
38
|
+
(0, 0, 0),
|
|
39
|
+
(128, 0, 0),
|
|
40
|
+
(0, 128, 0),
|
|
41
|
+
(128, 128, 0),
|
|
42
|
+
(0, 0, 128),
|
|
43
|
+
(128, 0, 128),
|
|
44
|
+
(0, 128, 128),
|
|
45
|
+
(192, 192, 192),
|
|
46
|
+
],
|
|
47
|
+
[
|
|
48
|
+
(128, 128, 128),
|
|
49
|
+
(255, 0, 0),
|
|
50
|
+
(0, 255, 0),
|
|
51
|
+
(255, 255, 0),
|
|
52
|
+
(0, 0, 255),
|
|
53
|
+
(255, 0, 255),
|
|
54
|
+
(0, 255, 255),
|
|
55
|
+
(255, 255, 255),
|
|
56
|
+
],
|
|
57
|
+
)
|