epstein-files 1.1.3__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +15 -7
- epstein_files/documents/communication.py +3 -3
- epstein_files/documents/document.py +10 -3
- epstein_files/documents/email.py +105 -107
- epstein_files/documents/emails/email_header.py +4 -2
- epstein_files/documents/imessage/text_message.py +8 -12
- epstein_files/documents/messenger_log.py +8 -8
- epstein_files/epstein_files.py +123 -119
- epstein_files/person.py +350 -0
- epstein_files/util/constant/names.py +66 -50
- epstein_files/util/constant/output_files.py +1 -0
- epstein_files/util/constant/strings.py +3 -1
- epstein_files/util/constant/urls.py +14 -2
- epstein_files/util/constants.py +134 -26
- epstein_files/util/data.py +1 -12
- epstein_files/util/doc_cfg.py +30 -14
- epstein_files/util/env.py +3 -1
- epstein_files/util/file_helper.py +4 -1
- epstein_files/util/highlighted_group.py +228 -166
- epstein_files/util/output.py +108 -165
- epstein_files/util/rich.py +23 -45
- epstein_files/util/word_count.py +2 -3
- {epstein_files-1.1.3.dist-info → epstein_files-1.2.0.dist-info}/METADATA +2 -1
- epstein_files-1.2.0.dist-info/RECORD +34 -0
- epstein_files-1.1.3.dist-info/RECORD +0 -33
- {epstein_files-1.1.3.dist-info → epstein_files-1.2.0.dist-info}/LICENSE +0 -0
- {epstein_files-1.1.3.dist-info → epstein_files-1.2.0.dist-info}/WHEEL +0 -0
- {epstein_files-1.1.3.dist-info → epstein_files-1.2.0.dist-info}/entry_points.txt +0 -0
epstein_files/epstein_files.py
CHANGED
|
@@ -3,40 +3,39 @@ import json
|
|
|
3
3
|
import pickle
|
|
4
4
|
import re
|
|
5
5
|
from collections import defaultdict
|
|
6
|
+
from copy import copy
|
|
6
7
|
from dataclasses import dataclass, field
|
|
7
8
|
from datetime import datetime
|
|
8
9
|
from pathlib import Path
|
|
9
|
-
from typing import Sequence, Type
|
|
10
|
-
|
|
11
|
-
from rich.padding import Padding
|
|
12
|
-
from rich.table import Table
|
|
13
|
-
from rich.text import Text
|
|
10
|
+
from typing import Sequence, Type, cast
|
|
14
11
|
|
|
15
12
|
from epstein_files.documents.document import Document
|
|
16
|
-
from epstein_files.documents.email import DETECT_EMAIL_REGEX,
|
|
17
|
-
from epstein_files.documents.emails.email_header import AUTHOR
|
|
13
|
+
from epstein_files.documents.email import DETECT_EMAIL_REGEX, Email
|
|
18
14
|
from epstein_files.documents.json_file import JsonFile
|
|
19
15
|
from epstein_files.documents.messenger_log import MSG_REGEX, MessengerLog
|
|
20
16
|
from epstein_files.documents.other_file import OtherFile
|
|
17
|
+
from epstein_files.person import Person
|
|
21
18
|
from epstein_files.util.constant.strings import *
|
|
22
19
|
from epstein_files.util.constants import *
|
|
23
|
-
from epstein_files.util.data import
|
|
20
|
+
from epstein_files.util.data import flatten, json_safe, listify, uniquify
|
|
24
21
|
from epstein_files.util.doc_cfg import EmailCfg, Metadata
|
|
25
22
|
from epstein_files.util.env import DOCS_DIR, args, logger
|
|
26
23
|
from epstein_files.util.file_helper import file_size_str
|
|
27
|
-
from epstein_files.util.highlighted_group import HIGHLIGHTED_NAMES, HighlightedNames
|
|
28
|
-
from epstein_files.util.rich import
|
|
29
|
-
print_author_panel, print_centered, print_subtitle_panel)
|
|
24
|
+
from epstein_files.util.highlighted_group import HIGHLIGHTED_NAMES, HighlightedNames
|
|
25
|
+
from epstein_files.util.rich import NA_TXT, add_cols_to_table, build_table, console, print_centered
|
|
30
26
|
from epstein_files.util.search_result import SearchResult
|
|
31
27
|
from epstein_files.util.timer import Timer
|
|
32
28
|
|
|
33
|
-
|
|
34
|
-
DEVICE_SIGNATURE_SUBTITLE = f"Email [italic]Sent from \\[DEVICE][/italic] Signature Breakdown"
|
|
35
|
-
DEVICE_SIGNATURE = 'Device Signature'
|
|
36
|
-
DEVICE_SIGNATURE_PADDING = (1, 0)
|
|
29
|
+
DUPLICATE_PROPS_TO_COPY = ['author', 'recipients', 'timestamp']
|
|
37
30
|
PICKLED_PATH = Path("the_epstein_files.pkl.gz")
|
|
38
31
|
SLOW_FILE_SECONDS = 1.0
|
|
39
32
|
|
|
33
|
+
EMAILS_WITH_UNINTERESTING_CCS = [
|
|
34
|
+
'025329', # Krassner
|
|
35
|
+
'024923', # Krassner
|
|
36
|
+
'033568', # Krassner
|
|
37
|
+
]
|
|
38
|
+
|
|
40
39
|
|
|
41
40
|
@dataclass
|
|
42
41
|
class EpsteinFiles:
|
|
@@ -46,19 +45,13 @@ class EpsteinFiles:
|
|
|
46
45
|
json_files: list[JsonFile] = field(default_factory=list)
|
|
47
46
|
other_files: list[OtherFile] = field(default_factory=list)
|
|
48
47
|
timer: Timer = field(default_factory=lambda: Timer())
|
|
49
|
-
|
|
50
|
-
# Analytics / calculations
|
|
51
|
-
email_author_counts: dict[str | None, int] = field(default_factory=lambda: defaultdict(int))
|
|
52
|
-
email_authors_to_device_signatures: dict[str, set] = field(default_factory=lambda: defaultdict(set))
|
|
53
|
-
email_device_signatures_to_authors: dict[str, set] = field(default_factory=lambda: defaultdict(set))
|
|
54
|
-
email_recipient_counts: dict[str | None, int] = field(default_factory=lambda: defaultdict(int))
|
|
55
|
-
unknown_recipient_email_ids: set[str] = field(default_factory=set)
|
|
48
|
+
uninteresting_ccs: list[Name] = field(init=False)
|
|
56
49
|
|
|
57
50
|
def __post_init__(self):
|
|
58
51
|
"""Iterate through files and build appropriate objects."""
|
|
59
52
|
self.all_files = sorted([f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')])
|
|
60
53
|
documents = []
|
|
61
|
-
file_type_count = defaultdict(int)
|
|
54
|
+
file_type_count = defaultdict(int) # Hack used by --skip-other-files option
|
|
62
55
|
|
|
63
56
|
# Read through and classify all the files
|
|
64
57
|
for file_arg in self.all_files:
|
|
@@ -84,14 +77,15 @@ class EpsteinFiles:
|
|
|
84
77
|
self.imessage_logs = Document.sort_by_timestamp([d for d in documents if isinstance(d, MessengerLog)])
|
|
85
78
|
self.other_files = Document.sort_by_timestamp([d for d in documents if isinstance(d, (JsonFile, OtherFile))])
|
|
86
79
|
self.json_files = [doc for doc in self.other_files if isinstance(doc, JsonFile)]
|
|
87
|
-
self.
|
|
80
|
+
self._set_uninteresting_ccs()
|
|
81
|
+
self._copy_duplicate_email_properties()
|
|
88
82
|
|
|
89
83
|
@classmethod
|
|
90
84
|
def get_files(cls, timer: Timer | None = None) -> 'EpsteinFiles':
|
|
91
85
|
"""Alternate constructor that reads/writes a pickled version of the data ('timer' arg is for logging)."""
|
|
92
86
|
timer = timer or Timer()
|
|
93
87
|
|
|
94
|
-
if PICKLED_PATH.exists() and not args.overwrite_pickle:
|
|
88
|
+
if PICKLED_PATH.exists() and not args.overwrite_pickle and not args.skip_other_files:
|
|
95
89
|
with gzip.open(PICKLED_PATH, 'rb') as file:
|
|
96
90
|
epstein_files = pickle.load(file)
|
|
97
91
|
epstein_files.timer = timer
|
|
@@ -115,17 +109,7 @@ class EpsteinFiles:
|
|
|
115
109
|
def all_documents(self) -> Sequence[Document]:
|
|
116
110
|
return self.imessage_logs + self.emails + self.other_files
|
|
117
111
|
|
|
118
|
-
def
|
|
119
|
-
"""Returns all emailers except Epstein and EXCLUDED_EMAILERS, sorted from least frequent to most."""
|
|
120
|
-
names = [a for a in self.email_author_counts.keys()] + [r for r in self.email_recipient_counts.keys()]
|
|
121
|
-
names = names if include_useless else [e for e in names if e not in EXCLUDED_EMAILERS]
|
|
122
|
-
return sorted(list(set(names)), key=lambda e: self.email_author_counts[e] + self.email_recipient_counts[e])
|
|
123
|
-
|
|
124
|
-
def docs_matching(
|
|
125
|
-
self,
|
|
126
|
-
pattern: re.Pattern | str,
|
|
127
|
-
names: list[str | None] | None = None
|
|
128
|
-
) -> list[SearchResult]:
|
|
112
|
+
def docs_matching(self, pattern: re.Pattern | str, names: list[Name] | None = None) -> list[SearchResult]:
|
|
129
113
|
"""Find documents whose text matches a pattern (file_type and names args limit the documents searched)."""
|
|
130
114
|
results: list[SearchResult] = []
|
|
131
115
|
|
|
@@ -140,14 +124,39 @@ class EpsteinFiles:
|
|
|
140
124
|
|
|
141
125
|
return results
|
|
142
126
|
|
|
143
|
-
def earliest_email_at(self,
|
|
144
|
-
return self.emails_for(
|
|
127
|
+
def earliest_email_at(self, name: Name) -> datetime:
|
|
128
|
+
return self.emails_for(name)[0].timestamp
|
|
129
|
+
|
|
130
|
+
def last_email_at(self, name: Name) -> datetime:
|
|
131
|
+
return self.emails_for(name)[-1].timestamp
|
|
132
|
+
|
|
133
|
+
def email_author_counts(self) -> dict[Name, int]:
|
|
134
|
+
return {
|
|
135
|
+
person.name: len(person.unique_emails_by())
|
|
136
|
+
for person in self.emailers() if len(person.unique_emails_by()) > 0
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
def email_authors_to_device_signatures(self) -> dict[str, set[str]]:
|
|
140
|
+
signatures = defaultdict(set)
|
|
141
|
+
|
|
142
|
+
for email in [e for e in self.non_duplicate_emails() if e.sent_from_device]:
|
|
143
|
+
signatures[email.author_or_unknown()].add(email.sent_from_device)
|
|
144
|
+
|
|
145
|
+
return signatures
|
|
145
146
|
|
|
146
|
-
def
|
|
147
|
-
|
|
147
|
+
def email_device_signatures_to_authors(self) -> dict[str, set[str]]:
|
|
148
|
+
signatures = defaultdict(set)
|
|
148
149
|
|
|
149
|
-
|
|
150
|
-
|
|
150
|
+
for email in [e for e in self.non_duplicate_emails() if e.sent_from_device]:
|
|
151
|
+
signatures[email.sent_from_device].add(email.author_or_unknown())
|
|
152
|
+
|
|
153
|
+
return signatures
|
|
154
|
+
|
|
155
|
+
def email_recipient_counts(self) -> dict[Name, int]:
|
|
156
|
+
return {
|
|
157
|
+
person.name: len(person.unique_emails_to())
|
|
158
|
+
for person in self.emailers() if len(person.unique_emails_to()) > 0
|
|
159
|
+
}
|
|
151
160
|
|
|
152
161
|
def email_signature_substitution_counts(self) -> dict[str, int]:
|
|
153
162
|
"""Return the number of times an email signature was replaced with "<...snipped...>" for each author."""
|
|
@@ -159,30 +168,41 @@ class EpsteinFiles:
|
|
|
159
168
|
|
|
160
169
|
return substitution_counts
|
|
161
170
|
|
|
162
|
-
def
|
|
163
|
-
|
|
171
|
+
def emailers(self) -> list[Person]:
|
|
172
|
+
"""All the people who sent or received an email."""
|
|
173
|
+
authors = [email.author for email in self.emails]
|
|
174
|
+
recipients = flatten([email.recipients for email in self.emails])
|
|
175
|
+
return self.person_objs(uniquify(authors + recipients))
|
|
164
176
|
|
|
165
|
-
def emails_by(self, author:
|
|
177
|
+
def emails_by(self, author: Name) -> list[Email]:
|
|
166
178
|
return Document.sort_by_timestamp([e for e in self.emails if e.author == author])
|
|
167
179
|
|
|
168
|
-
def emails_for(self,
|
|
180
|
+
def emails_for(self, name: Name) -> list[Email]:
|
|
169
181
|
"""Returns emails to or from a given 'author' sorted chronologically."""
|
|
170
|
-
emails = self.emails_by(
|
|
182
|
+
emails = self.emails_by(name) + self.emails_to(name)
|
|
171
183
|
|
|
172
184
|
if len(emails) == 0:
|
|
173
|
-
raise RuntimeError(f"No emails found for '{
|
|
185
|
+
raise RuntimeError(f"No emails found for '{name}'")
|
|
174
186
|
|
|
175
187
|
return Document.sort_by_timestamp(Document.uniquify(emails))
|
|
176
188
|
|
|
177
|
-
def emails_to(self,
|
|
178
|
-
if
|
|
189
|
+
def emails_to(self, name: Name) -> list[Email]:
|
|
190
|
+
if name is None:
|
|
179
191
|
emails = [e for e in self.emails if len(e.recipients) == 0 or None in e.recipients]
|
|
180
192
|
else:
|
|
181
|
-
emails = [e for e in self.emails if
|
|
193
|
+
emails = [e for e in self.emails if name in e.recipients]
|
|
182
194
|
|
|
183
195
|
return Document.sort_by_timestamp(emails)
|
|
184
196
|
|
|
185
|
-
def
|
|
197
|
+
def email_for_id(self, file_id: str) -> Email:
|
|
198
|
+
docs = self.for_ids([file_id])
|
|
199
|
+
|
|
200
|
+
if docs and isinstance(docs[0], Email):
|
|
201
|
+
return docs[0]
|
|
202
|
+
else:
|
|
203
|
+
raise ValueError(f"No email found for {file_id}")
|
|
204
|
+
|
|
205
|
+
def for_ids(self, file_ids: str | list[str]) -> list[Document]:
|
|
186
206
|
file_ids = listify(file_ids)
|
|
187
207
|
docs = [doc for doc in self.all_documents() if doc.file_id in file_ids]
|
|
188
208
|
|
|
@@ -191,6 +211,9 @@ class EpsteinFiles:
|
|
|
191
211
|
|
|
192
212
|
return docs
|
|
193
213
|
|
|
214
|
+
def imessage_logs_for(self, name: Name) -> list[MessengerLog]:
|
|
215
|
+
return [log for log in self.imessage_logs if name == log.author]
|
|
216
|
+
|
|
194
217
|
def json_metadata(self) -> str:
|
|
195
218
|
"""Create a JSON string containing metadata for all the files."""
|
|
196
219
|
metadata = {
|
|
@@ -201,7 +224,7 @@ class EpsteinFiles:
|
|
|
201
224
|
OtherFile.__name__: _sorted_metadata(self.non_json_other_files()),
|
|
202
225
|
},
|
|
203
226
|
'people': {
|
|
204
|
-
name: highlighted_group.
|
|
227
|
+
name: highlighted_group.info_for(name, include_category=True)
|
|
205
228
|
for highlighted_group in HIGHLIGHTED_NAMES
|
|
206
229
|
if isinstance(highlighted_group, HighlightedNames)
|
|
207
230
|
for name, description in highlighted_group.emailers.items()
|
|
@@ -217,6 +240,19 @@ class EpsteinFiles:
|
|
|
217
240
|
def non_json_other_files(self) -> list[OtherFile]:
|
|
218
241
|
return [doc for doc in self.other_files if not isinstance(doc, JsonFile)]
|
|
219
242
|
|
|
243
|
+
def person_objs(self, names: list[Name]) -> list[Person]:
|
|
244
|
+
"""Construct Person objects for a list of names."""
|
|
245
|
+
return [
|
|
246
|
+
Person(
|
|
247
|
+
name=name,
|
|
248
|
+
emails=self.emails_for(name),
|
|
249
|
+
imessage_logs=self.imessage_logs_for(name),
|
|
250
|
+
is_uninteresting_cc=name in self.uninteresting_emailers(),
|
|
251
|
+
other_files=[f for f in self.other_files if name and name == f.author]
|
|
252
|
+
)
|
|
253
|
+
for name in names
|
|
254
|
+
]
|
|
255
|
+
|
|
220
256
|
def print_files_summary(self) -> None:
|
|
221
257
|
table = build_table('File Overview')
|
|
222
258
|
add_cols_to_table(table, ['File Type', 'Count', 'Author Known', 'Author Unknown', 'Duplicates'])
|
|
@@ -240,61 +276,44 @@ class EpsteinFiles:
|
|
|
240
276
|
print_centered(table)
|
|
241
277
|
console.line()
|
|
242
278
|
|
|
243
|
-
def
|
|
244
|
-
"""
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
for email in emails:
|
|
261
|
-
if email.is_duplicate():
|
|
262
|
-
console.print(Padding(email.duplicate_file_txt().append('...'), (0, 0, 0, 4)))
|
|
263
|
-
last_printed_email_was_duplicate = True
|
|
264
|
-
else:
|
|
265
|
-
if last_printed_email_was_duplicate:
|
|
266
|
-
console.line()
|
|
267
|
-
|
|
268
|
-
console.print(email)
|
|
269
|
-
last_printed_email_was_duplicate = False
|
|
270
|
-
|
|
271
|
-
return emails
|
|
272
|
-
|
|
273
|
-
def print_emails_table_for(self, author: str | None) -> None:
|
|
274
|
-
emails = [email for email in self.emails_for(author) if not email.is_duplicate()] # Remove dupes
|
|
275
|
-
print_centered(Email.build_emails_table(emails, author))
|
|
276
|
-
console.line()
|
|
279
|
+
def unknown_recipient_ids(self) -> list[str]:
|
|
280
|
+
"""IDs of emails whose recipient is not known."""
|
|
281
|
+
return sorted([e.file_id for e in self.emails if None in e.recipients or not e.recipients])
|
|
282
|
+
|
|
283
|
+
def uninteresting_emailers(self) -> list[Name]:
|
|
284
|
+
if '_uninteresting_emailers' not in vars(self):
|
|
285
|
+
self._uninteresting_emailers = sorted(uniquify(UNINTERESTING_EMAILERS + self.uninteresting_ccs))
|
|
286
|
+
|
|
287
|
+
return self._uninteresting_emailers
|
|
288
|
+
|
|
289
|
+
def _copy_duplicate_email_properties(self) -> None:
|
|
290
|
+
"""Ensure dupe emails have the properties of the emails they duplicate to capture any repairs, config etc."""
|
|
291
|
+
for email in self.emails:
|
|
292
|
+
if not email.is_duplicate():
|
|
293
|
+
continue
|
|
294
|
+
|
|
295
|
+
original = self.email_for_id(email.duplicate_of_id())
|
|
277
296
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
console.print(_build_signature_table(self.email_authors_to_device_signatures, (AUTHOR, DEVICE_SIGNATURE)))
|
|
297
|
+
for field_name in DUPLICATE_PROPS_TO_COPY:
|
|
298
|
+
original_prop = getattr(original, field_name)
|
|
299
|
+
duplicate_prop = getattr(email, field_name)
|
|
282
300
|
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
self.email_author_counts[email.author] += 1
|
|
301
|
+
if original_prop != duplicate_prop:
|
|
302
|
+
email.warn(f"Replacing {field_name} {duplicate_prop} with {original_prop} from duplicated '{original.file_id}'")
|
|
303
|
+
setattr(email, field_name, original_prop)
|
|
287
304
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
self.email_recipient_counts[None] += 1
|
|
291
|
-
else:
|
|
292
|
-
for recipient in email.recipients:
|
|
293
|
-
self.email_recipient_counts[recipient] += 1
|
|
305
|
+
# Resort in case any timestamp were updated
|
|
306
|
+
self.emails = Document.sort_by_timestamp(self.emails)
|
|
294
307
|
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
308
|
+
def _set_uninteresting_ccs(self) -> None:
|
|
309
|
+
ross_gow_email = self.email_for_id('014797_1')
|
|
310
|
+
self.uninteresting_ccs = copy(cast(list[Name], ross_gow_email.header.bcc))
|
|
311
|
+
|
|
312
|
+
for id in EMAILS_WITH_UNINTERESTING_CCS:
|
|
313
|
+
self.uninteresting_ccs += self.email_for_id(id).recipients
|
|
314
|
+
|
|
315
|
+
self.uninteresting_ccs = sorted(uniquify(self.uninteresting_ccs))
|
|
316
|
+
logger.info(f"Extracted uninteresting_ccs: {self.uninteresting_ccs}")
|
|
298
317
|
|
|
299
318
|
|
|
300
319
|
def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
|
|
@@ -324,21 +343,6 @@ def document_cls(doc: Document) -> Type[Document]:
|
|
|
324
343
|
return OtherFile
|
|
325
344
|
|
|
326
345
|
|
|
327
|
-
def _build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str], join_char: str = '\n') -> Padding:
|
|
328
|
-
title = 'Signatures Used By Authors' if cols[0] == AUTHOR else 'Authors Seen Using Signatures'
|
|
329
|
-
table = build_table(title, header_style="bold reverse", show_lines=True)
|
|
330
|
-
|
|
331
|
-
for i, col in enumerate(cols):
|
|
332
|
-
table.add_column(col.title() + ('s' if i == 1 else ''))
|
|
333
|
-
|
|
334
|
-
new_dict = dict_sets_to_lists(keyed_sets)
|
|
335
|
-
|
|
336
|
-
for k in sorted(new_dict.keys()):
|
|
337
|
-
table.add_row(highlighter(k or UNKNOWN), highlighter(join_char.join(sorted(new_dict[k]))))
|
|
338
|
-
|
|
339
|
-
return Padding(table, DEVICE_SIGNATURE_PADDING)
|
|
340
|
-
|
|
341
|
-
|
|
342
346
|
def _sorted_metadata(docs: Sequence[Document]) -> list[Metadata]:
|
|
343
347
|
docs_sorted_by_id = sorted(docs, key=lambda d: d.file_id)
|
|
344
348
|
return [json_safe(d.metadata()) for d in docs_sorted_by_id]
|