epstein-files 1.1.5__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +12 -21
- epstein_files/documents/communication.py +0 -3
- epstein_files/documents/document.py +68 -21
- epstein_files/documents/email.py +54 -70
- epstein_files/documents/emails/email_header.py +14 -4
- epstein_files/documents/imessage/text_message.py +5 -4
- epstein_files/documents/messenger_log.py +7 -7
- epstein_files/documents/other_file.py +16 -34
- epstein_files/epstein_files.py +133 -141
- epstein_files/person.py +324 -0
- epstein_files/util/constant/names.py +46 -15
- epstein_files/util/constant/output_files.py +1 -0
- epstein_files/util/constant/strings.py +3 -3
- epstein_files/util/constant/urls.py +15 -2
- epstein_files/util/constants.py +75 -21
- epstein_files/util/data.py +1 -20
- epstein_files/util/doc_cfg.py +27 -17
- epstein_files/util/env.py +5 -3
- epstein_files/util/highlighted_group.py +248 -203
- epstein_files/util/logging.py +1 -1
- epstein_files/util/output.py +113 -157
- epstein_files/util/rich.py +20 -35
- epstein_files/util/timer.py +14 -0
- epstein_files/util/word_count.py +1 -1
- {epstein_files-1.1.5.dist-info → epstein_files-1.2.1.dist-info}/METADATA +6 -2
- epstein_files-1.2.1.dist-info/RECORD +34 -0
- epstein_files-1.1.5.dist-info/RECORD +0 -33
- {epstein_files-1.1.5.dist-info → epstein_files-1.2.1.dist-info}/LICENSE +0 -0
- {epstein_files-1.1.5.dist-info → epstein_files-1.2.1.dist-info}/WHEEL +0 -0
- {epstein_files-1.1.5.dist-info → epstein_files-1.2.1.dist-info}/entry_points.txt +0 -0
|
@@ -10,11 +10,11 @@ from rich.text import Text
|
|
|
10
10
|
|
|
11
11
|
from epstein_files.documents.communication import Communication
|
|
12
12
|
from epstein_files.documents.imessage.text_message import TextMessage
|
|
13
|
-
from epstein_files.util.constant.names import JEFFREY_EPSTEIN,
|
|
13
|
+
from epstein_files.util.constant.names import JEFFREY_EPSTEIN, Name
|
|
14
14
|
from epstein_files.util.constant.strings import AUTHOR, TIMESTAMP_STYLE
|
|
15
15
|
from epstein_files.util.data import days_between, days_between_str, iso_timestamp, sort_dict
|
|
16
16
|
from epstein_files.util.doc_cfg import Metadata, TextCfg
|
|
17
|
-
from epstein_files.util.highlighted_group import
|
|
17
|
+
from epstein_files.util.highlighted_group import styled_name
|
|
18
18
|
from epstein_files.util.logging import logger
|
|
19
19
|
from epstein_files.util.rich import LAST_TIMESTAMP_STYLE, build_table, highlighter
|
|
20
20
|
|
|
@@ -35,7 +35,7 @@ class MessengerLog(Communication):
|
|
|
35
35
|
super().__post_init__()
|
|
36
36
|
self.messages = [self._build_message(match) for match in MSG_REGEX.finditer(self.text)]
|
|
37
37
|
|
|
38
|
-
def first_message_at(self, name:
|
|
38
|
+
def first_message_at(self, name: Name) -> datetime:
|
|
39
39
|
return self.messages_by(name)[0].parse_timestamp()
|
|
40
40
|
|
|
41
41
|
def info_txt(self) -> Text | None:
|
|
@@ -54,10 +54,10 @@ class MessengerLog(Communication):
|
|
|
54
54
|
|
|
55
55
|
return txt.append(')')
|
|
56
56
|
|
|
57
|
-
def last_message_at(self, name:
|
|
57
|
+
def last_message_at(self, name: Name) -> datetime:
|
|
58
58
|
return self.messages_by(name)[-1].parse_timestamp()
|
|
59
59
|
|
|
60
|
-
def messages_by(self, name:
|
|
60
|
+
def messages_by(self, name: Name) -> list[TextMessage]:
|
|
61
61
|
"""Return all messages by 'name'."""
|
|
62
62
|
return [m for m in self.messages if m.author == name]
|
|
63
63
|
|
|
@@ -129,9 +129,9 @@ class MessengerLog(Communication):
|
|
|
129
129
|
yield message
|
|
130
130
|
|
|
131
131
|
@classmethod
|
|
132
|
-
def count_authors(cls, imessage_logs: list['MessengerLog']) -> dict[
|
|
132
|
+
def count_authors(cls, imessage_logs: list['MessengerLog']) -> dict[Name, int]:
|
|
133
133
|
"""Count up how many texts were sent by each author."""
|
|
134
|
-
sender_counts: dict[
|
|
134
|
+
sender_counts: dict[Name, int] = defaultdict(int)
|
|
135
135
|
|
|
136
136
|
for message_log in imessage_logs:
|
|
137
137
|
for message in message_log.messages:
|
|
@@ -22,7 +22,7 @@ from epstein_files.util.data import days_between, escape_single_quotes, remove_t
|
|
|
22
22
|
from epstein_files.util.file_helper import FILENAME_LENGTH, file_size_to_str
|
|
23
23
|
from epstein_files.util.env import args
|
|
24
24
|
from epstein_files.util.highlighted_group import QUESTION_MARKS_TXT, styled_category
|
|
25
|
-
from epstein_files.util.rich import build_table, highlighter
|
|
25
|
+
from epstein_files.util.rich import add_cols_to_table, build_table, highlighter
|
|
26
26
|
from epstein_files.util.logging import logger
|
|
27
27
|
|
|
28
28
|
FIRST_FEW_LINES = 'First Few Lines'
|
|
@@ -209,39 +209,8 @@ class OtherFile(Document):
|
|
|
209
209
|
if num_days_spanned > MAX_DAYS_SPANNED_TO_BE_VALID and VAST_HOUSE not in self.text:
|
|
210
210
|
self.log_top_lines(15, msg=timestamps_log_msg, level=logging.DEBUG)
|
|
211
211
|
|
|
212
|
-
@
|
|
213
|
-
def
|
|
214
|
-
counts = defaultdict(int)
|
|
215
|
-
category_bytes = defaultdict(int)
|
|
216
|
-
|
|
217
|
-
for file in files:
|
|
218
|
-
if file.category() is None:
|
|
219
|
-
logger.warning(f"file {file.file_id} has no category")
|
|
220
|
-
|
|
221
|
-
counts[file.category()] += 1
|
|
222
|
-
category_bytes[file.category()] += file.file_size()
|
|
223
|
-
|
|
224
|
-
table = build_table(f'{title_pfx}Other Files Summary', ['Category', 'Count', 'Has Author', 'No Author', 'Size'])
|
|
225
|
-
table.columns[-1].justify = 'right'
|
|
226
|
-
table.columns[0].min_width = 14
|
|
227
|
-
table.columns[-1].style = 'dim'
|
|
228
|
-
|
|
229
|
-
for (category, count) in sort_dict(counts):
|
|
230
|
-
category_files = [f for f in files if f.category() == category]
|
|
231
|
-
known_author_count = Document.known_author_count(category_files)
|
|
232
|
-
|
|
233
|
-
table.add_row(
|
|
234
|
-
styled_category(category),
|
|
235
|
-
str(count),
|
|
236
|
-
str(known_author_count),
|
|
237
|
-
str(count - known_author_count),
|
|
238
|
-
file_size_to_str(category_bytes[category]),
|
|
239
|
-
)
|
|
240
|
-
|
|
241
|
-
return table
|
|
242
|
-
|
|
243
|
-
@staticmethod
|
|
244
|
-
def files_preview_table(files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
|
|
212
|
+
@classmethod
|
|
213
|
+
def files_preview_table(cls, files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
|
|
245
214
|
"""Build a table of OtherFile documents."""
|
|
246
215
|
table = build_table(f'{title_pfx}Other Files Details in Chronological Order', show_lines=True)
|
|
247
216
|
table.add_column('File', justify='center', width=FILENAME_LENGTH)
|
|
@@ -272,3 +241,16 @@ class OtherFile(Document):
|
|
|
272
241
|
)
|
|
273
242
|
|
|
274
243
|
return table
|
|
244
|
+
|
|
245
|
+
@classmethod
|
|
246
|
+
def summary_table(cls, files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
|
|
247
|
+
categories = uniquify([f.category() for f in files])
|
|
248
|
+
categories = sorted(categories, key=lambda c: -len([f for f in files if f.category() == c]))
|
|
249
|
+
table = cls.file_info_table(f'{title_pfx}Other Files Summary', 'Category')
|
|
250
|
+
|
|
251
|
+
for category in categories:
|
|
252
|
+
category_files = [f for f in files if f.category() == category]
|
|
253
|
+
table.add_row(styled_category(category), *cls.files_info_row(category_files))
|
|
254
|
+
|
|
255
|
+
table.columns = table.columns[:-2] + [table.columns[-1]] # Removee unknown author col
|
|
256
|
+
return table
|
epstein_files/epstein_files.py
CHANGED
|
@@ -3,39 +3,44 @@ import json
|
|
|
3
3
|
import pickle
|
|
4
4
|
import re
|
|
5
5
|
from collections import defaultdict
|
|
6
|
+
from copy import copy
|
|
6
7
|
from dataclasses import dataclass, field
|
|
7
8
|
from datetime import datetime
|
|
8
9
|
from pathlib import Path
|
|
9
|
-
from typing import Sequence, Type
|
|
10
|
+
from typing import Sequence, Type, cast
|
|
10
11
|
|
|
11
|
-
from rich.padding import Padding
|
|
12
12
|
from rich.table import Table
|
|
13
|
-
from rich.text import Text
|
|
14
13
|
|
|
15
14
|
from epstein_files.documents.document import Document
|
|
16
|
-
from epstein_files.documents.email import DETECT_EMAIL_REGEX,
|
|
17
|
-
from epstein_files.documents.emails.email_header import AUTHOR
|
|
15
|
+
from epstein_files.documents.email import DETECT_EMAIL_REGEX, Email
|
|
18
16
|
from epstein_files.documents.json_file import JsonFile
|
|
19
17
|
from epstein_files.documents.messenger_log import MSG_REGEX, MessengerLog
|
|
20
18
|
from epstein_files.documents.other_file import OtherFile
|
|
19
|
+
from epstein_files.person import Person
|
|
21
20
|
from epstein_files.util.constant.strings import *
|
|
22
21
|
from epstein_files.util.constants import *
|
|
23
|
-
from epstein_files.util.data import
|
|
22
|
+
from epstein_files.util.data import flatten, json_safe, listify, uniquify
|
|
24
23
|
from epstein_files.util.doc_cfg import EmailCfg, Metadata
|
|
25
24
|
from epstein_files.util.env import DOCS_DIR, args, logger
|
|
26
25
|
from epstein_files.util.file_helper import file_size_str
|
|
27
|
-
from epstein_files.util.highlighted_group import HIGHLIGHTED_NAMES, HighlightedNames
|
|
28
|
-
from epstein_files.util.rich import (NA_TXT, add_cols_to_table, build_table, console, highlighter,
|
|
29
|
-
print_author_panel, print_centered, print_subtitle_panel)
|
|
26
|
+
from epstein_files.util.highlighted_group import HIGHLIGHTED_NAMES, HighlightedNames
|
|
30
27
|
from epstein_files.util.search_result import SearchResult
|
|
31
28
|
from epstein_files.util.timer import Timer
|
|
32
29
|
|
|
33
|
-
|
|
34
|
-
DEVICE_SIGNATURE = 'Device Signature'
|
|
35
|
-
DEVICE_SIGNATURE_PADDING = (1, 0)
|
|
30
|
+
DUPLICATE_PROPS_TO_COPY = ['author', 'recipients', 'timestamp']
|
|
36
31
|
PICKLED_PATH = Path("the_epstein_files.pkl.gz")
|
|
37
32
|
SLOW_FILE_SECONDS = 1.0
|
|
38
33
|
|
|
34
|
+
EMAILS_WITH_UNINTERESTING_CCS = [
|
|
35
|
+
'025329', # Krassner
|
|
36
|
+
'024923', # Krassner
|
|
37
|
+
'033568', # Krassner
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
EMAILS_WITH_UNINTERESTING_BCCS = [
|
|
41
|
+
'014797_1', # Ross Gow
|
|
42
|
+
]
|
|
43
|
+
|
|
39
44
|
|
|
40
45
|
@dataclass
|
|
41
46
|
class EpsteinFiles:
|
|
@@ -45,19 +50,13 @@ class EpsteinFiles:
|
|
|
45
50
|
json_files: list[JsonFile] = field(default_factory=list)
|
|
46
51
|
other_files: list[OtherFile] = field(default_factory=list)
|
|
47
52
|
timer: Timer = field(default_factory=lambda: Timer())
|
|
48
|
-
|
|
49
|
-
# Analytics / calculations
|
|
50
|
-
email_author_counts: dict[str | None, int] = field(default_factory=lambda: defaultdict(int))
|
|
51
|
-
email_authors_to_device_signatures: dict[str, set] = field(default_factory=lambda: defaultdict(set))
|
|
52
|
-
email_device_signatures_to_authors: dict[str, set] = field(default_factory=lambda: defaultdict(set))
|
|
53
|
-
email_recipient_counts: dict[str | None, int] = field(default_factory=lambda: defaultdict(int))
|
|
54
|
-
unknown_recipient_email_ids: set[str] = field(default_factory=set)
|
|
53
|
+
uninteresting_ccs: list[Name] = field(default_factory=list)
|
|
55
54
|
|
|
56
55
|
def __post_init__(self):
|
|
57
56
|
"""Iterate through files and build appropriate objects."""
|
|
58
57
|
self.all_files = sorted([f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')])
|
|
59
58
|
documents = []
|
|
60
|
-
file_type_count = defaultdict(int)
|
|
59
|
+
file_type_count = defaultdict(int) # Hack used by --skip-other-files option
|
|
61
60
|
|
|
62
61
|
# Read through and classify all the files
|
|
63
62
|
for file_arg in self.all_files:
|
|
@@ -83,23 +82,23 @@ class EpsteinFiles:
|
|
|
83
82
|
self.imessage_logs = Document.sort_by_timestamp([d for d in documents if isinstance(d, MessengerLog)])
|
|
84
83
|
self.other_files = Document.sort_by_timestamp([d for d in documents if isinstance(d, (JsonFile, OtherFile))])
|
|
85
84
|
self.json_files = [doc for doc in self.other_files if isinstance(doc, JsonFile)]
|
|
86
|
-
self.
|
|
85
|
+
self._set_uninteresting_ccs()
|
|
86
|
+
self._copy_duplicate_email_properties()
|
|
87
87
|
|
|
88
88
|
@classmethod
|
|
89
89
|
def get_files(cls, timer: Timer | None = None) -> 'EpsteinFiles':
|
|
90
90
|
"""Alternate constructor that reads/writes a pickled version of the data ('timer' arg is for logging)."""
|
|
91
91
|
timer = timer or Timer()
|
|
92
92
|
|
|
93
|
-
if PICKLED_PATH.exists() and not args.overwrite_pickle:
|
|
93
|
+
if PICKLED_PATH.exists() and not args.overwrite_pickle and not args.skip_other_files:
|
|
94
94
|
with gzip.open(PICKLED_PATH, 'rb') as file:
|
|
95
95
|
epstein_files = pickle.load(file)
|
|
96
|
-
epstein_files.timer = timer
|
|
97
96
|
timer_msg = f"Loaded {len(epstein_files.all_files):,} documents from '{PICKLED_PATH}'"
|
|
98
|
-
|
|
97
|
+
timer.print_at_checkpoint(f"{timer_msg} ({file_size_str(PICKLED_PATH)})")
|
|
99
98
|
return epstein_files
|
|
100
99
|
|
|
101
100
|
logger.warning(f"Building new cache file, this will take a few minutes...")
|
|
102
|
-
epstein_files = EpsteinFiles(
|
|
101
|
+
epstein_files = EpsteinFiles()
|
|
103
102
|
|
|
104
103
|
if args.skip_other_files:
|
|
105
104
|
logger.warning(f"Not writing pickled data because --skip-other-files")
|
|
@@ -114,17 +113,7 @@ class EpsteinFiles:
|
|
|
114
113
|
def all_documents(self) -> Sequence[Document]:
|
|
115
114
|
return self.imessage_logs + self.emails + self.other_files
|
|
116
115
|
|
|
117
|
-
def
|
|
118
|
-
"""Returns all emailers USELESS_EMAILERS, sorted from least frequent to most."""
|
|
119
|
-
names = [a for a in self.email_author_counts.keys()] + [r for r in self.email_recipient_counts.keys()]
|
|
120
|
-
names = names if include_useless else [e for e in names if e not in USELESS_EMAILERS]
|
|
121
|
-
return sorted(list(set(names)), key=lambda e: self.email_author_counts[e] + self.email_recipient_counts[e])
|
|
122
|
-
|
|
123
|
-
def docs_matching(
|
|
124
|
-
self,
|
|
125
|
-
pattern: re.Pattern | str,
|
|
126
|
-
names: list[str | None] | None = None
|
|
127
|
-
) -> list[SearchResult]:
|
|
116
|
+
def docs_matching(self, pattern: re.Pattern | str, names: list[Name] | None = None) -> list[SearchResult]:
|
|
128
117
|
"""Find documents whose text matches a pattern (file_type and names args limit the documents searched)."""
|
|
129
118
|
results: list[SearchResult] = []
|
|
130
119
|
|
|
@@ -139,14 +128,39 @@ class EpsteinFiles:
|
|
|
139
128
|
|
|
140
129
|
return results
|
|
141
130
|
|
|
142
|
-
def earliest_email_at(self,
|
|
143
|
-
return self.emails_for(
|
|
131
|
+
def earliest_email_at(self, name: Name) -> datetime:
|
|
132
|
+
return self.emails_for(name)[0].timestamp
|
|
133
|
+
|
|
134
|
+
def last_email_at(self, name: Name) -> datetime:
|
|
135
|
+
return self.emails_for(name)[-1].timestamp
|
|
136
|
+
|
|
137
|
+
def email_author_counts(self) -> dict[Name, int]:
|
|
138
|
+
return {
|
|
139
|
+
person.name: len(person.unique_emails_by())
|
|
140
|
+
for person in self.emailers() if len(person.unique_emails_by()) > 0
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
def email_authors_to_device_signatures(self) -> dict[str, set[str]]:
|
|
144
|
+
signatures = defaultdict(set)
|
|
145
|
+
|
|
146
|
+
for email in [e for e in self.non_duplicate_emails() if e.sent_from_device]:
|
|
147
|
+
signatures[email.author_or_unknown()].add(email.sent_from_device)
|
|
144
148
|
|
|
145
|
-
|
|
146
|
-
return self.emails_for(author)[-1].timestamp
|
|
149
|
+
return signatures
|
|
147
150
|
|
|
148
|
-
def
|
|
149
|
-
|
|
151
|
+
def email_device_signatures_to_authors(self) -> dict[str, set[str]]:
|
|
152
|
+
signatures = defaultdict(set)
|
|
153
|
+
|
|
154
|
+
for email in [e for e in self.non_duplicate_emails() if e.sent_from_device]:
|
|
155
|
+
signatures[email.sent_from_device].add(email.author_or_unknown())
|
|
156
|
+
|
|
157
|
+
return signatures
|
|
158
|
+
|
|
159
|
+
def email_recipient_counts(self) -> dict[Name, int]:
|
|
160
|
+
return {
|
|
161
|
+
person.name: len(person.unique_emails_to())
|
|
162
|
+
for person in self.emailers() if len(person.unique_emails_to()) > 0
|
|
163
|
+
}
|
|
150
164
|
|
|
151
165
|
def email_signature_substitution_counts(self) -> dict[str, int]:
|
|
152
166
|
"""Return the number of times an email signature was replaced with "<...snipped...>" for each author."""
|
|
@@ -158,32 +172,40 @@ class EpsteinFiles:
|
|
|
158
172
|
|
|
159
173
|
return substitution_counts
|
|
160
174
|
|
|
161
|
-
def
|
|
162
|
-
|
|
175
|
+
def emailers(self) -> list[Person]:
|
|
176
|
+
"""All the people who sent or received an email."""
|
|
177
|
+
authors = [email.author for email in self.emails]
|
|
178
|
+
recipients = flatten([email.recipients for email in self.emails])
|
|
179
|
+
return self.person_objs(uniquify(authors + recipients))
|
|
163
180
|
|
|
164
|
-
def emails_by(self, author:
|
|
181
|
+
def emails_by(self, author: Name) -> list[Email]:
|
|
165
182
|
return Document.sort_by_timestamp([e for e in self.emails if e.author == author])
|
|
166
183
|
|
|
167
|
-
def emails_for(self,
|
|
184
|
+
def emails_for(self, name: Name) -> list[Email]:
|
|
168
185
|
"""Returns emails to or from a given 'author' sorted chronologically."""
|
|
169
|
-
|
|
170
|
-
emails = [e for e in self.emails_by(JEFFREY_EPSTEIN) if e.is_note_to_self()]
|
|
171
|
-
else:
|
|
172
|
-
emails = self.emails_by(author) + self.emails_to(author)
|
|
186
|
+
emails = self.emails_by(name) + self.emails_to(name)
|
|
173
187
|
|
|
174
188
|
if len(emails) == 0:
|
|
175
|
-
raise RuntimeError(f"No emails found for '{
|
|
189
|
+
raise RuntimeError(f"No emails found for '{name}'")
|
|
176
190
|
|
|
177
191
|
return Document.sort_by_timestamp(Document.uniquify(emails))
|
|
178
192
|
|
|
179
|
-
def emails_to(self,
|
|
180
|
-
if
|
|
193
|
+
def emails_to(self, name: Name) -> list[Email]:
|
|
194
|
+
if name is None:
|
|
181
195
|
emails = [e for e in self.emails if len(e.recipients) == 0 or None in e.recipients]
|
|
182
196
|
else:
|
|
183
|
-
emails = [e for e in self.emails if
|
|
197
|
+
emails = [e for e in self.emails if name in e.recipients]
|
|
184
198
|
|
|
185
199
|
return Document.sort_by_timestamp(emails)
|
|
186
200
|
|
|
201
|
+
def email_for_id(self, file_id: str) -> Email:
|
|
202
|
+
docs = self.for_ids([file_id])
|
|
203
|
+
|
|
204
|
+
if docs and isinstance(docs[0], Email):
|
|
205
|
+
return docs[0]
|
|
206
|
+
else:
|
|
207
|
+
raise ValueError(f"No email found for {file_id}")
|
|
208
|
+
|
|
187
209
|
def for_ids(self, file_ids: str | list[str]) -> list[Document]:
|
|
188
210
|
file_ids = listify(file_ids)
|
|
189
211
|
docs = [doc for doc in self.all_documents() if doc.file_id in file_ids]
|
|
@@ -193,6 +215,9 @@ class EpsteinFiles:
|
|
|
193
215
|
|
|
194
216
|
return docs
|
|
195
217
|
|
|
218
|
+
def imessage_logs_for(self, name: Name) -> list[MessengerLog]:
|
|
219
|
+
return [log for log in self.imessage_logs if name == log.author]
|
|
220
|
+
|
|
196
221
|
def json_metadata(self) -> str:
|
|
197
222
|
"""Create a JSON string containing metadata for all the files."""
|
|
198
223
|
metadata = {
|
|
@@ -203,7 +228,7 @@ class EpsteinFiles:
|
|
|
203
228
|
OtherFile.__name__: _sorted_metadata(self.non_json_other_files()),
|
|
204
229
|
},
|
|
205
230
|
'people': {
|
|
206
|
-
name: highlighted_group.
|
|
231
|
+
name: highlighted_group.info_for(name, include_category=True)
|
|
207
232
|
for highlighted_group in HIGHLIGHTED_NAMES
|
|
208
233
|
if isinstance(highlighted_group, HighlightedNames)
|
|
209
234
|
for name, description in highlighted_group.emailers.items()
|
|
@@ -214,89 +239,71 @@ class EpsteinFiles:
|
|
|
214
239
|
return json.dumps(metadata, indent=4, sort_keys=True)
|
|
215
240
|
|
|
216
241
|
def non_duplicate_emails(self) -> list[Email]:
|
|
217
|
-
return
|
|
242
|
+
return Document.without_dupes(self.emails)
|
|
218
243
|
|
|
219
244
|
def non_json_other_files(self) -> list[OtherFile]:
|
|
220
245
|
return [doc for doc in self.other_files if not isinstance(doc, JsonFile)]
|
|
221
246
|
|
|
222
|
-
def
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
label,
|
|
232
|
-
f"{len(docs):,}",
|
|
233
|
-
f"{known:,}" if known is not None else NA_TXT,
|
|
234
|
-
f"{len(docs) - known:,}" if known is not None else NA_TXT,
|
|
235
|
-
f"{len([d for d in docs if d.is_duplicate()])}",
|
|
247
|
+
def person_objs(self, names: list[Name]) -> list[Person]:
|
|
248
|
+
"""Construct Person objects for a list of names."""
|
|
249
|
+
return [
|
|
250
|
+
Person(
|
|
251
|
+
name=name,
|
|
252
|
+
emails=self.emails_for(name),
|
|
253
|
+
imessage_logs=self.imessage_logs_for(name),
|
|
254
|
+
is_uninteresting_cc=name in self.uninteresting_emailers(),
|
|
255
|
+
other_files=[f for f in self.other_files if name and name == f.author]
|
|
236
256
|
)
|
|
257
|
+
for name in names
|
|
258
|
+
]
|
|
259
|
+
|
|
260
|
+
def overview_table(self) -> Table:
|
|
261
|
+
table = Document.file_info_table('Files Overview', 'File Type')
|
|
262
|
+
table.add_row('Emails', *Document.files_info_row(self.emails))
|
|
263
|
+
table.add_row('iMessage Logs', *Document.files_info_row(self.imessage_logs))
|
|
264
|
+
table.add_row('JSON Data', *Document.files_info_row(self.json_files, True))
|
|
265
|
+
table.add_row('Other', *Document.files_info_row(self.non_json_other_files()))
|
|
266
|
+
return table
|
|
267
|
+
|
|
268
|
+
def unknown_recipient_ids(self) -> list[str]:
|
|
269
|
+
"""IDs of emails whose recipient is not known."""
|
|
270
|
+
return sorted([e.file_id for e in self.emails if None in e.recipients or not e.recipients])
|
|
271
|
+
|
|
272
|
+
def uninteresting_emailers(self) -> list[Name]:
|
|
273
|
+
"""Emailers whom we don't want to print a separate section for because they're just CCed."""
|
|
274
|
+
if '_uninteresting_emailers' not in vars(self):
|
|
275
|
+
self._uninteresting_emailers = sorted(uniquify(UNINTERESTING_EMAILERS + self.uninteresting_ccs))
|
|
276
|
+
|
|
277
|
+
return self._uninteresting_emailers
|
|
278
|
+
|
|
279
|
+
def _copy_duplicate_email_properties(self) -> None:
|
|
280
|
+
"""Ensure dupe emails have the properties of the emails they duplicate to capture any repairs, config etc."""
|
|
281
|
+
for email in self.emails:
|
|
282
|
+
if not email.is_duplicate():
|
|
283
|
+
continue
|
|
237
284
|
|
|
238
|
-
|
|
239
|
-
add_row('iMessage Logs', self.imessage_logs)
|
|
240
|
-
add_row('JSON Data', self.json_files)
|
|
241
|
-
add_row('Other', self.non_json_other_files())
|
|
242
|
-
print_centered(table)
|
|
243
|
-
console.line()
|
|
244
|
-
|
|
245
|
-
def print_emails_for(self, _author: str | None) -> list[Email]:
|
|
246
|
-
"""Print complete emails to or from a particular 'author'. Returns the Emails that were printed."""
|
|
247
|
-
emails = self.emails_for(_author)
|
|
248
|
-
num_days = self.email_conversation_length_in_days(_author)
|
|
249
|
-
unique_emails = [email for email in emails if not email.is_duplicate()]
|
|
250
|
-
start_date = emails[0].timestamp.date()
|
|
251
|
-
author = _author or UNKNOWN
|
|
252
|
-
title = f"Found {len(unique_emails)} emails"
|
|
253
|
-
|
|
254
|
-
if author == JEFFREY_EPSTEIN:
|
|
255
|
-
title += f" sent by {JEFFREY_EPSTEIN} to himself"
|
|
256
|
-
else:
|
|
257
|
-
title += f" to/from {author} starting {start_date} covering {num_days:,} days"
|
|
258
|
-
|
|
259
|
-
print_author_panel(title, get_info_for_name(author), get_style_for_name(author))
|
|
260
|
-
self.print_emails_table_for(_author)
|
|
261
|
-
last_printed_email_was_duplicate = False
|
|
262
|
-
|
|
263
|
-
for email in emails:
|
|
264
|
-
if email.is_duplicate():
|
|
265
|
-
console.print(Padding(email.duplicate_file_txt().append('...'), (0, 0, 0, 4)))
|
|
266
|
-
last_printed_email_was_duplicate = True
|
|
267
|
-
else:
|
|
268
|
-
if last_printed_email_was_duplicate:
|
|
269
|
-
console.line()
|
|
270
|
-
|
|
271
|
-
console.print(email)
|
|
272
|
-
last_printed_email_was_duplicate = False
|
|
285
|
+
original = self.email_for_id(email.duplicate_of_id())
|
|
273
286
|
|
|
274
|
-
|
|
287
|
+
for field_name in DUPLICATE_PROPS_TO_COPY:
|
|
288
|
+
original_prop = getattr(original, field_name)
|
|
289
|
+
duplicate_prop = getattr(email, field_name)
|
|
275
290
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
291
|
+
if original_prop != duplicate_prop:
|
|
292
|
+
email.warn(f"Replacing {field_name} {duplicate_prop} with {original_prop} from duplicated '{original.file_id}'")
|
|
293
|
+
setattr(email, field_name, original_prop)
|
|
279
294
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
console.print(_build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
|
|
283
|
-
console.print(_build_signature_table(self.email_authors_to_device_signatures, (AUTHOR, DEVICE_SIGNATURE)))
|
|
295
|
+
# Resort in case any timestamp were updated
|
|
296
|
+
self.emails = Document.sort_by_timestamp(self.emails)
|
|
284
297
|
|
|
285
|
-
def
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
self.email_author_counts[email.author] += 1
|
|
298
|
+
def _set_uninteresting_ccs(self) -> None:
|
|
299
|
+
for id in EMAILS_WITH_UNINTERESTING_BCCS:
|
|
300
|
+
self.uninteresting_ccs += copy(cast(list[Name], self.email_for_id(id).header.bcc))
|
|
289
301
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
self.email_recipient_counts[None] += 1
|
|
293
|
-
else:
|
|
294
|
-
for recipient in email.recipients:
|
|
295
|
-
self.email_recipient_counts[recipient] += 1
|
|
302
|
+
for id in EMAILS_WITH_UNINTERESTING_CCS:
|
|
303
|
+
self.uninteresting_ccs += self.email_for_id(id).recipients
|
|
296
304
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
self.email_device_signatures_to_authors[email.sent_from_device].add(email.author_or_unknown())
|
|
305
|
+
self.uninteresting_ccs = sorted(uniquify(self.uninteresting_ccs))
|
|
306
|
+
logger.info(f"Extracted uninteresting_ccs: {self.uninteresting_ccs}")
|
|
300
307
|
|
|
301
308
|
|
|
302
309
|
def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
|
|
@@ -326,21 +333,6 @@ def document_cls(doc: Document) -> Type[Document]:
|
|
|
326
333
|
return OtherFile
|
|
327
334
|
|
|
328
335
|
|
|
329
|
-
def _build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str], join_char: str = '\n') -> Padding:
|
|
330
|
-
title = 'Signatures Used By Authors' if cols[0] == AUTHOR else 'Authors Seen Using Signatures'
|
|
331
|
-
table = build_table(title, header_style="bold reverse", show_lines=True)
|
|
332
|
-
|
|
333
|
-
for i, col in enumerate(cols):
|
|
334
|
-
table.add_column(col.title() + ('s' if i == 1 else ''))
|
|
335
|
-
|
|
336
|
-
new_dict = dict_sets_to_lists(keyed_sets)
|
|
337
|
-
|
|
338
|
-
for k in sorted(new_dict.keys()):
|
|
339
|
-
table.add_row(highlighter(k or UNKNOWN), highlighter(join_char.join(sorted(new_dict[k]))))
|
|
340
|
-
|
|
341
|
-
return Padding(table, DEVICE_SIGNATURE_PADDING)
|
|
342
|
-
|
|
343
|
-
|
|
344
336
|
def _sorted_metadata(docs: Sequence[Document]) -> list[Metadata]:
|
|
345
337
|
docs_sorted_by_id = sorted(docs, key=lambda d: d.file_id)
|
|
346
338
|
return [json_safe(d.metadata()) for d in docs_sorted_by_id]
|