epstein-files 1.1.3__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +15 -7
- epstein_files/documents/communication.py +3 -3
- epstein_files/documents/document.py +10 -3
- epstein_files/documents/email.py +105 -107
- epstein_files/documents/emails/email_header.py +4 -2
- epstein_files/documents/imessage/text_message.py +8 -12
- epstein_files/documents/messenger_log.py +8 -8
- epstein_files/epstein_files.py +123 -119
- epstein_files/person.py +350 -0
- epstein_files/util/constant/names.py +66 -50
- epstein_files/util/constant/output_files.py +1 -0
- epstein_files/util/constant/strings.py +3 -1
- epstein_files/util/constant/urls.py +14 -2
- epstein_files/util/constants.py +134 -26
- epstein_files/util/data.py +1 -12
- epstein_files/util/doc_cfg.py +30 -14
- epstein_files/util/env.py +3 -1
- epstein_files/util/file_helper.py +4 -1
- epstein_files/util/highlighted_group.py +228 -166
- epstein_files/util/output.py +108 -165
- epstein_files/util/rich.py +23 -45
- epstein_files/util/word_count.py +2 -3
- {epstein_files-1.1.3.dist-info → epstein_files-1.2.0.dist-info}/METADATA +2 -1
- epstein_files-1.2.0.dist-info/RECORD +34 -0
- epstein_files-1.1.3.dist-info/RECORD +0 -33
- {epstein_files-1.1.3.dist-info → epstein_files-1.2.0.dist-info}/LICENSE +0 -0
- {epstein_files-1.1.3.dist-info → epstein_files-1.2.0.dist-info}/WHEEL +0 -0
- {epstein_files-1.1.3.dist-info → epstein_files-1.2.0.dist-info}/entry_points.txt +0 -0
epstein_files/__init__.py
CHANGED
|
@@ -21,7 +21,8 @@ from epstein_files.util.env import args
|
|
|
21
21
|
from epstein_files.util.file_helper import coerce_file_path, extract_file_id
|
|
22
22
|
from epstein_files.util.logging import exit_with_error, logger
|
|
23
23
|
from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
|
|
24
|
-
print_other_files_section, print_text_messages_section, print_email_timeline,
|
|
24
|
+
print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info_png,
|
|
25
|
+
print_json_metadata, write_urls)
|
|
25
26
|
from epstein_files.util.rich import (build_highlighter, console, print_color_key, print_title_page_header,
|
|
26
27
|
print_title_page_tables, print_subtitle_panel, write_html)
|
|
27
28
|
from epstein_files.util.timer import Timer
|
|
@@ -43,8 +44,11 @@ def generate_html() -> None:
|
|
|
43
44
|
elif args.json_files:
|
|
44
45
|
print_json_files(epstein_files)
|
|
45
46
|
exit()
|
|
47
|
+
elif args.emailers_info_png:
|
|
48
|
+
print_emailers_info_png(epstein_files)
|
|
49
|
+
exit()
|
|
46
50
|
|
|
47
|
-
print_title_page_header(
|
|
51
|
+
print_title_page_header()
|
|
48
52
|
|
|
49
53
|
if args.email_timeline:
|
|
50
54
|
print_color_key()
|
|
@@ -96,8 +100,7 @@ def epstein_search():
|
|
|
96
100
|
for search_term in args.positional_args:
|
|
97
101
|
temp_highlighter = build_highlighter(search_term)
|
|
98
102
|
search_results = epstein_files.docs_matching(search_term, args.names)
|
|
99
|
-
|
|
100
|
-
print_subtitle_panel(f"Found {len(search_results)} documents matching '{search_term}'", padding=(0, 0, 0, 3))
|
|
103
|
+
print_subtitle_panel(f"Found {len(search_results)} documents matching '{search_term}'")
|
|
101
104
|
|
|
102
105
|
for search_result in search_results:
|
|
103
106
|
console.line()
|
|
@@ -115,11 +118,16 @@ def epstein_search():
|
|
|
115
118
|
def epstein_show():
|
|
116
119
|
"""Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
|
|
117
120
|
_assert_positional_args()
|
|
118
|
-
|
|
119
|
-
raw_docs = [Document(coerce_file_path(id)) for id in ids]
|
|
120
|
-
docs = [document_cls(doc)(doc.file_path) for doc in raw_docs]
|
|
121
|
+
raw_docs: list[Document] = []
|
|
121
122
|
console.line()
|
|
122
123
|
|
|
124
|
+
try:
|
|
125
|
+
ids = [extract_file_id(arg) for arg in args.positional_args]
|
|
126
|
+
raw_docs = [Document(coerce_file_path(id)) for id in ids]
|
|
127
|
+
docs = Document.sort_by_timestamp([document_cls(doc)(doc.file_path) for doc in raw_docs])
|
|
128
|
+
except Exception as e:
|
|
129
|
+
exit_with_error(str(e))
|
|
130
|
+
|
|
123
131
|
for doc in docs:
|
|
124
132
|
console.print('\n', doc, '\n')
|
|
125
133
|
|
|
@@ -9,7 +9,7 @@ from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, Document
|
|
|
9
9
|
from epstein_files.util.constant.names import UNKNOWN
|
|
10
10
|
from epstein_files.util.constants import FALLBACK_TIMESTAMP
|
|
11
11
|
from epstein_files.util.doc_cfg import CommunicationCfg
|
|
12
|
-
from epstein_files.util.highlighted_group import get_style_for_name
|
|
12
|
+
from epstein_files.util.highlighted_group import get_style_for_name, styled_name
|
|
13
13
|
from epstein_files.util.rich import key_value_txt
|
|
14
14
|
|
|
15
15
|
TIMESTAMP_SECONDS_REGEX = re.compile(r":\d{2}$")
|
|
@@ -25,10 +25,10 @@ class Communication(Document):
|
|
|
25
25
|
return self.author or UNKNOWN
|
|
26
26
|
|
|
27
27
|
def author_style(self) -> str:
|
|
28
|
-
return get_style_for_name(self.
|
|
28
|
+
return get_style_for_name(self.author)
|
|
29
29
|
|
|
30
30
|
def author_txt(self) -> Text:
|
|
31
|
-
return
|
|
31
|
+
return styled_name(self.author)
|
|
32
32
|
|
|
33
33
|
def external_links_txt(self, _style: str = '', include_alt_links: bool = True) -> Text:
|
|
34
34
|
"""Overrides super() method to apply self.author_style."""
|
|
@@ -63,7 +63,7 @@ class Document:
|
|
|
63
63
|
|
|
64
64
|
Attributes:
|
|
65
65
|
file_path (Path): Local path to file
|
|
66
|
-
author (
|
|
66
|
+
author (Name): Who is responsible for the text in the file
|
|
67
67
|
config (DocCfg): Information about this fil
|
|
68
68
|
file_id (str): 6 digit (or 8 digits if it's a local extract file) string ID
|
|
69
69
|
filename (str): File's basename
|
|
@@ -74,7 +74,7 @@ class Document:
|
|
|
74
74
|
"""
|
|
75
75
|
file_path: Path
|
|
76
76
|
# Optional fields
|
|
77
|
-
author:
|
|
77
|
+
author: Name = None
|
|
78
78
|
config: EmailCfg | DocCfg | TextCfg | None = None
|
|
79
79
|
file_id: str = field(init=False)
|
|
80
80
|
filename: str = field(init=False)
|
|
@@ -88,6 +88,9 @@ class Document:
|
|
|
88
88
|
strip_whitespace: ClassVar[bool] = True # Overridden in JsonFile
|
|
89
89
|
|
|
90
90
|
def __post_init__(self):
|
|
91
|
+
if not self.file_path.exists():
|
|
92
|
+
raise FileNotFoundError(f"File '{self.file_path.name}' does not exist!")
|
|
93
|
+
|
|
91
94
|
self.filename = self.file_path.name
|
|
92
95
|
self.file_id = extract_file_id(self.filename)
|
|
93
96
|
# config and url_slug could have been pre-set in Email
|
|
@@ -118,6 +121,10 @@ class Document:
|
|
|
118
121
|
txt.append(f" because it's {DUPE_TYPE_STRS[self.config.dupe_type]} ")
|
|
119
122
|
return txt.append(epstein_media_doc_link_txt(self.config.duplicate_of_id, style='royal_blue1'))
|
|
120
123
|
|
|
124
|
+
def duplicate_of_id(self) -> str | None:
|
|
125
|
+
if self.config and self.config.duplicate_of_id:
|
|
126
|
+
return self.config.duplicate_of_id
|
|
127
|
+
|
|
121
128
|
def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
122
129
|
return self.external_link(epsteinify_doc_url, style, link_txt)
|
|
123
130
|
|
|
@@ -175,7 +182,7 @@ class Document:
|
|
|
175
182
|
return None
|
|
176
183
|
|
|
177
184
|
def is_duplicate(self) -> bool:
|
|
178
|
-
return bool(self.
|
|
185
|
+
return bool(self.duplicate_of_id())
|
|
179
186
|
|
|
180
187
|
def is_local_extract_file(self) -> bool:
|
|
181
188
|
"""True if extracted from other file (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
|
epstein_files/documents/email.py
CHANGED
|
@@ -20,11 +20,11 @@ from epstein_files.documents.emails.email_header import (BAD_EMAILER_REGEX, EMAI
|
|
|
20
20
|
from epstein_files.util.constant.names import *
|
|
21
21
|
from epstein_files.util.constant.strings import REDACTED
|
|
22
22
|
from epstein_files.util.constants import *
|
|
23
|
-
from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes,
|
|
23
|
+
from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes,
|
|
24
24
|
flatten, listify, remove_timezone, uniquify)
|
|
25
25
|
from epstein_files.util.doc_cfg import EmailCfg, Metadata
|
|
26
26
|
from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
|
|
27
|
-
from epstein_files.util.highlighted_group import get_style_for_name
|
|
27
|
+
from epstein_files.util.highlighted_group import JUNK_EMAILERS, get_style_for_name
|
|
28
28
|
from epstein_files.util.logging import logger
|
|
29
29
|
from epstein_files.util.rich import *
|
|
30
30
|
|
|
@@ -55,6 +55,7 @@ REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
|
|
|
55
55
|
|
|
56
56
|
OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
57
57
|
re.compile(r'grnail\.com'): 'gmail.com',
|
|
58
|
+
'Newsmax. corn': 'Newsmax.com',
|
|
58
59
|
re.compile(r"^(From|To)(: )?[_1.]{5,}", re.MULTILINE): rf"\1: {REDACTED}", # Redacted email addresses
|
|
59
60
|
# These 3 must come in this order!
|
|
60
61
|
re.compile(r'([/vkT]|Ai|li|(I|7)v)rote:'): 'wrote:',
|
|
@@ -71,6 +72,7 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
|
71
72
|
# Signatures
|
|
72
73
|
'BlackBerry by AT &T': 'BlackBerry by AT&T',
|
|
73
74
|
'BlackBerry from T- Mobile': 'BlackBerry from T-Mobile',
|
|
75
|
+
'Envoy& de mon iPhone': 'Envoyé de mon iPhone',
|
|
74
76
|
"from my 'Phone": 'from my iPhone',
|
|
75
77
|
'from Samsung Mob.le': 'from Samsung Mobile',
|
|
76
78
|
'gJeremyRubin': '@JeremyRubin',
|
|
@@ -78,6 +80,7 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
|
78
80
|
'twitter glhsummers': 'twitter @lhsummers',
|
|
79
81
|
re.compile(r"twitter\.com[i/][lI]krauss[1lt]"): "twitter.com/lkrauss1",
|
|
80
82
|
re.compile(r'from my BlackBerry[0°] wireless device'): 'from my BlackBerry® wireless device',
|
|
83
|
+
re.compile(r'^INW$', re.MULTILINE): REDACTED,
|
|
81
84
|
# links
|
|
82
85
|
'Imps ://': 'https://',
|
|
83
86
|
re.compile(r'timestopics/people/t/landon jr thomas/inde\n?x\n?\.\n?h\n?tml'): 'timestopics/people/t/landon_jr_thomas/index.html',
|
|
@@ -133,19 +136,24 @@ MAILING_LISTS = [
|
|
|
133
136
|
JP_MORGAN_USGIO,
|
|
134
137
|
]
|
|
135
138
|
|
|
136
|
-
|
|
139
|
+
BBC_LISTS = JUNK_EMAILERS + MAILING_LISTS
|
|
140
|
+
|
|
141
|
+
TRUNCATE_ALL_EMAILS_FROM = BBC_LISTS + [
|
|
137
142
|
'Alan S Halperin',
|
|
138
143
|
'Mitchell Bard',
|
|
139
144
|
'Skip Rimer',
|
|
145
|
+
'Steven Victor MD',
|
|
140
146
|
]
|
|
141
147
|
|
|
142
148
|
TRUNCATION_LENGTHS = {
|
|
143
149
|
'023627': 16_800, # Micheal Wolff article with brock pierce
|
|
144
|
-
'030245':
|
|
145
|
-
'030781':
|
|
146
|
-
'032906':
|
|
150
|
+
'030245': None, # Epstein rationalizes his behavior in an open letter to the world
|
|
151
|
+
'030781': None, # Bannon email about crypto coin issues
|
|
152
|
+
'032906': None, # David Blaine email
|
|
147
153
|
'026036': 6000, # Gino Yu blockchain mention
|
|
148
|
-
'023208':
|
|
154
|
+
'023208': None, # Long discussion about leon black's finances
|
|
155
|
+
'029609': None, # Joi Ito
|
|
156
|
+
'025233': None, # Reputation.com discussion
|
|
149
157
|
}
|
|
150
158
|
|
|
151
159
|
# These are long forwarded articles so we force a trim to 1,333 chars if these strings exist
|
|
@@ -242,66 +250,15 @@ TRUNCATE_TERMS = [
|
|
|
242
250
|
'https://www.washingtonpost.com/politics/2018/09/04/transcript-phone-call',
|
|
243
251
|
]
|
|
244
252
|
|
|
245
|
-
# Some Paul Krassner emails have a ton of CCed parties we don't care about
|
|
246
|
-
KRASSNER_RECIPIENTS = uniquify(flatten([ALL_FILE_CONFIGS[id].recipients for id in ['025329', '024923', '033568']]))
|
|
247
|
-
|
|
248
|
-
# No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
|
|
249
|
-
USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIPIENTS + [
|
|
250
|
-
'Alan Dlugash', # CCed with Richard Kahn
|
|
251
|
-
'Alan Rogers', # Random CC
|
|
252
|
-
'Andrew Friendly', # Presumably some relation of Kelly Friendly
|
|
253
|
-
'BS Stern', # A random fwd of email we have
|
|
254
|
-
'Cheryl Kleen', # Single email from Anne Boyles, displayed under Anne Boyles
|
|
255
|
-
'Connie Zaguirre', # Random CC
|
|
256
|
-
'Dan Fleuette', # CC from sean bannon
|
|
257
|
-
'Danny Goldberg', # Random Paul Krassner emails
|
|
258
|
-
GERALD_LEFCOURT, # Single CC
|
|
259
|
-
GORDON_GETTY, # Random CC
|
|
260
|
-
JEFF_FULLER, # Random Jean Luc Brunel CC
|
|
261
|
-
'Jojo Fontanilla', # Random CC
|
|
262
|
-
'Joseph Vinciguerra', # Random CC
|
|
263
|
-
'Larry Cohen', # Random Bill Gates CC
|
|
264
|
-
'Lyn Fontanilla', # Random CC
|
|
265
|
-
'Mark Albert', # Random CC
|
|
266
|
-
'Matthew Schafer', # Random CC
|
|
267
|
-
MICHAEL_BUCHHOLTZ, # Terry Kafka CC
|
|
268
|
-
'Nancy Dahl', # covered by Lawrence Krauss (her husband)
|
|
269
|
-
'Michael Simmons', # Random CC
|
|
270
|
-
'Nancy Portland', # Lawrence Krauss CC
|
|
271
|
-
'Oliver Goodenough', # Robert Trivers CC
|
|
272
|
-
'Peter Aldhous', # Lawrence Krauss CC
|
|
273
|
-
'Players2', # Hoffenberg CC
|
|
274
|
-
'Sam Harris', # Lawrence Krauss CC
|
|
275
|
-
SAMUEL_LEFF, # Random CC
|
|
276
|
-
'Sean T Lehane', # Random CC
|
|
277
|
-
'Stephen Rubin', # Random CC
|
|
278
|
-
'Tim Kane', # Random CC
|
|
279
|
-
'Travis Pangburn', # Random CC
|
|
280
|
-
'Vahe Stepanian', # Random CC
|
|
281
|
-
# Ross Gow BCC
|
|
282
|
-
'david.brown@thetimes.co.uk',
|
|
283
|
-
'io-anne.pugh@bbc.co.uk',
|
|
284
|
-
'martin.robinson@mailonline.co.uk',
|
|
285
|
-
'nick.alwav@bbc.co.uk'
|
|
286
|
-
'nick.sommerlad@mirror.co.uk',
|
|
287
|
-
'p.peachev@independent.co.uk',
|
|
288
|
-
]
|
|
289
|
-
|
|
290
|
-
# Emails sent by epstein to himself that are just notes
|
|
291
|
-
SELF_EMAILS_FILE_IDS = [
|
|
292
|
-
'026677',
|
|
293
|
-
'029752', # TODO: jokeland...
|
|
294
|
-
'030238',
|
|
295
|
-
# '033274', # TODO: Epstein's note to self doesn't get printed if we don't set the recipients to [None]
|
|
296
|
-
]
|
|
297
|
-
|
|
298
253
|
METADATA_FIELDS = [
|
|
299
254
|
'is_junk_mail',
|
|
255
|
+
'is_mailing_list',
|
|
300
256
|
'recipients',
|
|
301
257
|
'sent_from_device',
|
|
302
258
|
'subject',
|
|
303
259
|
]
|
|
304
260
|
|
|
261
|
+
# Note the line repair happens *after* 'Importance: High' is removed
|
|
305
262
|
LINE_REPAIR_MERGES = {
|
|
306
263
|
'017523': 4,
|
|
307
264
|
'019407': [2, 4],
|
|
@@ -309,9 +266,14 @@ LINE_REPAIR_MERGES = {
|
|
|
309
266
|
'022673': 9,
|
|
310
267
|
'022684': 9,
|
|
311
268
|
'022695': 4,
|
|
269
|
+
'029773': [2, 5],
|
|
312
270
|
'023067': 3,
|
|
313
271
|
'025790': 2,
|
|
272
|
+
'029841': 3,
|
|
273
|
+
'026345': 3,
|
|
314
274
|
'026609': 4,
|
|
275
|
+
'033299': 3,
|
|
276
|
+
'026829': 3,
|
|
315
277
|
'026924': [2, 4],
|
|
316
278
|
'028931': [3, 6],
|
|
317
279
|
'029154': [2, 5],
|
|
@@ -322,6 +284,7 @@ LINE_REPAIR_MERGES = {
|
|
|
322
284
|
'029501': 2,
|
|
323
285
|
'029835': [2, 4],
|
|
324
286
|
'029889': 2,
|
|
287
|
+
'029545': [3, 5],
|
|
325
288
|
'029976': 3,
|
|
326
289
|
'030299': [7, 10],
|
|
327
290
|
'030381': [2, 4],
|
|
@@ -337,6 +300,7 @@ LINE_REPAIR_MERGES = {
|
|
|
337
300
|
'032405': 4,
|
|
338
301
|
'033097': 2,
|
|
339
302
|
'033144': [2, 4],
|
|
303
|
+
'033217': 3,
|
|
340
304
|
'033228': [3, 5],
|
|
341
305
|
'033357': [2, 4],
|
|
342
306
|
'033486': [7, 9],
|
|
@@ -354,14 +318,14 @@ class Email(Communication):
|
|
|
354
318
|
actual_text (str) - best effort at the text actually sent in this email, excluding quoted replies and forwards
|
|
355
319
|
config (EmailCfg | None) - manual config for this email (if it exists)
|
|
356
320
|
header (EmailHeader) - header data extracted from the text (from/to/sent/subject etc)
|
|
357
|
-
recipients (list[
|
|
321
|
+
recipients (list[Name]) - who this email was sent to
|
|
358
322
|
sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
|
|
359
323
|
signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
|
|
360
324
|
"""
|
|
361
325
|
actual_text: str = field(init=False)
|
|
362
326
|
config: EmailCfg | None = None
|
|
363
327
|
header: EmailHeader = field(init=False)
|
|
364
|
-
recipients: list[
|
|
328
|
+
recipients: list[Name] = field(default_factory=list)
|
|
365
329
|
sent_from_device: str | None = None
|
|
366
330
|
signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
|
|
367
331
|
|
|
@@ -382,25 +346,21 @@ class Email(Communication):
|
|
|
382
346
|
|
|
383
347
|
super().__post_init__()
|
|
384
348
|
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
# Remove self CCs
|
|
402
|
-
recipients = [r for r in self.recipients if r != self.author or self.file_id in SELF_EMAILS_FILE_IDS]
|
|
403
|
-
self.recipients = list(set(recipients))
|
|
349
|
+
if self.config and self.config.recipients:
|
|
350
|
+
self.recipients = self.config.recipients
|
|
351
|
+
else:
|
|
352
|
+
for recipient in self.header.recipients():
|
|
353
|
+
self.recipients.extend(self._extract_emailer_names(recipient))
|
|
354
|
+
|
|
355
|
+
# Assume mailing list emails are to Epstein
|
|
356
|
+
if self.author in BBC_LISTS and (self.is_note_to_self() or not self.recipients):
|
|
357
|
+
self.recipients = [JEFFREY_EPSTEIN]
|
|
358
|
+
|
|
359
|
+
# Remove self CCs but preserve self emails
|
|
360
|
+
if not self.is_note_to_self():
|
|
361
|
+
self.recipients = [r for r in self.recipients if r != self.author]
|
|
362
|
+
|
|
363
|
+
self.recipients = sorted(list(set(self.recipients)), key=lambda r: r or UNKNOWN)
|
|
404
364
|
self.text = self._prettify_text()
|
|
405
365
|
self.actual_text = self._actual_text()
|
|
406
366
|
self.sent_from_device = self._sent_from_device()
|
|
@@ -410,18 +370,30 @@ class Email(Communication):
|
|
|
410
370
|
|
|
411
371
|
def info_txt(self) -> Text:
|
|
412
372
|
email_type = 'fwded article' if self.is_fwded_article() else 'email'
|
|
413
|
-
txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt())
|
|
414
|
-
|
|
373
|
+
txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt())
|
|
374
|
+
|
|
375
|
+
if self.config and self.config.is_attribution_uncertain:
|
|
376
|
+
txt.append(f" {QUESTION_MARKS}", style=self.author_style())
|
|
377
|
+
|
|
378
|
+
txt.append(' to ').append(self.recipients_txt())
|
|
379
|
+
return txt.append(highlighter(f" probably sent at {self.timestamp}"))
|
|
415
380
|
|
|
416
381
|
def is_fwded_article(self) -> bool:
|
|
417
382
|
return bool(self.config and self.config.is_fwded_article)
|
|
418
383
|
|
|
419
384
|
def is_junk_mail(self) -> bool:
|
|
420
|
-
return self.author in JUNK_EMAILERS
|
|
385
|
+
return self.author in JUNK_EMAILERS
|
|
386
|
+
|
|
387
|
+
def is_mailing_list(self) -> bool:
|
|
388
|
+
return self.author in MAILING_LISTS or self.is_junk_mail()
|
|
389
|
+
|
|
390
|
+
def is_note_to_self(self) -> bool:
|
|
391
|
+
return self.recipients == [self.author]
|
|
421
392
|
|
|
422
393
|
def metadata(self) -> Metadata:
|
|
423
394
|
local_metadata = asdict(self)
|
|
424
395
|
local_metadata['is_junk_mail'] = self.is_junk_mail()
|
|
396
|
+
local_metadata['is_mailing_list'] = self.is_junk_mail()
|
|
425
397
|
local_metadata['subject'] = self.subject() or None
|
|
426
398
|
metadata = super().metadata()
|
|
427
399
|
metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
|
|
@@ -438,7 +410,10 @@ class Email(Communication):
|
|
|
438
410
|
], join=', ')
|
|
439
411
|
|
|
440
412
|
def subject(self) -> str:
|
|
441
|
-
|
|
413
|
+
if self.config and self.config.subject:
|
|
414
|
+
return self.config.subject
|
|
415
|
+
else:
|
|
416
|
+
return self.header.subject or ''
|
|
442
417
|
|
|
443
418
|
def summary(self) -> Text:
|
|
444
419
|
"""One line summary mostly for logging."""
|
|
@@ -489,11 +464,8 @@ class Email(Communication):
|
|
|
489
464
|
|
|
490
465
|
def _border_style(self) -> str:
|
|
491
466
|
"""Color emails from epstein to others with the color for the first recipient."""
|
|
492
|
-
if self.author == JEFFREY_EPSTEIN:
|
|
493
|
-
|
|
494
|
-
style = self.author_style()
|
|
495
|
-
else:
|
|
496
|
-
style = get_style_for_name(self.recipients[0])
|
|
467
|
+
if self.author == JEFFREY_EPSTEIN and len(self.recipients) > 0:
|
|
468
|
+
style = get_style_for_name(self.recipients[0])
|
|
497
469
|
else:
|
|
498
470
|
style = self.author_style()
|
|
499
471
|
|
|
@@ -541,6 +513,8 @@ class Email(Communication):
|
|
|
541
513
|
self.log_top_lines(msg='No email header match found!', level=log_level)
|
|
542
514
|
self.header = EmailHeader(field_names=[])
|
|
543
515
|
|
|
516
|
+
logger.debug(f"{self.file_id} extracted header\n\n{self.header}\n")
|
|
517
|
+
|
|
544
518
|
def _extract_timestamp(self) -> datetime:
|
|
545
519
|
if self.config and self.config.timestamp:
|
|
546
520
|
return self.config.timestamp
|
|
@@ -665,6 +639,9 @@ class Email(Communication):
|
|
|
665
639
|
elif self.file_id in ['025329']:
|
|
666
640
|
for _i in range(9):
|
|
667
641
|
self._merge_lines(2)
|
|
642
|
+
elif self.file_id in ['025812']:
|
|
643
|
+
for _i in range(2):
|
|
644
|
+
self._merge_lines(3)
|
|
668
645
|
elif self.file_id == '014860':
|
|
669
646
|
self._merge_lines(3)
|
|
670
647
|
self._merge_lines(4)
|
|
@@ -763,7 +740,7 @@ class Email(Communication):
|
|
|
763
740
|
if args.whole_file:
|
|
764
741
|
num_chars = len(self.text)
|
|
765
742
|
elif self.file_id in TRUNCATION_LENGTHS:
|
|
766
|
-
num_chars = TRUNCATION_LENGTHS[self.file_id]
|
|
743
|
+
num_chars = TRUNCATION_LENGTHS[self.file_id] or self.file_size()
|
|
767
744
|
elif self.author in TRUNCATE_ALL_EMAILS_FROM or includes_truncate_term:
|
|
768
745
|
num_chars = int(MAX_CHARS_TO_PRINT / 3)
|
|
769
746
|
elif quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
|
|
@@ -830,26 +807,47 @@ class Email(Communication):
|
|
|
830
807
|
self.log_top_lines(self.header.num_header_rows + 4, f'Original header:')
|
|
831
808
|
|
|
832
809
|
@staticmethod
|
|
833
|
-
def build_emails_table(emails: list['Email'],
|
|
834
|
-
"""Turn a set of Emails
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
810
|
+
def build_emails_table(emails: list['Email'], name: Name = '', title: str = '', show_length: bool = False) -> Table:
|
|
811
|
+
"""Turn a set of Emails into a Table."""
|
|
812
|
+
if title and name:
|
|
813
|
+
raise ValueError(f"Can't provide both 'author' and 'title' args")
|
|
814
|
+
elif name == '' and title == '':
|
|
815
|
+
raise ValueError(f"Must provide either 'author' or 'title' arg")
|
|
816
|
+
|
|
817
|
+
author_style = get_style_for_name(name, allow_bold=False)
|
|
818
|
+
link_style = author_style if name else ARCHIVE_LINK_COLOR
|
|
819
|
+
min_width = len(name or UNKNOWN)
|
|
820
|
+
max_width = max(20, min_width)
|
|
821
|
+
|
|
822
|
+
columns = [
|
|
823
|
+
{'name': 'Sent At', 'justify': 'left', 'style': TIMESTAMP_DIM},
|
|
824
|
+
{'name': 'From', 'justify': 'left', 'min_width': min_width, 'max_width': max_width},
|
|
825
|
+
{'name': 'To', 'justify': 'left', 'min_width': min_width, 'max_width': max_width + 2},
|
|
826
|
+
{'name': 'Length', 'justify': 'right', 'style': 'wheat4'},
|
|
827
|
+
{'name': 'Subject', 'justify': 'left', 'min_width': 35, 'style': 'honeydew2'},
|
|
828
|
+
]
|
|
829
|
+
|
|
830
|
+
table = build_table(
|
|
831
|
+
title or None,
|
|
832
|
+
cols=[col for col in columns if show_length or col['name'] not in ['Length']],
|
|
833
|
+
border_style=DEFAULT_TABLE_KWARGS['border_style'] if title else author_style,
|
|
834
|
+
header_style="bold",
|
|
835
|
+
highlight=True,
|
|
841
836
|
)
|
|
842
837
|
|
|
843
|
-
table.add_column('From', justify='left')
|
|
844
|
-
table.add_column('Timestamp', justify='center')
|
|
845
|
-
table.add_column('Subject', justify='left', style='honeydew2', min_width=70)
|
|
846
|
-
|
|
847
838
|
for email in emails:
|
|
848
|
-
|
|
839
|
+
fields = [
|
|
840
|
+
email.epstein_media_link(link_txt=email.timestamp_without_seconds(), style=link_style),
|
|
849
841
|
email.author_txt(),
|
|
850
|
-
email.
|
|
851
|
-
|
|
852
|
-
|
|
842
|
+
email.recipients_txt(max_full_names=1),
|
|
843
|
+
f"{email.length()}",
|
|
844
|
+
email.subject(),
|
|
845
|
+
]
|
|
846
|
+
|
|
847
|
+
if not show_length:
|
|
848
|
+
del fields[3]
|
|
849
|
+
|
|
850
|
+
table.add_row(*fields)
|
|
853
851
|
|
|
854
852
|
return table
|
|
855
853
|
|
|
@@ -8,13 +8,13 @@ from epstein_files.util.doc_cfg import EmailCfg
|
|
|
8
8
|
from epstein_files.util.logging import logger
|
|
9
9
|
from epstein_files.util.rich import UNKNOWN
|
|
10
10
|
|
|
11
|
-
FIELD_NAMES = ['
|
|
11
|
+
FIELD_NAMES = ['Date', 'From', 'Sent', 'Subject']
|
|
12
12
|
NON_HEADER_FIELDS = ['field_names', 'num_header_rows', 'was_initially_empty']
|
|
13
13
|
ON_BEHALF_OF = 'on behalf of'
|
|
14
14
|
TO_FIELDS = ['bcc', 'cc', 'to']
|
|
15
15
|
EMAILER_FIELDS = [AUTHOR] + TO_FIELDS
|
|
16
16
|
|
|
17
|
-
HEADER_REGEX_STR = r'(((?:(?:Date|From|Sent|To|C[cC]|Importance|Subject|Bee|B[cC]{2}|Attachments):|on behalf of ?)(?! +(by |from my|via )).*\n){3,})'
|
|
17
|
+
HEADER_REGEX_STR = r'(((?:(?:Date|From|Sent|To|C[cC]|Importance|Subject|Bee|B[cC]{2}|Attachments|Classification|Flag):|on behalf of ?)(?! +(by |from my|via )).*\n){3,})'
|
|
18
18
|
EMAIL_SIMPLE_HEADER_REGEX = re.compile(rf'^{HEADER_REGEX_STR}')
|
|
19
19
|
EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX = re.compile(HEADER_REGEX_STR)
|
|
20
20
|
EMAIL_PRE_FORWARD_REGEX = re.compile(r"(.{3,2000}?)" + HEADER_REGEX_STR, re.DOTALL) # Match up to the next email header section
|
|
@@ -41,6 +41,8 @@ class EmailHeader:
|
|
|
41
41
|
subject: str | None = None
|
|
42
42
|
bcc: list[str] | None = None
|
|
43
43
|
cc: list[str] | None = None
|
|
44
|
+
classification: str | None = None
|
|
45
|
+
flag: str | None = None
|
|
44
46
|
importance: str | None = None
|
|
45
47
|
attachments: str | None = None
|
|
46
48
|
to: list[str] | None = None
|
|
@@ -4,38 +4,35 @@ from datetime import datetime
|
|
|
4
4
|
|
|
5
5
|
from rich.text import Text
|
|
6
6
|
|
|
7
|
-
from epstein_files.util.constant.names import JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN
|
|
7
|
+
from epstein_files.util.constant.names import JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN, Name, extract_last_name
|
|
8
8
|
from epstein_files.util.constant.strings import TIMESTAMP_DIM
|
|
9
|
-
from epstein_files.util.data import
|
|
9
|
+
from epstein_files.util.data import iso_timestamp
|
|
10
10
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
11
11
|
from epstein_files.util.logging import logger
|
|
12
12
|
from epstein_files.util.rich import TEXT_LINK, highlighter
|
|
13
13
|
|
|
14
|
+
EPSTEIN_TEXTERS = ['e:', 'e:jeeitunes@gmail.com']
|
|
14
15
|
MSG_DATE_FORMAT = r"%m/%d/%y %I:%M:%S %p"
|
|
15
16
|
PHONE_NUMBER_REGEX = re.compile(r'^[\d+]+.*')
|
|
17
|
+
UNCERTAIN_SUFFIX = ' (?)'
|
|
16
18
|
|
|
17
19
|
DISPLAY_LAST_NAME_ONLY = [
|
|
18
20
|
JEFFREY_EPSTEIN,
|
|
19
21
|
STEVE_BANNON,
|
|
20
22
|
]
|
|
21
23
|
|
|
22
|
-
TEXTER_MAPPING = {
|
|
23
|
-
'e:': JEFFREY_EPSTEIN,
|
|
24
|
-
'e:jeeitunes@gmail.com': JEFFREY_EPSTEIN,
|
|
25
|
-
}
|
|
26
|
-
|
|
27
24
|
|
|
28
25
|
@dataclass(kw_only=True)
|
|
29
26
|
class TextMessage:
|
|
30
27
|
"""Class representing a single iMessage text message."""
|
|
31
|
-
author:
|
|
28
|
+
author: Name
|
|
32
29
|
author_str: str = ''
|
|
33
30
|
is_id_confirmed: bool = False
|
|
34
31
|
text: str
|
|
35
32
|
timestamp_str: str
|
|
36
33
|
|
|
37
34
|
def __post_init__(self):
|
|
38
|
-
self.author =
|
|
35
|
+
self.author = JEFFREY_EPSTEIN if self.author in EPSTEIN_TEXTERS else self.author
|
|
39
36
|
|
|
40
37
|
if not self.author:
|
|
41
38
|
self.author_str = UNKNOWN
|
|
@@ -45,7 +42,7 @@ class TextMessage:
|
|
|
45
42
|
self.author_str = self.author_str or self.author
|
|
46
43
|
|
|
47
44
|
if not self.is_id_confirmed and self.author is not None and self.author != JEFFREY_EPSTEIN:
|
|
48
|
-
self.author_str +=
|
|
45
|
+
self.author_str += UNCERTAIN_SUFFIX
|
|
49
46
|
|
|
50
47
|
if self.is_link():
|
|
51
48
|
self.text = self.text.replace('\n', '').replace(' ', '_')
|
|
@@ -59,12 +56,11 @@ class TextMessage:
|
|
|
59
56
|
return datetime.strptime(self.timestamp_str, MSG_DATE_FORMAT)
|
|
60
57
|
|
|
61
58
|
def timestamp_txt(self) -> Text:
|
|
62
|
-
timestamp_str = self.timestamp_str
|
|
63
|
-
|
|
64
59
|
try:
|
|
65
60
|
timestamp_str = iso_timestamp(self.parse_timestamp())
|
|
66
61
|
except Exception as e:
|
|
67
62
|
logger.warning(f"Failed to parse timestamp for {self}")
|
|
63
|
+
timestamp_str = self.timestamp_str
|
|
68
64
|
|
|
69
65
|
return Text(f"[{timestamp_str}]", style=TIMESTAMP_DIM)
|
|
70
66
|
|
|
@@ -10,11 +10,11 @@ from rich.text import Text
|
|
|
10
10
|
|
|
11
11
|
from epstein_files.documents.communication import Communication
|
|
12
12
|
from epstein_files.documents.imessage.text_message import TextMessage
|
|
13
|
-
from epstein_files.util.constant.names import JEFFREY_EPSTEIN,
|
|
13
|
+
from epstein_files.util.constant.names import JEFFREY_EPSTEIN, Name
|
|
14
14
|
from epstein_files.util.constant.strings import AUTHOR, TIMESTAMP_STYLE
|
|
15
15
|
from epstein_files.util.data import days_between, days_between_str, iso_timestamp, sort_dict
|
|
16
16
|
from epstein_files.util.doc_cfg import Metadata, TextCfg
|
|
17
|
-
from epstein_files.util.highlighted_group import
|
|
17
|
+
from epstein_files.util.highlighted_group import styled_name
|
|
18
18
|
from epstein_files.util.logging import logger
|
|
19
19
|
from epstein_files.util.rich import LAST_TIMESTAMP_STYLE, build_table, highlighter
|
|
20
20
|
|
|
@@ -35,7 +35,7 @@ class MessengerLog(Communication):
|
|
|
35
35
|
super().__post_init__()
|
|
36
36
|
self.messages = [self._build_message(match) for match in MSG_REGEX.finditer(self.text)]
|
|
37
37
|
|
|
38
|
-
def first_message_at(self, name:
|
|
38
|
+
def first_message_at(self, name: Name) -> datetime:
|
|
39
39
|
return self.messages_by(name)[0].parse_timestamp()
|
|
40
40
|
|
|
41
41
|
def info_txt(self) -> Text | None:
|
|
@@ -54,10 +54,10 @@ class MessengerLog(Communication):
|
|
|
54
54
|
|
|
55
55
|
return txt.append(')')
|
|
56
56
|
|
|
57
|
-
def last_message_at(self, name:
|
|
57
|
+
def last_message_at(self, name: Name) -> datetime:
|
|
58
58
|
return self.messages_by(name)[-1].parse_timestamp()
|
|
59
59
|
|
|
60
|
-
def messages_by(self, name:
|
|
60
|
+
def messages_by(self, name: Name) -> list[TextMessage]:
|
|
61
61
|
"""Return all messages by 'name'."""
|
|
62
62
|
return [m for m in self.messages if m.author == name]
|
|
63
63
|
|
|
@@ -129,9 +129,9 @@ class MessengerLog(Communication):
|
|
|
129
129
|
yield message
|
|
130
130
|
|
|
131
131
|
@classmethod
|
|
132
|
-
def count_authors(cls, imessage_logs: list['MessengerLog']) -> dict[
|
|
132
|
+
def count_authors(cls, imessage_logs: list['MessengerLog']) -> dict[Name, int]:
|
|
133
133
|
"""Count up how many texts were sent by each author."""
|
|
134
|
-
sender_counts: dict[
|
|
134
|
+
sender_counts: dict[Name, int] = defaultdict(int)
|
|
135
135
|
|
|
136
136
|
for message_log in imessage_logs:
|
|
137
137
|
for message in message_log.messages:
|
|
@@ -160,7 +160,7 @@ class MessengerLog(Communication):
|
|
|
160
160
|
last_at = logs[-1].first_message_at(name)
|
|
161
161
|
|
|
162
162
|
counts_table.add_row(
|
|
163
|
-
|
|
163
|
+
styled_name(name),
|
|
164
164
|
str(len(logs)),
|
|
165
165
|
f"{count:,}",
|
|
166
166
|
iso_timestamp(first_at),
|