epstein-files 1.1.0__py3-none-any.whl → 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +16 -27
- epstein_files/documents/communication.py +10 -14
- epstein_files/documents/document.py +1 -1
- epstein_files/documents/email.py +152 -75
- epstein_files/documents/imessage/text_message.py +42 -25
- epstein_files/documents/messenger_log.py +31 -12
- epstein_files/documents/other_file.py +13 -12
- epstein_files/epstein_files.py +20 -81
- epstein_files/util/constant/common_words.py +3 -3
- epstein_files/util/constant/html.py +4 -5
- epstein_files/util/constant/names.py +18 -6
- epstein_files/util/constant/strings.py +6 -2
- epstein_files/util/constant/urls.py +1 -1
- epstein_files/util/constants.py +19 -23
- epstein_files/util/env.py +55 -36
- epstein_files/util/file_helper.py +1 -2
- epstein_files/util/highlighted_group.py +1019 -189
- epstein_files/util/logging.py +8 -1
- epstein_files/util/output.py +183 -89
- epstein_files/util/rich.py +35 -69
- epstein_files/util/timer.py +1 -1
- epstein_files/util/word_count.py +3 -4
- {epstein_files-1.1.0.dist-info → epstein_files-1.1.3.dist-info}/METADATA +4 -1
- epstein_files-1.1.3.dist-info/RECORD +33 -0
- epstein_files-1.1.0.dist-info/RECORD +0 -33
- {epstein_files-1.1.0.dist-info → epstein_files-1.1.3.dist-info}/LICENSE +0 -0
- {epstein_files-1.1.0.dist-info → epstein_files-1.1.3.dist-info}/WHEEL +0 -0
- {epstein_files-1.1.0.dist-info → epstein_files-1.1.3.dist-info}/entry_points.txt +0 -0
epstein_files/__init__.py
CHANGED
|
@@ -16,13 +16,14 @@ from rich.text import Text
|
|
|
16
16
|
from epstein_files.epstein_files import EpsteinFiles, document_cls
|
|
17
17
|
from epstein_files.documents.document import INFO_PADDING, Document
|
|
18
18
|
from epstein_files.documents.email import Email
|
|
19
|
-
from epstein_files.util.constant.output_files import
|
|
19
|
+
from epstein_files.util.constant.output_files import make_clean
|
|
20
20
|
from epstein_files.util.env import args
|
|
21
21
|
from epstein_files.util.file_helper import coerce_file_path, extract_file_id
|
|
22
|
-
from epstein_files.util.logging import logger
|
|
22
|
+
from epstein_files.util.logging import exit_with_error, logger
|
|
23
23
|
from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
|
|
24
|
-
print_other_files_section, print_text_messages_section,
|
|
25
|
-
from epstein_files.util.rich import build_highlighter, console,
|
|
24
|
+
print_other_files_section, print_text_messages_section, print_email_timeline, print_json_metadata, write_urls)
|
|
25
|
+
from epstein_files.util.rich import (build_highlighter, console, print_color_key, print_title_page_header,
|
|
26
|
+
print_title_page_tables, print_subtitle_panel, write_html)
|
|
26
27
|
from epstein_files.util.timer import Timer
|
|
27
28
|
from epstein_files.util.word_count import write_word_counts_html
|
|
28
29
|
|
|
@@ -37,7 +38,7 @@ def generate_html() -> None:
|
|
|
37
38
|
epstein_files = EpsteinFiles.get_files(timer)
|
|
38
39
|
|
|
39
40
|
if args.json_metadata:
|
|
40
|
-
|
|
41
|
+
print_json_metadata(epstein_files)
|
|
41
42
|
exit()
|
|
42
43
|
elif args.json_files:
|
|
43
44
|
print_json_files(epstein_files)
|
|
@@ -45,21 +46,24 @@ def generate_html() -> None:
|
|
|
45
46
|
|
|
46
47
|
print_title_page_header(epstein_files)
|
|
47
48
|
|
|
48
|
-
if
|
|
49
|
+
if args.email_timeline:
|
|
50
|
+
print_color_key()
|
|
51
|
+
else:
|
|
49
52
|
print_title_page_tables(epstein_files)
|
|
50
53
|
|
|
51
54
|
if args.colors_only:
|
|
52
55
|
exit()
|
|
53
56
|
|
|
54
57
|
if args.output_texts:
|
|
55
|
-
|
|
56
|
-
|
|
58
|
+
imessage_logs = [log for log in epstein_files.imessage_logs if not args.names or log.author in args.names]
|
|
59
|
+
print_text_messages_section(imessage_logs)
|
|
60
|
+
timer.print_at_checkpoint(f'Printed {len(imessage_logs)} text message log files')
|
|
57
61
|
|
|
58
62
|
if args.output_emails:
|
|
59
63
|
emails_that_were_printed = print_emails_section(epstein_files)
|
|
60
64
|
timer.print_at_checkpoint(f"Printed {len(emails_that_were_printed):,} emails")
|
|
61
65
|
elif args.email_timeline:
|
|
62
|
-
|
|
66
|
+
print_email_timeline(epstein_files)
|
|
63
67
|
timer.print_at_checkpoint(f"Printed chronological emails table")
|
|
64
68
|
|
|
65
69
|
if args.output_other:
|
|
@@ -71,15 +75,7 @@ def generate_html() -> None:
|
|
|
71
75
|
print_other_files_section(files, epstein_files)
|
|
72
76
|
timer.print_at_checkpoint(f"Printed {len(files)} other files (skipped {len(epstein_files.other_files) - len(files)})")
|
|
73
77
|
|
|
74
|
-
|
|
75
|
-
if args.all_emails:
|
|
76
|
-
output_path = ALL_EMAILS_PATH
|
|
77
|
-
elif args.email_timeline:
|
|
78
|
-
output_path = CHRONOLOGICAL_EMAILS_PATH
|
|
79
|
-
else:
|
|
80
|
-
output_path = TEXT_MSGS_HTML_PATH
|
|
81
|
-
|
|
82
|
-
write_html(output_path)
|
|
78
|
+
write_html(args.build)
|
|
83
79
|
logger.warning(f"Total time: {timer.seconds_since_start_str()}")
|
|
84
80
|
|
|
85
81
|
# JSON stats (mostly used for building pytest checks)
|
|
@@ -101,15 +97,12 @@ def epstein_search():
|
|
|
101
97
|
temp_highlighter = build_highlighter(search_term)
|
|
102
98
|
search_results = epstein_files.docs_matching(search_term, args.names)
|
|
103
99
|
console.line(2)
|
|
104
|
-
|
|
100
|
+
print_subtitle_panel(f"Found {len(search_results)} documents matching '{search_term}'", padding=(0, 0, 0, 3))
|
|
105
101
|
|
|
106
102
|
for search_result in search_results:
|
|
107
103
|
console.line()
|
|
108
104
|
|
|
109
105
|
if args.whole_file:
|
|
110
|
-
if isinstance(search_result.document, Email):
|
|
111
|
-
search_result.document._truncation_allowed = False
|
|
112
|
-
|
|
113
106
|
console.print(search_result.document)
|
|
114
107
|
else:
|
|
115
108
|
console.print(search_result.document.summary_panel())
|
|
@@ -128,9 +121,6 @@ def epstein_show():
|
|
|
128
121
|
console.line()
|
|
129
122
|
|
|
130
123
|
for doc in docs:
|
|
131
|
-
if isinstance(doc, Email):
|
|
132
|
-
doc._truncation_allowed = False
|
|
133
|
-
|
|
134
124
|
console.print('\n', doc, '\n')
|
|
135
125
|
|
|
136
126
|
if args.raw:
|
|
@@ -148,5 +138,4 @@ def epstein_word_count() -> None:
|
|
|
148
138
|
|
|
149
139
|
def _assert_positional_args():
|
|
150
140
|
if not args.positional_args:
|
|
151
|
-
|
|
152
|
-
exit(1)
|
|
141
|
+
exit_with_error(f"No positional args provided!\n")
|
|
@@ -18,25 +18,24 @@ TIMESTAMP_SECONDS_REGEX = re.compile(r":\d{2}$")
|
|
|
18
18
|
@dataclass
|
|
19
19
|
class Communication(Document):
|
|
20
20
|
"""Superclass for Email and MessengerLog."""
|
|
21
|
-
author_style: str = 'white'
|
|
22
|
-
author_txt: Text = field(init=False)
|
|
23
21
|
config: CommunicationCfg | None = None
|
|
24
22
|
timestamp: datetime = FALLBACK_TIMESTAMP # TODO this default sucks (though it never happens)
|
|
25
23
|
|
|
26
|
-
def __post_init__(self):
|
|
27
|
-
super().__post_init__()
|
|
28
|
-
self.author_style = get_style_for_name(self.author_or_unknown())
|
|
29
|
-
self.author_txt = Text(self.author_or_unknown(), style=self.author_style)
|
|
30
|
-
|
|
31
24
|
def author_or_unknown(self) -> str:
|
|
32
25
|
return self.author or UNKNOWN
|
|
33
26
|
|
|
34
|
-
def
|
|
35
|
-
return
|
|
27
|
+
def author_style(self) -> str:
|
|
28
|
+
return get_style_for_name(self.author_or_unknown())
|
|
29
|
+
|
|
30
|
+
def author_txt(self) -> Text:
|
|
31
|
+
return Text(self.author_or_unknown(), style=self.author_style())
|
|
36
32
|
|
|
37
33
|
def external_links_txt(self, _style: str = '', include_alt_links: bool = True) -> Text:
|
|
38
34
|
"""Overrides super() method to apply self.author_style."""
|
|
39
|
-
return super().external_links_txt(self.author_style, include_alt_links=include_alt_links)
|
|
35
|
+
return super().external_links_txt(self.author_style(), include_alt_links=include_alt_links)
|
|
36
|
+
|
|
37
|
+
def is_attribution_uncertain(self) -> bool:
|
|
38
|
+
return bool(self.config and self.config.is_attribution_uncertain)
|
|
40
39
|
|
|
41
40
|
def summary(self) -> Text:
|
|
42
41
|
return self._summary().append(CLOSE_PROPERTIES_CHAR)
|
|
@@ -47,7 +46,4 @@ class Communication(Document):
|
|
|
47
46
|
def _summary(self) -> Text:
|
|
48
47
|
"""One line summary mostly for logging."""
|
|
49
48
|
txt = super().summary().append(', ')
|
|
50
|
-
return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown()}'", style=self.author_style)))
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
CommunicationType = TypeVar('CommunicationType', bound=Document)
|
|
49
|
+
return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown()}'", style=self.author_style())))
|
|
@@ -251,7 +251,7 @@ class Document:
|
|
|
251
251
|
def summary(self) -> Text:
|
|
252
252
|
"""Summary of this file for logging. Brackets are left open for subclasses to add stuff."""
|
|
253
253
|
txt = Text('').append(self._class_name(), style=self._class_style())
|
|
254
|
-
txt.append(f" {self.
|
|
254
|
+
txt.append(f" {self.file_path.stem}", style=FILENAME_STYLE)
|
|
255
255
|
|
|
256
256
|
if self.timestamp:
|
|
257
257
|
timestamp_str = remove_zero_time_from_timestamp_str(self.timestamp).replace('T', ' ')
|
epstein_files/documents/email.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import logging
|
|
2
3
|
import re
|
|
3
4
|
from copy import deepcopy
|
|
@@ -20,7 +21,7 @@ from epstein_files.util.constant.names import *
|
|
|
20
21
|
from epstein_files.util.constant.strings import REDACTED
|
|
21
22
|
from epstein_files.util.constants import *
|
|
22
23
|
from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes, extract_last_name,
|
|
23
|
-
flatten, remove_timezone, uniquify)
|
|
24
|
+
flatten, listify, remove_timezone, uniquify)
|
|
24
25
|
from epstein_files.util.doc_cfg import EmailCfg, Metadata
|
|
25
26
|
from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
|
|
26
27
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
@@ -42,7 +43,7 @@ LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
|
|
|
42
43
|
SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
|
|
43
44
|
REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
|
|
44
45
|
URL_SIGNIFIERS = ['gclid', 'htm', 'ref=', 'utm']
|
|
45
|
-
APPEARS_IN = '
|
|
46
|
+
APPEARS_IN = 'appears in'
|
|
46
47
|
MAX_CHARS_TO_PRINT = 4000
|
|
47
48
|
MAX_NUM_HEADER_LINES = 14
|
|
48
49
|
MAX_QUOTED_REPLIES = 2
|
|
@@ -125,15 +126,6 @@ EMAIL_SIGNATURE_REGEXES = {
|
|
|
125
126
|
UNKNOWN: re.compile(r"(This message is directed to and is for the use of the above-noted addressee only.*\nhereon\.)", re.DOTALL),
|
|
126
127
|
}
|
|
127
128
|
|
|
128
|
-
# Invalid for links to EpsteinWeb
|
|
129
|
-
JUNK_EMAILERS = [
|
|
130
|
-
'asmallworld@travel.asmallworld.net',
|
|
131
|
-
"digest-noreply@quora.com",
|
|
132
|
-
'editorialstaff@flipboard.com',
|
|
133
|
-
'How To Academy',
|
|
134
|
-
'Jokeland',
|
|
135
|
-
]
|
|
136
|
-
|
|
137
129
|
MAILING_LISTS = [
|
|
138
130
|
CAROLYN_RANGEL,
|
|
139
131
|
INTELLIGENCE_SQUARED,
|
|
@@ -152,6 +144,8 @@ TRUNCATION_LENGTHS = {
|
|
|
152
144
|
'030245': 7_500, # Epstein rationalizes his behavior in an open letter to the world
|
|
153
145
|
'030781': 1_700, # Bannon email about crypto coin issues
|
|
154
146
|
'032906': 750, # David Blaine email
|
|
147
|
+
'026036': 6000, # Gino Yu blockchain mention
|
|
148
|
+
'023208': 350_000, # Long discussion about leon black's finances
|
|
155
149
|
}
|
|
156
150
|
|
|
157
151
|
# These are long forwarded articles so we force a trim to 1,333 chars if these strings exist
|
|
@@ -276,6 +270,7 @@ USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIP
|
|
|
276
270
|
'Nancy Portland', # Lawrence Krauss CC
|
|
277
271
|
'Oliver Goodenough', # Robert Trivers CC
|
|
278
272
|
'Peter Aldhous', # Lawrence Krauss CC
|
|
273
|
+
'Players2', # Hoffenberg CC
|
|
279
274
|
'Sam Harris', # Lawrence Krauss CC
|
|
280
275
|
SAMUEL_LEFF, # Random CC
|
|
281
276
|
'Sean T Lehane', # Random CC
|
|
@@ -283,6 +278,13 @@ USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIP
|
|
|
283
278
|
'Tim Kane', # Random CC
|
|
284
279
|
'Travis Pangburn', # Random CC
|
|
285
280
|
'Vahe Stepanian', # Random CC
|
|
281
|
+
# Ross Gow BCC
|
|
282
|
+
'david.brown@thetimes.co.uk',
|
|
283
|
+
'io-anne.pugh@bbc.co.uk',
|
|
284
|
+
'martin.robinson@mailonline.co.uk',
|
|
285
|
+
'nick.alwav@bbc.co.uk'
|
|
286
|
+
'nick.sommerlad@mirror.co.uk',
|
|
287
|
+
'p.peachev@independent.co.uk',
|
|
286
288
|
]
|
|
287
289
|
|
|
288
290
|
# Emails sent by epstein to himself that are just notes
|
|
@@ -300,6 +302,50 @@ METADATA_FIELDS = [
|
|
|
300
302
|
'subject',
|
|
301
303
|
]
|
|
302
304
|
|
|
305
|
+
LINE_REPAIR_MERGES = {
|
|
306
|
+
'017523': 4,
|
|
307
|
+
'019407': [2, 4],
|
|
308
|
+
'021729': 2,
|
|
309
|
+
'022673': 9,
|
|
310
|
+
'022684': 9,
|
|
311
|
+
'022695': 4,
|
|
312
|
+
'023067': 3,
|
|
313
|
+
'025790': 2,
|
|
314
|
+
'026609': 4,
|
|
315
|
+
'026924': [2, 4],
|
|
316
|
+
'028931': [3, 6],
|
|
317
|
+
'029154': [2, 5],
|
|
318
|
+
'029163': [2, 5],
|
|
319
|
+
'029282': 2,
|
|
320
|
+
'029402': 5,
|
|
321
|
+
'029498': 2,
|
|
322
|
+
'029501': 2,
|
|
323
|
+
'029835': [2, 4],
|
|
324
|
+
'029889': 2,
|
|
325
|
+
'029976': 3,
|
|
326
|
+
'030299': [7, 10],
|
|
327
|
+
'030381': [2, 4],
|
|
328
|
+
'030384': [2, 4],
|
|
329
|
+
'030626': 2,
|
|
330
|
+
'030999': [2, 4],
|
|
331
|
+
'031384': 2,
|
|
332
|
+
'031428': 2,
|
|
333
|
+
'031442': 0,
|
|
334
|
+
'031980': [2, 4],
|
|
335
|
+
'032063': [3, 5],
|
|
336
|
+
'032272': 3,
|
|
337
|
+
'032405': 4,
|
|
338
|
+
'033097': 2,
|
|
339
|
+
'033144': [2, 4],
|
|
340
|
+
'033228': [3, 5],
|
|
341
|
+
'033357': [2, 4],
|
|
342
|
+
'033486': [7, 9],
|
|
343
|
+
'033512': 2,
|
|
344
|
+
'033575': [2, 4],
|
|
345
|
+
'033576': 3,
|
|
346
|
+
'033583': 2,
|
|
347
|
+
}
|
|
348
|
+
|
|
303
349
|
|
|
304
350
|
@dataclass
|
|
305
351
|
class Email(Communication):
|
|
@@ -318,7 +364,6 @@ class Email(Communication):
|
|
|
318
364
|
recipients: list[str | None] = field(default_factory=list)
|
|
319
365
|
sent_from_device: str | None = None
|
|
320
366
|
signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
|
|
321
|
-
_truncation_allowed: bool = True # Hacky way to get __rich_console__() not to truncate in epstein_show script
|
|
322
367
|
|
|
323
368
|
# For logging how many headers we prettified while printing, kind of janky
|
|
324
369
|
rewritten_header_ids: ClassVar[set[str]] = set([])
|
|
@@ -342,7 +387,7 @@ class Email(Communication):
|
|
|
342
387
|
self.recipients = self.config.recipients
|
|
343
388
|
else:
|
|
344
389
|
for recipient in self.header.recipients():
|
|
345
|
-
self.recipients.extend(self.
|
|
390
|
+
self.recipients.extend(self._extract_emailer_names(recipient))
|
|
346
391
|
|
|
347
392
|
if self.author in MAILING_LISTS and (len(self.recipients) == 0 or self.recipients == [self.author]):
|
|
348
393
|
self.recipients = [JEFFREY_EPSTEIN] # Assume mailing list emails are to Epstein
|
|
@@ -365,7 +410,7 @@ class Email(Communication):
|
|
|
365
410
|
|
|
366
411
|
def info_txt(self) -> Text:
|
|
367
412
|
email_type = 'fwded article' if self.is_fwded_article() else 'email'
|
|
368
|
-
txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt).append(' to ')
|
|
413
|
+
txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt()).append(' to ')
|
|
369
414
|
return txt.append(self.recipients_txt()).append(highlighter(f" probably sent at {self.timestamp}"))
|
|
370
415
|
|
|
371
416
|
def is_fwded_article(self) -> bool:
|
|
@@ -446,15 +491,23 @@ class Email(Communication):
|
|
|
446
491
|
"""Color emails from epstein to others with the color for the first recipient."""
|
|
447
492
|
if self.author == JEFFREY_EPSTEIN:
|
|
448
493
|
if len(self.recipients) == 0 or self.recipients == [None]:
|
|
449
|
-
style = self.author_style
|
|
494
|
+
style = self.author_style()
|
|
450
495
|
else:
|
|
451
496
|
style = get_style_for_name(self.recipients[0])
|
|
452
497
|
else:
|
|
453
|
-
style = self.author_style
|
|
498
|
+
style = self.author_style()
|
|
454
499
|
|
|
455
500
|
return style.replace('bold', '').strip()
|
|
456
501
|
|
|
457
|
-
def
|
|
502
|
+
def _extract_author(self) -> None:
|
|
503
|
+
self._extract_header()
|
|
504
|
+
super()._extract_author()
|
|
505
|
+
|
|
506
|
+
if not self.author and self.header.author:
|
|
507
|
+
authors = self._extract_emailer_names(self.header.author)
|
|
508
|
+
self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
|
|
509
|
+
|
|
510
|
+
def _extract_emailer_names(self, emailer_str: str) -> list[str]:
|
|
458
511
|
"""Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
|
|
459
512
|
emailer_str = EmailHeader.cleanup_str(emailer_str)
|
|
460
513
|
|
|
@@ -474,14 +527,6 @@ class Email(Communication):
|
|
|
474
527
|
names_found = names_found or [emailer_str]
|
|
475
528
|
return [_reverse_first_and_last_names(name) for name in names_found]
|
|
476
529
|
|
|
477
|
-
def _extract_author(self) -> None:
|
|
478
|
-
self._extract_header()
|
|
479
|
-
super()._extract_author()
|
|
480
|
-
|
|
481
|
-
if not self.author and self.header.author:
|
|
482
|
-
authors = self._emailer_names(self.header.author)
|
|
483
|
-
self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
|
|
484
|
-
|
|
485
530
|
def _extract_header(self) -> None:
|
|
486
531
|
"""Extract an EmailHeader object from the OCR text."""
|
|
487
532
|
header_match = EMAIL_SIMPLE_HEADER_REGEX.search(self.text)
|
|
@@ -579,44 +624,47 @@ class Email(Communication):
|
|
|
579
624
|
self._set_computed_fields(lines=[line for line in self.lines if not BAD_LINE_REGEX.match(line)])
|
|
580
625
|
old_text = self.text
|
|
581
626
|
|
|
582
|
-
if self.file_id in
|
|
583
|
-
self.
|
|
584
|
-
|
|
585
|
-
self._merge_lines(
|
|
627
|
+
if self.file_id in LINE_REPAIR_MERGES:
|
|
628
|
+
merge = LINE_REPAIR_MERGES[self.file_id]
|
|
629
|
+
merge_args = merge if isinstance(merge, list) else [merge]
|
|
630
|
+
self._merge_lines(*merge_args)
|
|
586
631
|
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
elif self.file_id in ['029498', '031428']:
|
|
592
|
-
self._merge_lines(2, 4)
|
|
593
|
-
elif self.file_id in ['029976', '023067', '033576']:
|
|
594
|
-
self._merge_lines(3) # Merge 4th and 5th rows
|
|
595
|
-
elif self.file_id in '026609 029402 032405 022695'.split():
|
|
596
|
-
self._merge_lines(4) # Merge 5th and 6th rows
|
|
597
|
-
elif self.file_id in ['019407', '031980', '030384', '033144', '030999', '033575', '029835', '030381', '033357', '026924']:
|
|
598
|
-
self._merge_lines(2, 4)
|
|
599
|
-
elif self.file_id in ['029154', '029163']:
|
|
632
|
+
# These already had 2nd line merged
|
|
633
|
+
if self.file_id in ['030626']: # Merge 6th and 7th (now 5th and 6th) rows
|
|
634
|
+
self._merge_lines(4)
|
|
635
|
+
elif self.file_id == '029889':
|
|
600
636
|
self._merge_lines(2, 5)
|
|
601
|
-
elif self.file_id in ['
|
|
602
|
-
self._merge_lines(
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
637
|
+
elif self.file_id in ['029498', '031428']:
|
|
638
|
+
self._merge_lines(2, 4)
|
|
639
|
+
|
|
640
|
+
# Multiline
|
|
641
|
+
if self.file_id == '013415':
|
|
642
|
+
for _i in range(2):
|
|
643
|
+
self._merge_lines(4)
|
|
644
|
+
elif self.file_id == '013405':
|
|
645
|
+
for _i in range(2):
|
|
646
|
+
self._merge_lines(4)
|
|
647
|
+
elif self.file_id == '029458':
|
|
648
|
+
for _i in range(3):
|
|
649
|
+
self._merge_lines(4)
|
|
650
|
+
elif self.file_id in ['025233']:
|
|
606
651
|
for _i in range(2):
|
|
607
652
|
self._merge_lines(4)
|
|
653
|
+
|
|
654
|
+
self.lines[4] = f"Attachments: {self.lines[4]}"
|
|
655
|
+
self._set_computed_fields(lines=self.lines)
|
|
656
|
+
elif self.file_id in ['023001']:
|
|
657
|
+
for _i in range(3):
|
|
658
|
+
self._merge_lines(5)
|
|
659
|
+
elif self.file_id in ['019105']:
|
|
660
|
+
for _i in range(4):
|
|
661
|
+
self._merge_lines(5)
|
|
608
662
|
elif self.file_id in ['033568']:
|
|
609
663
|
for _i in range(5):
|
|
610
664
|
self._merge_lines(5)
|
|
611
665
|
elif self.file_id in ['025329']:
|
|
612
666
|
for _i in range(9):
|
|
613
667
|
self._merge_lines(2)
|
|
614
|
-
elif self.file_id == '033486':
|
|
615
|
-
self._merge_lines(7, 9)
|
|
616
|
-
elif self.file_id == '030299':
|
|
617
|
-
self._merge_lines(7, 10)
|
|
618
|
-
elif self.file_id in ['022673', '022684']:
|
|
619
|
-
self._merge_lines(9)
|
|
620
668
|
elif self.file_id == '014860':
|
|
621
669
|
self._merge_lines(3)
|
|
622
670
|
self._merge_lines(4)
|
|
@@ -629,7 +677,15 @@ class Email(Communication):
|
|
|
629
677
|
|
|
630
678
|
self._merge_lines(4)
|
|
631
679
|
self._merge_lines(2, 4)
|
|
632
|
-
elif self.file_id
|
|
680
|
+
elif self.file_id in ['033252']:
|
|
681
|
+
for _i in range(2):
|
|
682
|
+
self._merge_lines(9)
|
|
683
|
+
elif self.file_id in ['032637']:
|
|
684
|
+
for _i in range(3):
|
|
685
|
+
self._merge_lines(9)
|
|
686
|
+
|
|
687
|
+
# Bad line removal
|
|
688
|
+
if self.file_id == '025041':
|
|
633
689
|
self._remove_line(4)
|
|
634
690
|
self._remove_line(4)
|
|
635
691
|
elif self.file_id == '029692':
|
|
@@ -679,7 +735,7 @@ class Email(Communication):
|
|
|
679
735
|
"""Copy info from original config for file this document was extracted from."""
|
|
680
736
|
if self.file_id in ALL_FILE_CONFIGS:
|
|
681
737
|
self.config = cast(EmailCfg, deepcopy(ALL_FILE_CONFIGS[self.file_id]))
|
|
682
|
-
self.warn(f"Merging existing
|
|
738
|
+
self.warn(f"Merging existing cfg for '{self.file_id}' with cfg for extracted document...")
|
|
683
739
|
else:
|
|
684
740
|
self.config = EmailCfg(id=self.file_id)
|
|
685
741
|
|
|
@@ -692,33 +748,55 @@ class Email(Communication):
|
|
|
692
748
|
extracted_description += ' email'
|
|
693
749
|
|
|
694
750
|
if self.config.description:
|
|
695
|
-
self.warn(f"Overwriting description '{self.config.description}' with extract
|
|
751
|
+
self.warn(f"Overwriting description '{self.config.description}' with extract's '{self.config.description}'")
|
|
696
752
|
|
|
697
753
|
self.config.description = extracted_description
|
|
698
754
|
|
|
699
755
|
self.config.is_interesting = self.config.is_interesting or extracted_from_doc_cfg.is_interesting
|
|
700
756
|
self.log(f"Constructed synthetic config: {self.config}")
|
|
701
757
|
|
|
702
|
-
def
|
|
703
|
-
|
|
704
|
-
yield self.file_info_panel()
|
|
705
|
-
should_rewrite_header = self.header.was_initially_empty and self.header.num_header_rows > 0
|
|
758
|
+
def _truncate_to_length(self) -> int:
|
|
759
|
+
"""When printing truncate this email to this length."""
|
|
706
760
|
quote_cutoff = self._idx_of_nth_quoted_reply(text=self.text) # Trim if there's many quoted replies
|
|
707
|
-
|
|
708
|
-
trim_footer_txt = None
|
|
709
|
-
text = self.text
|
|
761
|
+
includes_truncate_term = next((term for term in TRUNCATE_TERMS if term in self.text), None)
|
|
710
762
|
|
|
711
|
-
if
|
|
763
|
+
if args.whole_file:
|
|
764
|
+
num_chars = len(self.text)
|
|
765
|
+
elif self.file_id in TRUNCATION_LENGTHS:
|
|
712
766
|
num_chars = TRUNCATION_LENGTHS[self.file_id]
|
|
713
|
-
elif self.author in TRUNCATE_ALL_EMAILS_FROM or
|
|
767
|
+
elif self.author in TRUNCATE_ALL_EMAILS_FROM or includes_truncate_term:
|
|
714
768
|
num_chars = int(MAX_CHARS_TO_PRINT / 3)
|
|
715
769
|
elif quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
|
|
716
770
|
num_chars = quote_cutoff
|
|
771
|
+
else:
|
|
772
|
+
num_chars = MAX_CHARS_TO_PRINT
|
|
773
|
+
|
|
774
|
+
if num_chars != MAX_CHARS_TO_PRINT and not self.is_duplicate():
|
|
775
|
+
log_args = {
|
|
776
|
+
'num_chars': num_chars,
|
|
777
|
+
'author_truncate': self.author in TRUNCATE_ALL_EMAILS_FROM,
|
|
778
|
+
'is_fwded_article': self.is_fwded_article(),
|
|
779
|
+
'is_quote_cutoff': quote_cutoff == num_chars,
|
|
780
|
+
'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
|
|
781
|
+
'quote_cutoff': quote_cutoff,
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
if quote_cutoff != num_chars:
|
|
785
|
+
logger.debug(f'{self.summary()} truncating: ' + ', '.join([f"{k}={v}" for k, v in log_args.items() if v]) + '\n')
|
|
786
|
+
|
|
787
|
+
return num_chars
|
|
788
|
+
|
|
789
|
+
def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
|
|
790
|
+
logger.debug(f"Printing '{self.filename}'...")
|
|
791
|
+
should_rewrite_header = self.header.was_initially_empty and self.header.num_header_rows > 0
|
|
792
|
+
num_chars = self._truncate_to_length()
|
|
793
|
+
trim_footer_txt = None
|
|
794
|
+
text = self.text
|
|
717
795
|
|
|
718
796
|
# Truncate long emails but leave a note explaining what happened w/link to source document
|
|
719
|
-
if len(text) > num_chars
|
|
797
|
+
if len(text) > num_chars:
|
|
720
798
|
text = text[0:num_chars]
|
|
721
|
-
doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
|
|
799
|
+
doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style())
|
|
722
800
|
trim_note = f"<...trimmed to {num_chars} characters of {self.length()}, read the rest at {doc_link_markup}...>"
|
|
723
801
|
trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
|
|
724
802
|
|
|
@@ -738,38 +816,37 @@ class Email(Communication):
|
|
|
738
816
|
text = _add_line_breaks(text) # This was skipped when _prettify_text() w/a broken header so we do it now
|
|
739
817
|
self.rewritten_header_ids.add(self.file_id)
|
|
740
818
|
|
|
741
|
-
panel_txt = highlighter(text)
|
|
742
|
-
|
|
743
819
|
email_txt_panel = Panel(
|
|
744
|
-
|
|
820
|
+
highlighter(text).append('\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
|
|
745
821
|
border_style=self._border_style(),
|
|
746
822
|
expand=False,
|
|
747
823
|
subtitle=REWRITTEN_HEADER_MSG if should_rewrite_header else None,
|
|
748
824
|
)
|
|
749
825
|
|
|
826
|
+
yield self.file_info_panel()
|
|
750
827
|
yield Padding(email_txt_panel, (0, 0, 1, INFO_INDENT))
|
|
751
828
|
|
|
752
829
|
if should_rewrite_header:
|
|
753
830
|
self.log_top_lines(self.header.num_header_rows + 4, f'Original header:')
|
|
754
831
|
|
|
755
832
|
@staticmethod
|
|
756
|
-
def
|
|
757
|
-
"""Turn a set of
|
|
833
|
+
def build_emails_table(emails: list['Email'], _author: str | None, include_title: bool = False) -> Table:
|
|
834
|
+
"""Turn a set of Emails to/from a given _author into a Table."""
|
|
758
835
|
author = _author or UNKNOWN
|
|
759
836
|
|
|
760
837
|
table = Table(
|
|
761
|
-
title=f"Emails to/from {author} starting {emails[0].timestamp.date()}",
|
|
838
|
+
title=f"Emails to/from {author} starting {emails[0].timestamp.date()}" if include_title else None,
|
|
762
839
|
border_style=get_style_for_name(author, allow_bold=False),
|
|
763
840
|
header_style="bold"
|
|
764
841
|
)
|
|
765
842
|
|
|
766
843
|
table.add_column('From', justify='left')
|
|
767
844
|
table.add_column('Timestamp', justify='center')
|
|
768
|
-
table.add_column('Subject', justify='left', style='honeydew2', min_width=
|
|
845
|
+
table.add_column('Subject', justify='left', style='honeydew2', min_width=70)
|
|
769
846
|
|
|
770
847
|
for email in emails:
|
|
771
848
|
table.add_row(
|
|
772
|
-
email.author_txt,
|
|
849
|
+
email.author_txt(),
|
|
773
850
|
email.epstein_media_link(link_txt=email.timestamp_without_seconds()),
|
|
774
851
|
highlighter(email.subject())
|
|
775
852
|
)
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from dataclasses import dataclass
|
|
2
|
+
from dataclasses import dataclass, field, fields
|
|
3
3
|
from datetime import datetime
|
|
4
4
|
|
|
5
5
|
from rich.text import Text
|
|
6
6
|
|
|
7
7
|
from epstein_files.util.constant.names import JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN
|
|
8
8
|
from epstein_files.util.constant.strings import TIMESTAMP_DIM
|
|
9
|
-
from epstein_files.util.data import extract_last_name
|
|
9
|
+
from epstein_files.util.data import extract_last_name, iso_timestamp
|
|
10
10
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
11
11
|
from epstein_files.util.logging import logger
|
|
12
12
|
from epstein_files.util.rich import TEXT_LINK, highlighter
|
|
@@ -30,7 +30,7 @@ class TextMessage:
|
|
|
30
30
|
"""Class representing a single iMessage text message."""
|
|
31
31
|
author: str | None
|
|
32
32
|
author_str: str = ''
|
|
33
|
-
|
|
33
|
+
is_id_confirmed: bool = False
|
|
34
34
|
text: str
|
|
35
35
|
timestamp_str: str
|
|
36
36
|
|
|
@@ -44,38 +44,55 @@ class TextMessage:
|
|
|
44
44
|
else:
|
|
45
45
|
self.author_str = self.author_str or self.author
|
|
46
46
|
|
|
47
|
-
if not self.
|
|
47
|
+
if not self.is_id_confirmed and self.author is not None and self.author != JEFFREY_EPSTEIN:
|
|
48
48
|
self.author_str += ' (?)'
|
|
49
49
|
|
|
50
|
-
|
|
51
|
-
|
|
50
|
+
if self.is_link():
|
|
51
|
+
self.text = self.text.replace('\n', '').replace(' ', '_')
|
|
52
|
+
else:
|
|
53
|
+
self.text = self.text.replace('\n', ' ')
|
|
52
54
|
|
|
53
|
-
def
|
|
54
|
-
|
|
55
|
+
def is_link(self) -> bool:
|
|
56
|
+
return self.text.startswith('http')
|
|
55
57
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
text = self.text
|
|
58
|
+
def parse_timestamp(self) -> datetime:
|
|
59
|
+
return datetime.strptime(self.timestamp_str, MSG_DATE_FORMAT)
|
|
59
60
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
text = text.replace('\n', '', 2)
|
|
63
|
-
else:
|
|
64
|
-
text = text.replace('\n', '', 1)
|
|
61
|
+
def timestamp_txt(self) -> Text:
|
|
62
|
+
timestamp_str = self.timestamp_str
|
|
65
63
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
64
|
+
try:
|
|
65
|
+
timestamp_str = iso_timestamp(self.parse_timestamp())
|
|
66
|
+
except Exception as e:
|
|
67
|
+
logger.warning(f"Failed to parse timestamp for {self}")
|
|
69
68
|
|
|
70
|
-
|
|
71
|
-
msg_txt.append('\n' + ' '.join(lines))
|
|
72
|
-
else:
|
|
73
|
-
msg_txt = highlighter(' '.join(lines)) # remove newlines
|
|
69
|
+
return Text(f"[{timestamp_str}]", style=TIMESTAMP_DIM)
|
|
74
70
|
|
|
75
|
-
|
|
71
|
+
def _message(self) -> Text:
|
|
72
|
+
if self.is_link():
|
|
73
|
+
return Text.from_markup(f"[link={self.text}]{self.text}[/link]", style=TEXT_LINK)
|
|
74
|
+
else:
|
|
75
|
+
return highlighter(self.text)
|
|
76
76
|
|
|
77
77
|
def __rich__(self) -> Text:
|
|
78
|
-
timestamp_txt =
|
|
78
|
+
timestamp_txt = self.timestamp_txt().append(' ')
|
|
79
79
|
author_style = get_style_for_name(self.author_str if self.author_str.startswith('+') else self.author)
|
|
80
80
|
author_txt = Text(self.author_str, style=author_style)
|
|
81
81
|
return Text('').append(timestamp_txt).append(author_txt).append(': ', style='dim').append(self._message())
|
|
82
|
+
|
|
83
|
+
def __repr__(self) -> str:
|
|
84
|
+
props = []
|
|
85
|
+
add_prop = lambda k, v: props.append(f"{k}={v}")
|
|
86
|
+
|
|
87
|
+
for _field in sorted(fields(self), key=lambda f: f.name):
|
|
88
|
+
key = _field.name
|
|
89
|
+
value = getattr(self, key)
|
|
90
|
+
|
|
91
|
+
if key == 'author_str' and self.author and self.author_str.startswith(value):
|
|
92
|
+
continue
|
|
93
|
+
elif isinstance(value, str):
|
|
94
|
+
add_prop(key, f'"{value}"')
|
|
95
|
+
else:
|
|
96
|
+
add_prop(key, value)
|
|
97
|
+
|
|
98
|
+
return f"{type(self).__name__}(" + ', '.join(props) + f')'
|