epstein-files 1.0.16__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +26 -17
- epstein_files/documents/communication.py +10 -14
- epstein_files/documents/document.py +5 -1
- epstein_files/documents/email.py +164 -78
- epstein_files/documents/imessage/text_message.py +42 -25
- epstein_files/documents/messenger_log.py +31 -12
- epstein_files/documents/other_file.py +13 -12
- epstein_files/epstein_files.py +19 -80
- epstein_files/util/constant/common_words.py +3 -3
- epstein_files/util/constant/html.py +13 -6
- epstein_files/util/constant/names.py +10 -7
- epstein_files/util/constant/output_files.py +3 -0
- epstein_files/util/constant/strings.py +6 -2
- epstein_files/util/constant/urls.py +1 -1
- epstein_files/util/constants.py +18 -22
- epstein_files/util/env.py +46 -36
- epstein_files/util/file_helper.py +1 -2
- epstein_files/util/highlighted_group.py +1007 -187
- epstein_files/util/logging.py +8 -1
- epstein_files/util/output.py +166 -51
- epstein_files/util/rich.py +55 -79
- epstein_files/util/timer.py +1 -1
- epstein_files/util/word_count.py +3 -4
- {epstein_files-1.0.16.dist-info → epstein_files-1.1.2.dist-info}/METADATA +1 -1
- epstein_files-1.1.2.dist-info/RECORD +33 -0
- epstein_files-1.0.16.dist-info/RECORD +0 -33
- {epstein_files-1.0.16.dist-info → epstein_files-1.1.2.dist-info}/LICENSE +0 -0
- {epstein_files-1.0.16.dist-info → epstein_files-1.1.2.dist-info}/WHEEL +0 -0
- {epstein_files-1.0.16.dist-info → epstein_files-1.1.2.dist-info}/entry_points.txt +0 -0
epstein_files/__init__.py
CHANGED
|
@@ -16,13 +16,13 @@ from rich.text import Text
|
|
|
16
16
|
from epstein_files.epstein_files import EpsteinFiles, document_cls
|
|
17
17
|
from epstein_files.documents.document import INFO_PADDING, Document
|
|
18
18
|
from epstein_files.documents.email import Email
|
|
19
|
-
from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, TEXT_MSGS_HTML_PATH, make_clean
|
|
19
|
+
from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, CHRONOLOGICAL_EMAILS_PATH, TEXT_MSGS_HTML_PATH, make_clean
|
|
20
20
|
from epstein_files.util.env import args
|
|
21
21
|
from epstein_files.util.file_helper import coerce_file_path, extract_file_id
|
|
22
|
-
from epstein_files.util.logging import logger
|
|
22
|
+
from epstein_files.util.logging import exit_with_error, logger
|
|
23
23
|
from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
|
|
24
|
-
print_other_files_section, print_text_messages_section, write_json_metadata, write_urls)
|
|
25
|
-
from epstein_files.util.rich import build_highlighter, console,
|
|
24
|
+
print_other_files_section, print_text_messages_section, write_complete_emails_timeline, write_json_metadata, write_urls)
|
|
25
|
+
from epstein_files.util.rich import build_highlighter, console, print_color_key, print_title_page_header, print_title_page_tables, print_subtitle_panel, write_html
|
|
26
26
|
from epstein_files.util.timer import Timer
|
|
27
27
|
from epstein_files.util.word_count import write_word_counts_html
|
|
28
28
|
|
|
@@ -43,18 +43,27 @@ def generate_html() -> None:
|
|
|
43
43
|
print_json_files(epstein_files)
|
|
44
44
|
exit()
|
|
45
45
|
|
|
46
|
-
|
|
46
|
+
print_title_page_header(epstein_files)
|
|
47
|
+
|
|
48
|
+
if args.email_timeline:
|
|
49
|
+
print_color_key()
|
|
50
|
+
else:
|
|
51
|
+
print_title_page_tables(epstein_files)
|
|
47
52
|
|
|
48
53
|
if args.colors_only:
|
|
49
54
|
exit()
|
|
50
55
|
|
|
51
56
|
if args.output_texts:
|
|
52
|
-
|
|
53
|
-
|
|
57
|
+
imessage_logs = [log for log in epstein_files.imessage_logs if not args.names or log.author in args.names]
|
|
58
|
+
print_text_messages_section(imessage_logs)
|
|
59
|
+
timer.print_at_checkpoint(f'Printed {len(imessage_logs)} text message log files')
|
|
54
60
|
|
|
55
61
|
if args.output_emails:
|
|
56
62
|
emails_that_were_printed = print_emails_section(epstein_files)
|
|
57
63
|
timer.print_at_checkpoint(f"Printed {len(emails_that_were_printed):,} emails")
|
|
64
|
+
elif args.email_timeline:
|
|
65
|
+
write_complete_emails_timeline(epstein_files)
|
|
66
|
+
timer.print_at_checkpoint(f"Printed chronological emails table")
|
|
58
67
|
|
|
59
68
|
if args.output_other:
|
|
60
69
|
if args.uninteresting:
|
|
@@ -66,7 +75,14 @@ def generate_html() -> None:
|
|
|
66
75
|
timer.print_at_checkpoint(f"Printed {len(files)} other files (skipped {len(epstein_files.other_files) - len(files)})")
|
|
67
76
|
|
|
68
77
|
# Save output
|
|
69
|
-
|
|
78
|
+
if args.all_emails:
|
|
79
|
+
output_path = ALL_EMAILS_PATH
|
|
80
|
+
elif args.email_timeline:
|
|
81
|
+
output_path = CHRONOLOGICAL_EMAILS_PATH
|
|
82
|
+
else:
|
|
83
|
+
output_path = TEXT_MSGS_HTML_PATH
|
|
84
|
+
|
|
85
|
+
write_html(output_path)
|
|
70
86
|
logger.warning(f"Total time: {timer.seconds_since_start_str()}")
|
|
71
87
|
|
|
72
88
|
# JSON stats (mostly used for building pytest checks)
|
|
@@ -88,15 +104,12 @@ def epstein_search():
|
|
|
88
104
|
temp_highlighter = build_highlighter(search_term)
|
|
89
105
|
search_results = epstein_files.docs_matching(search_term, args.names)
|
|
90
106
|
console.line(2)
|
|
91
|
-
|
|
107
|
+
print_subtitle_panel(f"Found {len(search_results)} documents matching '{search_term}'", padding=(0, 0, 0, 3))
|
|
92
108
|
|
|
93
109
|
for search_result in search_results:
|
|
94
110
|
console.line()
|
|
95
111
|
|
|
96
112
|
if args.whole_file:
|
|
97
|
-
if isinstance(search_result.document, Email):
|
|
98
|
-
search_result.document._truncation_allowed = False
|
|
99
|
-
|
|
100
113
|
console.print(search_result.document)
|
|
101
114
|
else:
|
|
102
115
|
console.print(search_result.document.summary_panel())
|
|
@@ -115,9 +128,6 @@ def epstein_show():
|
|
|
115
128
|
console.line()
|
|
116
129
|
|
|
117
130
|
for doc in docs:
|
|
118
|
-
if isinstance(doc, Email):
|
|
119
|
-
doc._truncation_allowed = False
|
|
120
|
-
|
|
121
131
|
console.print('\n', doc, '\n')
|
|
122
132
|
|
|
123
133
|
if args.raw:
|
|
@@ -135,5 +145,4 @@ def epstein_word_count() -> None:
|
|
|
135
145
|
|
|
136
146
|
def _assert_positional_args():
|
|
137
147
|
if not args.positional_args:
|
|
138
|
-
|
|
139
|
-
exit(1)
|
|
148
|
+
exit_with_error(f"No positional args provided!\n")
|
|
@@ -18,25 +18,24 @@ TIMESTAMP_SECONDS_REGEX = re.compile(r":\d{2}$")
|
|
|
18
18
|
@dataclass
|
|
19
19
|
class Communication(Document):
|
|
20
20
|
"""Superclass for Email and MessengerLog."""
|
|
21
|
-
author_style: str = 'white'
|
|
22
|
-
author_txt: Text = field(init=False)
|
|
23
21
|
config: CommunicationCfg | None = None
|
|
24
22
|
timestamp: datetime = FALLBACK_TIMESTAMP # TODO this default sucks (though it never happens)
|
|
25
23
|
|
|
26
|
-
def __post_init__(self):
|
|
27
|
-
super().__post_init__()
|
|
28
|
-
self.author_style = get_style_for_name(self.author_or_unknown())
|
|
29
|
-
self.author_txt = Text(self.author_or_unknown(), style=self.author_style)
|
|
30
|
-
|
|
31
24
|
def author_or_unknown(self) -> str:
|
|
32
25
|
return self.author or UNKNOWN
|
|
33
26
|
|
|
34
|
-
def
|
|
35
|
-
return
|
|
27
|
+
def author_style(self) -> str:
|
|
28
|
+
return get_style_for_name(self.author_or_unknown())
|
|
29
|
+
|
|
30
|
+
def author_txt(self) -> Text:
|
|
31
|
+
return Text(self.author_or_unknown(), style=self.author_style())
|
|
36
32
|
|
|
37
33
|
def external_links_txt(self, _style: str = '', include_alt_links: bool = True) -> Text:
|
|
38
34
|
"""Overrides super() method to apply self.author_style."""
|
|
39
|
-
return super().external_links_txt(self.author_style, include_alt_links=include_alt_links)
|
|
35
|
+
return super().external_links_txt(self.author_style(), include_alt_links=include_alt_links)
|
|
36
|
+
|
|
37
|
+
def is_attribution_uncertain(self) -> bool:
|
|
38
|
+
return bool(self.config and self.config.is_attribution_uncertain)
|
|
40
39
|
|
|
41
40
|
def summary(self) -> Text:
|
|
42
41
|
return self._summary().append(CLOSE_PROPERTIES_CHAR)
|
|
@@ -47,7 +46,4 @@ class Communication(Document):
|
|
|
47
46
|
def _summary(self) -> Text:
|
|
48
47
|
"""One line summary mostly for logging."""
|
|
49
48
|
txt = super().summary().append(', ')
|
|
50
|
-
return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown()}'", style=self.author_style)))
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
CommunicationType = TypeVar('CommunicationType', bound=Document)
|
|
49
|
+
return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown()}'", style=self.author_style())))
|
|
@@ -244,10 +244,14 @@ class Document:
|
|
|
244
244
|
|
|
245
245
|
return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
|
|
246
246
|
|
|
247
|
+
def source_file_id(self) -> str:
|
|
248
|
+
"""Strip off the _1, _2, etc. suffixes for extracted documents."""
|
|
249
|
+
return self.file_id[0:6]
|
|
250
|
+
|
|
247
251
|
def summary(self) -> Text:
|
|
248
252
|
"""Summary of this file for logging. Brackets are left open for subclasses to add stuff."""
|
|
249
253
|
txt = Text('').append(self._class_name(), style=self._class_style())
|
|
250
|
-
txt.append(f" {self.
|
|
254
|
+
txt.append(f" {self.file_path.stem}", style=FILENAME_STYLE)
|
|
251
255
|
|
|
252
256
|
if self.timestamp:
|
|
253
257
|
timestamp_str = remove_zero_time_from_timestamp_str(self.timestamp).replace('T', ' ')
|
epstein_files/documents/email.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import logging
|
|
2
3
|
import re
|
|
3
4
|
from copy import deepcopy
|
|
@@ -20,7 +21,7 @@ from epstein_files.util.constant.names import *
|
|
|
20
21
|
from epstein_files.util.constant.strings import REDACTED
|
|
21
22
|
from epstein_files.util.constants import *
|
|
22
23
|
from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes, extract_last_name,
|
|
23
|
-
flatten, remove_timezone, uniquify)
|
|
24
|
+
flatten, listify, remove_timezone, uniquify)
|
|
24
25
|
from epstein_files.util.doc_cfg import EmailCfg, Metadata
|
|
25
26
|
from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
|
|
26
27
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
@@ -42,7 +43,7 @@ LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
|
|
|
42
43
|
SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
|
|
43
44
|
REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
|
|
44
45
|
URL_SIGNIFIERS = ['gclid', 'htm', 'ref=', 'utm']
|
|
45
|
-
APPEARS_IN = '
|
|
46
|
+
APPEARS_IN = 'appears in'
|
|
46
47
|
MAX_CHARS_TO_PRINT = 4000
|
|
47
48
|
MAX_NUM_HEADER_LINES = 14
|
|
48
49
|
MAX_QUOTED_REPLIES = 2
|
|
@@ -152,6 +153,8 @@ TRUNCATION_LENGTHS = {
|
|
|
152
153
|
'030245': 7_500, # Epstein rationalizes his behavior in an open letter to the world
|
|
153
154
|
'030781': 1_700, # Bannon email about crypto coin issues
|
|
154
155
|
'032906': 750, # David Blaine email
|
|
156
|
+
'026036': 6000, # Gino Yu blockchain mention
|
|
157
|
+
'023208': 350_000, # Long discussion about leon black's finances
|
|
155
158
|
}
|
|
156
159
|
|
|
157
160
|
# These are long forwarded articles so we force a trim to 1,333 chars if these strings exist
|
|
@@ -276,6 +279,7 @@ USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIP
|
|
|
276
279
|
'Nancy Portland', # Lawrence Krauss CC
|
|
277
280
|
'Oliver Goodenough', # Robert Trivers CC
|
|
278
281
|
'Peter Aldhous', # Lawrence Krauss CC
|
|
282
|
+
'Players2', # Hoffenberg CC
|
|
279
283
|
'Sam Harris', # Lawrence Krauss CC
|
|
280
284
|
SAMUEL_LEFF, # Random CC
|
|
281
285
|
'Sean T Lehane', # Random CC
|
|
@@ -283,6 +287,13 @@ USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIP
|
|
|
283
287
|
'Tim Kane', # Random CC
|
|
284
288
|
'Travis Pangburn', # Random CC
|
|
285
289
|
'Vahe Stepanian', # Random CC
|
|
290
|
+
# Ross Gow BCC
|
|
291
|
+
'david.brown@thetimes.co.uk',
|
|
292
|
+
'io-anne.pugh@bbc.co.uk',
|
|
293
|
+
'martin.robinson@mailonline.co.uk',
|
|
294
|
+
'nick.alwav@bbc.co.uk'
|
|
295
|
+
'nick.sommerlad@mirror.co.uk',
|
|
296
|
+
'p.peachev@independent.co.uk',
|
|
286
297
|
]
|
|
287
298
|
|
|
288
299
|
# Emails sent by epstein to himself that are just notes
|
|
@@ -300,6 +311,50 @@ METADATA_FIELDS = [
|
|
|
300
311
|
'subject',
|
|
301
312
|
]
|
|
302
313
|
|
|
314
|
+
LINE_REPAIR_MERGES = {
|
|
315
|
+
'017523': 4,
|
|
316
|
+
'019407': [2, 4],
|
|
317
|
+
'021729': 2,
|
|
318
|
+
'022673': 9,
|
|
319
|
+
'022684': 9,
|
|
320
|
+
'022695': 4,
|
|
321
|
+
'023067': 3,
|
|
322
|
+
'025790': 2,
|
|
323
|
+
'026609': 4,
|
|
324
|
+
'026924': [2, 4],
|
|
325
|
+
'028931': [3, 6],
|
|
326
|
+
'029154': [2, 5],
|
|
327
|
+
'029163': [2, 5],
|
|
328
|
+
'029282': 2,
|
|
329
|
+
'029402': 5,
|
|
330
|
+
'029498': 2,
|
|
331
|
+
'029501': 2,
|
|
332
|
+
'029835': [2, 4],
|
|
333
|
+
'029889': 2,
|
|
334
|
+
'029976': 3,
|
|
335
|
+
'030299': [7, 10],
|
|
336
|
+
'030381': [2, 4],
|
|
337
|
+
'030384': [2, 4],
|
|
338
|
+
'030626': 2,
|
|
339
|
+
'030999': [2, 4],
|
|
340
|
+
'031384': 2,
|
|
341
|
+
'031428': 2,
|
|
342
|
+
'031442': 0,
|
|
343
|
+
'031980': [2, 4],
|
|
344
|
+
'032063': [3, 5],
|
|
345
|
+
'032272': 3,
|
|
346
|
+
'032405': 4,
|
|
347
|
+
'033097': 2,
|
|
348
|
+
'033144': [2, 4],
|
|
349
|
+
'033228': [3, 5],
|
|
350
|
+
'033357': [2, 4],
|
|
351
|
+
'033486': [7, 9],
|
|
352
|
+
'033512': 2,
|
|
353
|
+
'033575': [2, 4],
|
|
354
|
+
'033576': 3,
|
|
355
|
+
'033583': 2,
|
|
356
|
+
}
|
|
357
|
+
|
|
303
358
|
|
|
304
359
|
@dataclass
|
|
305
360
|
class Email(Communication):
|
|
@@ -318,7 +373,6 @@ class Email(Communication):
|
|
|
318
373
|
recipients: list[str | None] = field(default_factory=list)
|
|
319
374
|
sent_from_device: str | None = None
|
|
320
375
|
signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
|
|
321
|
-
_truncation_allowed: bool = True # Hacky way to get __rich_console__() not to truncate in epstein_show script
|
|
322
376
|
|
|
323
377
|
# For logging how many headers we prettified while printing, kind of janky
|
|
324
378
|
rewritten_header_ids: ClassVar[set[str]] = set([])
|
|
@@ -342,7 +396,7 @@ class Email(Communication):
|
|
|
342
396
|
self.recipients = self.config.recipients
|
|
343
397
|
else:
|
|
344
398
|
for recipient in self.header.recipients():
|
|
345
|
-
self.recipients.extend(self.
|
|
399
|
+
self.recipients.extend(self._extract_emailer_names(recipient))
|
|
346
400
|
|
|
347
401
|
if self.author in MAILING_LISTS and (len(self.recipients) == 0 or self.recipients == [self.author]):
|
|
348
402
|
self.recipients = [JEFFREY_EPSTEIN] # Assume mailing list emails are to Epstein
|
|
@@ -365,8 +419,8 @@ class Email(Communication):
|
|
|
365
419
|
|
|
366
420
|
def info_txt(self) -> Text:
|
|
367
421
|
email_type = 'fwded article' if self.is_fwded_article() else 'email'
|
|
368
|
-
txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt).append(' to ')
|
|
369
|
-
return txt.append(self.
|
|
422
|
+
txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt()).append(' to ')
|
|
423
|
+
return txt.append(self.recipients_txt()).append(highlighter(f" probably sent at {self.timestamp}"))
|
|
370
424
|
|
|
371
425
|
def is_fwded_article(self) -> bool:
|
|
372
426
|
return bool(self.config and self.config.is_fwded_article)
|
|
@@ -382,6 +436,16 @@ class Email(Communication):
|
|
|
382
436
|
metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
|
|
383
437
|
return metadata
|
|
384
438
|
|
|
439
|
+
def recipients_txt(self, max_full_names: int = 2) -> Text:
|
|
440
|
+
"""Text object with comma separated colored versions of all recipients."""
|
|
441
|
+
recipients = [r or UNKNOWN for r in self.recipients] if len(self.recipients) > 0 else [UNKNOWN]
|
|
442
|
+
|
|
443
|
+
# Use just the last name for each recipient if there's 3 or more recipients
|
|
444
|
+
return join_texts([
|
|
445
|
+
Text(r if len(recipients) <= max_full_names else extract_last_name(r), style=get_style_for_name(r))
|
|
446
|
+
for r in recipients
|
|
447
|
+
], join=', ')
|
|
448
|
+
|
|
385
449
|
def subject(self) -> str:
|
|
386
450
|
return self.header.subject or ''
|
|
387
451
|
|
|
@@ -390,7 +454,7 @@ class Email(Communication):
|
|
|
390
454
|
txt = self._summary()
|
|
391
455
|
|
|
392
456
|
if len(self.recipients) > 0:
|
|
393
|
-
txt.append(', ').append(key_value_txt('recipients', self.
|
|
457
|
+
txt.append(', ').append(key_value_txt('recipients', self.recipients_txt()))
|
|
394
458
|
|
|
395
459
|
return txt.append(CLOSE_PROPERTIES_CHAR)
|
|
396
460
|
|
|
@@ -436,15 +500,23 @@ class Email(Communication):
|
|
|
436
500
|
"""Color emails from epstein to others with the color for the first recipient."""
|
|
437
501
|
if self.author == JEFFREY_EPSTEIN:
|
|
438
502
|
if len(self.recipients) == 0 or self.recipients == [None]:
|
|
439
|
-
style = self.author_style
|
|
503
|
+
style = self.author_style()
|
|
440
504
|
else:
|
|
441
505
|
style = get_style_for_name(self.recipients[0])
|
|
442
506
|
else:
|
|
443
|
-
style = self.author_style
|
|
507
|
+
style = self.author_style()
|
|
444
508
|
|
|
445
509
|
return style.replace('bold', '').strip()
|
|
446
510
|
|
|
447
|
-
def
|
|
511
|
+
def _extract_author(self) -> None:
|
|
512
|
+
self._extract_header()
|
|
513
|
+
super()._extract_author()
|
|
514
|
+
|
|
515
|
+
if not self.author and self.header.author:
|
|
516
|
+
authors = self._extract_emailer_names(self.header.author)
|
|
517
|
+
self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
|
|
518
|
+
|
|
519
|
+
def _extract_emailer_names(self, emailer_str: str) -> list[str]:
|
|
448
520
|
"""Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
|
|
449
521
|
emailer_str = EmailHeader.cleanup_str(emailer_str)
|
|
450
522
|
|
|
@@ -464,14 +536,6 @@ class Email(Communication):
|
|
|
464
536
|
names_found = names_found or [emailer_str]
|
|
465
537
|
return [_reverse_first_and_last_names(name) for name in names_found]
|
|
466
538
|
|
|
467
|
-
def _extract_author(self) -> None:
|
|
468
|
-
self._extract_header()
|
|
469
|
-
super()._extract_author()
|
|
470
|
-
|
|
471
|
-
if not self.author and self.header.author:
|
|
472
|
-
authors = self._emailer_names(self.header.author)
|
|
473
|
-
self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
|
|
474
|
-
|
|
475
539
|
def _extract_header(self) -> None:
|
|
476
540
|
"""Extract an EmailHeader object from the OCR text."""
|
|
477
541
|
header_match = EMAIL_SIMPLE_HEADER_REGEX.search(self.text)
|
|
@@ -553,16 +617,6 @@ class Email(Communication):
|
|
|
553
617
|
|
|
554
618
|
return collapse_newlines(text).strip()
|
|
555
619
|
|
|
556
|
-
def _recipients_txt(self) -> Text:
|
|
557
|
-
"""Text object with comma separated colored versions of all recipients."""
|
|
558
|
-
recipients = [r or UNKNOWN for r in self.recipients] if len(self.recipients) > 0 else [UNKNOWN]
|
|
559
|
-
|
|
560
|
-
# Use just the last name for each recipient if there's 3 or more recipients
|
|
561
|
-
return join_texts([
|
|
562
|
-
Text(r if len(recipients) < 3 else extract_last_name(r), style=get_style_for_name(r))
|
|
563
|
-
for r in recipients
|
|
564
|
-
], join=', ')
|
|
565
|
-
|
|
566
620
|
def _remove_line(self, idx: int) -> None:
|
|
567
621
|
"""Remove a line from self.lines."""
|
|
568
622
|
num_lines = idx * 2
|
|
@@ -579,44 +633,47 @@ class Email(Communication):
|
|
|
579
633
|
self._set_computed_fields(lines=[line for line in self.lines if not BAD_LINE_REGEX.match(line)])
|
|
580
634
|
old_text = self.text
|
|
581
635
|
|
|
582
|
-
if self.file_id in
|
|
583
|
-
self.
|
|
584
|
-
|
|
585
|
-
self._merge_lines(
|
|
636
|
+
if self.file_id in LINE_REPAIR_MERGES:
|
|
637
|
+
merge = LINE_REPAIR_MERGES[self.file_id]
|
|
638
|
+
merge_args = merge if isinstance(merge, list) else [merge]
|
|
639
|
+
self._merge_lines(*merge_args)
|
|
586
640
|
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
elif self.file_id in ['029498', '031428']:
|
|
592
|
-
self._merge_lines(2, 4)
|
|
593
|
-
elif self.file_id in ['029976', '023067', '033576']:
|
|
594
|
-
self._merge_lines(3) # Merge 4th and 5th rows
|
|
595
|
-
elif self.file_id in '026609 029402 032405 022695'.split():
|
|
596
|
-
self._merge_lines(4) # Merge 5th and 6th rows
|
|
597
|
-
elif self.file_id in ['019407', '031980', '030384', '033144', '030999', '033575', '029835', '030381', '033357', '026924']:
|
|
598
|
-
self._merge_lines(2, 4)
|
|
599
|
-
elif self.file_id in ['029154', '029163']:
|
|
641
|
+
# These already had 2nd line merged
|
|
642
|
+
if self.file_id in ['030626']: # Merge 6th and 7th (now 5th and 6th) rows
|
|
643
|
+
self._merge_lines(4)
|
|
644
|
+
elif self.file_id == '029889':
|
|
600
645
|
self._merge_lines(2, 5)
|
|
601
|
-
elif self.file_id in ['
|
|
602
|
-
self._merge_lines(
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
646
|
+
elif self.file_id in ['029498', '031428']:
|
|
647
|
+
self._merge_lines(2, 4)
|
|
648
|
+
|
|
649
|
+
# Multiline
|
|
650
|
+
if self.file_id == '013415':
|
|
651
|
+
for _i in range(2):
|
|
652
|
+
self._merge_lines(4)
|
|
653
|
+
elif self.file_id == '013405':
|
|
654
|
+
for _i in range(2):
|
|
655
|
+
self._merge_lines(4)
|
|
656
|
+
elif self.file_id == '029458':
|
|
657
|
+
for _i in range(3):
|
|
658
|
+
self._merge_lines(4)
|
|
659
|
+
elif self.file_id in ['025233']:
|
|
606
660
|
for _i in range(2):
|
|
607
661
|
self._merge_lines(4)
|
|
662
|
+
|
|
663
|
+
self.lines[4] = f"Attachments: {self.lines[4]}"
|
|
664
|
+
self._set_computed_fields(lines=self.lines)
|
|
665
|
+
elif self.file_id in ['023001']:
|
|
666
|
+
for _i in range(3):
|
|
667
|
+
self._merge_lines(5)
|
|
668
|
+
elif self.file_id in ['019105']:
|
|
669
|
+
for _i in range(4):
|
|
670
|
+
self._merge_lines(5)
|
|
608
671
|
elif self.file_id in ['033568']:
|
|
609
672
|
for _i in range(5):
|
|
610
673
|
self._merge_lines(5)
|
|
611
674
|
elif self.file_id in ['025329']:
|
|
612
675
|
for _i in range(9):
|
|
613
676
|
self._merge_lines(2)
|
|
614
|
-
elif self.file_id == '033486':
|
|
615
|
-
self._merge_lines(7, 9)
|
|
616
|
-
elif self.file_id == '030299':
|
|
617
|
-
self._merge_lines(7, 10)
|
|
618
|
-
elif self.file_id in ['022673', '022684']:
|
|
619
|
-
self._merge_lines(9)
|
|
620
677
|
elif self.file_id == '014860':
|
|
621
678
|
self._merge_lines(3)
|
|
622
679
|
self._merge_lines(4)
|
|
@@ -629,7 +686,15 @@ class Email(Communication):
|
|
|
629
686
|
|
|
630
687
|
self._merge_lines(4)
|
|
631
688
|
self._merge_lines(2, 4)
|
|
632
|
-
elif self.file_id
|
|
689
|
+
elif self.file_id in ['033252']:
|
|
690
|
+
for _i in range(2):
|
|
691
|
+
self._merge_lines(9)
|
|
692
|
+
elif self.file_id in ['032637']:
|
|
693
|
+
for _i in range(3):
|
|
694
|
+
self._merge_lines(9)
|
|
695
|
+
|
|
696
|
+
# Bad line removal
|
|
697
|
+
if self.file_id == '025041':
|
|
633
698
|
self._remove_line(4)
|
|
634
699
|
self._remove_line(4)
|
|
635
700
|
elif self.file_id == '029692':
|
|
@@ -679,7 +744,7 @@ class Email(Communication):
|
|
|
679
744
|
"""Copy info from original config for file this document was extracted from."""
|
|
680
745
|
if self.file_id in ALL_FILE_CONFIGS:
|
|
681
746
|
self.config = cast(EmailCfg, deepcopy(ALL_FILE_CONFIGS[self.file_id]))
|
|
682
|
-
self.warn(f"Merging existing
|
|
747
|
+
self.warn(f"Merging existing cfg for '{self.file_id}' with cfg for extracted document...")
|
|
683
748
|
else:
|
|
684
749
|
self.config = EmailCfg(id=self.file_id)
|
|
685
750
|
|
|
@@ -692,33 +757,55 @@ class Email(Communication):
|
|
|
692
757
|
extracted_description += ' email'
|
|
693
758
|
|
|
694
759
|
if self.config.description:
|
|
695
|
-
self.warn(f"Overwriting description '{self.config.description}' with extract
|
|
760
|
+
self.warn(f"Overwriting description '{self.config.description}' with extract's '{self.config.description}'")
|
|
696
761
|
|
|
697
762
|
self.config.description = extracted_description
|
|
698
763
|
|
|
699
764
|
self.config.is_interesting = self.config.is_interesting or extracted_from_doc_cfg.is_interesting
|
|
700
765
|
self.log(f"Constructed synthetic config: {self.config}")
|
|
701
766
|
|
|
702
|
-
def
|
|
703
|
-
|
|
704
|
-
yield self.file_info_panel()
|
|
705
|
-
should_rewrite_header = self.header.was_initially_empty and self.header.num_header_rows > 0
|
|
767
|
+
def _truncate_to_length(self) -> int:
|
|
768
|
+
"""When printing truncate this email to this length."""
|
|
706
769
|
quote_cutoff = self._idx_of_nth_quoted_reply(text=self.text) # Trim if there's many quoted replies
|
|
707
|
-
|
|
708
|
-
trim_footer_txt = None
|
|
709
|
-
text = self.text
|
|
770
|
+
includes_truncate_term = next((term for term in TRUNCATE_TERMS if term in self.text), None)
|
|
710
771
|
|
|
711
|
-
if
|
|
772
|
+
if args.whole_file:
|
|
773
|
+
num_chars = len(self.text)
|
|
774
|
+
elif self.file_id in TRUNCATION_LENGTHS:
|
|
712
775
|
num_chars = TRUNCATION_LENGTHS[self.file_id]
|
|
713
|
-
elif self.author in TRUNCATE_ALL_EMAILS_FROM or
|
|
776
|
+
elif self.author in TRUNCATE_ALL_EMAILS_FROM or includes_truncate_term:
|
|
714
777
|
num_chars = int(MAX_CHARS_TO_PRINT / 3)
|
|
715
778
|
elif quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
|
|
716
779
|
num_chars = quote_cutoff
|
|
780
|
+
else:
|
|
781
|
+
num_chars = MAX_CHARS_TO_PRINT
|
|
782
|
+
|
|
783
|
+
if num_chars != MAX_CHARS_TO_PRINT and not self.is_duplicate():
|
|
784
|
+
log_args = {
|
|
785
|
+
'num_chars': num_chars,
|
|
786
|
+
'author_truncate': self.author in TRUNCATE_ALL_EMAILS_FROM,
|
|
787
|
+
'is_fwded_article': self.is_fwded_article(),
|
|
788
|
+
'is_quote_cutoff': quote_cutoff == num_chars,
|
|
789
|
+
'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
|
|
790
|
+
'quote_cutoff': quote_cutoff,
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
if quote_cutoff != num_chars:
|
|
794
|
+
logger.debug(f'{self.summary()} truncating: ' + ', '.join([f"{k}={v}" for k, v in log_args.items() if v]) + '\n')
|
|
795
|
+
|
|
796
|
+
return num_chars
|
|
797
|
+
|
|
798
|
+
def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
|
|
799
|
+
logger.debug(f"Printing '{self.filename}'...")
|
|
800
|
+
should_rewrite_header = self.header.was_initially_empty and self.header.num_header_rows > 0
|
|
801
|
+
num_chars = self._truncate_to_length()
|
|
802
|
+
trim_footer_txt = None
|
|
803
|
+
text = self.text
|
|
717
804
|
|
|
718
805
|
# Truncate long emails but leave a note explaining what happened w/link to source document
|
|
719
|
-
if len(text) > num_chars
|
|
806
|
+
if len(text) > num_chars:
|
|
720
807
|
text = text[0:num_chars]
|
|
721
|
-
doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
|
|
808
|
+
doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style())
|
|
722
809
|
trim_note = f"<...trimmed to {num_chars} characters of {self.length()}, read the rest at {doc_link_markup}...>"
|
|
723
810
|
trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
|
|
724
811
|
|
|
@@ -738,38 +825,37 @@ class Email(Communication):
|
|
|
738
825
|
text = _add_line_breaks(text) # This was skipped when _prettify_text() w/a broken header so we do it now
|
|
739
826
|
self.rewritten_header_ids.add(self.file_id)
|
|
740
827
|
|
|
741
|
-
panel_txt = highlighter(text)
|
|
742
|
-
|
|
743
828
|
email_txt_panel = Panel(
|
|
744
|
-
|
|
829
|
+
highlighter(text).append('\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
|
|
745
830
|
border_style=self._border_style(),
|
|
746
831
|
expand=False,
|
|
747
832
|
subtitle=REWRITTEN_HEADER_MSG if should_rewrite_header else None,
|
|
748
833
|
)
|
|
749
834
|
|
|
835
|
+
yield self.file_info_panel()
|
|
750
836
|
yield Padding(email_txt_panel, (0, 0, 1, INFO_INDENT))
|
|
751
837
|
|
|
752
838
|
if should_rewrite_header:
|
|
753
839
|
self.log_top_lines(self.header.num_header_rows + 4, f'Original header:')
|
|
754
840
|
|
|
755
841
|
@staticmethod
|
|
756
|
-
def
|
|
757
|
-
"""Turn a set of
|
|
842
|
+
def build_emails_table(emails: list['Email'], _author: str | None, include_title: bool = False) -> Table:
|
|
843
|
+
"""Turn a set of Emails to/from a given _author into a Table."""
|
|
758
844
|
author = _author or UNKNOWN
|
|
759
845
|
|
|
760
846
|
table = Table(
|
|
761
|
-
title=f"Emails to/from {author} starting {emails[0].timestamp.date()}",
|
|
847
|
+
title=f"Emails to/from {author} starting {emails[0].timestamp.date()}" if include_title else None,
|
|
762
848
|
border_style=get_style_for_name(author, allow_bold=False),
|
|
763
849
|
header_style="bold"
|
|
764
850
|
)
|
|
765
851
|
|
|
766
852
|
table.add_column('From', justify='left')
|
|
767
853
|
table.add_column('Timestamp', justify='center')
|
|
768
|
-
table.add_column('Subject', justify='left', style='honeydew2', min_width=
|
|
854
|
+
table.add_column('Subject', justify='left', style='honeydew2', min_width=70)
|
|
769
855
|
|
|
770
856
|
for email in emails:
|
|
771
857
|
table.add_row(
|
|
772
|
-
email.author_txt,
|
|
858
|
+
email.author_txt(),
|
|
773
859
|
email.epstein_media_link(link_txt=email.timestamp_without_seconds()),
|
|
774
860
|
highlighter(email.subject())
|
|
775
861
|
)
|