epstein-files 1.1.0__tar.gz → 1.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {epstein_files-1.1.0 → epstein_files-1.1.2}/PKG-INFO +1 -1
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/__init__.py +10 -14
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/documents/communication.py +10 -14
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/documents/document.py +1 -1
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/documents/email.py +152 -66
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/documents/imessage/text_message.py +42 -25
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/documents/messenger_log.py +31 -12
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/documents/other_file.py +13 -12
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/epstein_files.py +18 -79
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/util/constant/common_words.py +3 -3
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/util/constant/html.py +4 -5
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/util/constant/names.py +9 -6
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/util/constant/strings.py +6 -2
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/util/constant/urls.py +1 -1
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/util/constants.py +18 -22
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/util/env.py +45 -36
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/util/file_helper.py +1 -2
- epstein_files-1.1.2/epstein_files/util/highlighted_group.py +1513 -0
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/util/logging.py +8 -1
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/util/output.py +147 -60
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/util/rich.py +33 -67
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/util/timer.py +1 -1
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/util/word_count.py +3 -4
- {epstein_files-1.1.0 → epstein_files-1.1.2}/pyproject.toml +1 -1
- epstein_files-1.1.0/epstein_files/util/highlighted_group.py +0 -695
- {epstein_files-1.1.0 → epstein_files-1.1.2}/LICENSE +0 -0
- {epstein_files-1.1.0 → epstein_files-1.1.2}/README.md +0 -0
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/documents/emails/email_header.py +0 -0
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/documents/json_file.py +0 -0
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/util/constant/output_files.py +0 -0
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/util/data.py +0 -0
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/util/doc_cfg.py +0 -0
- {epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/util/search_result.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: epstein-files
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.2
|
|
4
4
|
Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
|
|
5
5
|
Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
|
|
6
6
|
License: GPL-3.0-or-later
|
|
@@ -19,10 +19,10 @@ from epstein_files.documents.email import Email
|
|
|
19
19
|
from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, CHRONOLOGICAL_EMAILS_PATH, TEXT_MSGS_HTML_PATH, make_clean
|
|
20
20
|
from epstein_files.util.env import args
|
|
21
21
|
from epstein_files.util.file_helper import coerce_file_path, extract_file_id
|
|
22
|
-
from epstein_files.util.logging import logger
|
|
22
|
+
from epstein_files.util.logging import exit_with_error, logger
|
|
23
23
|
from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
|
|
24
24
|
print_other_files_section, print_text_messages_section, write_complete_emails_timeline, write_json_metadata, write_urls)
|
|
25
|
-
from epstein_files.util.rich import build_highlighter, console, print_title_page_header, print_title_page_tables,
|
|
25
|
+
from epstein_files.util.rich import build_highlighter, console, print_color_key, print_title_page_header, print_title_page_tables, print_subtitle_panel, write_html
|
|
26
26
|
from epstein_files.util.timer import Timer
|
|
27
27
|
from epstein_files.util.word_count import write_word_counts_html
|
|
28
28
|
|
|
@@ -45,15 +45,18 @@ def generate_html() -> None:
|
|
|
45
45
|
|
|
46
46
|
print_title_page_header(epstein_files)
|
|
47
47
|
|
|
48
|
-
if
|
|
48
|
+
if args.email_timeline:
|
|
49
|
+
print_color_key()
|
|
50
|
+
else:
|
|
49
51
|
print_title_page_tables(epstein_files)
|
|
50
52
|
|
|
51
53
|
if args.colors_only:
|
|
52
54
|
exit()
|
|
53
55
|
|
|
54
56
|
if args.output_texts:
|
|
55
|
-
|
|
56
|
-
|
|
57
|
+
imessage_logs = [log for log in epstein_files.imessage_logs if not args.names or log.author in args.names]
|
|
58
|
+
print_text_messages_section(imessage_logs)
|
|
59
|
+
timer.print_at_checkpoint(f'Printed {len(imessage_logs)} text message log files')
|
|
57
60
|
|
|
58
61
|
if args.output_emails:
|
|
59
62
|
emails_that_were_printed = print_emails_section(epstein_files)
|
|
@@ -101,15 +104,12 @@ def epstein_search():
|
|
|
101
104
|
temp_highlighter = build_highlighter(search_term)
|
|
102
105
|
search_results = epstein_files.docs_matching(search_term, args.names)
|
|
103
106
|
console.line(2)
|
|
104
|
-
|
|
107
|
+
print_subtitle_panel(f"Found {len(search_results)} documents matching '{search_term}'", padding=(0, 0, 0, 3))
|
|
105
108
|
|
|
106
109
|
for search_result in search_results:
|
|
107
110
|
console.line()
|
|
108
111
|
|
|
109
112
|
if args.whole_file:
|
|
110
|
-
if isinstance(search_result.document, Email):
|
|
111
|
-
search_result.document._truncation_allowed = False
|
|
112
|
-
|
|
113
113
|
console.print(search_result.document)
|
|
114
114
|
else:
|
|
115
115
|
console.print(search_result.document.summary_panel())
|
|
@@ -128,9 +128,6 @@ def epstein_show():
|
|
|
128
128
|
console.line()
|
|
129
129
|
|
|
130
130
|
for doc in docs:
|
|
131
|
-
if isinstance(doc, Email):
|
|
132
|
-
doc._truncation_allowed = False
|
|
133
|
-
|
|
134
131
|
console.print('\n', doc, '\n')
|
|
135
132
|
|
|
136
133
|
if args.raw:
|
|
@@ -148,5 +145,4 @@ def epstein_word_count() -> None:
|
|
|
148
145
|
|
|
149
146
|
def _assert_positional_args():
|
|
150
147
|
if not args.positional_args:
|
|
151
|
-
|
|
152
|
-
exit(1)
|
|
148
|
+
exit_with_error(f"No positional args provided!\n")
|
|
@@ -18,25 +18,24 @@ TIMESTAMP_SECONDS_REGEX = re.compile(r":\d{2}$")
|
|
|
18
18
|
@dataclass
|
|
19
19
|
class Communication(Document):
|
|
20
20
|
"""Superclass for Email and MessengerLog."""
|
|
21
|
-
author_style: str = 'white'
|
|
22
|
-
author_txt: Text = field(init=False)
|
|
23
21
|
config: CommunicationCfg | None = None
|
|
24
22
|
timestamp: datetime = FALLBACK_TIMESTAMP # TODO this default sucks (though it never happens)
|
|
25
23
|
|
|
26
|
-
def __post_init__(self):
|
|
27
|
-
super().__post_init__()
|
|
28
|
-
self.author_style = get_style_for_name(self.author_or_unknown())
|
|
29
|
-
self.author_txt = Text(self.author_or_unknown(), style=self.author_style)
|
|
30
|
-
|
|
31
24
|
def author_or_unknown(self) -> str:
|
|
32
25
|
return self.author or UNKNOWN
|
|
33
26
|
|
|
34
|
-
def
|
|
35
|
-
return
|
|
27
|
+
def author_style(self) -> str:
|
|
28
|
+
return get_style_for_name(self.author_or_unknown())
|
|
29
|
+
|
|
30
|
+
def author_txt(self) -> Text:
|
|
31
|
+
return Text(self.author_or_unknown(), style=self.author_style())
|
|
36
32
|
|
|
37
33
|
def external_links_txt(self, _style: str = '', include_alt_links: bool = True) -> Text:
|
|
38
34
|
"""Overrides super() method to apply self.author_style."""
|
|
39
|
-
return super().external_links_txt(self.author_style, include_alt_links=include_alt_links)
|
|
35
|
+
return super().external_links_txt(self.author_style(), include_alt_links=include_alt_links)
|
|
36
|
+
|
|
37
|
+
def is_attribution_uncertain(self) -> bool:
|
|
38
|
+
return bool(self.config and self.config.is_attribution_uncertain)
|
|
40
39
|
|
|
41
40
|
def summary(self) -> Text:
|
|
42
41
|
return self._summary().append(CLOSE_PROPERTIES_CHAR)
|
|
@@ -47,7 +46,4 @@ class Communication(Document):
|
|
|
47
46
|
def _summary(self) -> Text:
|
|
48
47
|
"""One line summary mostly for logging."""
|
|
49
48
|
txt = super().summary().append(', ')
|
|
50
|
-
return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown()}'", style=self.author_style)))
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
CommunicationType = TypeVar('CommunicationType', bound=Document)
|
|
49
|
+
return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown()}'", style=self.author_style())))
|
|
@@ -251,7 +251,7 @@ class Document:
|
|
|
251
251
|
def summary(self) -> Text:
|
|
252
252
|
"""Summary of this file for logging. Brackets are left open for subclasses to add stuff."""
|
|
253
253
|
txt = Text('').append(self._class_name(), style=self._class_style())
|
|
254
|
-
txt.append(f" {self.
|
|
254
|
+
txt.append(f" {self.file_path.stem}", style=FILENAME_STYLE)
|
|
255
255
|
|
|
256
256
|
if self.timestamp:
|
|
257
257
|
timestamp_str = remove_zero_time_from_timestamp_str(self.timestamp).replace('T', ' ')
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import logging
|
|
2
3
|
import re
|
|
3
4
|
from copy import deepcopy
|
|
@@ -20,7 +21,7 @@ from epstein_files.util.constant.names import *
|
|
|
20
21
|
from epstein_files.util.constant.strings import REDACTED
|
|
21
22
|
from epstein_files.util.constants import *
|
|
22
23
|
from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes, extract_last_name,
|
|
23
|
-
flatten, remove_timezone, uniquify)
|
|
24
|
+
flatten, listify, remove_timezone, uniquify)
|
|
24
25
|
from epstein_files.util.doc_cfg import EmailCfg, Metadata
|
|
25
26
|
from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
|
|
26
27
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
@@ -42,7 +43,7 @@ LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
|
|
|
42
43
|
SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
|
|
43
44
|
REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
|
|
44
45
|
URL_SIGNIFIERS = ['gclid', 'htm', 'ref=', 'utm']
|
|
45
|
-
APPEARS_IN = '
|
|
46
|
+
APPEARS_IN = 'appears in'
|
|
46
47
|
MAX_CHARS_TO_PRINT = 4000
|
|
47
48
|
MAX_NUM_HEADER_LINES = 14
|
|
48
49
|
MAX_QUOTED_REPLIES = 2
|
|
@@ -152,6 +153,8 @@ TRUNCATION_LENGTHS = {
|
|
|
152
153
|
'030245': 7_500, # Epstein rationalizes his behavior in an open letter to the world
|
|
153
154
|
'030781': 1_700, # Bannon email about crypto coin issues
|
|
154
155
|
'032906': 750, # David Blaine email
|
|
156
|
+
'026036': 6000, # Gino Yu blockchain mention
|
|
157
|
+
'023208': 350_000, # Long discussion about leon black's finances
|
|
155
158
|
}
|
|
156
159
|
|
|
157
160
|
# These are long forwarded articles so we force a trim to 1,333 chars if these strings exist
|
|
@@ -276,6 +279,7 @@ USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIP
|
|
|
276
279
|
'Nancy Portland', # Lawrence Krauss CC
|
|
277
280
|
'Oliver Goodenough', # Robert Trivers CC
|
|
278
281
|
'Peter Aldhous', # Lawrence Krauss CC
|
|
282
|
+
'Players2', # Hoffenberg CC
|
|
279
283
|
'Sam Harris', # Lawrence Krauss CC
|
|
280
284
|
SAMUEL_LEFF, # Random CC
|
|
281
285
|
'Sean T Lehane', # Random CC
|
|
@@ -283,6 +287,13 @@ USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIP
|
|
|
283
287
|
'Tim Kane', # Random CC
|
|
284
288
|
'Travis Pangburn', # Random CC
|
|
285
289
|
'Vahe Stepanian', # Random CC
|
|
290
|
+
# Ross Gow BCC
|
|
291
|
+
'david.brown@thetimes.co.uk',
|
|
292
|
+
'io-anne.pugh@bbc.co.uk',
|
|
293
|
+
'martin.robinson@mailonline.co.uk',
|
|
294
|
+
'nick.alwav@bbc.co.uk'
|
|
295
|
+
'nick.sommerlad@mirror.co.uk',
|
|
296
|
+
'p.peachev@independent.co.uk',
|
|
286
297
|
]
|
|
287
298
|
|
|
288
299
|
# Emails sent by epstein to himself that are just notes
|
|
@@ -300,6 +311,50 @@ METADATA_FIELDS = [
|
|
|
300
311
|
'subject',
|
|
301
312
|
]
|
|
302
313
|
|
|
314
|
+
LINE_REPAIR_MERGES = {
|
|
315
|
+
'017523': 4,
|
|
316
|
+
'019407': [2, 4],
|
|
317
|
+
'021729': 2,
|
|
318
|
+
'022673': 9,
|
|
319
|
+
'022684': 9,
|
|
320
|
+
'022695': 4,
|
|
321
|
+
'023067': 3,
|
|
322
|
+
'025790': 2,
|
|
323
|
+
'026609': 4,
|
|
324
|
+
'026924': [2, 4],
|
|
325
|
+
'028931': [3, 6],
|
|
326
|
+
'029154': [2, 5],
|
|
327
|
+
'029163': [2, 5],
|
|
328
|
+
'029282': 2,
|
|
329
|
+
'029402': 5,
|
|
330
|
+
'029498': 2,
|
|
331
|
+
'029501': 2,
|
|
332
|
+
'029835': [2, 4],
|
|
333
|
+
'029889': 2,
|
|
334
|
+
'029976': 3,
|
|
335
|
+
'030299': [7, 10],
|
|
336
|
+
'030381': [2, 4],
|
|
337
|
+
'030384': [2, 4],
|
|
338
|
+
'030626': 2,
|
|
339
|
+
'030999': [2, 4],
|
|
340
|
+
'031384': 2,
|
|
341
|
+
'031428': 2,
|
|
342
|
+
'031442': 0,
|
|
343
|
+
'031980': [2, 4],
|
|
344
|
+
'032063': [3, 5],
|
|
345
|
+
'032272': 3,
|
|
346
|
+
'032405': 4,
|
|
347
|
+
'033097': 2,
|
|
348
|
+
'033144': [2, 4],
|
|
349
|
+
'033228': [3, 5],
|
|
350
|
+
'033357': [2, 4],
|
|
351
|
+
'033486': [7, 9],
|
|
352
|
+
'033512': 2,
|
|
353
|
+
'033575': [2, 4],
|
|
354
|
+
'033576': 3,
|
|
355
|
+
'033583': 2,
|
|
356
|
+
}
|
|
357
|
+
|
|
303
358
|
|
|
304
359
|
@dataclass
|
|
305
360
|
class Email(Communication):
|
|
@@ -318,7 +373,6 @@ class Email(Communication):
|
|
|
318
373
|
recipients: list[str | None] = field(default_factory=list)
|
|
319
374
|
sent_from_device: str | None = None
|
|
320
375
|
signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
|
|
321
|
-
_truncation_allowed: bool = True # Hacky way to get __rich_console__() not to truncate in epstein_show script
|
|
322
376
|
|
|
323
377
|
# For logging how many headers we prettified while printing, kind of janky
|
|
324
378
|
rewritten_header_ids: ClassVar[set[str]] = set([])
|
|
@@ -342,7 +396,7 @@ class Email(Communication):
|
|
|
342
396
|
self.recipients = self.config.recipients
|
|
343
397
|
else:
|
|
344
398
|
for recipient in self.header.recipients():
|
|
345
|
-
self.recipients.extend(self.
|
|
399
|
+
self.recipients.extend(self._extract_emailer_names(recipient))
|
|
346
400
|
|
|
347
401
|
if self.author in MAILING_LISTS and (len(self.recipients) == 0 or self.recipients == [self.author]):
|
|
348
402
|
self.recipients = [JEFFREY_EPSTEIN] # Assume mailing list emails are to Epstein
|
|
@@ -365,7 +419,7 @@ class Email(Communication):
|
|
|
365
419
|
|
|
366
420
|
def info_txt(self) -> Text:
|
|
367
421
|
email_type = 'fwded article' if self.is_fwded_article() else 'email'
|
|
368
|
-
txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt).append(' to ')
|
|
422
|
+
txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt()).append(' to ')
|
|
369
423
|
return txt.append(self.recipients_txt()).append(highlighter(f" probably sent at {self.timestamp}"))
|
|
370
424
|
|
|
371
425
|
def is_fwded_article(self) -> bool:
|
|
@@ -446,15 +500,23 @@ class Email(Communication):
|
|
|
446
500
|
"""Color emails from epstein to others with the color for the first recipient."""
|
|
447
501
|
if self.author == JEFFREY_EPSTEIN:
|
|
448
502
|
if len(self.recipients) == 0 or self.recipients == [None]:
|
|
449
|
-
style = self.author_style
|
|
503
|
+
style = self.author_style()
|
|
450
504
|
else:
|
|
451
505
|
style = get_style_for_name(self.recipients[0])
|
|
452
506
|
else:
|
|
453
|
-
style = self.author_style
|
|
507
|
+
style = self.author_style()
|
|
454
508
|
|
|
455
509
|
return style.replace('bold', '').strip()
|
|
456
510
|
|
|
457
|
-
def
|
|
511
|
+
def _extract_author(self) -> None:
|
|
512
|
+
self._extract_header()
|
|
513
|
+
super()._extract_author()
|
|
514
|
+
|
|
515
|
+
if not self.author and self.header.author:
|
|
516
|
+
authors = self._extract_emailer_names(self.header.author)
|
|
517
|
+
self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
|
|
518
|
+
|
|
519
|
+
def _extract_emailer_names(self, emailer_str: str) -> list[str]:
|
|
458
520
|
"""Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
|
|
459
521
|
emailer_str = EmailHeader.cleanup_str(emailer_str)
|
|
460
522
|
|
|
@@ -474,14 +536,6 @@ class Email(Communication):
|
|
|
474
536
|
names_found = names_found or [emailer_str]
|
|
475
537
|
return [_reverse_first_and_last_names(name) for name in names_found]
|
|
476
538
|
|
|
477
|
-
def _extract_author(self) -> None:
|
|
478
|
-
self._extract_header()
|
|
479
|
-
super()._extract_author()
|
|
480
|
-
|
|
481
|
-
if not self.author and self.header.author:
|
|
482
|
-
authors = self._emailer_names(self.header.author)
|
|
483
|
-
self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
|
|
484
|
-
|
|
485
539
|
def _extract_header(self) -> None:
|
|
486
540
|
"""Extract an EmailHeader object from the OCR text."""
|
|
487
541
|
header_match = EMAIL_SIMPLE_HEADER_REGEX.search(self.text)
|
|
@@ -579,44 +633,47 @@ class Email(Communication):
|
|
|
579
633
|
self._set_computed_fields(lines=[line for line in self.lines if not BAD_LINE_REGEX.match(line)])
|
|
580
634
|
old_text = self.text
|
|
581
635
|
|
|
582
|
-
if self.file_id in
|
|
583
|
-
self.
|
|
584
|
-
|
|
585
|
-
self._merge_lines(
|
|
636
|
+
if self.file_id in LINE_REPAIR_MERGES:
|
|
637
|
+
merge = LINE_REPAIR_MERGES[self.file_id]
|
|
638
|
+
merge_args = merge if isinstance(merge, list) else [merge]
|
|
639
|
+
self._merge_lines(*merge_args)
|
|
586
640
|
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
elif self.file_id in ['029498', '031428']:
|
|
592
|
-
self._merge_lines(2, 4)
|
|
593
|
-
elif self.file_id in ['029976', '023067', '033576']:
|
|
594
|
-
self._merge_lines(3) # Merge 4th and 5th rows
|
|
595
|
-
elif self.file_id in '026609 029402 032405 022695'.split():
|
|
596
|
-
self._merge_lines(4) # Merge 5th and 6th rows
|
|
597
|
-
elif self.file_id in ['019407', '031980', '030384', '033144', '030999', '033575', '029835', '030381', '033357', '026924']:
|
|
598
|
-
self._merge_lines(2, 4)
|
|
599
|
-
elif self.file_id in ['029154', '029163']:
|
|
641
|
+
# These already had 2nd line merged
|
|
642
|
+
if self.file_id in ['030626']: # Merge 6th and 7th (now 5th and 6th) rows
|
|
643
|
+
self._merge_lines(4)
|
|
644
|
+
elif self.file_id == '029889':
|
|
600
645
|
self._merge_lines(2, 5)
|
|
601
|
-
elif self.file_id in ['
|
|
602
|
-
self._merge_lines(
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
646
|
+
elif self.file_id in ['029498', '031428']:
|
|
647
|
+
self._merge_lines(2, 4)
|
|
648
|
+
|
|
649
|
+
# Multiline
|
|
650
|
+
if self.file_id == '013415':
|
|
651
|
+
for _i in range(2):
|
|
652
|
+
self._merge_lines(4)
|
|
653
|
+
elif self.file_id == '013405':
|
|
654
|
+
for _i in range(2):
|
|
655
|
+
self._merge_lines(4)
|
|
656
|
+
elif self.file_id == '029458':
|
|
657
|
+
for _i in range(3):
|
|
658
|
+
self._merge_lines(4)
|
|
659
|
+
elif self.file_id in ['025233']:
|
|
606
660
|
for _i in range(2):
|
|
607
661
|
self._merge_lines(4)
|
|
662
|
+
|
|
663
|
+
self.lines[4] = f"Attachments: {self.lines[4]}"
|
|
664
|
+
self._set_computed_fields(lines=self.lines)
|
|
665
|
+
elif self.file_id in ['023001']:
|
|
666
|
+
for _i in range(3):
|
|
667
|
+
self._merge_lines(5)
|
|
668
|
+
elif self.file_id in ['019105']:
|
|
669
|
+
for _i in range(4):
|
|
670
|
+
self._merge_lines(5)
|
|
608
671
|
elif self.file_id in ['033568']:
|
|
609
672
|
for _i in range(5):
|
|
610
673
|
self._merge_lines(5)
|
|
611
674
|
elif self.file_id in ['025329']:
|
|
612
675
|
for _i in range(9):
|
|
613
676
|
self._merge_lines(2)
|
|
614
|
-
elif self.file_id == '033486':
|
|
615
|
-
self._merge_lines(7, 9)
|
|
616
|
-
elif self.file_id == '030299':
|
|
617
|
-
self._merge_lines(7, 10)
|
|
618
|
-
elif self.file_id in ['022673', '022684']:
|
|
619
|
-
self._merge_lines(9)
|
|
620
677
|
elif self.file_id == '014860':
|
|
621
678
|
self._merge_lines(3)
|
|
622
679
|
self._merge_lines(4)
|
|
@@ -629,7 +686,15 @@ class Email(Communication):
|
|
|
629
686
|
|
|
630
687
|
self._merge_lines(4)
|
|
631
688
|
self._merge_lines(2, 4)
|
|
632
|
-
elif self.file_id
|
|
689
|
+
elif self.file_id in ['033252']:
|
|
690
|
+
for _i in range(2):
|
|
691
|
+
self._merge_lines(9)
|
|
692
|
+
elif self.file_id in ['032637']:
|
|
693
|
+
for _i in range(3):
|
|
694
|
+
self._merge_lines(9)
|
|
695
|
+
|
|
696
|
+
# Bad line removal
|
|
697
|
+
if self.file_id == '025041':
|
|
633
698
|
self._remove_line(4)
|
|
634
699
|
self._remove_line(4)
|
|
635
700
|
elif self.file_id == '029692':
|
|
@@ -679,7 +744,7 @@ class Email(Communication):
|
|
|
679
744
|
"""Copy info from original config for file this document was extracted from."""
|
|
680
745
|
if self.file_id in ALL_FILE_CONFIGS:
|
|
681
746
|
self.config = cast(EmailCfg, deepcopy(ALL_FILE_CONFIGS[self.file_id]))
|
|
682
|
-
self.warn(f"Merging existing
|
|
747
|
+
self.warn(f"Merging existing cfg for '{self.file_id}' with cfg for extracted document...")
|
|
683
748
|
else:
|
|
684
749
|
self.config = EmailCfg(id=self.file_id)
|
|
685
750
|
|
|
@@ -692,33 +757,55 @@ class Email(Communication):
|
|
|
692
757
|
extracted_description += ' email'
|
|
693
758
|
|
|
694
759
|
if self.config.description:
|
|
695
|
-
self.warn(f"Overwriting description '{self.config.description}' with extract
|
|
760
|
+
self.warn(f"Overwriting description '{self.config.description}' with extract's '{self.config.description}'")
|
|
696
761
|
|
|
697
762
|
self.config.description = extracted_description
|
|
698
763
|
|
|
699
764
|
self.config.is_interesting = self.config.is_interesting or extracted_from_doc_cfg.is_interesting
|
|
700
765
|
self.log(f"Constructed synthetic config: {self.config}")
|
|
701
766
|
|
|
702
|
-
def
|
|
703
|
-
|
|
704
|
-
yield self.file_info_panel()
|
|
705
|
-
should_rewrite_header = self.header.was_initially_empty and self.header.num_header_rows > 0
|
|
767
|
+
def _truncate_to_length(self) -> int:
|
|
768
|
+
"""When printing truncate this email to this length."""
|
|
706
769
|
quote_cutoff = self._idx_of_nth_quoted_reply(text=self.text) # Trim if there's many quoted replies
|
|
707
|
-
|
|
708
|
-
trim_footer_txt = None
|
|
709
|
-
text = self.text
|
|
770
|
+
includes_truncate_term = next((term for term in TRUNCATE_TERMS if term in self.text), None)
|
|
710
771
|
|
|
711
|
-
if
|
|
772
|
+
if args.whole_file:
|
|
773
|
+
num_chars = len(self.text)
|
|
774
|
+
elif self.file_id in TRUNCATION_LENGTHS:
|
|
712
775
|
num_chars = TRUNCATION_LENGTHS[self.file_id]
|
|
713
|
-
elif self.author in TRUNCATE_ALL_EMAILS_FROM or
|
|
776
|
+
elif self.author in TRUNCATE_ALL_EMAILS_FROM or includes_truncate_term:
|
|
714
777
|
num_chars = int(MAX_CHARS_TO_PRINT / 3)
|
|
715
778
|
elif quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
|
|
716
779
|
num_chars = quote_cutoff
|
|
780
|
+
else:
|
|
781
|
+
num_chars = MAX_CHARS_TO_PRINT
|
|
782
|
+
|
|
783
|
+
if num_chars != MAX_CHARS_TO_PRINT and not self.is_duplicate():
|
|
784
|
+
log_args = {
|
|
785
|
+
'num_chars': num_chars,
|
|
786
|
+
'author_truncate': self.author in TRUNCATE_ALL_EMAILS_FROM,
|
|
787
|
+
'is_fwded_article': self.is_fwded_article(),
|
|
788
|
+
'is_quote_cutoff': quote_cutoff == num_chars,
|
|
789
|
+
'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
|
|
790
|
+
'quote_cutoff': quote_cutoff,
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
if quote_cutoff != num_chars:
|
|
794
|
+
logger.debug(f'{self.summary()} truncating: ' + ', '.join([f"{k}={v}" for k, v in log_args.items() if v]) + '\n')
|
|
795
|
+
|
|
796
|
+
return num_chars
|
|
797
|
+
|
|
798
|
+
def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
|
|
799
|
+
logger.debug(f"Printing '{self.filename}'...")
|
|
800
|
+
should_rewrite_header = self.header.was_initially_empty and self.header.num_header_rows > 0
|
|
801
|
+
num_chars = self._truncate_to_length()
|
|
802
|
+
trim_footer_txt = None
|
|
803
|
+
text = self.text
|
|
717
804
|
|
|
718
805
|
# Truncate long emails but leave a note explaining what happened w/link to source document
|
|
719
|
-
if len(text) > num_chars
|
|
806
|
+
if len(text) > num_chars:
|
|
720
807
|
text = text[0:num_chars]
|
|
721
|
-
doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
|
|
808
|
+
doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style())
|
|
722
809
|
trim_note = f"<...trimmed to {num_chars} characters of {self.length()}, read the rest at {doc_link_markup}...>"
|
|
723
810
|
trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
|
|
724
811
|
|
|
@@ -738,38 +825,37 @@ class Email(Communication):
|
|
|
738
825
|
text = _add_line_breaks(text) # This was skipped when _prettify_text() w/a broken header so we do it now
|
|
739
826
|
self.rewritten_header_ids.add(self.file_id)
|
|
740
827
|
|
|
741
|
-
panel_txt = highlighter(text)
|
|
742
|
-
|
|
743
828
|
email_txt_panel = Panel(
|
|
744
|
-
|
|
829
|
+
highlighter(text).append('\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
|
|
745
830
|
border_style=self._border_style(),
|
|
746
831
|
expand=False,
|
|
747
832
|
subtitle=REWRITTEN_HEADER_MSG if should_rewrite_header else None,
|
|
748
833
|
)
|
|
749
834
|
|
|
835
|
+
yield self.file_info_panel()
|
|
750
836
|
yield Padding(email_txt_panel, (0, 0, 1, INFO_INDENT))
|
|
751
837
|
|
|
752
838
|
if should_rewrite_header:
|
|
753
839
|
self.log_top_lines(self.header.num_header_rows + 4, f'Original header:')
|
|
754
840
|
|
|
755
841
|
@staticmethod
|
|
756
|
-
def
|
|
757
|
-
"""Turn a set of
|
|
842
|
+
def build_emails_table(emails: list['Email'], _author: str | None, include_title: bool = False) -> Table:
|
|
843
|
+
"""Turn a set of Emails to/from a given _author into a Table."""
|
|
758
844
|
author = _author or UNKNOWN
|
|
759
845
|
|
|
760
846
|
table = Table(
|
|
761
|
-
title=f"Emails to/from {author} starting {emails[0].timestamp.date()}",
|
|
847
|
+
title=f"Emails to/from {author} starting {emails[0].timestamp.date()}" if include_title else None,
|
|
762
848
|
border_style=get_style_for_name(author, allow_bold=False),
|
|
763
849
|
header_style="bold"
|
|
764
850
|
)
|
|
765
851
|
|
|
766
852
|
table.add_column('From', justify='left')
|
|
767
853
|
table.add_column('Timestamp', justify='center')
|
|
768
|
-
table.add_column('Subject', justify='left', style='honeydew2', min_width=
|
|
854
|
+
table.add_column('Subject', justify='left', style='honeydew2', min_width=70)
|
|
769
855
|
|
|
770
856
|
for email in emails:
|
|
771
857
|
table.add_row(
|
|
772
|
-
email.author_txt,
|
|
858
|
+
email.author_txt(),
|
|
773
859
|
email.epstein_media_link(link_txt=email.timestamp_without_seconds()),
|
|
774
860
|
highlighter(email.subject())
|
|
775
861
|
)
|
{epstein_files-1.1.0 → epstein_files-1.1.2}/epstein_files/documents/imessage/text_message.py
RENAMED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from dataclasses import dataclass
|
|
2
|
+
from dataclasses import dataclass, field, fields
|
|
3
3
|
from datetime import datetime
|
|
4
4
|
|
|
5
5
|
from rich.text import Text
|
|
6
6
|
|
|
7
7
|
from epstein_files.util.constant.names import JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN
|
|
8
8
|
from epstein_files.util.constant.strings import TIMESTAMP_DIM
|
|
9
|
-
from epstein_files.util.data import extract_last_name
|
|
9
|
+
from epstein_files.util.data import extract_last_name, iso_timestamp
|
|
10
10
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
11
11
|
from epstein_files.util.logging import logger
|
|
12
12
|
from epstein_files.util.rich import TEXT_LINK, highlighter
|
|
@@ -30,7 +30,7 @@ class TextMessage:
|
|
|
30
30
|
"""Class representing a single iMessage text message."""
|
|
31
31
|
author: str | None
|
|
32
32
|
author_str: str = ''
|
|
33
|
-
|
|
33
|
+
is_id_confirmed: bool = False
|
|
34
34
|
text: str
|
|
35
35
|
timestamp_str: str
|
|
36
36
|
|
|
@@ -44,38 +44,55 @@ class TextMessage:
|
|
|
44
44
|
else:
|
|
45
45
|
self.author_str = self.author_str or self.author
|
|
46
46
|
|
|
47
|
-
if not self.
|
|
47
|
+
if not self.is_id_confirmed and self.author is not None and self.author != JEFFREY_EPSTEIN:
|
|
48
48
|
self.author_str += ' (?)'
|
|
49
49
|
|
|
50
|
-
|
|
51
|
-
|
|
50
|
+
if self.is_link():
|
|
51
|
+
self.text = self.text.replace('\n', '').replace(' ', '_')
|
|
52
|
+
else:
|
|
53
|
+
self.text = self.text.replace('\n', ' ')
|
|
52
54
|
|
|
53
|
-
def
|
|
54
|
-
|
|
55
|
+
def is_link(self) -> bool:
|
|
56
|
+
return self.text.startswith('http')
|
|
55
57
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
text = self.text
|
|
58
|
+
def parse_timestamp(self) -> datetime:
|
|
59
|
+
return datetime.strptime(self.timestamp_str, MSG_DATE_FORMAT)
|
|
59
60
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
text = text.replace('\n', '', 2)
|
|
63
|
-
else:
|
|
64
|
-
text = text.replace('\n', '', 1)
|
|
61
|
+
def timestamp_txt(self) -> Text:
|
|
62
|
+
timestamp_str = self.timestamp_str
|
|
65
63
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
64
|
+
try:
|
|
65
|
+
timestamp_str = iso_timestamp(self.parse_timestamp())
|
|
66
|
+
except Exception as e:
|
|
67
|
+
logger.warning(f"Failed to parse timestamp for {self}")
|
|
69
68
|
|
|
70
|
-
|
|
71
|
-
msg_txt.append('\n' + ' '.join(lines))
|
|
72
|
-
else:
|
|
73
|
-
msg_txt = highlighter(' '.join(lines)) # remove newlines
|
|
69
|
+
return Text(f"[{timestamp_str}]", style=TIMESTAMP_DIM)
|
|
74
70
|
|
|
75
|
-
|
|
71
|
+
def _message(self) -> Text:
|
|
72
|
+
if self.is_link():
|
|
73
|
+
return Text.from_markup(f"[link={self.text}]{self.text}[/link]", style=TEXT_LINK)
|
|
74
|
+
else:
|
|
75
|
+
return highlighter(self.text)
|
|
76
76
|
|
|
77
77
|
def __rich__(self) -> Text:
|
|
78
|
-
timestamp_txt =
|
|
78
|
+
timestamp_txt = self.timestamp_txt().append(' ')
|
|
79
79
|
author_style = get_style_for_name(self.author_str if self.author_str.startswith('+') else self.author)
|
|
80
80
|
author_txt = Text(self.author_str, style=author_style)
|
|
81
81
|
return Text('').append(timestamp_txt).append(author_txt).append(': ', style='dim').append(self._message())
|
|
82
|
+
|
|
83
|
+
def __repr__(self) -> str:
|
|
84
|
+
props = []
|
|
85
|
+
add_prop = lambda k, v: props.append(f"{k}={v}")
|
|
86
|
+
|
|
87
|
+
for _field in sorted(fields(self), key=lambda f: f.name):
|
|
88
|
+
key = _field.name
|
|
89
|
+
value = getattr(self, key)
|
|
90
|
+
|
|
91
|
+
if key == 'author_str' and self.author and self.author_str.startswith(value):
|
|
92
|
+
continue
|
|
93
|
+
elif isinstance(value, str):
|
|
94
|
+
add_prop(key, f'"{value}"')
|
|
95
|
+
else:
|
|
96
|
+
add_prop(key, value)
|
|
97
|
+
|
|
98
|
+
return f"{type(self).__name__}(" + ', '.join(props) + f')'
|