epstein-files 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +66 -131
- epstein_files/documents/document.py +12 -3
- epstein_files/documents/email.py +33 -13
- epstein_files/documents/imessage/text_message.py +11 -15
- epstein_files/documents/messenger_log.py +15 -11
- epstein_files/documents/other_file.py +13 -8
- epstein_files/epstein_files.py +51 -43
- epstein_files/util/constant/names.py +21 -24
- epstein_files/util/constant/output_files.py +29 -0
- epstein_files/util/constant/strings.py +8 -2
- epstein_files/util/constant/urls.py +11 -7
- epstein_files/util/constants.py +325 -227
- epstein_files/util/data.py +12 -33
- epstein_files/util/doc_cfg.py +7 -14
- epstein_files/util/env.py +5 -3
- epstein_files/util/file_helper.py +0 -22
- epstein_files/util/highlighted_group.py +31 -26
- epstein_files/util/logging.py +7 -0
- epstein_files/util/output.py +179 -0
- epstein_files/util/rich.py +22 -10
- {epstein_files-1.0.1.dist-info → epstein_files-1.0.3.dist-info}/METADATA +32 -7
- epstein_files-1.0.3.dist-info/RECORD +33 -0
- epstein_files-1.0.3.dist-info/entry_points.txt +7 -0
- epstein_files-1.0.1.dist-info/RECORD +0 -30
- {epstein_files-1.0.1.dist-info → epstein_files-1.0.3.dist-info}/LICENSE +0 -0
- {epstein_files-1.0.1.dist-info → epstein_files-1.0.3.dist-info}/WHEEL +0 -0
epstein_files/__init__.py
CHANGED
|
@@ -4,73 +4,30 @@ Reformat Epstein text message files for readability and count email senders.
|
|
|
4
4
|
For use with iMessage log files from https://drive.google.com/drive/folders/1hTNH5woIRio578onLGElkTWofUSWRoH_
|
|
5
5
|
|
|
6
6
|
Install: 'poetry install'
|
|
7
|
-
Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT
|
|
7
|
+
Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT epstein_generate'
|
|
8
8
|
"""
|
|
9
9
|
from sys import exit
|
|
10
10
|
|
|
11
11
|
from dotenv import load_dotenv
|
|
12
12
|
load_dotenv()
|
|
13
|
+
|
|
14
|
+
from rich.markup import escape
|
|
13
15
|
from rich.padding import Padding
|
|
16
|
+
from rich.panel import Panel
|
|
14
17
|
|
|
18
|
+
from epstein_files.epstein_files import EpsteinFiles, document_cls
|
|
19
|
+
from epstein_files.documents.document import INFO_PADDING, Document
|
|
15
20
|
from epstein_files.documents.email import Email
|
|
16
|
-
from epstein_files.documents.messenger_log import MessengerLog
|
|
17
|
-
from epstein_files.epstein_files import EpsteinFiles, count_by_month
|
|
18
21
|
from epstein_files.util.constant.html import *
|
|
19
22
|
from epstein_files.util.constant.names import *
|
|
20
|
-
from epstein_files.util.constant.
|
|
21
|
-
from epstein_files.util.data import dict_sets_to_lists
|
|
23
|
+
from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, TEXT_MSGS_HTML_PATH, make_clean
|
|
22
24
|
from epstein_files.util.env import args, specified_names
|
|
23
|
-
from epstein_files.util.file_helper import
|
|
25
|
+
from epstein_files.util.file_helper import coerce_file_path, extract_file_id
|
|
24
26
|
from epstein_files.util.logging import logger
|
|
25
|
-
from epstein_files.util.
|
|
27
|
+
from epstein_files.util.output import print_emails, print_json_metadata, print_json_stats, print_text_messages, write_urls
|
|
28
|
+
from epstein_files.util.rich import build_highlighter, console, print_header, print_panel, write_html
|
|
26
29
|
from epstein_files.util.timer import Timer
|
|
27
30
|
|
|
28
|
-
PRINT_COLOR_KEY_EVERY_N_EMAILS = 150
|
|
29
|
-
|
|
30
|
-
# Order matters. Default names to print emails for.
|
|
31
|
-
DEFAULT_EMAILERS = [
|
|
32
|
-
JEREMY_RUBIN,
|
|
33
|
-
AL_SECKEL,
|
|
34
|
-
JOI_ITO,
|
|
35
|
-
JABOR_Y,
|
|
36
|
-
STEVEN_SINOFSKY,
|
|
37
|
-
DANIEL_SIAD,
|
|
38
|
-
JEAN_LUC_BRUNEL,
|
|
39
|
-
STEVEN_HOFFENBERG,
|
|
40
|
-
EHUD_BARAK,
|
|
41
|
-
MARTIN_NOWAK,
|
|
42
|
-
MASHA_DROKOVA,
|
|
43
|
-
RENATA_BOLOTOVA,
|
|
44
|
-
STEVE_BANNON,
|
|
45
|
-
OLIVIER_COLOM,
|
|
46
|
-
BORIS_NIKOLIC,
|
|
47
|
-
PRINCE_ANDREW,
|
|
48
|
-
JIDE_ZEITLIN,
|
|
49
|
-
DAVID_STERN,
|
|
50
|
-
MOHAMED_WAHEED_HASSAN,
|
|
51
|
-
JENNIFER_JACQUET,
|
|
52
|
-
None,
|
|
53
|
-
]
|
|
54
|
-
|
|
55
|
-
# Order matters. Default names to print tables w/email subject, timestamp, etc for.
|
|
56
|
-
# TODO: get rid of this
|
|
57
|
-
DEFAULT_EMAILER_TABLES: list[str | None] = [
|
|
58
|
-
GHISLAINE_MAXWELL,
|
|
59
|
-
LEON_BLACK,
|
|
60
|
-
LANDON_THOMAS,
|
|
61
|
-
KATHRYN_RUEMMLER,
|
|
62
|
-
DARREN_INDYKE,
|
|
63
|
-
RICHARD_KAHN,
|
|
64
|
-
TYLER_SHEARS,
|
|
65
|
-
SULTAN_BIN_SULAYEM,
|
|
66
|
-
DEEPAK_CHOPRA,
|
|
67
|
-
ARIANE_DE_ROTHSCHILD,
|
|
68
|
-
TOM_PRITZKER,
|
|
69
|
-
]
|
|
70
|
-
|
|
71
|
-
if len(set(DEFAULT_EMAILERS).intersection(set(DEFAULT_EMAILER_TABLES))) > 0:
|
|
72
|
-
raise RuntimeError(f"Some names appear in both DEFAULT_EMAILERS and DEFAULT_EMAILER_TABLES")
|
|
73
|
-
|
|
74
31
|
|
|
75
32
|
def generate_html() -> None:
|
|
76
33
|
if args.make_clean:
|
|
@@ -81,15 +38,7 @@ def generate_html() -> None:
|
|
|
81
38
|
epstein_files = EpsteinFiles.get_files(timer)
|
|
82
39
|
|
|
83
40
|
if args.json_metadata:
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
if args.build:
|
|
87
|
-
with open(JSON_METADATA_PATH, 'w') as f:
|
|
88
|
-
f.write(json_str)
|
|
89
|
-
timer.print_at_checkpoint(f"Wrote {file_size_str(JSON_METADATA_PATH)} to '{JSON_METADATA_PATH}'")
|
|
90
|
-
else:
|
|
91
|
-
console.print_json(json_str, indent=4, sort_keys=True)
|
|
92
|
-
|
|
41
|
+
print_json_metadata(epstein_files)
|
|
93
42
|
exit()
|
|
94
43
|
|
|
95
44
|
print_header(epstein_files)
|
|
@@ -98,11 +47,11 @@ def generate_html() -> None:
|
|
|
98
47
|
exit()
|
|
99
48
|
|
|
100
49
|
if args.output_texts:
|
|
101
|
-
|
|
50
|
+
print_text_messages(epstein_files)
|
|
102
51
|
timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs)} text message logs')
|
|
103
52
|
|
|
104
53
|
if args.output_emails:
|
|
105
|
-
emails_printed =
|
|
54
|
+
emails_printed = print_emails(epstein_files)
|
|
106
55
|
timer.print_at_checkpoint(f"Printed {emails_printed:,} emails")
|
|
107
56
|
|
|
108
57
|
if args.output_other_files:
|
|
@@ -110,93 +59,79 @@ def generate_html() -> None:
|
|
|
110
59
|
timer.print_at_checkpoint(f"Printed {len(files_printed)} other files")
|
|
111
60
|
|
|
112
61
|
# Save output
|
|
113
|
-
write_html(
|
|
62
|
+
write_html(ALL_EMAILS_PATH if args.all_emails else TEXT_MSGS_HTML_PATH)
|
|
114
63
|
logger.warning(f"Total time: {timer.seconds_since_start_str()}")
|
|
115
64
|
|
|
116
65
|
# JSON stats (mostly used for building pytest checks)
|
|
117
66
|
if args.json_stats:
|
|
118
|
-
|
|
119
|
-
_print_json_stats(epstein_files)
|
|
67
|
+
print_json_stats(epstein_files)
|
|
120
68
|
|
|
121
69
|
|
|
122
|
-
def
|
|
123
|
-
"""
|
|
124
|
-
|
|
125
|
-
print_other_site_link(is_header=False)
|
|
70
|
+
def epstein_diff():
|
|
71
|
+
"""Diff the cleaned up text of two files."""
|
|
72
|
+
Document.diff_files(args.positional_args)
|
|
126
73
|
|
|
127
|
-
if len(specified_names) == 0:
|
|
128
|
-
epstein_files.print_emailer_counts_table()
|
|
129
74
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
75
|
+
def epstein_search():
|
|
76
|
+
"""Search the cleaned up text of the files."""
|
|
77
|
+
_assert_positional_args()
|
|
78
|
+
epstein_files = EpsteinFiles.get_files(use_pickled=True)
|
|
134
79
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
emailers_to_print = specified_names if specified_names else DEFAULT_EMAILERS
|
|
141
|
-
console.print('Email conversations grouped by counterparty can be found in the order listed below.')
|
|
142
|
-
print_numbered_list_of_emailers(emailers_to_print)
|
|
143
|
-
console.print("\nAfter that there's tables linking to (but not displaying) all known emails for each of these people:")
|
|
144
|
-
|
|
145
|
-
if len(specified_names) > 0:
|
|
146
|
-
print_numbered_list_of_emailers(DEFAULT_EMAILER_TABLES)
|
|
80
|
+
for search_term in args.positional_args:
|
|
81
|
+
temp_highlighter = build_highlighter(search_term)
|
|
82
|
+
search_results = epstein_files.docs_matching(search_term, specified_names)
|
|
83
|
+
console.line(2)
|
|
84
|
+
print_panel(f"Found {len(search_results)} documents matching '{search_term}'", padding=(0, 0, 0, 3))
|
|
147
85
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
already_printed_emails.extend(newly_printed_emails)
|
|
151
|
-
num_emails_printed_since_last_color_key += len(newly_printed_emails)
|
|
86
|
+
for search_result in search_results:
|
|
87
|
+
console.line()
|
|
152
88
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
num_emails_printed_since_last_color_key = 0
|
|
89
|
+
if args.whole_file:
|
|
90
|
+
if isinstance(search_result.document, Email):
|
|
91
|
+
search_result.document.truncation_allowed = False
|
|
157
92
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
93
|
+
console.print(search_result.document)
|
|
94
|
+
else:
|
|
95
|
+
console.print(search_result.document.description_panel())
|
|
161
96
|
|
|
162
|
-
|
|
163
|
-
|
|
97
|
+
for matching_line in search_result.lines:
|
|
98
|
+
line_txt = matching_line.__rich__()
|
|
99
|
+
console.print(Padding(temp_highlighter(line_txt), INFO_PADDING), style='gray37')
|
|
164
100
|
|
|
165
|
-
epstein_files.print_email_device_info()
|
|
166
101
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
102
|
+
def epstein_show():
|
|
103
|
+
"""Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
|
|
104
|
+
_assert_positional_args()
|
|
105
|
+
ids = [extract_file_id(arg) for arg in args.positional_args]
|
|
106
|
+
console.line()
|
|
171
107
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
108
|
+
if args.pickled:
|
|
109
|
+
epstein_files = EpsteinFiles.get_files(use_pickled=True)
|
|
110
|
+
docs = epstein_files.get_documents_by_id(ids)
|
|
111
|
+
else:
|
|
112
|
+
raw_docs = [Document(coerce_file_path(id)) for id in ids]
|
|
113
|
+
docs = [document_cls(doc)(doc.file_path) for doc in raw_docs]
|
|
175
114
|
|
|
176
|
-
|
|
177
|
-
|
|
115
|
+
for doc in docs:
|
|
116
|
+
console.line()
|
|
117
|
+
console.print(doc)
|
|
178
118
|
|
|
119
|
+
if args.raw:
|
|
120
|
+
console.line()
|
|
121
|
+
console.print(Panel(f"*** {doc.url_slug} RAW ***", expand=False, style=doc._border_style()))
|
|
122
|
+
console.print(escape(doc.raw_text()))
|
|
179
123
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
log_files = epstein_files.imessage_logs_for(authors)
|
|
124
|
+
if isinstance(doc, Email):
|
|
125
|
+
console.line()
|
|
126
|
+
console.print(Panel(f"*** {doc.url_slug} actual_text ***", expand=False, style=doc._border_style()))
|
|
127
|
+
console.print(escape(doc._actual_text()))
|
|
185
128
|
|
|
186
|
-
for log_file in log_files:
|
|
187
|
-
console.print(Padding(log_file))
|
|
188
|
-
console.line(2)
|
|
189
129
|
|
|
190
|
-
|
|
130
|
+
def epstein_dump_urls() -> None:
|
|
131
|
+
write_urls()
|
|
191
132
|
|
|
192
133
|
|
|
193
|
-
def
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
print_json(f"{EMAIL_CLASS} Recipient Counts", epstein_files.email_recipient_counts, skip_falsey=True)
|
|
198
|
-
print_json("Email signature_substitution_countss", epstein_files.email_signature_substitution_counts(), skip_falsey=True)
|
|
199
|
-
print_json("email_author_device_signatures", dict_sets_to_lists(epstein_files.email_authors_to_device_signatures))
|
|
200
|
-
print_json("email_sent_from_devices", dict_sets_to_lists(epstein_files.email_device_signatures_to_authors))
|
|
201
|
-
print_json("email_unknown_recipient_file_ids", epstein_files.email_unknown_recipient_file_ids())
|
|
202
|
-
print_json("count_by_month", count_by_month(epstein_files.all_documents()))
|
|
134
|
+
def _assert_positional_args():
|
|
135
|
+
if not args.positional_args:
|
|
136
|
+
console.print(f"\n ERROR: No positional args!\n", style='red1')
|
|
137
|
+
exit(1)
|
|
@@ -15,7 +15,7 @@ from epstein_files.util.constant.names import *
|
|
|
15
15
|
from epstein_files.util.constant.strings import *
|
|
16
16
|
from epstein_files.util.constant.urls import *
|
|
17
17
|
from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
|
|
18
|
-
from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize,
|
|
18
|
+
from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize, without_falsey
|
|
19
19
|
from epstein_files.util.doc_cfg import EmailCfg, DocCfg, Metadata, TextCfg
|
|
20
20
|
from epstein_files.util.env import args
|
|
21
21
|
from epstein_files.util.file_helper import (DOCS_DIR, file_stem_for_id, extract_file_id, file_size,
|
|
@@ -159,7 +159,7 @@ class Document:
|
|
|
159
159
|
if hint_msg:
|
|
160
160
|
hints.append(highlighter(Text(hint_msg, style='white dim italic')))
|
|
161
161
|
|
|
162
|
-
return
|
|
162
|
+
return without_falsey(hints)
|
|
163
163
|
|
|
164
164
|
def info_txt(self) -> Text | None:
|
|
165
165
|
"""Secondary info about this file (recipients, level of certainty, etc). Overload in subclasses."""
|
|
@@ -255,7 +255,11 @@ class Document:
|
|
|
255
255
|
txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
|
|
256
256
|
|
|
257
257
|
txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
|
|
258
|
-
txt.append(", ").append(key_value_txt('lines',
|
|
258
|
+
txt.append(", ").append(key_value_txt('lines', self.num_lines))
|
|
259
|
+
|
|
260
|
+
if self.config and self.config.dupe_of_id:
|
|
261
|
+
txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.dupe_of_id, style='magenta')))
|
|
262
|
+
|
|
259
263
|
return txt
|
|
260
264
|
|
|
261
265
|
def top_lines(self, n: int = 10) -> str:
|
|
@@ -352,6 +356,11 @@ class Document:
|
|
|
352
356
|
for f in tmpfiles:
|
|
353
357
|
f.unlink()
|
|
354
358
|
|
|
359
|
+
@staticmethod
|
|
360
|
+
def known_author_count(docs: Sequence['Document']) -> int:
|
|
361
|
+
"""Count of how many Document objects have an author attribution."""
|
|
362
|
+
return len([doc for doc in docs if doc.author])
|
|
363
|
+
|
|
355
364
|
@staticmethod
|
|
356
365
|
def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
|
|
357
366
|
return sorted(docs, key=lambda doc: doc.sort_key())
|
epstein_files/documents/email.py
CHANGED
|
@@ -30,7 +30,6 @@ BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communicati
|
|
|
30
30
|
DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
|
|
31
31
|
LINK_LINE_REGEX = re.compile(f"^(> )?htt")
|
|
32
32
|
QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
|
|
33
|
-
REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + ['********************************']
|
|
34
33
|
REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
|
|
35
34
|
|
|
36
35
|
BAD_TIMEZONE_REGEX = re.compile(fr'\((UTC|GMT\+\d\d:\d\d)\)|{REDACTED}')
|
|
@@ -39,10 +38,16 @@ TIMESTAMP_LINE_REGEX = re.compile(r"\d+:\d+")
|
|
|
39
38
|
|
|
40
39
|
SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
|
|
41
40
|
REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
|
|
41
|
+
IS_JUNK_MAIL = 'is_junk_mail'
|
|
42
42
|
MAX_CHARS_TO_PRINT = 4000
|
|
43
43
|
MAX_NUM_HEADER_LINES = 14
|
|
44
44
|
MAX_QUOTED_REPLIES = 2
|
|
45
45
|
|
|
46
|
+
REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
|
|
47
|
+
'********************************',
|
|
48
|
+
'Begin forwarded message',
|
|
49
|
+
]
|
|
50
|
+
|
|
46
51
|
OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
47
52
|
re.compile(r'grnail\.com'): 'gmail.com',
|
|
48
53
|
re.compile(r"^(From|To)(: )?[_1.]{5,}", re.MULTILINE): rf"\1: {REDACTED}", # Redacted email addresses
|
|
@@ -119,6 +124,7 @@ EMAIL_SIGNATURE_REGEXES = {
|
|
|
119
124
|
# Invalid for links to EpsteinWeb
|
|
120
125
|
JUNK_EMAILERS = [
|
|
121
126
|
'asmallworld@travel.asmallworld.net',
|
|
127
|
+
"digest-noreply@quora.com",
|
|
122
128
|
'editorialstaff@flipboard.com',
|
|
123
129
|
'How To Academy',
|
|
124
130
|
'Jokeland',
|
|
@@ -126,9 +132,13 @@ JUNK_EMAILERS = [
|
|
|
126
132
|
'Saved by Internet Explorer 11',
|
|
127
133
|
]
|
|
128
134
|
|
|
129
|
-
|
|
130
|
-
|
|
135
|
+
MAILING_LISTS = [
|
|
136
|
+
INTELLIGENCE_SQUARED,
|
|
131
137
|
'middle.east.update@hotmail.com',
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + MAILING_LISTS + [
|
|
141
|
+
'Alan S Halperin',
|
|
132
142
|
'Mitchell Bard',
|
|
133
143
|
'Skip Rimer',
|
|
134
144
|
]
|
|
@@ -281,7 +291,7 @@ SELF_EMAILS_FILE_IDS = [
|
|
|
281
291
|
]
|
|
282
292
|
|
|
283
293
|
METADATA_FIELDS = [
|
|
284
|
-
|
|
294
|
+
IS_JUNK_MAIL,
|
|
285
295
|
'recipients',
|
|
286
296
|
'sent_from_device',
|
|
287
297
|
]
|
|
@@ -294,7 +304,6 @@ class Email(Communication):
|
|
|
294
304
|
actual_text (str) - best effort at the text actually sent in this email, excluding quoted replies and forwards
|
|
295
305
|
config (EmailCfg | None) - manual config for this email (if it exists)
|
|
296
306
|
header (EmailHeader) - header data extracted from the text (from/to/sent/subject etc)
|
|
297
|
-
is_junk_mail (bool) - True if this is junk mail
|
|
298
307
|
recipients (list[str | None]) - who this email was sent to
|
|
299
308
|
sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
|
|
300
309
|
signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
|
|
@@ -302,17 +311,16 @@ class Email(Communication):
|
|
|
302
311
|
actual_text: str = field(init=False)
|
|
303
312
|
config: EmailCfg | None = None
|
|
304
313
|
header: EmailHeader = field(init=False)
|
|
305
|
-
is_junk_mail: bool = False
|
|
306
314
|
recipients: list[str | None] = field(default_factory=list)
|
|
307
315
|
sent_from_device: str | None = None
|
|
308
316
|
signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
|
|
317
|
+
truncation_allowed: bool = True
|
|
309
318
|
|
|
310
319
|
# For logging how many headers we prettified while printing, kind of janky
|
|
311
320
|
rewritten_header_ids: ClassVar[set[str]] = set([])
|
|
312
321
|
|
|
313
322
|
def __post_init__(self):
|
|
314
323
|
super().__post_init__()
|
|
315
|
-
self.is_junk_mail = self.author in JUNK_EMAILERS
|
|
316
324
|
|
|
317
325
|
if self.config and self.config.recipients:
|
|
318
326
|
self.recipients = cast(list[str | None], self.config.recipients)
|
|
@@ -331,9 +339,17 @@ class Email(Communication):
|
|
|
331
339
|
txt = Text("OCR text of email from ", style='grey46').append(self.author_txt).append(' to ')
|
|
332
340
|
return txt.append(self._recipients_txt()).append(highlighter(f" probably sent at {self.timestamp}"))
|
|
333
341
|
|
|
342
|
+
def is_fwded_article(self) -> bool:
|
|
343
|
+
return bool(self.config and self.config.is_fwded_article)
|
|
344
|
+
|
|
345
|
+
def is_junk_mail(self) -> bool:
|
|
346
|
+
return self.author in JUNK_EMAILERS or self.author in MAILING_LISTS
|
|
347
|
+
|
|
334
348
|
def metadata(self) -> Metadata:
|
|
349
|
+
local_metadata = asdict(self)
|
|
350
|
+
local_metadata[IS_JUNK_MAIL] = self.is_junk_mail()
|
|
335
351
|
metadata = super().metadata()
|
|
336
|
-
metadata.update({k: v for k, v in
|
|
352
|
+
metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
|
|
337
353
|
return metadata
|
|
338
354
|
|
|
339
355
|
def subject(self) -> str:
|
|
@@ -352,17 +368,18 @@ class Email(Communication):
|
|
|
352
368
|
"""The text that comes before likely quoted replies and forwards etc."""
|
|
353
369
|
if self.config and self.config.actual_text is not None:
|
|
354
370
|
return self.config.actual_text
|
|
371
|
+
|
|
372
|
+
text = '\n'.join(self.text.split('\n')[self.header.num_header_rows:]).strip()
|
|
373
|
+
|
|
374
|
+
if self.config and self.config.fwded_text_after:
|
|
375
|
+
return text.split(self.config.fwded_text_after)[0].strip()
|
|
355
376
|
elif self.header.num_header_rows == 0:
|
|
356
377
|
return self.text
|
|
357
378
|
|
|
358
|
-
text = '\n'.join(self.text.split('\n')[self.header.num_header_rows:]).strip()
|
|
359
379
|
reply_text_match = REPLY_TEXT_REGEX.search(text)
|
|
360
380
|
# logger.info(f"Raw text:\n" + self.top_lines(20) + '\n\n')
|
|
361
381
|
# logger.info(f"With header removed:\n" + text[0:500] + '\n\n')
|
|
362
382
|
|
|
363
|
-
if self.file_id in ['024624']: # This email starts with "On September 14th"
|
|
364
|
-
return text.split('On Tue, May 14')[0].strip()
|
|
365
|
-
|
|
366
383
|
if reply_text_match:
|
|
367
384
|
actual_num_chars = len(reply_text_match.group(1))
|
|
368
385
|
actual_text_pct = f"{(100 * float(actual_num_chars) / len(text)):.1f}%"
|
|
@@ -555,6 +572,9 @@ class Email(Communication):
|
|
|
555
572
|
self._merge_lines(3, 5)
|
|
556
573
|
elif self.file_id == '028931':
|
|
557
574
|
self._merge_lines(3, 6)
|
|
575
|
+
elif self.file_id == '013415':
|
|
576
|
+
for _i in range(2):
|
|
577
|
+
self._merge_lines(4)
|
|
558
578
|
elif self.file_id in ['033568']:
|
|
559
579
|
for _i in range(5):
|
|
560
580
|
self._merge_lines(5)
|
|
@@ -637,7 +657,7 @@ class Email(Communication):
|
|
|
637
657
|
num_chars = quote_cutoff
|
|
638
658
|
|
|
639
659
|
# Truncate long emails but leave a note explaining what happened w/link to source document
|
|
640
|
-
if len(text) > num_chars:
|
|
660
|
+
if len(text) > num_chars and self.truncation_allowed:
|
|
641
661
|
text = text[0:num_chars]
|
|
642
662
|
doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
|
|
643
663
|
trim_note = f"<...trimmed to {num_chars} characters of {self.length}, read the rest at {doc_link_markup}...>"
|
|
@@ -4,7 +4,7 @@ from datetime import datetime
|
|
|
4
4
|
|
|
5
5
|
from rich.text import Text
|
|
6
6
|
|
|
7
|
-
from epstein_files.util.constant.names import JEFFREY_EPSTEIN, ANTHONY_SCARAMUCCI, STEVE_BANNON, UNKNOWN
|
|
7
|
+
from epstein_files.util.constant.names import JEFFREY_EPSTEIN, ANTHONY_SCARAMUCCI, CELINA_DUBIN, EVA, STEVE_BANNON, UNKNOWN
|
|
8
8
|
from epstein_files.util.data import extract_last_name
|
|
9
9
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
10
10
|
from epstein_files.util.logging import logger
|
|
@@ -19,17 +19,18 @@ DISPLAY_LAST_NAME_ONLY = [
|
|
|
19
19
|
STEVE_BANNON,
|
|
20
20
|
]
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
'+
|
|
24
|
-
'+
|
|
25
|
-
'+
|
|
26
|
-
|
|
22
|
+
PHONE_NUMBER_MAPPING = {
|
|
23
|
+
'+19174393646': ANTHONY_SCARAMUCCI,
|
|
24
|
+
'+13109906526': STEVE_BANNON,
|
|
25
|
+
'+16463880059': EVA,
|
|
26
|
+
'+13108737937': CELINA_DUBIN,
|
|
27
|
+
'+13108802851': STEVE_BANNON,
|
|
28
|
+
|
|
29
|
+
}
|
|
27
30
|
|
|
28
31
|
TEXTER_MAPPING = {
|
|
29
32
|
'e:': JEFFREY_EPSTEIN,
|
|
30
33
|
'e:jeeitunes@gmail.com': JEFFREY_EPSTEIN,
|
|
31
|
-
'+19174393646': ANTHONY_SCARAMUCCI,
|
|
32
|
-
'+13109906526': STEVE_BANNON,
|
|
33
34
|
}
|
|
34
35
|
|
|
35
36
|
|
|
@@ -37,7 +38,7 @@ TEXTER_MAPPING = {
|
|
|
37
38
|
class TextMessage:
|
|
38
39
|
"""Class representing a single iMessage text message."""
|
|
39
40
|
author: str | None
|
|
40
|
-
author_str: str =
|
|
41
|
+
author_str: str | None = None
|
|
41
42
|
id_confirmed: bool = False
|
|
42
43
|
text: str
|
|
43
44
|
timestamp_str: str
|
|
@@ -47,14 +48,10 @@ class TextMessage:
|
|
|
47
48
|
|
|
48
49
|
if self.author is None:
|
|
49
50
|
self.author_str = UNKNOWN
|
|
50
|
-
elif self.author in UNKNOWN_TEXTERS:
|
|
51
|
-
logger.warning(f"Bad text from '{self.author}': \"{self.text}\"")
|
|
52
|
-
self.author_str = self.author
|
|
53
|
-
self.author = None # TODO: this shouldn't be happening; we still know the author...
|
|
54
51
|
elif self.author in DISPLAY_LAST_NAME_ONLY:
|
|
55
52
|
self.author_str = extract_last_name(self.author)
|
|
56
53
|
else:
|
|
57
|
-
self.author_str = self.author
|
|
54
|
+
self.author_str = self.author_str or self.author
|
|
58
55
|
|
|
59
56
|
if not self.id_confirmed and self.author is not None and self.author != JEFFREY_EPSTEIN:
|
|
60
57
|
self.author_str = self.author + ' (?)'
|
|
@@ -87,7 +84,6 @@ class TextMessage:
|
|
|
87
84
|
return msg_txt
|
|
88
85
|
|
|
89
86
|
def __rich__(self) -> Text:
|
|
90
|
-
# TODO: Workaround for phone numbers that sucks
|
|
91
87
|
author_style = get_style_for_name(self.author_str if self.author_str.startswith('+') else self.author)
|
|
92
88
|
author_txt = Text(self.author_str, style=author_style)
|
|
93
89
|
timestamp_txt = Text(f"[{self.timestamp_str}]", style=TIMESTAMP_STYLE).append(' ')
|
|
@@ -44,17 +44,8 @@ class MessengerLog(Communication):
|
|
|
44
44
|
|
|
45
45
|
def messages(self) -> list[TextMessage]:
|
|
46
46
|
"""Lazily evaluated accessor for self._messages."""
|
|
47
|
-
if
|
|
48
|
-
self._messages = [
|
|
49
|
-
TextMessage(
|
|
50
|
-
# If the Sender: is redacted that means it's from self.author
|
|
51
|
-
author=REDACTED_AUTHOR_REGEX.sub('', match.group(1).strip()) or self.author,
|
|
52
|
-
id_confirmed=not self.is_attribution_uncertain(),
|
|
53
|
-
text=match.group(4).strip(),
|
|
54
|
-
timestamp_str=match.group(2).strip(),
|
|
55
|
-
)
|
|
56
|
-
for match in MSG_REGEX.finditer(self.text)
|
|
57
|
-
]
|
|
47
|
+
if not self._messages:
|
|
48
|
+
self._messages = [self._build_message(match) for match in MSG_REGEX.finditer(self.text)]
|
|
58
49
|
|
|
59
50
|
return self._messages
|
|
60
51
|
|
|
@@ -70,6 +61,19 @@ class MessengerLog(Communication):
|
|
|
70
61
|
def _border_style(self) -> str:
|
|
71
62
|
return self.author_style
|
|
72
63
|
|
|
64
|
+
def _build_message(self, match: re.Match) -> TextMessage:
|
|
65
|
+
"""Turn a regex match into a TextMessage."""
|
|
66
|
+
author_str = REDACTED_AUTHOR_REGEX.sub('', match.group(1).strip())
|
|
67
|
+
|
|
68
|
+
# If the Sender: is redacted that means it's from self.author
|
|
69
|
+
return TextMessage(
|
|
70
|
+
author=self.author if (author_str.startswith('+') or not author_str) else author_str,
|
|
71
|
+
author_str=author_str if author_str.startswith('+') else None, # Preserve phone numbers
|
|
72
|
+
id_confirmed=not self.is_attribution_uncertain(),
|
|
73
|
+
text=match.group(4).strip(),
|
|
74
|
+
timestamp_str=match.group(2).strip(),
|
|
75
|
+
)
|
|
76
|
+
|
|
73
77
|
def _extract_timestamp(self) -> datetime:
|
|
74
78
|
for match in MSG_REGEX.finditer(self.text):
|
|
75
79
|
timestamp_str = match.group(2).strip()
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import logging
|
|
3
3
|
import warnings
|
|
4
|
-
from dataclasses import dataclass
|
|
4
|
+
from dataclasses import asdict, dataclass
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
|
|
7
7
|
import datefinder
|
|
@@ -15,7 +15,7 @@ from rich.text import Text
|
|
|
15
15
|
from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, WHITESPACE_REGEX, Document
|
|
16
16
|
from epstein_files.util.constant.strings import *
|
|
17
17
|
from epstein_files.util.constants import *
|
|
18
|
-
from epstein_files.util.doc_cfg import FINANCIAL_REPORTS_AUTHORS, DocCfg
|
|
18
|
+
from epstein_files.util.doc_cfg import FINANCIAL_REPORTS_AUTHORS, DocCfg, Metadata
|
|
19
19
|
from epstein_files.util.data import escape_single_quotes, remove_timezone, uniquify
|
|
20
20
|
from epstein_files.util.file_helper import FILENAME_LENGTH
|
|
21
21
|
from epstein_files.util.env import args
|
|
@@ -83,11 +83,10 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
|
|
|
83
83
|
NOBEL_CHARITABLE_TRUST,
|
|
84
84
|
'Nautilus',
|
|
85
85
|
'New Yorker',
|
|
86
|
-
|
|
87
|
-
NYT_COLUMN,
|
|
86
|
+
NYT,
|
|
88
87
|
PALM_BEACH_CODE_ENFORCEMENT,
|
|
89
|
-
|
|
90
|
-
|
|
88
|
+
PALM_BEACH_DAILY_NEWS,
|
|
89
|
+
PALM_BEACH_POST,
|
|
91
90
|
PALM_BEACH_TSV,
|
|
92
91
|
PALM_BEACH_WATER_COMMITTEE,
|
|
93
92
|
PAUL_KRASSNER,
|
|
@@ -102,6 +101,7 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
|
|
|
102
101
|
SHIMON_POST_ARTICLE,
|
|
103
102
|
SINGLE_PAGE,
|
|
104
103
|
STACEY_PLASKETT,
|
|
104
|
+
'Tatler',
|
|
105
105
|
TERJE_ROD_LARSEN,
|
|
106
106
|
TEXT_OF_US_LAW,
|
|
107
107
|
TRANSLATION,
|
|
@@ -113,7 +113,7 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
|
|
|
113
113
|
'U.S. News',
|
|
114
114
|
'US Office',
|
|
115
115
|
'Vanity Fair',
|
|
116
|
-
|
|
116
|
+
VI_DAILY_NEWS,
|
|
117
117
|
WAPO,
|
|
118
118
|
]
|
|
119
119
|
|
|
@@ -127,7 +127,7 @@ class OtherFile(Document):
|
|
|
127
127
|
|
|
128
128
|
if self.config is None and VI_DAILY_NEWS_REGEX.search(self.text):
|
|
129
129
|
self.log(f"Creating synthetic config for VI Daily News article...", logging.INFO)
|
|
130
|
-
self.config = DocCfg(id=self.file_id,
|
|
130
|
+
self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
|
|
131
131
|
|
|
132
132
|
def category(self) -> str | None:
|
|
133
133
|
return self.config and self.config.category
|
|
@@ -175,6 +175,11 @@ class OtherFile(Document):
|
|
|
175
175
|
|
|
176
176
|
return True
|
|
177
177
|
|
|
178
|
+
def metadata(self) -> Metadata:
|
|
179
|
+
metadata = super().metadata()
|
|
180
|
+
metadata['is_interesting'] = self.is_interesting()
|
|
181
|
+
return metadata
|
|
182
|
+
|
|
178
183
|
def preview_text(self) -> str:
|
|
179
184
|
return WHITESPACE_REGEX.sub(' ', self.text)[0:PREVIEW_CHARS]
|
|
180
185
|
|