epstein-files 1.1.5__tar.gz → 1.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {epstein_files-1.1.5 → epstein_files-1.2.1}/PKG-INFO +6 -2
- {epstein_files-1.1.5 → epstein_files-1.2.1}/README.md +4 -1
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/__init__.py +12 -21
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/communication.py +0 -3
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/document.py +68 -21
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/email.py +54 -70
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/emails/email_header.py +14 -4
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/imessage/text_message.py +5 -4
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/messenger_log.py +7 -7
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/other_file.py +16 -34
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/epstein_files.py +133 -141
- epstein_files-1.2.1/epstein_files/person.py +324 -0
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/constant/names.py +46 -15
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/constant/output_files.py +1 -0
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/constant/strings.py +3 -3
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/constant/urls.py +15 -2
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/constants.py +75 -21
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/data.py +1 -20
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/doc_cfg.py +27 -17
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/env.py +5 -3
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/highlighted_group.py +248 -203
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/logging.py +1 -1
- epstein_files-1.2.1/epstein_files/util/output.py +306 -0
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/rich.py +20 -35
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/timer.py +14 -0
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/word_count.py +1 -1
- {epstein_files-1.1.5 → epstein_files-1.2.1}/pyproject.toml +2 -1
- epstein_files-1.1.5/epstein_files/util/output.py +0 -350
- {epstein_files-1.1.5 → epstein_files-1.2.1}/LICENSE +0 -0
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/json_file.py +0 -0
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/constant/common_words.py +0 -0
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/constant/html.py +0 -0
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/file_helper.py +0 -0
- {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/search_result.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: epstein-files
|
|
3
|
-
Version: 1.1
|
|
3
|
+
Version: 1.2.1
|
|
4
4
|
Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
|
|
5
5
|
Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
|
|
6
6
|
License: GPL-3.0-or-later
|
|
@@ -17,6 +17,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.10
|
|
18
18
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Requires-Dist: cairosvg (>=2.8.2,<3.0.0)
|
|
20
21
|
Requires-Dist: datefinder (>=0.7.3,<0.8.0)
|
|
21
22
|
Requires-Dist: inflection (>=0.5.1,<0.6.0)
|
|
22
23
|
Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
|
|
@@ -31,7 +32,7 @@ Project-URL: TextMessages, https://michelcrypt4d4mus.github.io/epstein_text_mess
|
|
|
31
32
|
Project-URL: WordCounts, https://michelcrypt4d4mus.github.io/epstein_text_messages/communication_word_count_epstein_files_nov_2025.html
|
|
32
33
|
Description-Content-Type: text/markdown
|
|
33
34
|
|
|
34
|
-
#
|
|
35
|
+
# Color Highlighted Epstein Emails and Text Messages
|
|
35
36
|
|
|
36
37
|

|
|
37
38
|
|
|
@@ -119,3 +120,6 @@ for file in epstein_files.other_files:
|
|
|
119
120
|
do_stuff(file)
|
|
120
121
|
```
|
|
121
122
|
|
|
123
|
+
# Everyone Who Sent or Received an Email in the November Document Dump
|
|
124
|
+

|
|
125
|
+
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#
|
|
1
|
+
# Color Highlighted Epstein Emails and Text Messages
|
|
2
2
|
|
|
3
3
|

|
|
4
4
|
|
|
@@ -85,3 +85,6 @@ for json_file in epstein_files.json_files:
|
|
|
85
85
|
for file in epstein_files.other_files:
|
|
86
86
|
do_stuff(file)
|
|
87
87
|
```
|
|
88
|
+
|
|
89
|
+
# Everyone Who Sent or Received an Email in the November Document Dump
|
|
90
|
+

|
|
@@ -21,7 +21,8 @@ from epstein_files.util.env import args
|
|
|
21
21
|
from epstein_files.util.file_helper import coerce_file_path, extract_file_id
|
|
22
22
|
from epstein_files.util.logging import exit_with_error, logger
|
|
23
23
|
from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
|
|
24
|
-
print_other_files_section, print_text_messages_section, print_email_timeline,
|
|
24
|
+
print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info,
|
|
25
|
+
print_json_metadata, write_urls)
|
|
25
26
|
from epstein_files.util.rich import (build_highlighter, console, print_color_key, print_title_page_header,
|
|
26
27
|
print_title_page_tables, print_subtitle_panel, write_html)
|
|
27
28
|
from epstein_files.util.timer import Timer
|
|
@@ -37,7 +38,10 @@ def generate_html() -> None:
|
|
|
37
38
|
timer = Timer()
|
|
38
39
|
epstein_files = EpsteinFiles.get_files(timer)
|
|
39
40
|
|
|
40
|
-
if args.
|
|
41
|
+
if args.emailers_info:
|
|
42
|
+
print_emailers_info(epstein_files)
|
|
43
|
+
exit()
|
|
44
|
+
elif args.json_metadata:
|
|
41
45
|
print_json_metadata(epstein_files)
|
|
42
46
|
exit()
|
|
43
47
|
elif args.json_files:
|
|
@@ -55,25 +59,19 @@ def generate_html() -> None:
|
|
|
55
59
|
exit()
|
|
56
60
|
|
|
57
61
|
if args.output_texts:
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
timer.print_at_checkpoint(f'Printed {len(imessage_logs)} text message log files')
|
|
62
|
+
printed_logs = print_text_messages_section(epstein_files)
|
|
63
|
+
timer.log_section_complete('MessengerLog', epstein_files.imessage_logs, printed_logs)
|
|
61
64
|
|
|
62
65
|
if args.output_emails:
|
|
63
|
-
|
|
64
|
-
timer.
|
|
66
|
+
printed_emails = print_emails_section(epstein_files)
|
|
67
|
+
timer.log_section_complete('Email', epstein_files.emails, printed_emails)
|
|
65
68
|
elif args.email_timeline:
|
|
66
69
|
print_email_timeline(epstein_files)
|
|
67
70
|
timer.print_at_checkpoint(f"Printed chronological emails table")
|
|
68
71
|
|
|
69
72
|
if args.output_other:
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
else:
|
|
73
|
-
files = [f for f in epstein_files.other_files if args.all_other_files or f.is_interesting()]
|
|
74
|
-
|
|
75
|
-
print_other_files_section(files, epstein_files)
|
|
76
|
-
timer.print_at_checkpoint(f"Printed {len(files)} other files (skipped {len(epstein_files.other_files) - len(files)})")
|
|
73
|
+
printed_files = print_other_files_section(epstein_files)
|
|
74
|
+
timer.log_section_complete('OtherFile', epstein_files.other_files, printed_files)
|
|
77
75
|
|
|
78
76
|
write_html(args.build)
|
|
79
77
|
logger.warning(f"Total time: {timer.seconds_since_start_str()}")
|
|
@@ -90,7 +88,6 @@ def epstein_diff():
|
|
|
90
88
|
|
|
91
89
|
def epstein_search():
|
|
92
90
|
"""Search the cleaned up text of the files."""
|
|
93
|
-
_assert_positional_args()
|
|
94
91
|
epstein_files = EpsteinFiles.get_files()
|
|
95
92
|
|
|
96
93
|
for search_term in args.positional_args:
|
|
@@ -113,7 +110,6 @@ def epstein_search():
|
|
|
113
110
|
|
|
114
111
|
def epstein_show():
|
|
115
112
|
"""Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
|
|
116
|
-
_assert_positional_args()
|
|
117
113
|
raw_docs: list[Document] = []
|
|
118
114
|
console.line()
|
|
119
115
|
|
|
@@ -138,8 +134,3 @@ def epstein_show():
|
|
|
138
134
|
|
|
139
135
|
def epstein_word_count() -> None:
|
|
140
136
|
write_word_counts_html()
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
def _assert_positional_args():
|
|
144
|
-
if not args.positional_args:
|
|
145
|
-
exit_with_error(f"No positional args provided!\n")
|
|
@@ -34,9 +34,6 @@ class Communication(Document):
|
|
|
34
34
|
"""Overrides super() method to apply self.author_style."""
|
|
35
35
|
return super().external_links_txt(self.author_style(), include_alt_links=include_alt_links)
|
|
36
36
|
|
|
37
|
-
def is_attribution_uncertain(self) -> bool:
|
|
38
|
-
return bool(self.config and self.config.is_attribution_uncertain)
|
|
39
|
-
|
|
40
37
|
def summary(self) -> Text:
|
|
41
38
|
return self._summary().append(CLOSE_PROPERTIES_CHAR)
|
|
42
39
|
|
|
@@ -11,17 +11,19 @@ from rich.console import Console, ConsoleOptions, Group, RenderResult
|
|
|
11
11
|
from rich.padding import Padding
|
|
12
12
|
from rich.panel import Panel
|
|
13
13
|
from rich.text import Text
|
|
14
|
+
from rich.table import Table
|
|
14
15
|
|
|
15
16
|
from epstein_files.util.constant.names import *
|
|
16
17
|
from epstein_files.util.constant.strings import *
|
|
17
18
|
from epstein_files.util.constant.urls import *
|
|
18
19
|
from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
|
|
19
|
-
from epstein_files.util.data import collapse_newlines, date_str, patternize,
|
|
20
|
+
from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time, without_falsey
|
|
20
21
|
from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
|
|
21
22
|
from epstein_files.util.env import DOCS_DIR, args
|
|
22
|
-
from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, is_local_extract_file
|
|
23
|
+
from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, file_size_to_str, is_local_extract_file
|
|
23
24
|
from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
|
|
24
|
-
from epstein_files.util.rich import INFO_STYLE,
|
|
25
|
+
from epstein_files.util.rich import (INFO_STYLE, NA_TXT, SYMBOL_STYLE, add_cols_to_table, build_table, console,
|
|
26
|
+
highlighter, join_texts, key_value_txt, link_text_obj, parenthesize)
|
|
25
27
|
from epstein_files.util.search_result import MatchedLine
|
|
26
28
|
|
|
27
29
|
ALT_LINK_STYLE = 'white dim'
|
|
@@ -55,6 +57,14 @@ OCR_REPAIRS = {
|
|
|
55
57
|
'Nil Priell': 'Nili Priell',
|
|
56
58
|
}
|
|
57
59
|
|
|
60
|
+
SUMMARY_TABLE_COLS: list[str | dict] = [
|
|
61
|
+
'Count',
|
|
62
|
+
{'name': 'Has Author', 'style': 'honeydew2'},
|
|
63
|
+
{'name': 'No Author', 'style': 'wheat4'},
|
|
64
|
+
{'name': 'Uncertain Author', 'style': 'royal_blue1 dim'},
|
|
65
|
+
{'name': 'Size', 'justify': 'right', 'style': 'dim'},
|
|
66
|
+
]
|
|
67
|
+
|
|
58
68
|
|
|
59
69
|
@dataclass
|
|
60
70
|
class Document:
|
|
@@ -63,7 +73,7 @@ class Document:
|
|
|
63
73
|
|
|
64
74
|
Attributes:
|
|
65
75
|
file_path (Path): Local path to file
|
|
66
|
-
author (
|
|
76
|
+
author (Name): Who is responsible for the text in the file
|
|
67
77
|
config (DocCfg): Information about this fil
|
|
68
78
|
file_id (str): 6 digit (or 8 digits if it's a local extract file) string ID
|
|
69
79
|
filename (str): File's basename
|
|
@@ -74,7 +84,7 @@ class Document:
|
|
|
74
84
|
"""
|
|
75
85
|
file_path: Path
|
|
76
86
|
# Optional fields
|
|
77
|
-
author:
|
|
87
|
+
author: Name = None
|
|
78
88
|
config: EmailCfg | DocCfg | TextCfg | None = None
|
|
79
89
|
file_id: str = field(init=False)
|
|
80
90
|
filename: str = field(init=False)
|
|
@@ -121,6 +131,10 @@ class Document:
|
|
|
121
131
|
txt.append(f" because it's {DUPE_TYPE_STRS[self.config.dupe_type]} ")
|
|
122
132
|
return txt.append(epstein_media_doc_link_txt(self.config.duplicate_of_id, style='royal_blue1'))
|
|
123
133
|
|
|
134
|
+
def duplicate_of_id(self) -> str | None:
|
|
135
|
+
if self.config and self.config.duplicate_of_id:
|
|
136
|
+
return self.config.duplicate_of_id
|
|
137
|
+
|
|
124
138
|
def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
125
139
|
return self.external_link(epsteinify_doc_url, style, link_txt)
|
|
126
140
|
|
|
@@ -177,8 +191,11 @@ class Document:
|
|
|
177
191
|
"""Secondary info about this file (description recipients, etc). Overload in subclasses."""
|
|
178
192
|
return None
|
|
179
193
|
|
|
194
|
+
def is_attribution_uncertain(self) -> bool:
|
|
195
|
+
return bool(self.config and self.config.is_attribution_uncertain)
|
|
196
|
+
|
|
180
197
|
def is_duplicate(self) -> bool:
|
|
181
|
-
return bool(self.
|
|
198
|
+
return bool(self.duplicate_of_id())
|
|
182
199
|
|
|
183
200
|
def is_local_extract_file(self) -> bool:
|
|
184
201
|
"""True if extracted from other file (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
|
|
@@ -236,17 +253,6 @@ class Document:
|
|
|
236
253
|
|
|
237
254
|
return text
|
|
238
255
|
|
|
239
|
-
def sort_key(self) -> tuple[datetime, str, int]:
|
|
240
|
-
"""Sort by timestamp, file_id, then whether or not it's a duplicate file."""
|
|
241
|
-
if self.is_duplicate():
|
|
242
|
-
sort_id = self.config.duplicate_of_id
|
|
243
|
-
dupe_idx = 1
|
|
244
|
-
else:
|
|
245
|
-
sort_id = self.file_id
|
|
246
|
-
dupe_idx = 0
|
|
247
|
-
|
|
248
|
-
return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
|
|
249
|
-
|
|
250
256
|
def source_file_id(self) -> str:
|
|
251
257
|
"""Strip off the _1, _2, etc. suffixes for extracted documents."""
|
|
252
258
|
return self.file_id[0:6]
|
|
@@ -257,7 +263,7 @@ class Document:
|
|
|
257
263
|
txt.append(f" {self.file_path.stem}", style=FILENAME_STYLE)
|
|
258
264
|
|
|
259
265
|
if self.timestamp:
|
|
260
|
-
timestamp_str =
|
|
266
|
+
timestamp_str = remove_zero_time(self.timestamp).replace('T', ' ')
|
|
261
267
|
txt.append(' (', style=SYMBOL_STYLE)
|
|
262
268
|
txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
|
|
263
269
|
|
|
@@ -278,6 +284,17 @@ class Document:
|
|
|
278
284
|
|
|
279
285
|
return Panel(Group(*sentences), border_style=self._class_style(), expand=False)
|
|
280
286
|
|
|
287
|
+
def timestamp_sort_key(self) -> tuple[datetime, str, int]:
|
|
288
|
+
"""Sort by timestamp, file_id, then whether or not it's a duplicate file."""
|
|
289
|
+
if self.is_duplicate():
|
|
290
|
+
sort_id = self.config.duplicate_of_id
|
|
291
|
+
dupe_idx = 1
|
|
292
|
+
else:
|
|
293
|
+
sort_id = self.file_id
|
|
294
|
+
dupe_idx = 0
|
|
295
|
+
|
|
296
|
+
return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
|
|
297
|
+
|
|
281
298
|
def top_lines(self, n: int = 10) -> str:
|
|
282
299
|
"""First n lines."""
|
|
283
300
|
return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
|
|
@@ -357,6 +374,32 @@ class Document:
|
|
|
357
374
|
def __str__(self) -> str:
|
|
358
375
|
return self.summary().plain
|
|
359
376
|
|
|
377
|
+
@classmethod
|
|
378
|
+
def file_info_table(cls, title: str, first_col_name: str) -> Table:
|
|
379
|
+
"""Empty table with appropriate cols for summarizing groups of files."""
|
|
380
|
+
table = build_table(title)
|
|
381
|
+
cols = [{'name': first_col_name, 'min_width': 14}] + SUMMARY_TABLE_COLS
|
|
382
|
+
add_cols_to_table(table, cols, 'right')
|
|
383
|
+
return table
|
|
384
|
+
|
|
385
|
+
@classmethod
|
|
386
|
+
def files_info(cls, files: Sequence['Document'], is_author_na: bool = False) -> dict[str, str | Text]:
|
|
387
|
+
"""Summary info about a group of files."""
|
|
388
|
+
file_count = len(files)
|
|
389
|
+
author_count = cls.known_author_count(files)
|
|
390
|
+
|
|
391
|
+
return {
|
|
392
|
+
'count': str(file_count),
|
|
393
|
+
'author_count': NA_TXT if is_author_na else str(author_count),
|
|
394
|
+
'no_author_count': NA_TXT if is_author_na else str(file_count - author_count),
|
|
395
|
+
'uncertain_author_count': NA_TXT if is_author_na else str(len([f for f in files if f.is_attribution_uncertain()])),
|
|
396
|
+
'bytes': file_size_to_str(sum([f.file_size() for f in files])),
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
@classmethod
|
|
400
|
+
def files_info_row(cls, files: Sequence['Document'], author_na: bool = False) -> Sequence[str | Text]:
|
|
401
|
+
return [v for v in cls.files_info(files, author_na).values()]
|
|
402
|
+
|
|
360
403
|
@staticmethod
|
|
361
404
|
def diff_files(files: list[str]) -> None:
|
|
362
405
|
"""Diff the contents of two Documents after all cleanup, BOM removal, etc."""
|
|
@@ -394,14 +437,18 @@ class Document:
|
|
|
394
437
|
|
|
395
438
|
@staticmethod
|
|
396
439
|
def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
|
|
397
|
-
return sorted(docs, key=lambda doc: doc.
|
|
440
|
+
return sorted(docs, key=lambda doc: doc.timestamp_sort_key())
|
|
398
441
|
|
|
399
|
-
@
|
|
400
|
-
def uniquify(
|
|
442
|
+
@staticmethod
|
|
443
|
+
def uniquify(documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
|
|
401
444
|
"""Uniquify by file_id."""
|
|
402
445
|
id_map = {doc.file_id: doc for doc in documents}
|
|
403
446
|
return [doc for doc in id_map.values()]
|
|
404
447
|
|
|
448
|
+
@staticmethod
|
|
449
|
+
def without_dupes(docs: Sequence['DocumentType']) -> list['DocumentType']:
|
|
450
|
+
return [doc for doc in docs if not doc.is_duplicate()]
|
|
451
|
+
|
|
405
452
|
|
|
406
453
|
DocumentType = TypeVar('DocumentType', bound=Document)
|
|
407
454
|
|
|
@@ -20,7 +20,7 @@ from epstein_files.documents.emails.email_header import (BAD_EMAILER_REGEX, EMAI
|
|
|
20
20
|
from epstein_files.util.constant.names import *
|
|
21
21
|
from epstein_files.util.constant.strings import REDACTED
|
|
22
22
|
from epstein_files.util.constants import *
|
|
23
|
-
from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes,
|
|
23
|
+
from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes,
|
|
24
24
|
flatten, listify, remove_timezone, uniquify)
|
|
25
25
|
from epstein_files.util.doc_cfg import EmailCfg, Metadata
|
|
26
26
|
from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
|
|
@@ -32,7 +32,7 @@ BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE
|
|
|
32
32
|
BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
|
|
33
33
|
DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
|
|
34
34
|
LINK_LINE_REGEX = re.compile(f"^(> )?htt")
|
|
35
|
-
QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote
|
|
35
|
+
QUOTED_REPLY_LINE_REGEX = re.compile(r'(\nFrom:(.*)|wrote:)\n', re.IGNORECASE)
|
|
36
36
|
REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
|
|
37
37
|
|
|
38
38
|
BAD_TIMEZONE_REGEX = re.compile(fr'\((UTC|GMT\+\d\d:\d\d)\)|{REDACTED}')
|
|
@@ -55,6 +55,7 @@ REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
|
|
|
55
55
|
|
|
56
56
|
OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
57
57
|
re.compile(r'grnail\.com'): 'gmail.com',
|
|
58
|
+
'Newsmax. corn': 'Newsmax.com',
|
|
58
59
|
re.compile(r"^(From|To)(: )?[_1.]{5,}", re.MULTILINE): rf"\1: {REDACTED}", # Redacted email addresses
|
|
59
60
|
# These 3 must come in this order!
|
|
60
61
|
re.compile(r'([/vkT]|Ai|li|(I|7)v)rote:'): 'wrote:',
|
|
@@ -79,6 +80,7 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
|
79
80
|
'twitter glhsummers': 'twitter @lhsummers',
|
|
80
81
|
re.compile(r"twitter\.com[i/][lI]krauss[1lt]"): "twitter.com/lkrauss1",
|
|
81
82
|
re.compile(r'from my BlackBerry[0°] wireless device'): 'from my BlackBerry® wireless device',
|
|
83
|
+
re.compile(r'^INW$', re.MULTILINE): REDACTED,
|
|
82
84
|
# links
|
|
83
85
|
'Imps ://': 'https://',
|
|
84
86
|
re.compile(r'timestopics/people/t/landon jr thomas/inde\n?x\n?\.\n?h\n?tml'): 'timestopics/people/t/landon_jr_thomas/index.html',
|
|
@@ -112,7 +114,7 @@ EMAIL_SIGNATURE_REGEXES = {
|
|
|
112
114
|
DAVID_INGRAM: re.compile(r"Thank you in advance.*\nDavid Ingram.*\nCorrespondent\nReuters.*\nThomson.*(\n(Office|Mobile|Reuters.com).*)*"),
|
|
113
115
|
DEEPAK_CHOPRA: re.compile(fr"({DEEPAK_CHOPRA}( MD)?\n)?2013 Costa Del Mar Road\nCarlsbad, CA 92009(\n(Chopra Foundation|Super Genes: Unlock.*))?(\nJiyo)?(\nChopra Center for Wellbeing)?(\nHome: Where Everyone is Welcome)?"),
|
|
114
116
|
JEFFREY_EPSTEIN: re.compile(r"((\*+|please note)\n+)?(> )?(• )?(» )?The information contained in this communication is\n(> )*(» )?confidential.*?all attachments.( copyright -all rights reserved?)?", re.DOTALL),
|
|
115
|
-
JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*", re.IGNORECASE),
|
|
117
|
+
JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*(\nTel:.*)?(\nEmail:.*)?", re.IGNORECASE),
|
|
116
118
|
KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
|
|
117
119
|
LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
|
|
118
120
|
LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
|
|
@@ -127,14 +129,6 @@ EMAIL_SIGNATURE_REGEXES = {
|
|
|
127
129
|
UNKNOWN: re.compile(r"(This message is directed to and is for the use of the above-noted addressee only.*\nhereon\.)", re.DOTALL),
|
|
128
130
|
}
|
|
129
131
|
|
|
130
|
-
EMAIL_TABLE_COLS = [
|
|
131
|
-
{'name': 'Sent At', 'justify': 'left', 'style': TIMESTAMP_DIM},
|
|
132
|
-
{'name': 'From', 'justify': 'left', 'max_width': 20},
|
|
133
|
-
{'name': 'To', 'justify': 'left', 'max_width': 22},
|
|
134
|
-
{'name': 'Length', 'justify': 'right', 'style': 'wheat4'},
|
|
135
|
-
{'name': 'Subject', 'justify': 'left', 'min_width': 35, 'style': 'honeydew2'},
|
|
136
|
-
]
|
|
137
|
-
|
|
138
132
|
MAILING_LISTS = [
|
|
139
133
|
CAROLYN_RANGEL,
|
|
140
134
|
INTELLIGENCE_SQUARED,
|
|
@@ -142,10 +136,13 @@ MAILING_LISTS = [
|
|
|
142
136
|
JP_MORGAN_USGIO,
|
|
143
137
|
]
|
|
144
138
|
|
|
145
|
-
|
|
139
|
+
BBC_LISTS = JUNK_EMAILERS + MAILING_LISTS
|
|
140
|
+
|
|
141
|
+
TRUNCATE_ALL_EMAILS_FROM = BBC_LISTS + [
|
|
146
142
|
'Alan S Halperin',
|
|
147
143
|
'Mitchell Bard',
|
|
148
144
|
'Skip Rimer',
|
|
145
|
+
'Steven Victor MD',
|
|
149
146
|
]
|
|
150
147
|
|
|
151
148
|
TRUNCATION_LENGTHS = {
|
|
@@ -253,58 +250,15 @@ TRUNCATE_TERMS = [
|
|
|
253
250
|
'https://www.washingtonpost.com/politics/2018/09/04/transcript-phone-call',
|
|
254
251
|
]
|
|
255
252
|
|
|
256
|
-
# Some Paul Krassner emails have a ton of CCed parties we don't care about
|
|
257
|
-
KRASSNER_RECIPIENTS = uniquify(flatten([ALL_FILE_CONFIGS[id].recipients for id in ['025329', '024923', '033568']]))
|
|
258
|
-
|
|
259
|
-
# No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
|
|
260
|
-
USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIPIENTS + [
|
|
261
|
-
'Alan Dlugash', # CCed with Richard Kahn
|
|
262
|
-
'Alan Rogers', # Random CC
|
|
263
|
-
'Andrew Friendly', # Presumably some relation of Kelly Friendly
|
|
264
|
-
'BS Stern', # A random fwd of email we have
|
|
265
|
-
'Cheryl Kleen', # Single email from Anne Boyles, displayed under Anne Boyles
|
|
266
|
-
'Connie Zaguirre', # Random CC
|
|
267
|
-
'Dan Fleuette', # CC from sean bannon
|
|
268
|
-
'Danny Goldberg', # Random Paul Krassner emails
|
|
269
|
-
GERALD_LEFCOURT, # Single CC
|
|
270
|
-
GORDON_GETTY, # Random CC
|
|
271
|
-
JEFF_FULLER, # Random Jean Luc Brunel CC
|
|
272
|
-
'Jojo Fontanilla', # Random CC
|
|
273
|
-
'Joseph Vinciguerra', # Random CC
|
|
274
|
-
'Larry Cohen', # Random Bill Gates CC
|
|
275
|
-
'Lyn Fontanilla', # Random CC
|
|
276
|
-
'Mark Albert', # Random CC
|
|
277
|
-
'Matthew Schafer', # Random CC
|
|
278
|
-
MICHAEL_BUCHHOLTZ, # Terry Kafka CC
|
|
279
|
-
'Nancy Dahl', # covered by Lawrence Krauss (her husband)
|
|
280
|
-
'Michael Simmons', # Random CC
|
|
281
|
-
'Nancy Portland', # Lawrence Krauss CC
|
|
282
|
-
'Oliver Goodenough', # Robert Trivers CC
|
|
283
|
-
'Peter Aldhous', # Lawrence Krauss CC
|
|
284
|
-
'Players2', # Hoffenberg CC
|
|
285
|
-
'Sam Harris', # Lawrence Krauss CC
|
|
286
|
-
SAMUEL_LEFF, # Random CC
|
|
287
|
-
'Sean T Lehane', # Random CC
|
|
288
|
-
'Stephen Rubin', # Random CC
|
|
289
|
-
'Tim Kane', # Random CC
|
|
290
|
-
'Travis Pangburn', # Random CC
|
|
291
|
-
'Vahe Stepanian', # Random CC
|
|
292
|
-
# Ross Gow BCC
|
|
293
|
-
'david.brown@thetimes.co.uk',
|
|
294
|
-
'io-anne.pugh@bbc.co.uk',
|
|
295
|
-
'martin.robinson@mailonline.co.uk',
|
|
296
|
-
'nick.alwav@bbc.co.uk'
|
|
297
|
-
'nick.sommerlad@mirror.co.uk',
|
|
298
|
-
'p.peachev@independent.co.uk',
|
|
299
|
-
]
|
|
300
|
-
|
|
301
253
|
METADATA_FIELDS = [
|
|
302
254
|
'is_junk_mail',
|
|
255
|
+
'is_mailing_list',
|
|
303
256
|
'recipients',
|
|
304
257
|
'sent_from_device',
|
|
305
258
|
'subject',
|
|
306
259
|
]
|
|
307
260
|
|
|
261
|
+
# Note the line repair happens *after* 'Importance: High' is removed
|
|
308
262
|
LINE_REPAIR_MERGES = {
|
|
309
263
|
'017523': 4,
|
|
310
264
|
'019407': [2, 4],
|
|
@@ -312,10 +266,14 @@ LINE_REPAIR_MERGES = {
|
|
|
312
266
|
'022673': 9,
|
|
313
267
|
'022684': 9,
|
|
314
268
|
'022695': 4,
|
|
269
|
+
'029773': [2, 5],
|
|
315
270
|
'023067': 3,
|
|
316
271
|
'025790': 2,
|
|
272
|
+
'029841': 3,
|
|
317
273
|
'026345': 3,
|
|
318
274
|
'026609': 4,
|
|
275
|
+
'033299': 3,
|
|
276
|
+
'026829': 3,
|
|
319
277
|
'026924': [2, 4],
|
|
320
278
|
'028931': [3, 6],
|
|
321
279
|
'029154': [2, 5],
|
|
@@ -326,6 +284,7 @@ LINE_REPAIR_MERGES = {
|
|
|
326
284
|
'029501': 2,
|
|
327
285
|
'029835': [2, 4],
|
|
328
286
|
'029889': 2,
|
|
287
|
+
'029545': [3, 5],
|
|
329
288
|
'029976': 3,
|
|
330
289
|
'030299': [7, 10],
|
|
331
290
|
'030381': [2, 4],
|
|
@@ -359,14 +318,14 @@ class Email(Communication):
|
|
|
359
318
|
actual_text (str) - best effort at the text actually sent in this email, excluding quoted replies and forwards
|
|
360
319
|
config (EmailCfg | None) - manual config for this email (if it exists)
|
|
361
320
|
header (EmailHeader) - header data extracted from the text (from/to/sent/subject etc)
|
|
362
|
-
recipients (list[
|
|
321
|
+
recipients (list[Name]) - who this email was sent to
|
|
363
322
|
sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
|
|
364
323
|
signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
|
|
365
324
|
"""
|
|
366
325
|
actual_text: str = field(init=False)
|
|
367
326
|
config: EmailCfg | None = None
|
|
368
327
|
header: EmailHeader = field(init=False)
|
|
369
|
-
recipients: list[
|
|
328
|
+
recipients: list[Name] = field(default_factory=list)
|
|
370
329
|
sent_from_device: str | None = None
|
|
371
330
|
signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
|
|
372
331
|
|
|
@@ -394,7 +353,7 @@ class Email(Communication):
|
|
|
394
353
|
self.recipients.extend(self._extract_emailer_names(recipient))
|
|
395
354
|
|
|
396
355
|
# Assume mailing list emails are to Epstein
|
|
397
|
-
if self.author in
|
|
356
|
+
if self.author in BBC_LISTS and (self.is_note_to_self() or not self.recipients):
|
|
398
357
|
self.recipients = [JEFFREY_EPSTEIN]
|
|
399
358
|
|
|
400
359
|
# Remove self CCs but preserve self emails
|
|
@@ -423,7 +382,10 @@ class Email(Communication):
|
|
|
423
382
|
return bool(self.config and self.config.is_fwded_article)
|
|
424
383
|
|
|
425
384
|
def is_junk_mail(self) -> bool:
|
|
426
|
-
return self.author in JUNK_EMAILERS
|
|
385
|
+
return self.author in JUNK_EMAILERS
|
|
386
|
+
|
|
387
|
+
def is_mailing_list(self) -> bool:
|
|
388
|
+
return self.author in MAILING_LISTS or self.is_junk_mail()
|
|
427
389
|
|
|
428
390
|
def is_note_to_self(self) -> bool:
|
|
429
391
|
return self.recipients == [self.author]
|
|
@@ -431,6 +393,7 @@ class Email(Communication):
|
|
|
431
393
|
def metadata(self) -> Metadata:
|
|
432
394
|
local_metadata = asdict(self)
|
|
433
395
|
local_metadata['is_junk_mail'] = self.is_junk_mail()
|
|
396
|
+
local_metadata['is_mailing_list'] = self.is_junk_mail()
|
|
434
397
|
local_metadata['subject'] = self.subject() or None
|
|
435
398
|
metadata = super().metadata()
|
|
436
399
|
metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
|
|
@@ -473,9 +436,9 @@ class Email(Communication):
|
|
|
473
436
|
elif self.header.num_header_rows == 0:
|
|
474
437
|
return self.text
|
|
475
438
|
|
|
476
|
-
reply_text_match = REPLY_TEXT_REGEX.search(text)
|
|
477
439
|
self.log_top_lines(20, "Raw text:", logging.DEBUG)
|
|
478
440
|
self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
|
|
441
|
+
reply_text_match = REPLY_TEXT_REGEX.search(text)
|
|
479
442
|
|
|
480
443
|
if reply_text_match:
|
|
481
444
|
actual_num_chars = len(reply_text_match.group(1))
|
|
@@ -550,6 +513,8 @@ class Email(Communication):
|
|
|
550
513
|
self.log_top_lines(msg='No email header match found!', level=log_level)
|
|
551
514
|
self.header = EmailHeader(field_names=[])
|
|
552
515
|
|
|
516
|
+
logger.debug(f"{self.file_id} extracted header\n\n{self.header}\n")
|
|
517
|
+
|
|
553
518
|
def _extract_timestamp(self) -> datetime:
|
|
554
519
|
if self.config and self.config.timestamp:
|
|
555
520
|
return self.config.timestamp
|
|
@@ -585,9 +550,15 @@ class Email(Communication):
|
|
|
585
550
|
|
|
586
551
|
def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
|
|
587
552
|
"""Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
|
|
588
|
-
|
|
553
|
+
if text is None:
|
|
554
|
+
header_offset = len(self.header.header_chars)
|
|
555
|
+
text = self.text[header_offset:]
|
|
556
|
+
else:
|
|
557
|
+
header_offset = 0
|
|
558
|
+
|
|
559
|
+
for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text)):
|
|
589
560
|
if i >= n:
|
|
590
|
-
return match.end() - 1
|
|
561
|
+
return match.end() + header_offset - 1
|
|
591
562
|
|
|
592
563
|
def _merge_lines(self, idx: int, idx2: int | None = None) -> None:
|
|
593
564
|
"""Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""
|
|
@@ -674,6 +645,9 @@ class Email(Communication):
|
|
|
674
645
|
elif self.file_id in ['025329']:
|
|
675
646
|
for _i in range(9):
|
|
676
647
|
self._merge_lines(2)
|
|
648
|
+
elif self.file_id in ['025812']:
|
|
649
|
+
for _i in range(2):
|
|
650
|
+
self._merge_lines(3)
|
|
677
651
|
elif self.file_id == '014860':
|
|
678
652
|
self._merge_lines(3)
|
|
679
653
|
self._merge_lines(4)
|
|
@@ -839,19 +813,29 @@ class Email(Communication):
|
|
|
839
813
|
self.log_top_lines(self.header.num_header_rows + 4, f'Original header:')
|
|
840
814
|
|
|
841
815
|
@staticmethod
|
|
842
|
-
def build_emails_table(emails: list['Email'],
|
|
816
|
+
def build_emails_table(emails: list['Email'], name: Name = '', title: str = '', show_length: bool = False) -> Table:
|
|
843
817
|
"""Turn a set of Emails into a Table."""
|
|
844
|
-
if title and
|
|
818
|
+
if title and name:
|
|
845
819
|
raise ValueError(f"Can't provide both 'author' and 'title' args")
|
|
846
|
-
elif
|
|
820
|
+
elif name == '' and title == '':
|
|
847
821
|
raise ValueError(f"Must provide either 'author' or 'title' arg")
|
|
848
822
|
|
|
849
|
-
author_style = get_style_for_name(
|
|
850
|
-
link_style = author_style if
|
|
823
|
+
author_style = get_style_for_name(name, allow_bold=False)
|
|
824
|
+
link_style = author_style if name else ARCHIVE_LINK_COLOR
|
|
825
|
+
min_width = len(name or UNKNOWN)
|
|
826
|
+
max_width = max(20, min_width)
|
|
827
|
+
|
|
828
|
+
columns = [
|
|
829
|
+
{'name': 'Sent At', 'justify': 'left', 'style': TIMESTAMP_DIM},
|
|
830
|
+
{'name': 'From', 'justify': 'left', 'min_width': min_width, 'max_width': max_width},
|
|
831
|
+
{'name': 'To', 'justify': 'left', 'min_width': min_width, 'max_width': max_width + 2},
|
|
832
|
+
{'name': 'Length', 'justify': 'right', 'style': 'wheat4'},
|
|
833
|
+
{'name': 'Subject', 'justify': 'left', 'min_width': 35, 'style': 'honeydew2'},
|
|
834
|
+
]
|
|
851
835
|
|
|
852
836
|
table = build_table(
|
|
853
837
|
title or None,
|
|
854
|
-
cols=[col for col in
|
|
838
|
+
cols=[col for col in columns if show_length or col['name'] not in ['Length']],
|
|
855
839
|
border_style=DEFAULT_TABLE_KWARGS['border_style'] if title else author_style,
|
|
856
840
|
header_style="bold",
|
|
857
841
|
highlight=True,
|
|
@@ -8,13 +8,12 @@ from epstein_files.util.doc_cfg import EmailCfg
|
|
|
8
8
|
from epstein_files.util.logging import logger
|
|
9
9
|
from epstein_files.util.rich import UNKNOWN
|
|
10
10
|
|
|
11
|
-
FIELD_NAMES = ['
|
|
12
|
-
NON_HEADER_FIELDS = ['field_names', 'num_header_rows', 'was_initially_empty']
|
|
11
|
+
FIELD_NAMES = ['Date', 'From', 'Sent', 'Subject']
|
|
13
12
|
ON_BEHALF_OF = 'on behalf of'
|
|
14
13
|
TO_FIELDS = ['bcc', 'cc', 'to']
|
|
15
14
|
EMAILER_FIELDS = [AUTHOR] + TO_FIELDS
|
|
16
15
|
|
|
17
|
-
HEADER_REGEX_STR = r'(((?:(?:Date|From|Sent|To|C[cC]|Importance|Subject|Bee|B[cC]{2}|Attachments):|on behalf of ?)(?! +(by |from my|via )).*\n){3,})'
|
|
16
|
+
HEADER_REGEX_STR = r'(((?:(?:Date|From|Sent|To|C[cC]|Importance|Subject|Bee|B[cC]{2}|Attachments|Classification|Flag):|on behalf of ?)(?! +(by |from my|via )).*\n){3,})'
|
|
18
17
|
EMAIL_SIMPLE_HEADER_REGEX = re.compile(rf'^{HEADER_REGEX_STR}')
|
|
19
18
|
EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX = re.compile(HEADER_REGEX_STR)
|
|
20
19
|
EMAIL_PRE_FORWARD_REGEX = re.compile(r"(.{3,2000}?)" + HEADER_REGEX_STR, re.DOTALL) # Match up to the next email header section
|
|
@@ -28,10 +27,18 @@ CONFIGURED_ACTUAL_TEXTS = [
|
|
|
28
27
|
if isinstance(cfg, EmailCfg) and cfg.actual_text is not None
|
|
29
28
|
]
|
|
30
29
|
|
|
30
|
+
NON_HEADER_FIELDS = [
|
|
31
|
+
'field_names',
|
|
32
|
+
'header_chars',
|
|
33
|
+
'num_header_rows',
|
|
34
|
+
'was_initially_empty',
|
|
35
|
+
]
|
|
36
|
+
|
|
31
37
|
|
|
32
38
|
@dataclass(kw_only=True)
|
|
33
39
|
class EmailHeader:
|
|
34
40
|
field_names: list[str] # Order is same as the order header fields appear in the email file text
|
|
41
|
+
header_chars: str = ''
|
|
35
42
|
num_header_rows: int = field(init=False)
|
|
36
43
|
was_initially_empty: bool = False
|
|
37
44
|
|
|
@@ -41,6 +48,8 @@ class EmailHeader:
|
|
|
41
48
|
subject: str | None = None
|
|
42
49
|
bcc: list[str] | None = None
|
|
43
50
|
cc: list[str] | None = None
|
|
51
|
+
classification: str | None = None
|
|
52
|
+
flag: str | None = None
|
|
44
53
|
importance: str | None = None
|
|
45
54
|
attachments: str | None = None
|
|
46
55
|
to: list[str] | None = None
|
|
@@ -99,6 +108,7 @@ class EmailHeader:
|
|
|
99
108
|
setattr(self, field_name, value)
|
|
100
109
|
|
|
101
110
|
self.num_header_rows = len(self.field_names) + num_headers
|
|
111
|
+
self.header_chars = '\n'.join(email_lines[0:self.num_header_rows])
|
|
102
112
|
log_msg = f"Corrected empty header using {self.num_header_rows} lines to:\n"
|
|
103
113
|
logger.debug(f"{log_msg}{self}\n\nTop lines:\n\n%s", '\n'.join(email_lines[0:(num_headers + 1) * 2]))
|
|
104
114
|
|
|
@@ -161,7 +171,7 @@ class EmailHeader:
|
|
|
161
171
|
if should_log_header:
|
|
162
172
|
logger.debug(f"Header being parsed was this:\n\n{header}\n")
|
|
163
173
|
|
|
164
|
-
return
|
|
174
|
+
return cls(field_names=field_names, header_chars=header, **kw_args)
|
|
165
175
|
|
|
166
176
|
@staticmethod
|
|
167
177
|
def cleanup_str(_str: str) -> str:
|