epstein-files 1.0.12__py3-none-any.whl → 1.0.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +11 -6
- epstein_files/documents/communication.py +2 -2
- epstein_files/documents/document.py +60 -50
- epstein_files/documents/email.py +40 -34
- epstein_files/documents/imessage/text_message.py +4 -4
- epstein_files/documents/json_file.py +9 -3
- epstein_files/documents/messenger_log.py +22 -19
- epstein_files/documents/other_file.py +50 -71
- epstein_files/epstein_files.py +108 -71
- epstein_files/util/constant/names.py +5 -3
- epstein_files/util/constant/strings.py +1 -1
- epstein_files/util/constant/urls.py +13 -8
- epstein_files/util/constants.py +66 -46
- epstein_files/util/data.py +3 -1
- epstein_files/util/doc_cfg.py +9 -9
- epstein_files/util/env.py +2 -5
- epstein_files/util/highlighted_group.py +25 -31
- epstein_files/util/output.py +15 -30
- epstein_files/util/rich.py +40 -31
- epstein_files/util/word_count.py +1 -1
- {epstein_files-1.0.12.dist-info → epstein_files-1.0.14.dist-info}/METADATA +10 -3
- epstein_files-1.0.14.dist-info/RECORD +33 -0
- epstein_files-1.0.12.dist-info/RECORD +0 -33
- {epstein_files-1.0.12.dist-info → epstein_files-1.0.14.dist-info}/LICENSE +0 -0
- {epstein_files-1.0.12.dist-info → epstein_files-1.0.14.dist-info}/WHEEL +0 -0
- {epstein_files-1.0.12.dist-info → epstein_files-1.0.14.dist-info}/entry_points.txt +0 -0
epstein_files/__init__.py
CHANGED
|
@@ -21,7 +21,7 @@ from epstein_files.util.env import args, specified_names
|
|
|
21
21
|
from epstein_files.util.file_helper import coerce_file_path, extract_file_id
|
|
22
22
|
from epstein_files.util.logging import logger
|
|
23
23
|
from epstein_files.util.output import (print_emails, print_json_files, print_json_stats,
|
|
24
|
-
|
|
24
|
+
write_json_metadata, write_urls)
|
|
25
25
|
from epstein_files.util.rich import build_highlighter, console, print_header, print_panel, write_html
|
|
26
26
|
from epstein_files.util.timer import Timer
|
|
27
27
|
from epstein_files.util.word_count import write_word_counts_html
|
|
@@ -49,7 +49,7 @@ def generate_html() -> None:
|
|
|
49
49
|
exit()
|
|
50
50
|
|
|
51
51
|
if args.output_texts:
|
|
52
|
-
|
|
52
|
+
epstein_files.print_text_messages_section()
|
|
53
53
|
timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs)} text message logs')
|
|
54
54
|
|
|
55
55
|
if args.output_emails:
|
|
@@ -57,8 +57,13 @@ def generate_html() -> None:
|
|
|
57
57
|
timer.print_at_checkpoint(f"Printed {emails_printed:,} emails")
|
|
58
58
|
|
|
59
59
|
if args.output_other:
|
|
60
|
-
|
|
61
|
-
|
|
60
|
+
if args.uninteresting:
|
|
61
|
+
files = [f for f in epstein_files.other_files if not f.is_interesting()]
|
|
62
|
+
else:
|
|
63
|
+
files = [f for f in epstein_files.other_files if args.all_other_files or f.is_interesting()]
|
|
64
|
+
|
|
65
|
+
epstein_files.print_other_files_section(files)
|
|
66
|
+
timer.print_at_checkpoint(f"Printed {len(files)} other files (skipped {len(epstein_files.other_files) - len(files)})")
|
|
62
67
|
|
|
63
68
|
# Save output
|
|
64
69
|
write_html(ALL_EMAILS_PATH if args.all_emails else TEXT_MSGS_HTML_PATH)
|
|
@@ -90,7 +95,7 @@ def epstein_search():
|
|
|
90
95
|
|
|
91
96
|
if args.whole_file:
|
|
92
97
|
if isinstance(search_result.document, Email):
|
|
93
|
-
search_result.document.
|
|
98
|
+
search_result.document._truncation_allowed = False
|
|
94
99
|
|
|
95
100
|
console.print(search_result.document)
|
|
96
101
|
else:
|
|
@@ -111,7 +116,7 @@ def epstein_show():
|
|
|
111
116
|
|
|
112
117
|
for doc in docs:
|
|
113
118
|
if isinstance(doc, Email):
|
|
114
|
-
doc.
|
|
119
|
+
doc._truncation_allowed = False
|
|
115
120
|
|
|
116
121
|
console.print('\n', doc, '\n')
|
|
117
122
|
|
|
@@ -34,9 +34,9 @@ class Communication(Document):
|
|
|
34
34
|
def is_attribution_uncertain(self) -> bool:
|
|
35
35
|
return bool(self.config and self.config.is_attribution_uncertain)
|
|
36
36
|
|
|
37
|
-
def
|
|
37
|
+
def external_links_txt(self, _style: str = '', include_alt_links: bool = True) -> Text:
|
|
38
38
|
"""Overrides super() method to apply self.author_style."""
|
|
39
|
-
return super().
|
|
39
|
+
return super().external_links_txt(self.author_style, include_alt_links=include_alt_links)
|
|
40
40
|
|
|
41
41
|
def summary(self) -> Text:
|
|
42
42
|
return self._summary().append(CLOSE_PROPERTIES_CHAR)
|
|
@@ -5,7 +5,7 @@ from dataclasses import asdict, dataclass, field
|
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from subprocess import run
|
|
8
|
-
from typing import ClassVar, Sequence, TypeVar
|
|
8
|
+
from typing import Callable, ClassVar, Sequence, TypeVar
|
|
9
9
|
|
|
10
10
|
from rich.console import Console, ConsoleOptions, Group, RenderResult
|
|
11
11
|
from rich.padding import Padding
|
|
@@ -16,15 +16,15 @@ from epstein_files.util.constant.names import *
|
|
|
16
16
|
from epstein_files.util.constant.strings import *
|
|
17
17
|
from epstein_files.util.constant.urls import *
|
|
18
18
|
from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
|
|
19
|
-
from epstein_files.util.data import collapse_newlines, date_str, patternize,
|
|
19
|
+
from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time_from_timestamp_str, without_falsey
|
|
20
20
|
from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
|
|
21
21
|
from epstein_files.util.env import DOCS_DIR, args
|
|
22
|
-
from epstein_files.util.file_helper import
|
|
23
|
-
file_size_str, is_local_extract_file)
|
|
22
|
+
from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, is_local_extract_file
|
|
24
23
|
from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
|
|
25
|
-
from epstein_files.util.rich import INFO_STYLE, SYMBOL_STYLE, console, highlighter, key_value_txt, link_text_obj
|
|
24
|
+
from epstein_files.util.rich import INFO_STYLE, SYMBOL_STYLE, console, highlighter, join_texts, key_value_txt, link_text_obj, parenthesize
|
|
26
25
|
from epstein_files.util.search_result import MatchedLine
|
|
27
26
|
|
|
27
|
+
ALT_LINK_STYLE = 'white dim'
|
|
28
28
|
CLOSE_PROPERTIES_CHAR = ']'
|
|
29
29
|
HOUSE_OVERSIGHT = HOUSE_OVERSIGHT_PREFIX.replace('_', ' ').strip()
|
|
30
30
|
INFO_INDENT = 2
|
|
@@ -46,7 +46,6 @@ FILENAME_MATCH_STYLES = [
|
|
|
46
46
|
METADATA_FIELDS = [
|
|
47
47
|
'author',
|
|
48
48
|
'file_id',
|
|
49
|
-
'num_lines',
|
|
50
49
|
'timestamp'
|
|
51
50
|
]
|
|
52
51
|
|
|
@@ -68,7 +67,6 @@ class Document:
|
|
|
68
67
|
config (DocCfg): Information about this fil
|
|
69
68
|
file_id (str): 6 digit (or 8 digits if it's a local extract file) string ID
|
|
70
69
|
filename (str): File's basename
|
|
71
|
-
length (int): Number of characters in the file after all the cleanup
|
|
72
70
|
lines (str): Number of lines in the file after all the cleanup
|
|
73
71
|
text (str): Contents of the file
|
|
74
72
|
timestamp (datetime | None): When the file was originally created
|
|
@@ -80,12 +78,10 @@ class Document:
|
|
|
80
78
|
config: EmailCfg | DocCfg | TextCfg | None = None
|
|
81
79
|
file_id: str = field(init=False)
|
|
82
80
|
filename: str = field(init=False)
|
|
83
|
-
|
|
84
|
-
lines: list[str] = field(init=False)
|
|
85
|
-
num_lines: int = field(init=False)
|
|
81
|
+
lines: list[str] = field(default_factory=list)
|
|
86
82
|
text: str = ''
|
|
87
83
|
timestamp: datetime | None = None
|
|
88
|
-
url_slug: str =
|
|
84
|
+
url_slug: str = ''
|
|
89
85
|
|
|
90
86
|
# Class variables
|
|
91
87
|
include_description_in_summary_panel: ClassVar[bool] = False
|
|
@@ -94,12 +90,13 @@ class Document:
|
|
|
94
90
|
def __post_init__(self):
|
|
95
91
|
self.filename = self.file_path.name
|
|
96
92
|
self.file_id = extract_file_id(self.filename)
|
|
93
|
+
# config and url_slug could have been pre-set in Email
|
|
97
94
|
self.config = self.config or deepcopy(ALL_FILE_CONFIGS.get(self.file_id))
|
|
95
|
+
self.url_slug = self.url_slug or self.filename.split('.')[0]
|
|
98
96
|
|
|
99
|
-
if
|
|
100
|
-
self.
|
|
97
|
+
if not self.text:
|
|
98
|
+
self._load_file()
|
|
101
99
|
|
|
102
|
-
self._set_computed_fields(text=self.text or self._load_file())
|
|
103
100
|
self._repair()
|
|
104
101
|
self._extract_author()
|
|
105
102
|
self.timestamp = self._extract_timestamp()
|
|
@@ -114,47 +111,49 @@ class Document:
|
|
|
114
111
|
|
|
115
112
|
def duplicate_file_txt(self) -> Text:
|
|
116
113
|
"""If the file is a dupe make a nice message to explain what file it's a duplicate of."""
|
|
117
|
-
if not self.
|
|
114
|
+
if not self.is_duplicate():
|
|
118
115
|
raise RuntimeError(f"duplicate_file_txt() called on {self.summary()} but not a dupe! config:\n\n{self.config}")
|
|
119
116
|
|
|
120
117
|
txt = Text(f"Not showing ", style=INFO_STYLE).append(epstein_media_doc_link_txt(self.file_id, style='cyan'))
|
|
121
118
|
txt.append(f" because it's {DUPE_TYPE_STRS[self.config.dupe_type]} ")
|
|
122
|
-
return txt.append(epstein_media_doc_link_txt(self.config.
|
|
119
|
+
return txt.append(epstein_media_doc_link_txt(self.config.duplicate_of_id, style='royal_blue1'))
|
|
123
120
|
|
|
124
121
|
def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
125
|
-
|
|
126
|
-
return link_text_obj(epsteinify_doc_url(self.url_slug), link_txt or self.file_path.stem, style)
|
|
122
|
+
return self.external_link(epsteinify_doc_url, style, link_txt)
|
|
127
123
|
|
|
128
124
|
def epstein_media_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
129
|
-
|
|
130
|
-
return link_text_obj(epstein_media_doc_url(self.url_slug), link_txt or self.file_path.stem, style)
|
|
125
|
+
return self.external_link(epstein_media_doc_url, style, link_txt)
|
|
131
126
|
|
|
132
127
|
def epstein_web_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
133
|
-
|
|
134
|
-
return link_text_obj(epstein_web_doc_url(self.url_slug), link_txt or self.file_path.stem, style)
|
|
128
|
+
return self.external_link(epstein_web_doc_url, style, link_txt)
|
|
135
129
|
|
|
136
|
-
def
|
|
137
|
-
|
|
138
|
-
txt = Text('', style='white' if include_alt_link else ARCHIVE_LINK_COLOR)
|
|
130
|
+
def rollcall_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
131
|
+
return self.external_link(rollcall_doc_url, style, link_txt)
|
|
139
132
|
|
|
140
|
-
|
|
141
|
-
|
|
133
|
+
def external_link(self, fxn: Callable[[str], str], style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
134
|
+
return link_text_obj(fxn(self.url_slug), link_txt or self.file_path.stem, style)
|
|
142
135
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
else:
|
|
147
|
-
txt.append(self.epstein_media_link(style=style))
|
|
136
|
+
def external_links_txt(self, style: str = '', include_alt_links: bool = False) -> Text:
|
|
137
|
+
"""Returns colored links to epstein.media and alternates in a Text object."""
|
|
138
|
+
links = [self.epstein_media_link(style=style)]
|
|
148
139
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
140
|
+
if include_alt_links:
|
|
141
|
+
links.append(self.epsteinify_link(style=ALT_LINK_STYLE, link_txt=EPSTEINIFY))
|
|
142
|
+
links.append(self.epstein_web_link(style=ALT_LINK_STYLE, link_txt=EPSTEIN_WEB))
|
|
152
143
|
|
|
153
|
-
|
|
144
|
+
if self._class_name() == 'Email':
|
|
145
|
+
links.append(self.rollcall_link(style=ALT_LINK_STYLE, link_txt=ROLLCALL))
|
|
146
|
+
|
|
147
|
+
links = [links[0]] + [parenthesize(link) for link in links[1:]]
|
|
148
|
+
base_txt = Text('', style='white' if include_alt_links else ARCHIVE_LINK_COLOR)
|
|
149
|
+
return base_txt.append(join_texts(links))
|
|
150
|
+
|
|
151
|
+
def file_id_debug_info(self) -> str:
|
|
152
|
+
return ', '.join([f"{prop}={getattr(self, prop)}" for prop in ['file_id', 'filename', 'url_slug']])
|
|
154
153
|
|
|
155
154
|
def file_info_panel(self) -> Group:
|
|
156
155
|
"""Panel with filename linking to raw file plus any additional info about the file."""
|
|
157
|
-
panel = Panel(self.
|
|
156
|
+
panel = Panel(self.external_links_txt(include_alt_links=True), border_style=self._border_style(), expand=False)
|
|
158
157
|
padded_info = [Padding(sentence, INFO_PADDING) for sentence in self.info()]
|
|
159
158
|
return Group(*([panel] + padded_info))
|
|
160
159
|
|
|
@@ -176,12 +175,15 @@ class Document:
|
|
|
176
175
|
return None
|
|
177
176
|
|
|
178
177
|
def is_duplicate(self) -> bool:
|
|
179
|
-
return bool(self.config and self.config.
|
|
178
|
+
return bool(self.config and self.config.duplicate_of_id)
|
|
180
179
|
|
|
181
180
|
def is_local_extract_file(self) -> bool:
|
|
182
181
|
"""True if file created by extracting text from a court doc (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
|
|
183
182
|
return is_local_extract_file(self.filename)
|
|
184
183
|
|
|
184
|
+
def length(self) -> int:
|
|
185
|
+
return len(self.text)
|
|
186
|
+
|
|
185
187
|
def log(self, msg: str, level: int = logging.INFO):
|
|
186
188
|
"""Log with filename as a prefix."""
|
|
187
189
|
logger.log(level, f"{self.file_path.stem} {msg}")
|
|
@@ -202,17 +204,21 @@ class Document:
|
|
|
202
204
|
metadata.update({k: v for k, v in asdict(self).items() if k in METADATA_FIELDS and v is not None})
|
|
203
205
|
metadata['bytes'] = self.file_size()
|
|
204
206
|
metadata['filename'] = f"{self.url_slug}.txt"
|
|
207
|
+
metadata['num_lines'] = self.num_lines()
|
|
205
208
|
metadata['type'] = self._class_name()
|
|
206
209
|
|
|
207
210
|
if self.is_local_extract_file():
|
|
208
211
|
metadata['extracted_file'] = {
|
|
209
|
-
'explanation': '
|
|
212
|
+
'explanation': 'manually extracted from one of the other files',
|
|
210
213
|
'extracted_from': self.url_slug + '.txt',
|
|
211
214
|
'url': extracted_file_url(self.filename),
|
|
212
215
|
}
|
|
213
216
|
|
|
214
217
|
return metadata
|
|
215
218
|
|
|
219
|
+
def num_lines(self) -> int:
|
|
220
|
+
return len(self.lines)
|
|
221
|
+
|
|
216
222
|
def raw_text(self) -> str:
|
|
217
223
|
with open(self.file_path) as f:
|
|
218
224
|
return f.read()
|
|
@@ -229,7 +235,7 @@ class Document:
|
|
|
229
235
|
|
|
230
236
|
def sort_key(self) -> tuple[datetime, str, int]:
|
|
231
237
|
if self.is_duplicate():
|
|
232
|
-
sort_id = self.config.
|
|
238
|
+
sort_id = self.config.duplicate_of_id
|
|
233
239
|
dupe_idx = 1
|
|
234
240
|
else:
|
|
235
241
|
sort_id = self.file_id
|
|
@@ -243,15 +249,15 @@ class Document:
|
|
|
243
249
|
txt.append(f" {self.url_slug}", style=FILENAME_STYLE)
|
|
244
250
|
|
|
245
251
|
if self.timestamp:
|
|
246
|
-
timestamp_str =
|
|
252
|
+
timestamp_str = remove_zero_time_from_timestamp_str(self.timestamp).replace('T', ' ')
|
|
247
253
|
txt.append(' (', style=SYMBOL_STYLE)
|
|
248
254
|
txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
|
|
249
255
|
|
|
250
256
|
txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
|
|
251
|
-
txt.append(", ").append(key_value_txt('lines', self.num_lines))
|
|
257
|
+
txt.append(", ").append(key_value_txt('lines', self.num_lines()))
|
|
252
258
|
|
|
253
|
-
if self.config and self.config.
|
|
254
|
-
txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.
|
|
259
|
+
if self.config and self.config.duplicate_of_id:
|
|
260
|
+
txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='magenta')))
|
|
255
261
|
|
|
256
262
|
return txt
|
|
257
263
|
|
|
@@ -290,13 +296,19 @@ class Document:
|
|
|
290
296
|
"""Should be implemented in subclasses."""
|
|
291
297
|
pass
|
|
292
298
|
|
|
293
|
-
def _load_file(self) ->
|
|
299
|
+
def _load_file(self) -> None:
|
|
294
300
|
"""Remove BOM and HOUSE OVERSIGHT lines, strip whitespace."""
|
|
295
301
|
text = self.raw_text()
|
|
296
302
|
text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text # remove BOM
|
|
297
303
|
text = self.repair_ocr_text(OCR_REPAIRS, text.strip())
|
|
298
|
-
|
|
299
|
-
|
|
304
|
+
|
|
305
|
+
lines = [
|
|
306
|
+
line.strip() if self.strip_whitespace else line for line in text.split('\n')
|
|
307
|
+
if not line.startswith(HOUSE_OVERSIGHT)
|
|
308
|
+
]
|
|
309
|
+
|
|
310
|
+
self.text = collapse_newlines('\n'.join(lines))
|
|
311
|
+
self.lines = self.text.split('\n')
|
|
300
312
|
|
|
301
313
|
def _repair(self) -> None:
|
|
302
314
|
"""Can optionally be overloaded in subclasses to further improve self.text."""
|
|
@@ -313,9 +325,7 @@ class Document:
|
|
|
313
325
|
else:
|
|
314
326
|
raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (neither was)")
|
|
315
327
|
|
|
316
|
-
self.length = len(self.text)
|
|
317
328
|
self.lines = [line.strip() if self.strip_whitespace else line for line in self.text.split('\n')]
|
|
318
|
-
self.num_lines = len(self.lines)
|
|
319
329
|
|
|
320
330
|
def _write_clean_text(self, output_path: Path) -> None:
|
|
321
331
|
"""Write self.text to 'output_path'. Used only for diffing files."""
|
|
@@ -328,7 +338,7 @@ class Document:
|
|
|
328
338
|
with open(output_path, 'w') as f:
|
|
329
339
|
f.write(self.text)
|
|
330
340
|
|
|
331
|
-
logger.warning(f"Wrote {self.length} chars of cleaned {self.filename} to {output_path}.")
|
|
341
|
+
logger.warning(f"Wrote {self.length()} chars of cleaned {self.filename} to {output_path}.")
|
|
332
342
|
|
|
333
343
|
def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
|
|
334
344
|
yield self.file_info_panel()
|
epstein_files/documents/email.py
CHANGED
|
@@ -131,13 +131,12 @@ JUNK_EMAILERS = [
|
|
|
131
131
|
'editorialstaff@flipboard.com',
|
|
132
132
|
'How To Academy',
|
|
133
133
|
'Jokeland',
|
|
134
|
-
JP_MORGAN_USGIO,
|
|
135
|
-
'Saved by Internet Explorer 11',
|
|
136
134
|
]
|
|
137
135
|
|
|
138
136
|
MAILING_LISTS = [
|
|
139
137
|
INTELLIGENCE_SQUARED,
|
|
140
138
|
'middle.east.update@hotmail.com',
|
|
139
|
+
JP_MORGAN_USGIO,
|
|
141
140
|
]
|
|
142
141
|
|
|
143
142
|
TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + MAILING_LISTS + [
|
|
@@ -274,11 +273,9 @@ USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIP
|
|
|
274
273
|
'Michael Simmons', # Random CC
|
|
275
274
|
'Nancy Portland', # Lawrence Krauss CC
|
|
276
275
|
'Oliver Goodenough', # Robert Trivers CC
|
|
277
|
-
'Owen Blicksilver', # Landon Thomas CC
|
|
278
276
|
'Peter Aldhous', # Lawrence Krauss CC
|
|
279
277
|
'Sam Harris', # Lawrence Krauss CC
|
|
280
278
|
SAMUEL_LEFF, # Random CC
|
|
281
|
-
"Saved by Internet Explorer 11",
|
|
282
279
|
'Sean T Lehane', # Random CC
|
|
283
280
|
'Stephen Rubin', # Random CC
|
|
284
281
|
'Tim Kane', # Random CC
|
|
@@ -319,7 +316,7 @@ class Email(Communication):
|
|
|
319
316
|
recipients: list[str | None] = field(default_factory=list)
|
|
320
317
|
sent_from_device: str | None = None
|
|
321
318
|
signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
|
|
322
|
-
|
|
319
|
+
_truncation_allowed: bool = True # Hacky way to get __rich_console__() not to truncate in epstein_show script
|
|
323
320
|
|
|
324
321
|
# For logging how many headers we prettified while printing, kind of janky
|
|
325
322
|
rewritten_header_ids: ClassVar[set[str]] = set([])
|
|
@@ -340,10 +337,10 @@ class Email(Communication):
|
|
|
340
337
|
|
|
341
338
|
try:
|
|
342
339
|
if self.config and self.config.recipients:
|
|
343
|
-
self.recipients =
|
|
340
|
+
self.recipients = self.config.recipients
|
|
344
341
|
else:
|
|
345
342
|
for recipient in self.header.recipients():
|
|
346
|
-
self.recipients.extend(self.
|
|
343
|
+
self.recipients.extend(self._emailer_names(recipient))
|
|
347
344
|
except Exception as e:
|
|
348
345
|
console.print_exception()
|
|
349
346
|
console.line(2)
|
|
@@ -358,8 +355,12 @@ class Email(Communication):
|
|
|
358
355
|
self.actual_text = self._actual_text()
|
|
359
356
|
self.sent_from_device = self._sent_from_device()
|
|
360
357
|
|
|
358
|
+
def attachments(self) -> list[str]:
|
|
359
|
+
return (self.header.attachments or '').split(';')
|
|
360
|
+
|
|
361
361
|
def info_txt(self) -> Text:
|
|
362
|
-
|
|
362
|
+
email_type = 'fwded article' if self.is_fwded_article() else 'email'
|
|
363
|
+
txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt).append(' to ')
|
|
363
364
|
return txt.append(self._recipients_txt()).append(highlighter(f" probably sent at {self.timestamp}"))
|
|
364
365
|
|
|
365
366
|
def is_fwded_article(self) -> bool:
|
|
@@ -401,8 +402,8 @@ class Email(Communication):
|
|
|
401
402
|
return self.text
|
|
402
403
|
|
|
403
404
|
reply_text_match = REPLY_TEXT_REGEX.search(text)
|
|
404
|
-
|
|
405
|
-
|
|
405
|
+
self.log_top_lines(20, "Raw text:", logging.DEBUG)
|
|
406
|
+
self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
|
|
406
407
|
|
|
407
408
|
if reply_text_match:
|
|
408
409
|
actual_num_chars = len(reply_text_match.group(1))
|
|
@@ -438,12 +439,32 @@ class Email(Communication):
|
|
|
438
439
|
|
|
439
440
|
return style.replace('bold', '').strip()
|
|
440
441
|
|
|
442
|
+
def _emailer_names(self, emailer_str: str) -> list[str]:
|
|
443
|
+
"""Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
|
|
444
|
+
emailer_str = EmailHeader.cleanup_str(emailer_str)
|
|
445
|
+
|
|
446
|
+
if len(emailer_str) == 0:
|
|
447
|
+
return []
|
|
448
|
+
|
|
449
|
+
names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
|
|
450
|
+
|
|
451
|
+
if BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
|
|
452
|
+
if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
|
|
453
|
+
logger.warning(f"'{self.filename}': No emailer found in '{escape_single_quotes(emailer_str)}'")
|
|
454
|
+
else:
|
|
455
|
+
logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
|
|
456
|
+
|
|
457
|
+
return names_found
|
|
458
|
+
|
|
459
|
+
names_found = names_found or [emailer_str]
|
|
460
|
+
return [_reverse_first_and_last_names(name) for name in names_found]
|
|
461
|
+
|
|
441
462
|
def _extract_author(self) -> None:
|
|
442
463
|
self._extract_header()
|
|
443
464
|
super()._extract_author()
|
|
444
465
|
|
|
445
466
|
if not self.author and self.header.author:
|
|
446
|
-
authors = self.
|
|
467
|
+
authors = self._emailer_names(self.header.author)
|
|
447
468
|
self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
|
|
448
469
|
|
|
449
470
|
def _extract_header(self) -> None:
|
|
@@ -493,26 +514,6 @@ class Email(Communication):
|
|
|
493
514
|
|
|
494
515
|
raise RuntimeError(f"No timestamp found in '{self.file_path.name}' top lines:\n{searchable_text}")
|
|
495
516
|
|
|
496
|
-
def _get_names(self, emailer_str: str) -> list[str]:
|
|
497
|
-
"""Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
|
|
498
|
-
emailer_str = EmailHeader.cleanup_str(emailer_str)
|
|
499
|
-
|
|
500
|
-
if len(emailer_str) == 0:
|
|
501
|
-
return []
|
|
502
|
-
|
|
503
|
-
names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
|
|
504
|
-
|
|
505
|
-
if BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
|
|
506
|
-
if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
|
|
507
|
-
logger.warning(f"'{self.filename}': No emailer found in '{escape_single_quotes(emailer_str)}'")
|
|
508
|
-
else:
|
|
509
|
-
logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
|
|
510
|
-
|
|
511
|
-
return names_found
|
|
512
|
-
|
|
513
|
-
names_found = names_found or [emailer_str]
|
|
514
|
-
return [_reverse_first_and_last_names(name) for name in names_found]
|
|
515
|
-
|
|
516
517
|
def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
|
|
517
518
|
"""Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
|
|
518
519
|
for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text or self.text)):
|
|
@@ -584,7 +585,7 @@ class Email(Communication):
|
|
|
584
585
|
self._merge_lines(2, 5)
|
|
585
586
|
elif self.file_id in ['029498', '031428']:
|
|
586
587
|
self._merge_lines(2, 4)
|
|
587
|
-
elif self.file_id in ['029976', '023067']:
|
|
588
|
+
elif self.file_id in ['029976', '023067', '033576']:
|
|
588
589
|
self._merge_lines(3) # Merge 4th and 5th rows
|
|
589
590
|
elif self.file_id in '026609 029402 032405 022695'.split():
|
|
590
591
|
self._merge_lines(4) # Merge 5th and 6th rows
|
|
@@ -609,6 +610,8 @@ class Email(Communication):
|
|
|
609
610
|
self._merge_lines(7, 9)
|
|
610
611
|
elif self.file_id == '030299':
|
|
611
612
|
self._merge_lines(7, 10)
|
|
613
|
+
elif self.file_id in ['022673', '022684']:
|
|
614
|
+
self._merge_lines(9)
|
|
612
615
|
elif self.file_id == '014860':
|
|
613
616
|
self._merge_lines(3)
|
|
614
617
|
self._merge_lines(4)
|
|
@@ -680,6 +683,9 @@ class Email(Communication):
|
|
|
680
683
|
if extracted_from_description:
|
|
681
684
|
extracted_description = f"{APPEARS_IN} {extracted_from_description}"
|
|
682
685
|
|
|
686
|
+
if isinstance(extracted_from_doc_cfg, EmailCfg):
|
|
687
|
+
extracted_description += ' email'
|
|
688
|
+
|
|
683
689
|
if self.config.description:
|
|
684
690
|
self.warn(f"Overwriting description '{self.config.description}' with extract description '{self.config.description}'")
|
|
685
691
|
|
|
@@ -705,10 +711,10 @@ class Email(Communication):
|
|
|
705
711
|
num_chars = quote_cutoff
|
|
706
712
|
|
|
707
713
|
# Truncate long emails but leave a note explaining what happened w/link to source document
|
|
708
|
-
if len(text) > num_chars and self.
|
|
714
|
+
if len(text) > num_chars and self._truncation_allowed:
|
|
709
715
|
text = text[0:num_chars]
|
|
710
716
|
doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
|
|
711
|
-
trim_note = f"<...trimmed to {num_chars} characters of {self.length}, read the rest at {doc_link_markup}...>"
|
|
717
|
+
trim_note = f"<...trimmed to {num_chars} characters of {self.length()}, read the rest at {doc_link_markup}...>"
|
|
712
718
|
trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
|
|
713
719
|
|
|
714
720
|
# Rewrite broken headers where the values are on separate lines from the field names
|
|
@@ -5,6 +5,7 @@ from datetime import datetime
|
|
|
5
5
|
from rich.text import Text
|
|
6
6
|
|
|
7
7
|
from epstein_files.util.constant.names import JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN
|
|
8
|
+
from epstein_files.util.constant.strings import TIMESTAMP_DIM
|
|
8
9
|
from epstein_files.util.data import extract_last_name
|
|
9
10
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
10
11
|
from epstein_files.util.logging import logger
|
|
@@ -12,7 +13,6 @@ from epstein_files.util.rich import TEXT_LINK, highlighter
|
|
|
12
13
|
|
|
13
14
|
MSG_DATE_FORMAT = r"%m/%d/%y %I:%M:%S %p"
|
|
14
15
|
PHONE_NUMBER_REGEX = re.compile(r'^[\d+]+.*')
|
|
15
|
-
TIMESTAMP_STYLE = 'turquoise4 dim'
|
|
16
16
|
|
|
17
17
|
DISPLAY_LAST_NAME_ONLY = [
|
|
18
18
|
JEFFREY_EPSTEIN,
|
|
@@ -29,7 +29,7 @@ TEXTER_MAPPING = {
|
|
|
29
29
|
class TextMessage:
|
|
30
30
|
"""Class representing a single iMessage text message."""
|
|
31
31
|
author: str | None
|
|
32
|
-
author_str: str
|
|
32
|
+
author_str: str = ''
|
|
33
33
|
id_confirmed: bool = False
|
|
34
34
|
text: str
|
|
35
35
|
timestamp_str: str
|
|
@@ -37,7 +37,7 @@ class TextMessage:
|
|
|
37
37
|
def __post_init__(self):
|
|
38
38
|
self.author = TEXTER_MAPPING.get(self.author or UNKNOWN, self.author)
|
|
39
39
|
|
|
40
|
-
if self.author
|
|
40
|
+
if not self.author:
|
|
41
41
|
self.author_str = UNKNOWN
|
|
42
42
|
elif self.author in DISPLAY_LAST_NAME_ONLY and not self.author_str:
|
|
43
43
|
self.author_str = extract_last_name(self.author)
|
|
@@ -77,5 +77,5 @@ class TextMessage:
|
|
|
77
77
|
def __rich__(self) -> Text:
|
|
78
78
|
author_style = get_style_for_name(self.author_str if self.author_str.startswith('+') else self.author)
|
|
79
79
|
author_txt = Text(self.author_str, style=author_style)
|
|
80
|
-
timestamp_txt = Text(f"[{self.timestamp_str}]", style=
|
|
80
|
+
timestamp_txt = Text(f"[{self.timestamp_str}]", style=TIMESTAMP_DIM).append(' ')
|
|
81
81
|
return Text('').append(timestamp_txt).append(author_txt).append(': ', style='dim').append(self._message())
|
|
@@ -6,10 +6,12 @@ from typing import ClassVar
|
|
|
6
6
|
|
|
7
7
|
from rich.text import Text
|
|
8
8
|
|
|
9
|
-
from epstein_files.documents.other_file import OtherFile
|
|
9
|
+
from epstein_files.documents.other_file import Metadata, OtherFile
|
|
10
10
|
from epstein_files.util.constant.strings import JSON
|
|
11
11
|
from epstein_files.util.rich import INFO_STYLE
|
|
12
12
|
|
|
13
|
+
DESCRIPTION = "JSON data containing preview info for links sent in a messaging app like iMessage"
|
|
14
|
+
|
|
13
15
|
TEXT_FIELDS = [
|
|
14
16
|
'caption',
|
|
15
17
|
'standard',
|
|
@@ -23,7 +25,6 @@ TEXT_FIELDS = [
|
|
|
23
25
|
@dataclass
|
|
24
26
|
class JsonFile(OtherFile):
|
|
25
27
|
"""File containing JSON data."""
|
|
26
|
-
|
|
27
28
|
include_description_in_summary_panel: ClassVar[bool] = False
|
|
28
29
|
strip_whitespace: ClassVar[bool] = False
|
|
29
30
|
|
|
@@ -39,7 +40,7 @@ class JsonFile(OtherFile):
|
|
|
39
40
|
return JSON
|
|
40
41
|
|
|
41
42
|
def info_txt(self) -> Text | None:
|
|
42
|
-
return Text(
|
|
43
|
+
return Text(DESCRIPTION, style=INFO_STYLE)
|
|
43
44
|
|
|
44
45
|
def is_interesting(self):
|
|
45
46
|
return False
|
|
@@ -48,5 +49,10 @@ class JsonFile(OtherFile):
|
|
|
48
49
|
with open(self.file_path, encoding='utf-8-sig') as f:
|
|
49
50
|
return json.load(f)
|
|
50
51
|
|
|
52
|
+
def metadata(self) -> Metadata:
|
|
53
|
+
metadata = super().metadata()
|
|
54
|
+
metadata['description'] = DESCRIPTION
|
|
55
|
+
return metadata
|
|
56
|
+
|
|
51
57
|
def json_str(self) -> str:
|
|
52
58
|
return json.dumps(self.json_data(), indent=4)
|