epstein-files 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +31 -18
- epstein_files/documents/communication.py +9 -5
- epstein_files/documents/document.py +225 -136
- epstein_files/documents/doj_file.py +242 -0
- epstein_files/documents/doj_files/full_text.py +166 -0
- epstein_files/documents/email.py +138 -163
- epstein_files/documents/emails/email_header.py +21 -11
- epstein_files/documents/emails/emailers.py +223 -0
- epstein_files/documents/imessage/text_message.py +2 -3
- epstein_files/documents/json_file.py +18 -14
- epstein_files/documents/messenger_log.py +23 -39
- epstein_files/documents/other_file.py +48 -44
- epstein_files/epstein_files.py +54 -33
- epstein_files/person.py +142 -110
- epstein_files/util/constant/names.py +29 -6
- epstein_files/util/constant/output_files.py +2 -0
- epstein_files/util/constant/strings.py +12 -6
- epstein_files/util/constant/urls.py +17 -0
- epstein_files/util/constants.py +101 -174
- epstein_files/util/data.py +2 -0
- epstein_files/util/doc_cfg.py +20 -15
- epstein_files/util/env.py +24 -16
- epstein_files/util/file_helper.py +28 -6
- epstein_files/util/helpers/debugging_helper.py +13 -0
- epstein_files/util/helpers/env_helpers.py +21 -0
- epstein_files/util/highlighted_group.py +57 -16
- epstein_files/util/layout/left_bar_panel.py +26 -0
- epstein_files/util/logging.py +28 -13
- epstein_files/util/output.py +33 -10
- epstein_files/util/rich.py +28 -2
- epstein_files/util/word_count.py +7 -7
- {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/METADATA +14 -1
- epstein_files-1.5.0.dist-info/RECORD +40 -0
- epstein_files-1.4.1.dist-info/RECORD +0 -34
- {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/LICENSE +0 -0
- {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/WHEEL +0 -0
- {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/entry_points.txt +0 -0
|
@@ -5,7 +5,7 @@ from dataclasses import asdict, dataclass, field
|
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from subprocess import run
|
|
8
|
-
from typing import Callable, ClassVar, Sequence, TypeVar
|
|
8
|
+
from typing import Callable, ClassVar, Self, Sequence, TypeVar
|
|
9
9
|
|
|
10
10
|
from rich.console import Console, ConsoleOptions, Group, RenderResult
|
|
11
11
|
from rich.padding import Padding
|
|
@@ -13,17 +13,19 @@ from rich.panel import Panel
|
|
|
13
13
|
from rich.text import Text
|
|
14
14
|
from rich.table import Table
|
|
15
15
|
|
|
16
|
+
from epstein_files.documents.emails.email_header import DETECT_EMAIL_REGEX
|
|
16
17
|
from epstein_files.util.constant.names import *
|
|
17
18
|
from epstein_files.util.constant.strings import *
|
|
18
19
|
from epstein_files.util.constant.urls import *
|
|
19
|
-
from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
|
|
20
|
+
from epstein_files.util.constants import ALL_FILE_CONFIGS, DOJ_FILE_STEM_REGEX, FALLBACK_TIMESTAMP
|
|
20
21
|
from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time, without_falsey
|
|
21
22
|
from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
|
|
22
|
-
from epstein_files.util.env import DOCS_DIR
|
|
23
|
-
from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str,
|
|
23
|
+
from epstein_files.util.env import DOCS_DIR
|
|
24
|
+
from epstein_files.util.file_helper import (coerce_file_path, extract_file_id, file_size, file_size_str,
|
|
25
|
+
file_size_to_str, is_local_extract_file)
|
|
24
26
|
from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
|
|
25
|
-
from epstein_files.util.rich import (INFO_STYLE, NA_TXT, SYMBOL_STYLE, add_cols_to_table, build_table,
|
|
26
|
-
highlighter, join_texts, key_value_txt, link_text_obj, parenthesize)
|
|
27
|
+
from epstein_files.util.rich import (INFO_STYLE, NA_TXT, SYMBOL_STYLE, add_cols_to_table, build_table,
|
|
28
|
+
console, highlighter, join_texts, key_value_txt, link_text_obj, parenthesize)
|
|
27
29
|
from epstein_files.util.search_result import MatchedLine
|
|
28
30
|
|
|
29
31
|
ALT_LINK_STYLE = 'white dim'
|
|
@@ -33,11 +35,9 @@ INFO_INDENT = 2
|
|
|
33
35
|
INFO_PADDING = (0, 0, 0, INFO_INDENT)
|
|
34
36
|
MAX_TOP_LINES_LEN = 4000 # Only for logging
|
|
35
37
|
MIN_DOCUMENT_ID = 10477
|
|
36
|
-
WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
|
|
37
38
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
MAX_TIMESTAMP = datetime(2020, 1, 1)
|
|
39
|
+
DOJ_DATASET_ID_REGEX = re.compile(r"(?:epstein_dataset_|DataSet )(\d+)")
|
|
40
|
+
WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
|
|
41
41
|
|
|
42
42
|
FILENAME_MATCH_STYLES = [
|
|
43
43
|
'dark_green',
|
|
@@ -74,7 +74,8 @@ class Document:
|
|
|
74
74
|
Attributes:
|
|
75
75
|
file_path (Path): Local path to file
|
|
76
76
|
author (Name): Who is responsible for the text in the file
|
|
77
|
-
config (DocCfg):
|
|
77
|
+
config (DocCfg): Preconfigured information about this file
|
|
78
|
+
doj_2026_dataset_id (int, optional): Only set for files that came from the DOJ website.
|
|
78
79
|
file_id (str): 6 digit (or 8 digits if it's a local extract file) string ID
|
|
79
80
|
filename (str): File's basename
|
|
80
81
|
lines (str): Number of lines in the file after all the cleanup
|
|
@@ -86,6 +87,7 @@ class Document:
|
|
|
86
87
|
# Optional fields
|
|
87
88
|
author: Name = None
|
|
88
89
|
config: EmailCfg | DocCfg | TextCfg | None = None
|
|
90
|
+
doj_2026_dataset_id: int | None = None
|
|
89
91
|
file_id: str = field(init=False)
|
|
90
92
|
filename: str = field(init=False)
|
|
91
93
|
lines: list[str] = field(default_factory=list)
|
|
@@ -97,140 +99,117 @@ class Document:
|
|
|
97
99
|
include_description_in_summary_panel: ClassVar[bool] = False
|
|
98
100
|
strip_whitespace: ClassVar[bool] = True # Overridden in JsonFile
|
|
99
101
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
self.filename = self.file_path.name
|
|
105
|
-
self.file_id = extract_file_id(self.filename)
|
|
106
|
-
# config and url_slug could have been pre-set in Email
|
|
107
|
-
self.config = self.config or deepcopy(ALL_FILE_CONFIGS.get(self.file_id))
|
|
108
|
-
self.url_slug = self.url_slug or self.filename.split('.')[0]
|
|
109
|
-
|
|
110
|
-
if not self.text:
|
|
111
|
-
self._load_file()
|
|
112
|
-
|
|
113
|
-
self._repair()
|
|
114
|
-
self._extract_author()
|
|
115
|
-
self.timestamp = self._extract_timestamp()
|
|
102
|
+
@property
|
|
103
|
+
def border_style(self) -> str:
|
|
104
|
+
"""Should be overloaded in subclasses."""
|
|
105
|
+
return 'white'
|
|
116
106
|
|
|
107
|
+
@property
|
|
117
108
|
def config_description(self) -> str | None:
|
|
118
|
-
"""Overloaded in OtherFile."""
|
|
119
109
|
if self.config and self.config.description:
|
|
120
110
|
return f"({self.config.description})"
|
|
121
111
|
|
|
112
|
+
@property
|
|
113
|
+
def config_timestamp(self) -> datetime | None:
|
|
114
|
+
"""Configured timestamp, if any."""
|
|
115
|
+
return self.config.timestamp if self.config and self.config.timestamp else None
|
|
116
|
+
|
|
117
|
+
@property
|
|
122
118
|
def date_str(self) -> str | None:
|
|
123
119
|
return date_str(self.timestamp)
|
|
124
120
|
|
|
121
|
+
@property
|
|
125
122
|
def duplicate_file_txt(self) -> Text:
|
|
126
123
|
"""If the file is a dupe make a nice message to explain what file it's a duplicate of."""
|
|
127
|
-
if not self.is_duplicate
|
|
124
|
+
if not self.is_duplicate:
|
|
128
125
|
raise RuntimeError(f"duplicate_file_txt() called on {self.summary()} but not a dupe! config:\n\n{self.config}")
|
|
129
126
|
|
|
130
127
|
txt = Text(f"Not showing ", style=INFO_STYLE).append(epstein_media_doc_link_txt(self.file_id, style='cyan'))
|
|
131
128
|
txt.append(f" because it's {DUPE_TYPE_STRS[self.config.dupe_type]} ")
|
|
132
129
|
return txt.append(epstein_media_doc_link_txt(self.config.duplicate_of_id, style='royal_blue1'))
|
|
133
130
|
|
|
131
|
+
@property
|
|
134
132
|
def duplicate_of_id(self) -> str | None:
|
|
135
133
|
if self.config and self.config.duplicate_of_id:
|
|
136
134
|
return self.config.duplicate_of_id
|
|
137
135
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
return self.external_link(epstein_web_doc_url, style, link_txt)
|
|
146
|
-
|
|
147
|
-
def rollcall_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
148
|
-
return self.external_link(rollcall_doc_url, style, link_txt)
|
|
149
|
-
|
|
150
|
-
def external_link(self, fxn: Callable[[str], str], style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
151
|
-
return link_text_obj(fxn(self.url_slug), link_txt or self.file_path.stem, style)
|
|
152
|
-
|
|
153
|
-
def external_links_txt(self, style: str = '', include_alt_links: bool = False) -> Text:
|
|
154
|
-
"""Returns colored links to epstein.media and alternates in a Text object."""
|
|
155
|
-
links = [self.epstein_media_link(style=style)]
|
|
156
|
-
|
|
157
|
-
if include_alt_links:
|
|
158
|
-
links.append(self.epsteinify_link(style=ALT_LINK_STYLE, link_txt=EPSTEINIFY))
|
|
159
|
-
links.append(self.epstein_web_link(style=ALT_LINK_STYLE, link_txt=EPSTEIN_WEB))
|
|
160
|
-
|
|
161
|
-
if self._class_name() == 'Email':
|
|
162
|
-
links.append(self.rollcall_link(style=ALT_LINK_STYLE, link_txt=ROLLCALL))
|
|
163
|
-
|
|
164
|
-
links = [links[0]] + [parenthesize(link) for link in links[1:]]
|
|
165
|
-
base_txt = Text('', style='white' if include_alt_links else ARCHIVE_LINK_COLOR)
|
|
166
|
-
return base_txt.append(join_texts(links))
|
|
136
|
+
@property
|
|
137
|
+
def external_url(self) -> str:
|
|
138
|
+
"""The primary external URL to use when linking to this document's source."""
|
|
139
|
+
if self.is_doj_file and self.doj_2026_dataset_id:
|
|
140
|
+
return doj_2026_file_url(self.doj_2026_dataset_id, self.url_slug)
|
|
141
|
+
else:
|
|
142
|
+
return epstein_media_doc_url(self.url_slug)
|
|
167
143
|
|
|
144
|
+
@property
|
|
168
145
|
def file_id_debug_info(self) -> str:
|
|
169
146
|
return ', '.join([f"{prop}={getattr(self, prop)}" for prop in ['file_id', 'filename', 'url_slug']])
|
|
170
147
|
|
|
171
|
-
|
|
172
|
-
"""Panel with filename linking to raw file plus any additional info about the file."""
|
|
173
|
-
panel = Panel(self.external_links_txt(include_alt_links=True), border_style=self._border_style(), expand=False)
|
|
174
|
-
padded_info = [Padding(sentence, INFO_PADDING) for sentence in self.info()]
|
|
175
|
-
return Group(*([panel] + padded_info))
|
|
176
|
-
|
|
148
|
+
@property
|
|
177
149
|
def file_size(self) -> int:
|
|
178
150
|
return file_size(self.file_path)
|
|
179
151
|
|
|
152
|
+
@property
|
|
180
153
|
def file_size_str(self, decimal_places: int | None = None) -> str:
|
|
181
154
|
return file_size_str(self.file_path, decimal_places)
|
|
182
155
|
|
|
156
|
+
@property
|
|
183
157
|
def info(self) -> list[Text]:
|
|
184
158
|
"""0 to 2 sentences containing the info_txt() as well as any configured description."""
|
|
185
159
|
return without_falsey([
|
|
186
|
-
self.info_txt
|
|
187
|
-
highlighter(Text(self.config_description
|
|
160
|
+
self.info_txt,
|
|
161
|
+
highlighter(Text(self.config_description, style=INFO_STYLE)) if self.config_description else None
|
|
188
162
|
])
|
|
189
163
|
|
|
164
|
+
@property
|
|
190
165
|
def info_txt(self) -> Text | None:
|
|
191
166
|
"""Secondary info about this file (description recipients, etc). Overload in subclasses."""
|
|
192
167
|
return None
|
|
193
168
|
|
|
169
|
+
@property
|
|
194
170
|
def is_attribution_uncertain(self) -> bool:
|
|
195
171
|
return bool(self.config and self.config.is_attribution_uncertain)
|
|
196
172
|
|
|
173
|
+
@property
|
|
174
|
+
def is_doj_file(self) -> bool:
|
|
175
|
+
return bool(DOJ_FILE_STEM_REGEX.match(self.file_id))
|
|
176
|
+
|
|
177
|
+
@property
|
|
197
178
|
def is_duplicate(self) -> bool:
|
|
198
|
-
return bool(self.duplicate_of_id
|
|
179
|
+
return bool(self.duplicate_of_id)
|
|
199
180
|
|
|
181
|
+
@property
|
|
182
|
+
def is_empty(self) -> bool:
|
|
183
|
+
return len(self.text.strip()) < 20
|
|
184
|
+
|
|
185
|
+
@property
|
|
200
186
|
def is_interesting(self) -> bool:
|
|
201
187
|
return bool(self.config and self.config.is_interesting)
|
|
202
188
|
|
|
189
|
+
@property
|
|
203
190
|
def is_local_extract_file(self) -> bool:
|
|
204
191
|
"""True if extracted from other file (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
|
|
205
192
|
return is_local_extract_file(self.filename)
|
|
206
193
|
|
|
194
|
+
@property
|
|
207
195
|
def length(self) -> int:
|
|
208
196
|
return len(self.text)
|
|
209
197
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
def log_top_lines(self, n: int = 10, msg: str = '', level: int = logging.INFO) -> None:
|
|
215
|
-
"""Log first 'n' lines of self.text at 'level'. 'msg' can be optionally provided."""
|
|
216
|
-
separator = '\n\n' if '\n' in msg else '. '
|
|
217
|
-
msg = (msg + separator) if msg else ''
|
|
218
|
-
self.log(f"{msg}First {n} lines:\n\n{self.top_lines(n)}\n", level)
|
|
219
|
-
|
|
220
|
-
def matching_lines(self, _pattern: re.Pattern | str) -> list[MatchedLine]:
|
|
221
|
-
"""Return lines matching a regex as colored list[Text]."""
|
|
222
|
-
pattern = patternize(_pattern)
|
|
223
|
-
return [MatchedLine(line, i) for i, line in enumerate(self.lines) if pattern.search(line)]
|
|
198
|
+
@property
|
|
199
|
+
def local_path_and_url(self) -> Text:
|
|
200
|
+
"""Text obj with local path and URL."""
|
|
201
|
+
return Text(f"{self.file_id} URL: {self.external_url}\n{self.file_id} Local path: '{self.file_path}'")
|
|
224
202
|
|
|
203
|
+
@property
|
|
225
204
|
def metadata(self) -> Metadata:
|
|
226
|
-
metadata = self.config.metadata
|
|
205
|
+
metadata = self.config.metadata if self.config else {}
|
|
227
206
|
metadata.update({k: v for k, v in asdict(self).items() if k in METADATA_FIELDS and v is not None})
|
|
228
|
-
metadata['bytes'] = self.file_size
|
|
207
|
+
metadata['bytes'] = self.file_size
|
|
229
208
|
metadata['filename'] = f"{self.url_slug}.txt"
|
|
230
|
-
metadata['num_lines'] = self.num_lines
|
|
231
|
-
metadata['type'] = self._class_name
|
|
209
|
+
metadata['num_lines'] = self.num_lines
|
|
210
|
+
metadata['type'] = self._class_name
|
|
232
211
|
|
|
233
|
-
if self.is_local_extract_file
|
|
212
|
+
if self.is_local_extract_file:
|
|
234
213
|
metadata['extracted_file'] = {
|
|
235
214
|
'explanation': 'manually extracted from one of the other files',
|
|
236
215
|
'extracted_from': self.url_slug + '.txt',
|
|
@@ -239,10 +218,141 @@ class Document:
|
|
|
239
218
|
|
|
240
219
|
return metadata
|
|
241
220
|
|
|
221
|
+
@property
|
|
242
222
|
def num_lines(self) -> int:
|
|
243
223
|
return len(self.lines)
|
|
244
224
|
|
|
225
|
+
@property
|
|
226
|
+
def panel_title_timestamp(self) -> str | None:
|
|
227
|
+
"""String placed in the `title` of the enclosing `Panel` when printing this document's text."""
|
|
228
|
+
if (self.timestamp or FALLBACK_TIMESTAMP) == FALLBACK_TIMESTAMP:
|
|
229
|
+
return None
|
|
230
|
+
|
|
231
|
+
prefix = '' if self.config and self.config.timestamp else 'inferred '
|
|
232
|
+
return f"{prefix}timestamp: {remove_zero_time(self.timestamp)}"
|
|
233
|
+
|
|
234
|
+
@property
|
|
235
|
+
def summary_panel(self) -> Panel:
|
|
236
|
+
"""Panelized description() with info_txt(), used in search results."""
|
|
237
|
+
sentences = [self.summary()]
|
|
238
|
+
|
|
239
|
+
if self.include_description_in_summary_panel:
|
|
240
|
+
sentences += [Text('', style='italic').append(h) for h in self.info]
|
|
241
|
+
|
|
242
|
+
return Panel(Group(*sentences), border_style=self._class_style, expand=False)
|
|
243
|
+
|
|
244
|
+
@property
|
|
245
|
+
def timestamp_sort_key(self) -> tuple[datetime, str, int]:
|
|
246
|
+
"""Sort by timestamp, file_id, then whether or not it's a duplicate file."""
|
|
247
|
+
if self.duplicate_of_id:
|
|
248
|
+
sort_id = self.duplicate_of_id
|
|
249
|
+
dupe_idx = 1
|
|
250
|
+
else:
|
|
251
|
+
sort_id = self.file_id
|
|
252
|
+
dupe_idx = 0
|
|
253
|
+
|
|
254
|
+
return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
|
|
255
|
+
|
|
256
|
+
@property
|
|
257
|
+
def _class_name(self) -> str:
|
|
258
|
+
"""Annoying workaround for circular import issues and isinstance()."""
|
|
259
|
+
return str(type(self).__name__)
|
|
260
|
+
|
|
261
|
+
@property
|
|
262
|
+
def _class_style(self) -> str:
|
|
263
|
+
return DOC_TYPE_STYLES[self._class_name]
|
|
264
|
+
|
|
265
|
+
def __post_init__(self):
|
|
266
|
+
if not self.file_path.exists():
|
|
267
|
+
raise FileNotFoundError(f"File '{self.file_path.name}' does not exist!")
|
|
268
|
+
|
|
269
|
+
self.filename = self.file_path.name
|
|
270
|
+
self.file_id = extract_file_id(self.filename)
|
|
271
|
+
# config and url_slug could have been pre-set in Email
|
|
272
|
+
self.config = self.config or deepcopy(ALL_FILE_CONFIGS.get(self.file_id))
|
|
273
|
+
self.url_slug = self.url_slug or self.filename.split('.')[0]
|
|
274
|
+
|
|
275
|
+
# Extract the DOJ dataset ID from the path
|
|
276
|
+
if self.is_doj_file:
|
|
277
|
+
if (data_set_match := DOJ_DATASET_ID_REGEX.search(str(self.file_path))):
|
|
278
|
+
self.doj_2026_dataset_id = int(data_set_match.group(1))
|
|
279
|
+
logger.info(f"Extracted data set ID {self.doj_2026_dataset_id} for {self.url_slug}")
|
|
280
|
+
else:
|
|
281
|
+
self.warn(f"Couldn't find a data set ID in path '{self.file_path}'! Cannot create valid links.")
|
|
282
|
+
|
|
283
|
+
self.text = self.text or self._load_file()
|
|
284
|
+
self._set_computed_fields(text=self.text)
|
|
285
|
+
self._repair()
|
|
286
|
+
self._extract_author()
|
|
287
|
+
self.timestamp = self.config_timestamp or self._extract_timestamp()
|
|
288
|
+
|
|
289
|
+
@classmethod
|
|
290
|
+
def from_file_id(cls, file_id: str | int) -> Self:
|
|
291
|
+
"""Alternate constructor that finds the file path automatically and builds a `Document`."""
|
|
292
|
+
return cls(coerce_file_path(file_id))
|
|
293
|
+
|
|
294
|
+
def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
295
|
+
return self.external_link(epsteinify_doc_url, style, link_txt)
|
|
296
|
+
|
|
297
|
+
def epstein_media_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
298
|
+
return self.external_link(epstein_media_doc_url, style, link_txt)
|
|
299
|
+
|
|
300
|
+
def epstein_web_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
301
|
+
return self.external_link(epstein_web_doc_url, style, link_txt)
|
|
302
|
+
|
|
303
|
+
def rollcall_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
304
|
+
return self.external_link(rollcall_doc_url, style, link_txt)
|
|
305
|
+
|
|
306
|
+
def external_link(self, fxn: Callable[[str], str], style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
307
|
+
return link_text_obj(fxn(self.url_slug), link_txt or self.file_path.stem, style)
|
|
308
|
+
|
|
309
|
+
def external_links_txt(self, style: str = '', include_alt_links: bool = False) -> Text:
|
|
310
|
+
"""Returns colored links to epstein.media and alternates in a Text object."""
|
|
311
|
+
links = [link_text_obj(self.external_url, self.url_slug, style=style)]
|
|
312
|
+
|
|
313
|
+
if include_alt_links:
|
|
314
|
+
if self.doj_2026_dataset_id:
|
|
315
|
+
jmail_url = jmail_doj_2026_file_url(self.doj_2026_dataset_id, self.file_id)
|
|
316
|
+
jmail_link = link_text_obj(jmail_url, JMAIL, style=f"{style} dim" if style else ARCHIVE_LINK_COLOR)
|
|
317
|
+
links.append(jmail_link)
|
|
318
|
+
else:
|
|
319
|
+
links.append(self.epsteinify_link(style=ALT_LINK_STYLE, link_txt=EPSTEINIFY))
|
|
320
|
+
links.append(self.epstein_web_link(style=ALT_LINK_STYLE, link_txt=EPSTEIN_WEB))
|
|
321
|
+
|
|
322
|
+
if self._class_name == 'Email':
|
|
323
|
+
links.append(self.rollcall_link(style=ALT_LINK_STYLE, link_txt=ROLLCALL))
|
|
324
|
+
|
|
325
|
+
links = [links[0]] + [parenthesize(link) for link in links[1:]]
|
|
326
|
+
base_txt = Text('', style='white' if include_alt_links else ARCHIVE_LINK_COLOR)
|
|
327
|
+
return base_txt.append(join_texts(links))
|
|
328
|
+
|
|
329
|
+
def file_info_panel(self) -> Group:
|
|
330
|
+
"""Panel with filename linking to raw file plus any additional info about the file."""
|
|
331
|
+
panel = Panel(self.external_links_txt(include_alt_links=True), border_style=self.border_style, expand=False)
|
|
332
|
+
padded_info = [Padding(sentence, INFO_PADDING) for sentence in self.info]
|
|
333
|
+
return Group(*([panel] + padded_info))
|
|
334
|
+
|
|
335
|
+
def log(self, msg: str, level: int = logging.INFO):
|
|
336
|
+
"""Log a message with with this document's filename as a prefix."""
|
|
337
|
+
logger.log(level, f"{self.file_path.stem} {msg}")
|
|
338
|
+
|
|
339
|
+
def log_top_lines(self, n: int = 10, msg: str = '', level: int = logging.INFO) -> None:
|
|
340
|
+
"""Log first 'n' lines of self.text at 'level'. 'msg' can be optionally provided."""
|
|
341
|
+
separator = '\n\n' if '\n' in msg else '. '
|
|
342
|
+
msg = (msg + separator) if msg else ''
|
|
343
|
+
self.log(f"{msg}First {n} lines:\n\n{self.top_lines(n)}\n", level)
|
|
344
|
+
|
|
345
|
+
def matching_lines(self, _pattern: re.Pattern | str) -> list[MatchedLine]:
|
|
346
|
+
"""Return lines matching a regex as colored list[Text]."""
|
|
347
|
+
pattern = patternize(_pattern)
|
|
348
|
+
return [MatchedLine(line, i) for i, line in enumerate(self.lines) if pattern.search(line)]
|
|
349
|
+
|
|
350
|
+
def printable_document(self) -> Self:
|
|
351
|
+
"""Overloaded by `DojFile` to convert some files to `Email` objects."""
|
|
352
|
+
return self
|
|
353
|
+
|
|
245
354
|
def raw_text(self) -> str:
|
|
355
|
+
"""Reload the raw data from the underlying file and return it."""
|
|
246
356
|
with open(self.file_path) as f:
|
|
247
357
|
return f.read()
|
|
248
358
|
|
|
@@ -256,13 +366,9 @@ class Document:
|
|
|
256
366
|
|
|
257
367
|
return text
|
|
258
368
|
|
|
259
|
-
def source_file_id(self) -> str:
|
|
260
|
-
"""Strip off the _1, _2, etc. suffixes for extracted documents."""
|
|
261
|
-
return self.file_id[0:6]
|
|
262
|
-
|
|
263
369
|
def summary(self) -> Text:
|
|
264
|
-
"""Summary of this file for logging.
|
|
265
|
-
txt = Text('').append(self._class_name
|
|
370
|
+
"""Summary of this file for logging. Subclasses should extend with a method that closes the open '['."""
|
|
371
|
+
txt = Text('').append(self._class_name, style=self._class_style)
|
|
266
372
|
txt.append(f" {self.file_path.stem}", style=FILENAME_STYLE)
|
|
267
373
|
|
|
268
374
|
if self.timestamp:
|
|
@@ -270,52 +376,22 @@ class Document:
|
|
|
270
376
|
txt.append(' (', style=SYMBOL_STYLE)
|
|
271
377
|
txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
|
|
272
378
|
|
|
273
|
-
txt.append(' [').append(key_value_txt('size', Text(str(self.length
|
|
274
|
-
txt.append(", ").append(key_value_txt('lines', self.num_lines
|
|
379
|
+
txt.append(' [').append(key_value_txt('size', Text(str(self.length), style='aquamarine1')))
|
|
380
|
+
txt.append(", ").append(key_value_txt('lines', self.num_lines))
|
|
275
381
|
|
|
276
382
|
if self.config and self.config.duplicate_of_id:
|
|
277
383
|
txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='cyan dim')))
|
|
278
384
|
|
|
279
385
|
return txt
|
|
280
386
|
|
|
281
|
-
def summary_panel(self) -> Panel:
|
|
282
|
-
"""Panelized description() with info_txt(), used in search results."""
|
|
283
|
-
sentences = [self.summary()]
|
|
284
|
-
|
|
285
|
-
if self.include_description_in_summary_panel:
|
|
286
|
-
sentences += [Text('', style='italic').append(h) for h in self.info()]
|
|
287
|
-
|
|
288
|
-
return Panel(Group(*sentences), border_style=self._class_style(), expand=False)
|
|
289
|
-
|
|
290
|
-
def timestamp_sort_key(self) -> tuple[datetime, str, int]:
|
|
291
|
-
"""Sort by timestamp, file_id, then whether or not it's a duplicate file."""
|
|
292
|
-
if self.is_duplicate():
|
|
293
|
-
sort_id = self.config.duplicate_of_id
|
|
294
|
-
dupe_idx = 1
|
|
295
|
-
else:
|
|
296
|
-
sort_id = self.file_id
|
|
297
|
-
dupe_idx = 0
|
|
298
|
-
|
|
299
|
-
return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
|
|
300
|
-
|
|
301
387
|
def top_lines(self, n: int = 10) -> str:
|
|
302
388
|
"""First n lines."""
|
|
303
389
|
return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
|
|
304
390
|
|
|
305
391
|
def warn(self, msg: str) -> None:
|
|
392
|
+
"""Print a warning message prefixed by info about this `Document`."""
|
|
306
393
|
self.log(msg, level=logging.WARNING)
|
|
307
394
|
|
|
308
|
-
def _border_style(self) -> str:
|
|
309
|
-
"""Should be overloaded in subclasses."""
|
|
310
|
-
return 'white'
|
|
311
|
-
|
|
312
|
-
def _class_name(self) -> str:
|
|
313
|
-
"""Annoying workaround for circular import issues and isinstance()."""
|
|
314
|
-
return str(type(self).__name__)
|
|
315
|
-
|
|
316
|
-
def _class_style(self) -> str:
|
|
317
|
-
return DOC_TYPE_STYLES[self._class_name()]
|
|
318
|
-
|
|
319
395
|
def _extract_author(self) -> None:
|
|
320
396
|
"""Get author from config. Extended in Email subclass to also check headers."""
|
|
321
397
|
if self.config and self.config.author:
|
|
@@ -325,7 +401,7 @@ class Document:
|
|
|
325
401
|
"""Should be implemented in subclasses."""
|
|
326
402
|
pass
|
|
327
403
|
|
|
328
|
-
def _load_file(self) ->
|
|
404
|
+
def _load_file(self) -> str:
|
|
329
405
|
"""Remove BOM and HOUSE OVERSIGHT lines, strip whitespace."""
|
|
330
406
|
text = self.raw_text()
|
|
331
407
|
text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text # remove BOM
|
|
@@ -333,11 +409,10 @@ class Document:
|
|
|
333
409
|
|
|
334
410
|
lines = [
|
|
335
411
|
line.strip() if self.strip_whitespace else line for line in text.split('\n')
|
|
336
|
-
if not line.startswith(HOUSE_OVERSIGHT)
|
|
412
|
+
if not (line.startswith(HOUSE_OVERSIGHT) or line.startswith('EFTA'))
|
|
337
413
|
]
|
|
338
414
|
|
|
339
|
-
|
|
340
|
-
self.lines = self.text.split('\n')
|
|
415
|
+
return collapse_newlines('\n'.join(lines))
|
|
341
416
|
|
|
342
417
|
def _repair(self) -> None:
|
|
343
418
|
"""Can optionally be overloaded in subclasses to further improve self.text."""
|
|
@@ -367,11 +442,20 @@ class Document:
|
|
|
367
442
|
with open(output_path, 'w') as f:
|
|
368
443
|
f.write(self.text)
|
|
369
444
|
|
|
370
|
-
logger.warning(f"Wrote {self.length
|
|
445
|
+
logger.warning(f"Wrote {self.length} chars of cleaned {self.filename} to {output_path}.")
|
|
371
446
|
|
|
372
447
|
def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
|
|
448
|
+
"""Default `Document` renderer (Email and MessengerLog override this)."""
|
|
373
449
|
yield self.file_info_panel()
|
|
374
|
-
|
|
450
|
+
|
|
451
|
+
text_panel = Panel(
|
|
452
|
+
highlighter(self.text),
|
|
453
|
+
border_style=self.border_style,
|
|
454
|
+
expand=False,
|
|
455
|
+
title=f"({self.panel_title_timestamp})",
|
|
456
|
+
title_align='right',
|
|
457
|
+
)
|
|
458
|
+
|
|
375
459
|
yield Padding(text_panel, (0, 0, 1, INFO_INDENT))
|
|
376
460
|
|
|
377
461
|
def __str__(self) -> str:
|
|
@@ -395,8 +479,8 @@ class Document:
|
|
|
395
479
|
'count': str(file_count),
|
|
396
480
|
'author_count': NA_TXT if is_author_na else str(author_count),
|
|
397
481
|
'no_author_count': NA_TXT if is_author_na else str(file_count - author_count),
|
|
398
|
-
'uncertain_author_count': NA_TXT if is_author_na else str(len([f for f in files if f.is_attribution_uncertain
|
|
399
|
-
'bytes': file_size_to_str(sum([f.file_size
|
|
482
|
+
'uncertain_author_count': NA_TXT if is_author_na else str(len([f for f in files if f.is_attribution_uncertain])),
|
|
483
|
+
'bytes': file_size_to_str(sum([f.file_size for f in files])),
|
|
400
484
|
}
|
|
401
485
|
|
|
402
486
|
@classmethod
|
|
@@ -433,6 +517,11 @@ class Document:
|
|
|
433
517
|
for f in tmpfiles:
|
|
434
518
|
f.unlink()
|
|
435
519
|
|
|
520
|
+
@staticmethod
|
|
521
|
+
def is_email(doc: 'Document') -> bool:
|
|
522
|
+
search_area = doc.text[0:5000] # Limit search area to avoid pointless scans of huge files
|
|
523
|
+
return isinstance(doc.config, EmailCfg) or bool(DETECT_EMAIL_REGEX.match(search_area) and doc.config is None)
|
|
524
|
+
|
|
436
525
|
@staticmethod
|
|
437
526
|
def known_author_count(docs: Sequence['Document']) -> int:
|
|
438
527
|
"""Count of how many Document objects have an author attribution."""
|
|
@@ -444,11 +533,11 @@ class Document:
|
|
|
444
533
|
|
|
445
534
|
@staticmethod
|
|
446
535
|
def sort_by_length(docs: Sequence['DocumentType']) -> list['DocumentType']:
|
|
447
|
-
return sorted(docs, key=lambda d: d.file_size
|
|
536
|
+
return sorted(docs, key=lambda d: d.file_size, reverse=True)
|
|
448
537
|
|
|
449
538
|
@staticmethod
|
|
450
539
|
def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
|
|
451
|
-
return sorted(docs, key=lambda doc: doc.timestamp_sort_key
|
|
540
|
+
return sorted(docs, key=lambda doc: doc.timestamp_sort_key)
|
|
452
541
|
|
|
453
542
|
@staticmethod
|
|
454
543
|
def uniquify(documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
|
|
@@ -458,7 +547,7 @@ class Document:
|
|
|
458
547
|
|
|
459
548
|
@staticmethod
|
|
460
549
|
def without_dupes(docs: Sequence['DocumentType']) -> list['DocumentType']:
|
|
461
|
-
return [doc for doc in docs if not doc.is_duplicate
|
|
550
|
+
return [doc for doc in docs if not doc.is_duplicate]
|
|
462
551
|
|
|
463
552
|
|
|
464
553
|
DocumentType = TypeVar('DocumentType', bound=Document)
|