epstein-files 1.2.5__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. epstein_files/__init__.py +55 -23
  2. epstein_files/documents/communication.py +9 -5
  3. epstein_files/documents/document.py +231 -135
  4. epstein_files/documents/doj_file.py +242 -0
  5. epstein_files/documents/doj_files/full_text.py +166 -0
  6. epstein_files/documents/email.py +289 -232
  7. epstein_files/documents/emails/email_header.py +35 -16
  8. epstein_files/documents/emails/emailers.py +223 -0
  9. epstein_files/documents/imessage/text_message.py +2 -3
  10. epstein_files/documents/json_file.py +18 -14
  11. epstein_files/documents/messenger_log.py +23 -39
  12. epstein_files/documents/other_file.py +54 -48
  13. epstein_files/epstein_files.py +65 -29
  14. epstein_files/person.py +151 -94
  15. epstein_files/util/constant/names.py +37 -10
  16. epstein_files/util/constant/output_files.py +2 -0
  17. epstein_files/util/constant/strings.py +14 -7
  18. epstein_files/util/constant/urls.py +17 -0
  19. epstein_files/util/constants.py +556 -391
  20. epstein_files/util/data.py +2 -0
  21. epstein_files/util/doc_cfg.py +44 -33
  22. epstein_files/util/env.py +34 -19
  23. epstein_files/util/file_helper.py +30 -6
  24. epstein_files/util/helpers/debugging_helper.py +13 -0
  25. epstein_files/util/helpers/env_helpers.py +21 -0
  26. epstein_files/util/highlighted_group.py +121 -37
  27. epstein_files/util/layout/left_bar_panel.py +26 -0
  28. epstein_files/util/logging.py +28 -13
  29. epstein_files/util/output.py +49 -40
  30. epstein_files/util/rich.py +30 -3
  31. epstein_files/util/word_count.py +7 -7
  32. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/METADATA +16 -3
  33. epstein_files-1.5.0.dist-info/RECORD +40 -0
  34. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/entry_points.txt +1 -1
  35. epstein_files-1.2.5.dist-info/RECORD +0 -34
  36. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/LICENSE +0 -0
  37. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/WHEEL +0 -0
@@ -5,7 +5,7 @@ from dataclasses import asdict, dataclass, field
5
5
  from datetime import datetime
6
6
  from pathlib import Path
7
7
  from subprocess import run
8
- from typing import Callable, ClassVar, Sequence, TypeVar
8
+ from typing import Callable, ClassVar, Self, Sequence, TypeVar
9
9
 
10
10
  from rich.console import Console, ConsoleOptions, Group, RenderResult
11
11
  from rich.padding import Padding
@@ -13,17 +13,19 @@ from rich.panel import Panel
13
13
  from rich.text import Text
14
14
  from rich.table import Table
15
15
 
16
+ from epstein_files.documents.emails.email_header import DETECT_EMAIL_REGEX
16
17
  from epstein_files.util.constant.names import *
17
18
  from epstein_files.util.constant.strings import *
18
19
  from epstein_files.util.constant.urls import *
19
- from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
20
+ from epstein_files.util.constants import ALL_FILE_CONFIGS, DOJ_FILE_STEM_REGEX, FALLBACK_TIMESTAMP
20
21
  from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time, without_falsey
21
22
  from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
22
- from epstein_files.util.env import DOCS_DIR, args
23
- from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, file_size_to_str, is_local_extract_file
23
+ from epstein_files.util.env import DOCS_DIR
24
+ from epstein_files.util.file_helper import (coerce_file_path, extract_file_id, file_size, file_size_str,
25
+ file_size_to_str, is_local_extract_file)
24
26
  from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
25
- from epstein_files.util.rich import (INFO_STYLE, NA_TXT, SYMBOL_STYLE, add_cols_to_table, build_table, console,
26
- highlighter, join_texts, key_value_txt, link_text_obj, parenthesize)
27
+ from epstein_files.util.rich import (INFO_STYLE, NA_TXT, SYMBOL_STYLE, add_cols_to_table, build_table,
28
+ console, highlighter, join_texts, key_value_txt, link_text_obj, parenthesize)
27
29
  from epstein_files.util.search_result import MatchedLine
28
30
 
29
31
  ALT_LINK_STYLE = 'white dim'
@@ -33,11 +35,9 @@ INFO_INDENT = 2
33
35
  INFO_PADDING = (0, 0, 0, INFO_INDENT)
34
36
  MAX_TOP_LINES_LEN = 4000 # Only for logging
35
37
  MIN_DOCUMENT_ID = 10477
36
- WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
37
38
 
38
- MIN_TIMESTAMP = datetime(1991, 1, 1)
39
- MID_TIMESTAMP = datetime(2007, 1, 1)
40
- MAX_TIMESTAMP = datetime(2020, 1, 1)
39
+ DOJ_DATASET_ID_REGEX = re.compile(r"(?:epstein_dataset_|DataSet )(\d+)")
40
+ WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
41
41
 
42
42
  FILENAME_MATCH_STYLES = [
43
43
  'dark_green',
@@ -74,7 +74,8 @@ class Document:
74
74
  Attributes:
75
75
  file_path (Path): Local path to file
76
76
  author (Name): Who is responsible for the text in the file
77
- config (DocCfg): Information about this fil
77
+ config (DocCfg): Preconfigured information about this file
78
+ doj_2026_dataset_id (int, optional): Only set for files that came from the DOJ website.
78
79
  file_id (str): 6 digit (or 8 digits if it's a local extract file) string ID
79
80
  filename (str): File's basename
80
81
  lines (str): Number of lines in the file after all the cleanup
@@ -86,6 +87,7 @@ class Document:
86
87
  # Optional fields
87
88
  author: Name = None
88
89
  config: EmailCfg | DocCfg | TextCfg | None = None
90
+ doj_2026_dataset_id: int | None = None
89
91
  file_id: str = field(init=False)
90
92
  filename: str = field(init=False)
91
93
  lines: list[str] = field(default_factory=list)
@@ -97,137 +99,117 @@ class Document:
97
99
  include_description_in_summary_panel: ClassVar[bool] = False
98
100
  strip_whitespace: ClassVar[bool] = True # Overridden in JsonFile
99
101
 
100
- def __post_init__(self):
101
- if not self.file_path.exists():
102
- raise FileNotFoundError(f"File '{self.file_path.name}' does not exist!")
103
-
104
- self.filename = self.file_path.name
105
- self.file_id = extract_file_id(self.filename)
106
- # config and url_slug could have been pre-set in Email
107
- self.config = self.config or deepcopy(ALL_FILE_CONFIGS.get(self.file_id))
108
- self.url_slug = self.url_slug or self.filename.split('.')[0]
109
-
110
- if not self.text:
111
- self._load_file()
112
-
113
- self._repair()
114
- self._extract_author()
115
- self.timestamp = self._extract_timestamp()
102
+ @property
103
+ def border_style(self) -> str:
104
+ """Should be overloaded in subclasses."""
105
+ return 'white'
116
106
 
107
+ @property
117
108
  def config_description(self) -> str | None:
118
- """Overloaded in OtherFile."""
119
109
  if self.config and self.config.description:
120
110
  return f"({self.config.description})"
121
111
 
112
+ @property
113
+ def config_timestamp(self) -> datetime | None:
114
+ """Configured timestamp, if any."""
115
+ return self.config.timestamp if self.config and self.config.timestamp else None
116
+
117
+ @property
122
118
  def date_str(self) -> str | None:
123
119
  return date_str(self.timestamp)
124
120
 
121
+ @property
125
122
  def duplicate_file_txt(self) -> Text:
126
123
  """If the file is a dupe make a nice message to explain what file it's a duplicate of."""
127
- if not self.is_duplicate():
124
+ if not self.is_duplicate:
128
125
  raise RuntimeError(f"duplicate_file_txt() called on {self.summary()} but not a dupe! config:\n\n{self.config}")
129
126
 
130
127
  txt = Text(f"Not showing ", style=INFO_STYLE).append(epstein_media_doc_link_txt(self.file_id, style='cyan'))
131
128
  txt.append(f" because it's {DUPE_TYPE_STRS[self.config.dupe_type]} ")
132
129
  return txt.append(epstein_media_doc_link_txt(self.config.duplicate_of_id, style='royal_blue1'))
133
130
 
131
+ @property
134
132
  def duplicate_of_id(self) -> str | None:
135
133
  if self.config and self.config.duplicate_of_id:
136
134
  return self.config.duplicate_of_id
137
135
 
138
- def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
139
- return self.external_link(epsteinify_doc_url, style, link_txt)
140
-
141
- def epstein_media_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
142
- return self.external_link(epstein_media_doc_url, style, link_txt)
143
-
144
- def epstein_web_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
145
- return self.external_link(epstein_web_doc_url, style, link_txt)
146
-
147
- def rollcall_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
148
- return self.external_link(rollcall_doc_url, style, link_txt)
149
-
150
- def external_link(self, fxn: Callable[[str], str], style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
151
- return link_text_obj(fxn(self.url_slug), link_txt or self.file_path.stem, style)
152
-
153
- def external_links_txt(self, style: str = '', include_alt_links: bool = False) -> Text:
154
- """Returns colored links to epstein.media and alternates in a Text object."""
155
- links = [self.epstein_media_link(style=style)]
156
-
157
- if include_alt_links:
158
- links.append(self.epsteinify_link(style=ALT_LINK_STYLE, link_txt=EPSTEINIFY))
159
- links.append(self.epstein_web_link(style=ALT_LINK_STYLE, link_txt=EPSTEIN_WEB))
160
-
161
- if self._class_name() == 'Email':
162
- links.append(self.rollcall_link(style=ALT_LINK_STYLE, link_txt=ROLLCALL))
163
-
164
- links = [links[0]] + [parenthesize(link) for link in links[1:]]
165
- base_txt = Text('', style='white' if include_alt_links else ARCHIVE_LINK_COLOR)
166
- return base_txt.append(join_texts(links))
136
+ @property
137
+ def external_url(self) -> str:
138
+ """The primary external URL to use when linking to this document's source."""
139
+ if self.is_doj_file and self.doj_2026_dataset_id:
140
+ return doj_2026_file_url(self.doj_2026_dataset_id, self.url_slug)
141
+ else:
142
+ return epstein_media_doc_url(self.url_slug)
167
143
 
144
+ @property
168
145
  def file_id_debug_info(self) -> str:
169
146
  return ', '.join([f"{prop}={getattr(self, prop)}" for prop in ['file_id', 'filename', 'url_slug']])
170
147
 
171
- def file_info_panel(self) -> Group:
172
- """Panel with filename linking to raw file plus any additional info about the file."""
173
- panel = Panel(self.external_links_txt(include_alt_links=True), border_style=self._border_style(), expand=False)
174
- padded_info = [Padding(sentence, INFO_PADDING) for sentence in self.info()]
175
- return Group(*([panel] + padded_info))
176
-
148
+ @property
177
149
  def file_size(self) -> int:
178
150
  return file_size(self.file_path)
179
151
 
152
+ @property
180
153
  def file_size_str(self, decimal_places: int | None = None) -> str:
181
154
  return file_size_str(self.file_path, decimal_places)
182
155
 
156
+ @property
183
157
  def info(self) -> list[Text]:
184
158
  """0 to 2 sentences containing the info_txt() as well as any configured description."""
185
159
  return without_falsey([
186
- self.info_txt(),
187
- highlighter(Text(self.config_description(), style=INFO_STYLE)) if self.config_description() else None
160
+ self.info_txt,
161
+ highlighter(Text(self.config_description, style=INFO_STYLE)) if self.config_description else None
188
162
  ])
189
163
 
164
+ @property
190
165
  def info_txt(self) -> Text | None:
191
166
  """Secondary info about this file (description recipients, etc). Overload in subclasses."""
192
167
  return None
193
168
 
169
+ @property
194
170
  def is_attribution_uncertain(self) -> bool:
195
171
  return bool(self.config and self.config.is_attribution_uncertain)
196
172
 
173
+ @property
174
+ def is_doj_file(self) -> bool:
175
+ return bool(DOJ_FILE_STEM_REGEX.match(self.file_id))
176
+
177
+ @property
197
178
  def is_duplicate(self) -> bool:
198
- return bool(self.duplicate_of_id())
179
+ return bool(self.duplicate_of_id)
180
+
181
+ @property
182
+ def is_empty(self) -> bool:
183
+ return len(self.text.strip()) < 20
199
184
 
185
+ @property
186
+ def is_interesting(self) -> bool:
187
+ return bool(self.config and self.config.is_interesting)
188
+
189
+ @property
200
190
  def is_local_extract_file(self) -> bool:
201
191
  """True if extracted from other file (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
202
192
  return is_local_extract_file(self.filename)
203
193
 
194
+ @property
204
195
  def length(self) -> int:
205
196
  return len(self.text)
206
197
 
207
- def log(self, msg: str, level: int = logging.INFO):
208
- """Log with filename as a prefix."""
209
- logger.log(level, f"{self.file_path.stem} {msg}")
210
-
211
- def log_top_lines(self, n: int = 10, msg: str = '', level: int = logging.INFO) -> None:
212
- """Log first 'n' lines of self.text at 'level'. 'msg' can be optionally provided."""
213
- separator = '\n\n' if '\n' in msg else '. '
214
- msg = (msg + separator) if msg else ''
215
- self.log(f"{msg}First {n} lines:\n\n{self.top_lines(n)}\n", level)
216
-
217
- def matching_lines(self, _pattern: re.Pattern | str) -> list[MatchedLine]:
218
- """Return lines matching a regex as colored list[Text]."""
219
- pattern = patternize(_pattern)
220
- return [MatchedLine(line, i) for i, line in enumerate(self.lines) if pattern.search(line)]
198
+ @property
199
+ def local_path_and_url(self) -> Text:
200
+ """Text obj with local path and URL."""
201
+ return Text(f"{self.file_id} URL: {self.external_url}\n{self.file_id} Local path: '{self.file_path}'")
221
202
 
203
+ @property
222
204
  def metadata(self) -> Metadata:
223
- metadata = self.config.metadata() if self.config else {}
205
+ metadata = self.config.metadata if self.config else {}
224
206
  metadata.update({k: v for k, v in asdict(self).items() if k in METADATA_FIELDS and v is not None})
225
- metadata['bytes'] = self.file_size()
207
+ metadata['bytes'] = self.file_size
226
208
  metadata['filename'] = f"{self.url_slug}.txt"
227
- metadata['num_lines'] = self.num_lines()
228
- metadata['type'] = self._class_name()
209
+ metadata['num_lines'] = self.num_lines
210
+ metadata['type'] = self._class_name
229
211
 
230
- if self.is_local_extract_file():
212
+ if self.is_local_extract_file:
231
213
  metadata['extracted_file'] = {
232
214
  'explanation': 'manually extracted from one of the other files',
233
215
  'extracted_from': self.url_slug + '.txt',
@@ -236,10 +218,141 @@ class Document:
236
218
 
237
219
  return metadata
238
220
 
221
+ @property
239
222
  def num_lines(self) -> int:
240
223
  return len(self.lines)
241
224
 
225
+ @property
226
+ def panel_title_timestamp(self) -> str | None:
227
+ """String placed in the `title` of the enclosing `Panel` when printing this document's text."""
228
+ if (self.timestamp or FALLBACK_TIMESTAMP) == FALLBACK_TIMESTAMP:
229
+ return None
230
+
231
+ prefix = '' if self.config and self.config.timestamp else 'inferred '
232
+ return f"{prefix}timestamp: {remove_zero_time(self.timestamp)}"
233
+
234
+ @property
235
+ def summary_panel(self) -> Panel:
236
+ """Panelized description() with info_txt(), used in search results."""
237
+ sentences = [self.summary()]
238
+
239
+ if self.include_description_in_summary_panel:
240
+ sentences += [Text('', style='italic').append(h) for h in self.info]
241
+
242
+ return Panel(Group(*sentences), border_style=self._class_style, expand=False)
243
+
244
+ @property
245
+ def timestamp_sort_key(self) -> tuple[datetime, str, int]:
246
+ """Sort by timestamp, file_id, then whether or not it's a duplicate file."""
247
+ if self.duplicate_of_id:
248
+ sort_id = self.duplicate_of_id
249
+ dupe_idx = 1
250
+ else:
251
+ sort_id = self.file_id
252
+ dupe_idx = 0
253
+
254
+ return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
255
+
256
+ @property
257
+ def _class_name(self) -> str:
258
+ """Annoying workaround for circular import issues and isinstance()."""
259
+ return str(type(self).__name__)
260
+
261
+ @property
262
+ def _class_style(self) -> str:
263
+ return DOC_TYPE_STYLES[self._class_name]
264
+
265
+ def __post_init__(self):
266
+ if not self.file_path.exists():
267
+ raise FileNotFoundError(f"File '{self.file_path.name}' does not exist!")
268
+
269
+ self.filename = self.file_path.name
270
+ self.file_id = extract_file_id(self.filename)
271
+ # config and url_slug could have been pre-set in Email
272
+ self.config = self.config or deepcopy(ALL_FILE_CONFIGS.get(self.file_id))
273
+ self.url_slug = self.url_slug or self.filename.split('.')[0]
274
+
275
+ # Extract the DOJ dataset ID from the path
276
+ if self.is_doj_file:
277
+ if (data_set_match := DOJ_DATASET_ID_REGEX.search(str(self.file_path))):
278
+ self.doj_2026_dataset_id = int(data_set_match.group(1))
279
+ logger.info(f"Extracted data set ID {self.doj_2026_dataset_id} for {self.url_slug}")
280
+ else:
281
+ self.warn(f"Couldn't find a data set ID in path '{self.file_path}'! Cannot create valid links.")
282
+
283
+ self.text = self.text or self._load_file()
284
+ self._set_computed_fields(text=self.text)
285
+ self._repair()
286
+ self._extract_author()
287
+ self.timestamp = self.config_timestamp or self._extract_timestamp()
288
+
289
+ @classmethod
290
+ def from_file_id(cls, file_id: str | int) -> Self:
291
+ """Alternate constructor that finds the file path automatically and builds a `Document`."""
292
+ return cls(coerce_file_path(file_id))
293
+
294
+ def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
295
+ return self.external_link(epsteinify_doc_url, style, link_txt)
296
+
297
+ def epstein_media_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
298
+ return self.external_link(epstein_media_doc_url, style, link_txt)
299
+
300
+ def epstein_web_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
301
+ return self.external_link(epstein_web_doc_url, style, link_txt)
302
+
303
+ def rollcall_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
304
+ return self.external_link(rollcall_doc_url, style, link_txt)
305
+
306
+ def external_link(self, fxn: Callable[[str], str], style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
307
+ return link_text_obj(fxn(self.url_slug), link_txt or self.file_path.stem, style)
308
+
309
+ def external_links_txt(self, style: str = '', include_alt_links: bool = False) -> Text:
310
+ """Returns colored links to epstein.media and alternates in a Text object."""
311
+ links = [link_text_obj(self.external_url, self.url_slug, style=style)]
312
+
313
+ if include_alt_links:
314
+ if self.doj_2026_dataset_id:
315
+ jmail_url = jmail_doj_2026_file_url(self.doj_2026_dataset_id, self.file_id)
316
+ jmail_link = link_text_obj(jmail_url, JMAIL, style=f"{style} dim" if style else ARCHIVE_LINK_COLOR)
317
+ links.append(jmail_link)
318
+ else:
319
+ links.append(self.epsteinify_link(style=ALT_LINK_STYLE, link_txt=EPSTEINIFY))
320
+ links.append(self.epstein_web_link(style=ALT_LINK_STYLE, link_txt=EPSTEIN_WEB))
321
+
322
+ if self._class_name == 'Email':
323
+ links.append(self.rollcall_link(style=ALT_LINK_STYLE, link_txt=ROLLCALL))
324
+
325
+ links = [links[0]] + [parenthesize(link) for link in links[1:]]
326
+ base_txt = Text('', style='white' if include_alt_links else ARCHIVE_LINK_COLOR)
327
+ return base_txt.append(join_texts(links))
328
+
329
+ def file_info_panel(self) -> Group:
330
+ """Panel with filename linking to raw file plus any additional info about the file."""
331
+ panel = Panel(self.external_links_txt(include_alt_links=True), border_style=self.border_style, expand=False)
332
+ padded_info = [Padding(sentence, INFO_PADDING) for sentence in self.info]
333
+ return Group(*([panel] + padded_info))
334
+
335
+ def log(self, msg: str, level: int = logging.INFO):
336
+ """Log a message with with this document's filename as a prefix."""
337
+ logger.log(level, f"{self.file_path.stem} {msg}")
338
+
339
+ def log_top_lines(self, n: int = 10, msg: str = '', level: int = logging.INFO) -> None:
340
+ """Log first 'n' lines of self.text at 'level'. 'msg' can be optionally provided."""
341
+ separator = '\n\n' if '\n' in msg else '. '
342
+ msg = (msg + separator) if msg else ''
343
+ self.log(f"{msg}First {n} lines:\n\n{self.top_lines(n)}\n", level)
344
+
345
+ def matching_lines(self, _pattern: re.Pattern | str) -> list[MatchedLine]:
346
+ """Return lines matching a regex as colored list[Text]."""
347
+ pattern = patternize(_pattern)
348
+ return [MatchedLine(line, i) for i, line in enumerate(self.lines) if pattern.search(line)]
349
+
350
+ def printable_document(self) -> Self:
351
+ """Overloaded by `DojFile` to convert some files to `Email` objects."""
352
+ return self
353
+
242
354
  def raw_text(self) -> str:
355
+ """Reload the raw data from the underlying file and return it."""
243
356
  with open(self.file_path) as f:
244
357
  return f.read()
245
358
 
@@ -253,13 +366,9 @@ class Document:
253
366
 
254
367
  return text
255
368
 
256
- def source_file_id(self) -> str:
257
- """Strip off the _1, _2, etc. suffixes for extracted documents."""
258
- return self.file_id[0:6]
259
-
260
369
  def summary(self) -> Text:
261
- """Summary of this file for logging. Brackets are left open for subclasses to add stuff."""
262
- txt = Text('').append(self._class_name(), style=self._class_style())
370
+ """Summary of this file for logging. Subclasses should extend with a method that closes the open '['."""
371
+ txt = Text('').append(self._class_name, style=self._class_style)
263
372
  txt.append(f" {self.file_path.stem}", style=FILENAME_STYLE)
264
373
 
265
374
  if self.timestamp:
@@ -267,52 +376,22 @@ class Document:
267
376
  txt.append(' (', style=SYMBOL_STYLE)
268
377
  txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
269
378
 
270
- txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(0), style='aquamarine1')))
271
- txt.append(", ").append(key_value_txt('lines', self.num_lines()))
379
+ txt.append(' [').append(key_value_txt('size', Text(str(self.length), style='aquamarine1')))
380
+ txt.append(", ").append(key_value_txt('lines', self.num_lines))
272
381
 
273
382
  if self.config and self.config.duplicate_of_id:
274
383
  txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='cyan dim')))
275
384
 
276
385
  return txt
277
386
 
278
- def summary_panel(self) -> Panel:
279
- """Panelized description() with info_txt(), used in search results."""
280
- sentences = [self.summary()]
281
-
282
- if self.include_description_in_summary_panel:
283
- sentences += [Text('', style='italic').append(h) for h in self.info()]
284
-
285
- return Panel(Group(*sentences), border_style=self._class_style(), expand=False)
286
-
287
- def timestamp_sort_key(self) -> tuple[datetime, str, int]:
288
- """Sort by timestamp, file_id, then whether or not it's a duplicate file."""
289
- if self.is_duplicate():
290
- sort_id = self.config.duplicate_of_id
291
- dupe_idx = 1
292
- else:
293
- sort_id = self.file_id
294
- dupe_idx = 0
295
-
296
- return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
297
-
298
387
  def top_lines(self, n: int = 10) -> str:
299
388
  """First n lines."""
300
389
  return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
301
390
 
302
391
  def warn(self, msg: str) -> None:
392
+ """Print a warning message prefixed by info about this `Document`."""
303
393
  self.log(msg, level=logging.WARNING)
304
394
 
305
- def _border_style(self) -> str:
306
- """Should be overloaded in subclasses."""
307
- return 'white'
308
-
309
- def _class_name(self) -> str:
310
- """Annoying workaround for circular import issues and isinstance()."""
311
- return str(type(self).__name__)
312
-
313
- def _class_style(self) -> str:
314
- return DOC_TYPE_STYLES[self._class_name()]
315
-
316
395
  def _extract_author(self) -> None:
317
396
  """Get author from config. Extended in Email subclass to also check headers."""
318
397
  if self.config and self.config.author:
@@ -322,7 +401,7 @@ class Document:
322
401
  """Should be implemented in subclasses."""
323
402
  pass
324
403
 
325
- def _load_file(self) -> None:
404
+ def _load_file(self) -> str:
326
405
  """Remove BOM and HOUSE OVERSIGHT lines, strip whitespace."""
327
406
  text = self.raw_text()
328
407
  text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text # remove BOM
@@ -330,11 +409,10 @@ class Document:
330
409
 
331
410
  lines = [
332
411
  line.strip() if self.strip_whitespace else line for line in text.split('\n')
333
- if not line.startswith(HOUSE_OVERSIGHT)
412
+ if not (line.startswith(HOUSE_OVERSIGHT) or line.startswith('EFTA'))
334
413
  ]
335
414
 
336
- self.text = collapse_newlines('\n'.join(lines))
337
- self.lines = self.text.split('\n')
415
+ return collapse_newlines('\n'.join(lines))
338
416
 
339
417
  def _repair(self) -> None:
340
418
  """Can optionally be overloaded in subclasses to further improve self.text."""
@@ -364,11 +442,20 @@ class Document:
364
442
  with open(output_path, 'w') as f:
365
443
  f.write(self.text)
366
444
 
367
- logger.warning(f"Wrote {self.length()} chars of cleaned {self.filename} to {output_path}.")
445
+ logger.warning(f"Wrote {self.length} chars of cleaned {self.filename} to {output_path}.")
368
446
 
369
447
  def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
448
+ """Default `Document` renderer (Email and MessengerLog override this)."""
370
449
  yield self.file_info_panel()
371
- text_panel = Panel(highlighter(self.text), border_style=self._border_style(), expand=False)
450
+
451
+ text_panel = Panel(
452
+ highlighter(self.text),
453
+ border_style=self.border_style,
454
+ expand=False,
455
+ title=f"({self.panel_title_timestamp})",
456
+ title_align='right',
457
+ )
458
+
372
459
  yield Padding(text_panel, (0, 0, 1, INFO_INDENT))
373
460
 
374
461
  def __str__(self) -> str:
@@ -392,8 +479,8 @@ class Document:
392
479
  'count': str(file_count),
393
480
  'author_count': NA_TXT if is_author_na else str(author_count),
394
481
  'no_author_count': NA_TXT if is_author_na else str(file_count - author_count),
395
- 'uncertain_author_count': NA_TXT if is_author_na else str(len([f for f in files if f.is_attribution_uncertain()])),
396
- 'bytes': file_size_to_str(sum([f.file_size() for f in files])),
482
+ 'uncertain_author_count': NA_TXT if is_author_na else str(len([f for f in files if f.is_attribution_uncertain])),
483
+ 'bytes': file_size_to_str(sum([f.file_size for f in files])),
397
484
  }
398
485
 
399
486
  @classmethod
@@ -430,6 +517,11 @@ class Document:
430
517
  for f in tmpfiles:
431
518
  f.unlink()
432
519
 
520
+ @staticmethod
521
+ def is_email(doc: 'Document') -> bool:
522
+ search_area = doc.text[0:5000] # Limit search area to avoid pointless scans of huge files
523
+ return isinstance(doc.config, EmailCfg) or bool(DETECT_EMAIL_REGEX.match(search_area) and doc.config is None)
524
+
433
525
  @staticmethod
434
526
  def known_author_count(docs: Sequence['Document']) -> int:
435
527
  """Count of how many Document objects have an author attribution."""
@@ -439,9 +531,13 @@ class Document:
439
531
  def sort_by_id(docs: Sequence['DocumentType']) -> list['DocumentType']:
440
532
  return sorted(docs, key=lambda d: d.file_id)
441
533
 
534
+ @staticmethod
535
+ def sort_by_length(docs: Sequence['DocumentType']) -> list['DocumentType']:
536
+ return sorted(docs, key=lambda d: d.file_size, reverse=True)
537
+
442
538
  @staticmethod
443
539
  def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
444
- return sorted(docs, key=lambda doc: doc.timestamp_sort_key())
540
+ return sorted(docs, key=lambda doc: doc.timestamp_sort_key)
445
541
 
446
542
  @staticmethod
447
543
  def uniquify(documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
@@ -451,7 +547,7 @@ class Document:
451
547
 
452
548
  @staticmethod
453
549
  def without_dupes(docs: Sequence['DocumentType']) -> list['DocumentType']:
454
- return [doc for doc in docs if not doc.is_duplicate()]
550
+ return [doc for doc in docs if not doc.is_duplicate]
455
551
 
456
552
 
457
553
  DocumentType = TypeVar('DocumentType', bound=Document)