epstein-files 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. epstein_files/__init__.py +31 -18
  2. epstein_files/documents/communication.py +9 -5
  3. epstein_files/documents/document.py +225 -136
  4. epstein_files/documents/doj_file.py +242 -0
  5. epstein_files/documents/doj_files/full_text.py +166 -0
  6. epstein_files/documents/email.py +138 -163
  7. epstein_files/documents/emails/email_header.py +21 -11
  8. epstein_files/documents/emails/emailers.py +223 -0
  9. epstein_files/documents/imessage/text_message.py +2 -3
  10. epstein_files/documents/json_file.py +18 -14
  11. epstein_files/documents/messenger_log.py +23 -39
  12. epstein_files/documents/other_file.py +48 -44
  13. epstein_files/epstein_files.py +54 -33
  14. epstein_files/person.py +142 -110
  15. epstein_files/util/constant/names.py +29 -6
  16. epstein_files/util/constant/output_files.py +2 -0
  17. epstein_files/util/constant/strings.py +12 -6
  18. epstein_files/util/constant/urls.py +17 -0
  19. epstein_files/util/constants.py +101 -174
  20. epstein_files/util/data.py +2 -0
  21. epstein_files/util/doc_cfg.py +20 -15
  22. epstein_files/util/env.py +24 -16
  23. epstein_files/util/file_helper.py +28 -6
  24. epstein_files/util/helpers/debugging_helper.py +13 -0
  25. epstein_files/util/helpers/env_helpers.py +21 -0
  26. epstein_files/util/highlighted_group.py +57 -16
  27. epstein_files/util/layout/left_bar_panel.py +26 -0
  28. epstein_files/util/logging.py +28 -13
  29. epstein_files/util/output.py +33 -10
  30. epstein_files/util/rich.py +28 -2
  31. epstein_files/util/word_count.py +7 -7
  32. {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/METADATA +14 -1
  33. epstein_files-1.5.0.dist-info/RECORD +40 -0
  34. epstein_files-1.4.1.dist-info/RECORD +0 -34
  35. {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/LICENSE +0 -0
  36. {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/WHEEL +0 -0
  37. {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/entry_points.txt +0 -0
@@ -5,7 +5,7 @@ from dataclasses import asdict, dataclass, field
5
5
  from datetime import datetime
6
6
  from pathlib import Path
7
7
  from subprocess import run
8
- from typing import Callable, ClassVar, Sequence, TypeVar
8
+ from typing import Callable, ClassVar, Self, Sequence, TypeVar
9
9
 
10
10
  from rich.console import Console, ConsoleOptions, Group, RenderResult
11
11
  from rich.padding import Padding
@@ -13,17 +13,19 @@ from rich.panel import Panel
13
13
  from rich.text import Text
14
14
  from rich.table import Table
15
15
 
16
+ from epstein_files.documents.emails.email_header import DETECT_EMAIL_REGEX
16
17
  from epstein_files.util.constant.names import *
17
18
  from epstein_files.util.constant.strings import *
18
19
  from epstein_files.util.constant.urls import *
19
- from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
20
+ from epstein_files.util.constants import ALL_FILE_CONFIGS, DOJ_FILE_STEM_REGEX, FALLBACK_TIMESTAMP
20
21
  from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time, without_falsey
21
22
  from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
22
- from epstein_files.util.env import DOCS_DIR, args
23
- from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, file_size_to_str, is_local_extract_file
23
+ from epstein_files.util.env import DOCS_DIR
24
+ from epstein_files.util.file_helper import (coerce_file_path, extract_file_id, file_size, file_size_str,
25
+ file_size_to_str, is_local_extract_file)
24
26
  from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
25
- from epstein_files.util.rich import (INFO_STYLE, NA_TXT, SYMBOL_STYLE, add_cols_to_table, build_table, console,
26
- highlighter, join_texts, key_value_txt, link_text_obj, parenthesize)
27
+ from epstein_files.util.rich import (INFO_STYLE, NA_TXT, SYMBOL_STYLE, add_cols_to_table, build_table,
28
+ console, highlighter, join_texts, key_value_txt, link_text_obj, parenthesize)
27
29
  from epstein_files.util.search_result import MatchedLine
28
30
 
29
31
  ALT_LINK_STYLE = 'white dim'
@@ -33,11 +35,9 @@ INFO_INDENT = 2
33
35
  INFO_PADDING = (0, 0, 0, INFO_INDENT)
34
36
  MAX_TOP_LINES_LEN = 4000 # Only for logging
35
37
  MIN_DOCUMENT_ID = 10477
36
- WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
37
38
 
38
- MIN_TIMESTAMP = datetime(1991, 1, 1)
39
- MID_TIMESTAMP = datetime(2007, 1, 1)
40
- MAX_TIMESTAMP = datetime(2020, 1, 1)
39
+ DOJ_DATASET_ID_REGEX = re.compile(r"(?:epstein_dataset_|DataSet )(\d+)")
40
+ WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
41
41
 
42
42
  FILENAME_MATCH_STYLES = [
43
43
  'dark_green',
@@ -74,7 +74,8 @@ class Document:
74
74
  Attributes:
75
75
  file_path (Path): Local path to file
76
76
  author (Name): Who is responsible for the text in the file
77
- config (DocCfg): Information about this fil
77
+ config (DocCfg): Preconfigured information about this file
78
+ doj_2026_dataset_id (int, optional): Only set for files that came from the DOJ website.
78
79
  file_id (str): 6 digit (or 8 digits if it's a local extract file) string ID
79
80
  filename (str): File's basename
80
81
  lines (str): Number of lines in the file after all the cleanup
@@ -86,6 +87,7 @@ class Document:
86
87
  # Optional fields
87
88
  author: Name = None
88
89
  config: EmailCfg | DocCfg | TextCfg | None = None
90
+ doj_2026_dataset_id: int | None = None
89
91
  file_id: str = field(init=False)
90
92
  filename: str = field(init=False)
91
93
  lines: list[str] = field(default_factory=list)
@@ -97,140 +99,117 @@ class Document:
97
99
  include_description_in_summary_panel: ClassVar[bool] = False
98
100
  strip_whitespace: ClassVar[bool] = True # Overridden in JsonFile
99
101
 
100
- def __post_init__(self):
101
- if not self.file_path.exists():
102
- raise FileNotFoundError(f"File '{self.file_path.name}' does not exist!")
103
-
104
- self.filename = self.file_path.name
105
- self.file_id = extract_file_id(self.filename)
106
- # config and url_slug could have been pre-set in Email
107
- self.config = self.config or deepcopy(ALL_FILE_CONFIGS.get(self.file_id))
108
- self.url_slug = self.url_slug or self.filename.split('.')[0]
109
-
110
- if not self.text:
111
- self._load_file()
112
-
113
- self._repair()
114
- self._extract_author()
115
- self.timestamp = self._extract_timestamp()
102
+ @property
103
+ def border_style(self) -> str:
104
+ """Should be overloaded in subclasses."""
105
+ return 'white'
116
106
 
107
+ @property
117
108
  def config_description(self) -> str | None:
118
- """Overloaded in OtherFile."""
119
109
  if self.config and self.config.description:
120
110
  return f"({self.config.description})"
121
111
 
112
+ @property
113
+ def config_timestamp(self) -> datetime | None:
114
+ """Configured timestamp, if any."""
115
+ return self.config.timestamp if self.config and self.config.timestamp else None
116
+
117
+ @property
122
118
  def date_str(self) -> str | None:
123
119
  return date_str(self.timestamp)
124
120
 
121
+ @property
125
122
  def duplicate_file_txt(self) -> Text:
126
123
  """If the file is a dupe make a nice message to explain what file it's a duplicate of."""
127
- if not self.is_duplicate():
124
+ if not self.is_duplicate:
128
125
  raise RuntimeError(f"duplicate_file_txt() called on {self.summary()} but not a dupe! config:\n\n{self.config}")
129
126
 
130
127
  txt = Text(f"Not showing ", style=INFO_STYLE).append(epstein_media_doc_link_txt(self.file_id, style='cyan'))
131
128
  txt.append(f" because it's {DUPE_TYPE_STRS[self.config.dupe_type]} ")
132
129
  return txt.append(epstein_media_doc_link_txt(self.config.duplicate_of_id, style='royal_blue1'))
133
130
 
131
+ @property
134
132
  def duplicate_of_id(self) -> str | None:
135
133
  if self.config and self.config.duplicate_of_id:
136
134
  return self.config.duplicate_of_id
137
135
 
138
- def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
139
- return self.external_link(epsteinify_doc_url, style, link_txt)
140
-
141
- def epstein_media_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
142
- return self.external_link(epstein_media_doc_url, style, link_txt)
143
-
144
- def epstein_web_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
145
- return self.external_link(epstein_web_doc_url, style, link_txt)
146
-
147
- def rollcall_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
148
- return self.external_link(rollcall_doc_url, style, link_txt)
149
-
150
- def external_link(self, fxn: Callable[[str], str], style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
151
- return link_text_obj(fxn(self.url_slug), link_txt or self.file_path.stem, style)
152
-
153
- def external_links_txt(self, style: str = '', include_alt_links: bool = False) -> Text:
154
- """Returns colored links to epstein.media and alternates in a Text object."""
155
- links = [self.epstein_media_link(style=style)]
156
-
157
- if include_alt_links:
158
- links.append(self.epsteinify_link(style=ALT_LINK_STYLE, link_txt=EPSTEINIFY))
159
- links.append(self.epstein_web_link(style=ALT_LINK_STYLE, link_txt=EPSTEIN_WEB))
160
-
161
- if self._class_name() == 'Email':
162
- links.append(self.rollcall_link(style=ALT_LINK_STYLE, link_txt=ROLLCALL))
163
-
164
- links = [links[0]] + [parenthesize(link) for link in links[1:]]
165
- base_txt = Text('', style='white' if include_alt_links else ARCHIVE_LINK_COLOR)
166
- return base_txt.append(join_texts(links))
136
+ @property
137
+ def external_url(self) -> str:
138
+ """The primary external URL to use when linking to this document's source."""
139
+ if self.is_doj_file and self.doj_2026_dataset_id:
140
+ return doj_2026_file_url(self.doj_2026_dataset_id, self.url_slug)
141
+ else:
142
+ return epstein_media_doc_url(self.url_slug)
167
143
 
144
+ @property
168
145
  def file_id_debug_info(self) -> str:
169
146
  return ', '.join([f"{prop}={getattr(self, prop)}" for prop in ['file_id', 'filename', 'url_slug']])
170
147
 
171
- def file_info_panel(self) -> Group:
172
- """Panel with filename linking to raw file plus any additional info about the file."""
173
- panel = Panel(self.external_links_txt(include_alt_links=True), border_style=self._border_style(), expand=False)
174
- padded_info = [Padding(sentence, INFO_PADDING) for sentence in self.info()]
175
- return Group(*([panel] + padded_info))
176
-
148
+ @property
177
149
  def file_size(self) -> int:
178
150
  return file_size(self.file_path)
179
151
 
152
+ @property
180
153
  def file_size_str(self, decimal_places: int | None = None) -> str:
181
154
  return file_size_str(self.file_path, decimal_places)
182
155
 
156
+ @property
183
157
  def info(self) -> list[Text]:
184
158
  """0 to 2 sentences containing the info_txt() as well as any configured description."""
185
159
  return without_falsey([
186
- self.info_txt(),
187
- highlighter(Text(self.config_description(), style=INFO_STYLE)) if self.config_description() else None
160
+ self.info_txt,
161
+ highlighter(Text(self.config_description, style=INFO_STYLE)) if self.config_description else None
188
162
  ])
189
163
 
164
+ @property
190
165
  def info_txt(self) -> Text | None:
191
166
  """Secondary info about this file (description recipients, etc). Overload in subclasses."""
192
167
  return None
193
168
 
169
+ @property
194
170
  def is_attribution_uncertain(self) -> bool:
195
171
  return bool(self.config and self.config.is_attribution_uncertain)
196
172
 
173
+ @property
174
+ def is_doj_file(self) -> bool:
175
+ return bool(DOJ_FILE_STEM_REGEX.match(self.file_id))
176
+
177
+ @property
197
178
  def is_duplicate(self) -> bool:
198
- return bool(self.duplicate_of_id())
179
+ return bool(self.duplicate_of_id)
199
180
 
181
+ @property
182
+ def is_empty(self) -> bool:
183
+ return len(self.text.strip()) < 20
184
+
185
+ @property
200
186
  def is_interesting(self) -> bool:
201
187
  return bool(self.config and self.config.is_interesting)
202
188
 
189
+ @property
203
190
  def is_local_extract_file(self) -> bool:
204
191
  """True if extracted from other file (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
205
192
  return is_local_extract_file(self.filename)
206
193
 
194
+ @property
207
195
  def length(self) -> int:
208
196
  return len(self.text)
209
197
 
210
- def log(self, msg: str, level: int = logging.INFO):
211
- """Log with filename as a prefix."""
212
- logger.log(level, f"{self.file_path.stem} {msg}")
213
-
214
- def log_top_lines(self, n: int = 10, msg: str = '', level: int = logging.INFO) -> None:
215
- """Log first 'n' lines of self.text at 'level'. 'msg' can be optionally provided."""
216
- separator = '\n\n' if '\n' in msg else '. '
217
- msg = (msg + separator) if msg else ''
218
- self.log(f"{msg}First {n} lines:\n\n{self.top_lines(n)}\n", level)
219
-
220
- def matching_lines(self, _pattern: re.Pattern | str) -> list[MatchedLine]:
221
- """Return lines matching a regex as colored list[Text]."""
222
- pattern = patternize(_pattern)
223
- return [MatchedLine(line, i) for i, line in enumerate(self.lines) if pattern.search(line)]
198
+ @property
199
+ def local_path_and_url(self) -> Text:
200
+ """Text obj with local path and URL."""
201
+ return Text(f"{self.file_id} URL: {self.external_url}\n{self.file_id} Local path: '{self.file_path}'")
224
202
 
203
+ @property
225
204
  def metadata(self) -> Metadata:
226
- metadata = self.config.metadata() if self.config else {}
205
+ metadata = self.config.metadata if self.config else {}
227
206
  metadata.update({k: v for k, v in asdict(self).items() if k in METADATA_FIELDS and v is not None})
228
- metadata['bytes'] = self.file_size()
207
+ metadata['bytes'] = self.file_size
229
208
  metadata['filename'] = f"{self.url_slug}.txt"
230
- metadata['num_lines'] = self.num_lines()
231
- metadata['type'] = self._class_name()
209
+ metadata['num_lines'] = self.num_lines
210
+ metadata['type'] = self._class_name
232
211
 
233
- if self.is_local_extract_file():
212
+ if self.is_local_extract_file:
234
213
  metadata['extracted_file'] = {
235
214
  'explanation': 'manually extracted from one of the other files',
236
215
  'extracted_from': self.url_slug + '.txt',
@@ -239,10 +218,141 @@ class Document:
239
218
 
240
219
  return metadata
241
220
 
221
+ @property
242
222
  def num_lines(self) -> int:
243
223
  return len(self.lines)
244
224
 
225
+ @property
226
+ def panel_title_timestamp(self) -> str | None:
227
+ """String placed in the `title` of the enclosing `Panel` when printing this document's text."""
228
+ if (self.timestamp or FALLBACK_TIMESTAMP) == FALLBACK_TIMESTAMP:
229
+ return None
230
+
231
+ prefix = '' if self.config and self.config.timestamp else 'inferred '
232
+ return f"{prefix}timestamp: {remove_zero_time(self.timestamp)}"
233
+
234
+ @property
235
+ def summary_panel(self) -> Panel:
236
+ """Panelized description() with info_txt(), used in search results."""
237
+ sentences = [self.summary()]
238
+
239
+ if self.include_description_in_summary_panel:
240
+ sentences += [Text('', style='italic').append(h) for h in self.info]
241
+
242
+ return Panel(Group(*sentences), border_style=self._class_style, expand=False)
243
+
244
+ @property
245
+ def timestamp_sort_key(self) -> tuple[datetime, str, int]:
246
+ """Sort by timestamp, file_id, then whether or not it's a duplicate file."""
247
+ if self.duplicate_of_id:
248
+ sort_id = self.duplicate_of_id
249
+ dupe_idx = 1
250
+ else:
251
+ sort_id = self.file_id
252
+ dupe_idx = 0
253
+
254
+ return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
255
+
256
+ @property
257
+ def _class_name(self) -> str:
258
+ """Annoying workaround for circular import issues and isinstance()."""
259
+ return str(type(self).__name__)
260
+
261
+ @property
262
+ def _class_style(self) -> str:
263
+ return DOC_TYPE_STYLES[self._class_name]
264
+
265
+ def __post_init__(self):
266
+ if not self.file_path.exists():
267
+ raise FileNotFoundError(f"File '{self.file_path.name}' does not exist!")
268
+
269
+ self.filename = self.file_path.name
270
+ self.file_id = extract_file_id(self.filename)
271
+ # config and url_slug could have been pre-set in Email
272
+ self.config = self.config or deepcopy(ALL_FILE_CONFIGS.get(self.file_id))
273
+ self.url_slug = self.url_slug or self.filename.split('.')[0]
274
+
275
+ # Extract the DOJ dataset ID from the path
276
+ if self.is_doj_file:
277
+ if (data_set_match := DOJ_DATASET_ID_REGEX.search(str(self.file_path))):
278
+ self.doj_2026_dataset_id = int(data_set_match.group(1))
279
+ logger.info(f"Extracted data set ID {self.doj_2026_dataset_id} for {self.url_slug}")
280
+ else:
281
+ self.warn(f"Couldn't find a data set ID in path '{self.file_path}'! Cannot create valid links.")
282
+
283
+ self.text = self.text or self._load_file()
284
+ self._set_computed_fields(text=self.text)
285
+ self._repair()
286
+ self._extract_author()
287
+ self.timestamp = self.config_timestamp or self._extract_timestamp()
288
+
289
+ @classmethod
290
+ def from_file_id(cls, file_id: str | int) -> Self:
291
+ """Alternate constructor that finds the file path automatically and builds a `Document`."""
292
+ return cls(coerce_file_path(file_id))
293
+
294
+ def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
295
+ return self.external_link(epsteinify_doc_url, style, link_txt)
296
+
297
+ def epstein_media_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
298
+ return self.external_link(epstein_media_doc_url, style, link_txt)
299
+
300
+ def epstein_web_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
301
+ return self.external_link(epstein_web_doc_url, style, link_txt)
302
+
303
+ def rollcall_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
304
+ return self.external_link(rollcall_doc_url, style, link_txt)
305
+
306
+ def external_link(self, fxn: Callable[[str], str], style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
307
+ return link_text_obj(fxn(self.url_slug), link_txt or self.file_path.stem, style)
308
+
309
+ def external_links_txt(self, style: str = '', include_alt_links: bool = False) -> Text:
310
+ """Returns colored links to epstein.media and alternates in a Text object."""
311
+ links = [link_text_obj(self.external_url, self.url_slug, style=style)]
312
+
313
+ if include_alt_links:
314
+ if self.doj_2026_dataset_id:
315
+ jmail_url = jmail_doj_2026_file_url(self.doj_2026_dataset_id, self.file_id)
316
+ jmail_link = link_text_obj(jmail_url, JMAIL, style=f"{style} dim" if style else ARCHIVE_LINK_COLOR)
317
+ links.append(jmail_link)
318
+ else:
319
+ links.append(self.epsteinify_link(style=ALT_LINK_STYLE, link_txt=EPSTEINIFY))
320
+ links.append(self.epstein_web_link(style=ALT_LINK_STYLE, link_txt=EPSTEIN_WEB))
321
+
322
+ if self._class_name == 'Email':
323
+ links.append(self.rollcall_link(style=ALT_LINK_STYLE, link_txt=ROLLCALL))
324
+
325
+ links = [links[0]] + [parenthesize(link) for link in links[1:]]
326
+ base_txt = Text('', style='white' if include_alt_links else ARCHIVE_LINK_COLOR)
327
+ return base_txt.append(join_texts(links))
328
+
329
+ def file_info_panel(self) -> Group:
330
+ """Panel with filename linking to raw file plus any additional info about the file."""
331
+ panel = Panel(self.external_links_txt(include_alt_links=True), border_style=self.border_style, expand=False)
332
+ padded_info = [Padding(sentence, INFO_PADDING) for sentence in self.info]
333
+ return Group(*([panel] + padded_info))
334
+
335
+ def log(self, msg: str, level: int = logging.INFO):
336
+ """Log a message with with this document's filename as a prefix."""
337
+ logger.log(level, f"{self.file_path.stem} {msg}")
338
+
339
+ def log_top_lines(self, n: int = 10, msg: str = '', level: int = logging.INFO) -> None:
340
+ """Log first 'n' lines of self.text at 'level'. 'msg' can be optionally provided."""
341
+ separator = '\n\n' if '\n' in msg else '. '
342
+ msg = (msg + separator) if msg else ''
343
+ self.log(f"{msg}First {n} lines:\n\n{self.top_lines(n)}\n", level)
344
+
345
+ def matching_lines(self, _pattern: re.Pattern | str) -> list[MatchedLine]:
346
+ """Return lines matching a regex as colored list[Text]."""
347
+ pattern = patternize(_pattern)
348
+ return [MatchedLine(line, i) for i, line in enumerate(self.lines) if pattern.search(line)]
349
+
350
+ def printable_document(self) -> Self:
351
+ """Overloaded by `DojFile` to convert some files to `Email` objects."""
352
+ return self
353
+
245
354
  def raw_text(self) -> str:
355
+ """Reload the raw data from the underlying file and return it."""
246
356
  with open(self.file_path) as f:
247
357
  return f.read()
248
358
 
@@ -256,13 +366,9 @@ class Document:
256
366
 
257
367
  return text
258
368
 
259
- def source_file_id(self) -> str:
260
- """Strip off the _1, _2, etc. suffixes for extracted documents."""
261
- return self.file_id[0:6]
262
-
263
369
  def summary(self) -> Text:
264
- """Summary of this file for logging. Brackets are left open for subclasses to add stuff."""
265
- txt = Text('').append(self._class_name(), style=self._class_style())
370
+ """Summary of this file for logging. Subclasses should extend with a method that closes the open '['."""
371
+ txt = Text('').append(self._class_name, style=self._class_style)
266
372
  txt.append(f" {self.file_path.stem}", style=FILENAME_STYLE)
267
373
 
268
374
  if self.timestamp:
@@ -270,52 +376,22 @@ class Document:
270
376
  txt.append(' (', style=SYMBOL_STYLE)
271
377
  txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
272
378
 
273
- txt.append(' [').append(key_value_txt('size', Text(str(self.length()), style='aquamarine1')))
274
- txt.append(", ").append(key_value_txt('lines', self.num_lines()))
379
+ txt.append(' [').append(key_value_txt('size', Text(str(self.length), style='aquamarine1')))
380
+ txt.append(", ").append(key_value_txt('lines', self.num_lines))
275
381
 
276
382
  if self.config and self.config.duplicate_of_id:
277
383
  txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='cyan dim')))
278
384
 
279
385
  return txt
280
386
 
281
- def summary_panel(self) -> Panel:
282
- """Panelized description() with info_txt(), used in search results."""
283
- sentences = [self.summary()]
284
-
285
- if self.include_description_in_summary_panel:
286
- sentences += [Text('', style='italic').append(h) for h in self.info()]
287
-
288
- return Panel(Group(*sentences), border_style=self._class_style(), expand=False)
289
-
290
- def timestamp_sort_key(self) -> tuple[datetime, str, int]:
291
- """Sort by timestamp, file_id, then whether or not it's a duplicate file."""
292
- if self.is_duplicate():
293
- sort_id = self.config.duplicate_of_id
294
- dupe_idx = 1
295
- else:
296
- sort_id = self.file_id
297
- dupe_idx = 0
298
-
299
- return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
300
-
301
387
  def top_lines(self, n: int = 10) -> str:
302
388
  """First n lines."""
303
389
  return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
304
390
 
305
391
  def warn(self, msg: str) -> None:
392
+ """Print a warning message prefixed by info about this `Document`."""
306
393
  self.log(msg, level=logging.WARNING)
307
394
 
308
- def _border_style(self) -> str:
309
- """Should be overloaded in subclasses."""
310
- return 'white'
311
-
312
- def _class_name(self) -> str:
313
- """Annoying workaround for circular import issues and isinstance()."""
314
- return str(type(self).__name__)
315
-
316
- def _class_style(self) -> str:
317
- return DOC_TYPE_STYLES[self._class_name()]
318
-
319
395
  def _extract_author(self) -> None:
320
396
  """Get author from config. Extended in Email subclass to also check headers."""
321
397
  if self.config and self.config.author:
@@ -325,7 +401,7 @@ class Document:
325
401
  """Should be implemented in subclasses."""
326
402
  pass
327
403
 
328
- def _load_file(self) -> None:
404
+ def _load_file(self) -> str:
329
405
  """Remove BOM and HOUSE OVERSIGHT lines, strip whitespace."""
330
406
  text = self.raw_text()
331
407
  text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text # remove BOM
@@ -333,11 +409,10 @@ class Document:
333
409
 
334
410
  lines = [
335
411
  line.strip() if self.strip_whitespace else line for line in text.split('\n')
336
- if not line.startswith(HOUSE_OVERSIGHT)
412
+ if not (line.startswith(HOUSE_OVERSIGHT) or line.startswith('EFTA'))
337
413
  ]
338
414
 
339
- self.text = collapse_newlines('\n'.join(lines))
340
- self.lines = self.text.split('\n')
415
+ return collapse_newlines('\n'.join(lines))
341
416
 
342
417
  def _repair(self) -> None:
343
418
  """Can optionally be overloaded in subclasses to further improve self.text."""
@@ -367,11 +442,20 @@ class Document:
367
442
  with open(output_path, 'w') as f:
368
443
  f.write(self.text)
369
444
 
370
- logger.warning(f"Wrote {self.length()} chars of cleaned {self.filename} to {output_path}.")
445
+ logger.warning(f"Wrote {self.length} chars of cleaned {self.filename} to {output_path}.")
371
446
 
372
447
  def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
448
+ """Default `Document` renderer (Email and MessengerLog override this)."""
373
449
  yield self.file_info_panel()
374
- text_panel = Panel(highlighter(self.text), border_style=self._border_style(), expand=False)
450
+
451
+ text_panel = Panel(
452
+ highlighter(self.text),
453
+ border_style=self.border_style,
454
+ expand=False,
455
+ title=f"({self.panel_title_timestamp})",
456
+ title_align='right',
457
+ )
458
+
375
459
  yield Padding(text_panel, (0, 0, 1, INFO_INDENT))
376
460
 
377
461
  def __str__(self) -> str:
@@ -395,8 +479,8 @@ class Document:
395
479
  'count': str(file_count),
396
480
  'author_count': NA_TXT if is_author_na else str(author_count),
397
481
  'no_author_count': NA_TXT if is_author_na else str(file_count - author_count),
398
- 'uncertain_author_count': NA_TXT if is_author_na else str(len([f for f in files if f.is_attribution_uncertain()])),
399
- 'bytes': file_size_to_str(sum([f.file_size() for f in files])),
482
+ 'uncertain_author_count': NA_TXT if is_author_na else str(len([f for f in files if f.is_attribution_uncertain])),
483
+ 'bytes': file_size_to_str(sum([f.file_size for f in files])),
400
484
  }
401
485
 
402
486
  @classmethod
@@ -433,6 +517,11 @@ class Document:
433
517
  for f in tmpfiles:
434
518
  f.unlink()
435
519
 
520
+ @staticmethod
521
+ def is_email(doc: 'Document') -> bool:
522
+ search_area = doc.text[0:5000] # Limit search area to avoid pointless scans of huge files
523
+ return isinstance(doc.config, EmailCfg) or bool(DETECT_EMAIL_REGEX.match(search_area) and doc.config is None)
524
+
436
525
  @staticmethod
437
526
  def known_author_count(docs: Sequence['Document']) -> int:
438
527
  """Count of how many Document objects have an author attribution."""
@@ -444,11 +533,11 @@ class Document:
444
533
 
445
534
  @staticmethod
446
535
  def sort_by_length(docs: Sequence['DocumentType']) -> list['DocumentType']:
447
- return sorted(docs, key=lambda d: d.file_size(), reverse=True)
536
+ return sorted(docs, key=lambda d: d.file_size, reverse=True)
448
537
 
449
538
  @staticmethod
450
539
  def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
451
- return sorted(docs, key=lambda doc: doc.timestamp_sort_key())
540
+ return sorted(docs, key=lambda doc: doc.timestamp_sort_key)
452
541
 
453
542
  @staticmethod
454
543
  def uniquify(documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
@@ -458,7 +547,7 @@ class Document:
458
547
 
459
548
  @staticmethod
460
549
  def without_dupes(docs: Sequence['DocumentType']) -> list['DocumentType']:
461
- return [doc for doc in docs if not doc.is_duplicate()]
550
+ return [doc for doc in docs if not doc.is_duplicate]
462
551
 
463
552
 
464
553
  DocumentType = TypeVar('DocumentType', bound=Document)