PyPI - epstein-files - Versions diffs - 1.2.5__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

epstein-files 1.2.5py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

epstein_files/__init__.py +55 -23
epstein_files/documents/communication.py +9 -5
epstein_files/documents/document.py +231 -135
epstein_files/documents/doj_file.py +242 -0
epstein_files/documents/doj_files/full_text.py +166 -0
epstein_files/documents/email.py +289 -232
epstein_files/documents/emails/email_header.py +35 -16
epstein_files/documents/emails/emailers.py +223 -0
epstein_files/documents/imessage/text_message.py +2 -3
epstein_files/documents/json_file.py +18 -14
epstein_files/documents/messenger_log.py +23 -39
epstein_files/documents/other_file.py +54 -48
epstein_files/epstein_files.py +65 -29
epstein_files/person.py +151 -94
epstein_files/util/constant/names.py +37 -10
epstein_files/util/constant/output_files.py +2 -0
epstein_files/util/constant/strings.py +14 -7
epstein_files/util/constant/urls.py +17 -0
epstein_files/util/constants.py +556 -391
epstein_files/util/data.py +2 -0
epstein_files/util/doc_cfg.py +44 -33
epstein_files/util/env.py +34 -19
epstein_files/util/file_helper.py +30 -6
epstein_files/util/helpers/debugging_helper.py +13 -0
epstein_files/util/helpers/env_helpers.py +21 -0
epstein_files/util/highlighted_group.py +121 -37
epstein_files/util/layout/left_bar_panel.py +26 -0
epstein_files/util/logging.py +28 -13
epstein_files/util/output.py +49 -40
epstein_files/util/rich.py +30 -3
epstein_files/util/word_count.py +7 -7
{epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/METADATA +16 -3
epstein_files-1.5.0.dist-info/RECORD +40 -0
{epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/entry_points.txt +1 -1
epstein_files-1.2.5.dist-info/RECORD +0 -34
{epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/LICENSE +0 -0
{epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/WHEEL +0 -0

epstein_files/documents/document.py CHANGED Viewed

@@ -5,7 +5,7 @@ from dataclasses import asdict, dataclass, field
 from datetime import datetime
 from pathlib import Path
 from subprocess import run
-from typing import Callable, ClassVar, Sequence, TypeVar
+from typing import Callable, ClassVar, Self, Sequence, TypeVar
 from rich.console import Console, ConsoleOptions, Group, RenderResult
 from rich.padding import Padding
@@ -13,17 +13,19 @@ from rich.panel import Panel
 from rich.text import Text
 from rich.table import Table
+from epstein_files.documents.emails.email_header import DETECT_EMAIL_REGEX
 from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import *
 from epstein_files.util.constant.urls import *
-from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
+from epstein_files.util.constants import ALL_FILE_CONFIGS, DOJ_FILE_STEM_REGEX, FALLBACK_TIMESTAMP
 from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time, without_falsey
 from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
-from epstein_files.util.env import DOCS_DIR, args
-from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, file_size_to_str, is_local_extract_file
+from epstein_files.util.env import DOCS_DIR
+from epstein_files.util.file_helper import (coerce_file_path, extract_file_id, file_size, file_size_str,
+     file_size_to_str, is_local_extract_file)
 from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
-from epstein_files.util.rich import (INFO_STYLE, NA_TXT, SYMBOL_STYLE, add_cols_to_table, build_table, console,
-     highlighter, join_texts, key_value_txt, link_text_obj, parenthesize)
+from epstein_files.util.rich import (INFO_STYLE, NA_TXT, SYMBOL_STYLE, add_cols_to_table, build_table,
+     console, highlighter, join_texts, key_value_txt, link_text_obj, parenthesize)
 from epstein_files.util.search_result import MatchedLine
 ALT_LINK_STYLE = 'white dim'
@@ -33,11 +35,9 @@ INFO_INDENT = 2
 INFO_PADDING = (0, 0, 0, INFO_INDENT)
 MAX_TOP_LINES_LEN = 4000  # Only for logging
 MIN_DOCUMENT_ID = 10477
-WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
-MIN_TIMESTAMP = datetime(1991, 1, 1)
-MID_TIMESTAMP = datetime(2007, 1, 1)
-MAX_TIMESTAMP = datetime(2020, 1, 1)
+DOJ_DATASET_ID_REGEX = re.compile(r"(?:epstein_dataset_|DataSet )(\d+)")
+WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
 FILENAME_MATCH_STYLES = [
     'dark_green',
@@ -74,7 +74,8 @@ class Document:
     Attributes:
         file_path (Path): Local path to file
         author (Name): Who is responsible for the text in the file
-        config (DocCfg): Information about this fil
+        config (DocCfg): Preconfigured information about this file
+        doj_2026_dataset_id (int, optional): Only set for files that came from the DOJ website.
         file_id (str): 6 digit (or 8 digits if it's a local extract file) string ID
         filename (str): File's basename
         lines (str): Number of lines in the file after all the cleanup
@@ -86,6 +87,7 @@ class Document:
     # Optional fields
     author: Name = None
     config: EmailCfg | DocCfg | TextCfg | None = None
+    doj_2026_dataset_id: int | None = None
     file_id: str = field(init=False)
     filename: str = field(init=False)
     lines: list[str] = field(default_factory=list)
@@ -97,137 +99,117 @@ class Document:
     include_description_in_summary_panel: ClassVar[bool] = False
     strip_whitespace: ClassVar[bool] = True  # Overridden in JsonFile
-    def __post_init__(self):
-        if not self.file_path.exists():
-            raise FileNotFoundError(f"File '{self.file_path.name}' does not exist!")
-        self.filename = self.file_path.name
-        self.file_id = extract_file_id(self.filename)
-        # config and url_slug could have been pre-set in Email
-        self.config = self.config or deepcopy(ALL_FILE_CONFIGS.get(self.file_id))
-        self.url_slug = self.url_slug or self.filename.split('.')[0]
-        if not self.text:
-            self._load_file()
-        self._repair()
-        self._extract_author()
-        self.timestamp = self._extract_timestamp()
+    @property
+    def border_style(self) -> str:
+        """Should be overloaded in subclasses."""
+        return 'white'
+    @property
     def config_description(self) -> str | None:
-        """Overloaded in OtherFile."""
         if self.config and self.config.description:
             return f"({self.config.description})"
+    @property
+    def config_timestamp(self) -> datetime | None:
+        """Configured timestamp, if any."""
+        return self.config.timestamp if self.config and self.config.timestamp else None
+    @property
     def date_str(self) -> str | None:
         return date_str(self.timestamp)
+    @property
     def duplicate_file_txt(self) -> Text:
         """If the file is a dupe make a nice message to explain what file it's a duplicate of."""
-        if not self.is_duplicate():
+        if not self.is_duplicate:
             raise RuntimeError(f"duplicate_file_txt() called on {self.summary()} but not a dupe! config:\n\n{self.config}")
         txt = Text(f"Not showing ", style=INFO_STYLE).append(epstein_media_doc_link_txt(self.file_id, style='cyan'))
         txt.append(f" because it's {DUPE_TYPE_STRS[self.config.dupe_type]} ")
         return txt.append(epstein_media_doc_link_txt(self.config.duplicate_of_id, style='royal_blue1'))
+    @property
     def duplicate_of_id(self) -> str | None:
         if self.config and self.config.duplicate_of_id:
             return self.config.duplicate_of_id
-    def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
-        return self.external_link(epsteinify_doc_url, style, link_txt)
-    def epstein_media_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
-        return self.external_link(epstein_media_doc_url, style, link_txt)
-    def epstein_web_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
-        return self.external_link(epstein_web_doc_url, style, link_txt)
-    def rollcall_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
-        return self.external_link(rollcall_doc_url, style, link_txt)
-    def external_link(self, fxn: Callable[[str], str], style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
-        return link_text_obj(fxn(self.url_slug), link_txt or self.file_path.stem, style)
-    def external_links_txt(self, style: str = '', include_alt_links: bool = False) -> Text:
-        """Returns colored links to epstein.media and alternates in a Text object."""
-        links = [self.epstein_media_link(style=style)]
-        if include_alt_links:
-            links.append(self.epsteinify_link(style=ALT_LINK_STYLE, link_txt=EPSTEINIFY))
-            links.append(self.epstein_web_link(style=ALT_LINK_STYLE, link_txt=EPSTEIN_WEB))
-            if self._class_name() == 'Email':
-                links.append(self.rollcall_link(style=ALT_LINK_STYLE, link_txt=ROLLCALL))
-        links = [links[0]] + [parenthesize(link) for link in links[1:]]
-        base_txt = Text('', style='white' if include_alt_links else ARCHIVE_LINK_COLOR)
-        return base_txt.append(join_texts(links))
+    @property
+    def external_url(self) -> str:
+        """The primary external URL to use when linking to this document's source."""
+        if self.is_doj_file and self.doj_2026_dataset_id:
+            return doj_2026_file_url(self.doj_2026_dataset_id, self.url_slug)
+        else:
+            return epstein_media_doc_url(self.url_slug)
+    @property
     def file_id_debug_info(self) -> str:
         return ', '.join([f"{prop}={getattr(self, prop)}" for prop in ['file_id', 'filename', 'url_slug']])
-    def file_info_panel(self) -> Group:
-        """Panel with filename linking to raw file plus any additional info about the file."""
-        panel = Panel(self.external_links_txt(include_alt_links=True), border_style=self._border_style(), expand=False)
-        padded_info = [Padding(sentence, INFO_PADDING) for sentence in self.info()]
-        return Group(*([panel] + padded_info))
+    @property
     def file_size(self) -> int:
         return file_size(self.file_path)
+    @property
     def file_size_str(self, decimal_places: int | None = None) -> str:
         return file_size_str(self.file_path, decimal_places)
+    @property
     def info(self) -> list[Text]:
         """0 to 2 sentences containing the info_txt() as well as any configured description."""
         return without_falsey([
-            self.info_txt(),
-            highlighter(Text(self.config_description(), style=INFO_STYLE)) if self.config_description() else None
+            self.info_txt,
+            highlighter(Text(self.config_description, style=INFO_STYLE)) if self.config_description else None
         ])
+    @property
     def info_txt(self) -> Text | None:
         """Secondary info about this file (description recipients, etc). Overload in subclasses."""
         return None
+    @property
     def is_attribution_uncertain(self) -> bool:
         return bool(self.config and self.config.is_attribution_uncertain)
+    @property
+    def is_doj_file(self) -> bool:
+        return bool(DOJ_FILE_STEM_REGEX.match(self.file_id))
+    @property
     def is_duplicate(self) -> bool:
-        return bool(self.duplicate_of_id())
+        return bool(self.duplicate_of_id)
+    @property
+    def is_empty(self) -> bool:
+        return len(self.text.strip()) < 20
+    @property
+    def is_interesting(self) -> bool:
+        return bool(self.config and self.config.is_interesting)
+    @property
     def is_local_extract_file(self) -> bool:
         """True if extracted from other file (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
         return is_local_extract_file(self.filename)
+    @property
     def length(self) -> int:
         return len(self.text)
-    def log(self, msg: str, level: int = logging.INFO):
-        """Log with filename as a prefix."""
-        logger.log(level, f"{self.file_path.stem} {msg}")
-    def log_top_lines(self, n: int = 10, msg: str = '', level: int = logging.INFO) -> None:
-        """Log first 'n' lines of self.text at 'level'. 'msg' can be optionally provided."""
-        separator = '\n\n' if '\n' in msg else '. '
-        msg = (msg + separator) if msg else ''
-        self.log(f"{msg}First {n} lines:\n\n{self.top_lines(n)}\n", level)
-    def matching_lines(self, _pattern: re.Pattern | str) -> list[MatchedLine]:
-        """Return lines matching a regex as colored list[Text]."""
-        pattern = patternize(_pattern)
-        return [MatchedLine(line, i) for i, line in enumerate(self.lines) if pattern.search(line)]
+    @property
+    def local_path_and_url(self) -> Text:
+        """Text obj with local path and URL."""
+        return Text(f"{self.file_id} URL:         {self.external_url}\n{self.file_id} Local path: '{self.file_path}'")
+    @property
     def metadata(self) -> Metadata:
-        metadata = self.config.metadata() if self.config else {}
+        metadata = self.config.metadata if self.config else {}
         metadata.update({k: v for k, v in asdict(self).items() if k in METADATA_FIELDS and v is not None})
-        metadata['bytes'] = self.file_size()
+        metadata['bytes'] = self.file_size
         metadata['filename'] = f"{self.url_slug}.txt"
-        metadata['num_lines'] = self.num_lines()
-        metadata['type'] = self._class_name()
+        metadata['num_lines'] = self.num_lines
+        metadata['type'] = self._class_name
-        if self.is_local_extract_file():
+        if self.is_local_extract_file:
             metadata['extracted_file'] = {
                 'explanation': 'manually extracted from one of the other files',
                 'extracted_from': self.url_slug + '.txt',
@@ -236,10 +218,141 @@ class Document:
         return metadata
+    @property
     def num_lines(self) -> int:
         return len(self.lines)
+    @property
+    def panel_title_timestamp(self) -> str | None:
+        """String placed in the `title` of the enclosing `Panel` when printing this document's text."""
+        if (self.timestamp or FALLBACK_TIMESTAMP) == FALLBACK_TIMESTAMP:
+            return None
+        prefix = '' if self.config and self.config.timestamp else 'inferred '
+        return f"{prefix}timestamp: {remove_zero_time(self.timestamp)}"
+    @property
+    def summary_panel(self) -> Panel:
+        """Panelized description() with info_txt(), used in search results."""
+        sentences = [self.summary()]
+        if self.include_description_in_summary_panel:
+            sentences += [Text('', style='italic').append(h) for h in self.info]
+        return Panel(Group(*sentences), border_style=self._class_style, expand=False)
+    @property
+    def timestamp_sort_key(self) -> tuple[datetime, str, int]:
+        """Sort by timestamp, file_id, then whether or not it's a duplicate file."""
+        if self.duplicate_of_id:
+            sort_id = self.duplicate_of_id
+            dupe_idx = 1
+        else:
+            sort_id = self.file_id
+            dupe_idx = 0
+        return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
+    @property
+    def _class_name(self) -> str:
+        """Annoying workaround for circular import issues and isinstance()."""
+        return str(type(self).__name__)
+    @property
+    def _class_style(self) -> str:
+        return DOC_TYPE_STYLES[self._class_name]
+    def __post_init__(self):
+        if not self.file_path.exists():
+            raise FileNotFoundError(f"File '{self.file_path.name}' does not exist!")
+        self.filename = self.file_path.name
+        self.file_id = extract_file_id(self.filename)
+        # config and url_slug could have been pre-set in Email
+        self.config = self.config or deepcopy(ALL_FILE_CONFIGS.get(self.file_id))
+        self.url_slug = self.url_slug or self.filename.split('.')[0]
+        # Extract the DOJ dataset ID from the path
+        if self.is_doj_file:
+            if (data_set_match := DOJ_DATASET_ID_REGEX.search(str(self.file_path))):
+                self.doj_2026_dataset_id = int(data_set_match.group(1))
+                logger.info(f"Extracted data set ID {self.doj_2026_dataset_id} for {self.url_slug}")
+            else:
+                self.warn(f"Couldn't find a data set ID in path '{self.file_path}'! Cannot create valid links.")
+        self.text = self.text or self._load_file()
+        self._set_computed_fields(text=self.text)
+        self._repair()
+        self._extract_author()
+        self.timestamp = self.config_timestamp or self._extract_timestamp()
+    @classmethod
+    def from_file_id(cls, file_id: str | int) -> Self:
+        """Alternate constructor that finds the file path automatically and builds a `Document`."""
+        return cls(coerce_file_path(file_id))
+    def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
+        return self.external_link(epsteinify_doc_url, style, link_txt)
+    def epstein_media_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
+        return self.external_link(epstein_media_doc_url, style, link_txt)
+    def epstein_web_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
+        return self.external_link(epstein_web_doc_url, style, link_txt)
+    def rollcall_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
+        return self.external_link(rollcall_doc_url, style, link_txt)
+    def external_link(self, fxn: Callable[[str], str], style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
+        return link_text_obj(fxn(self.url_slug), link_txt or self.file_path.stem, style)
+    def external_links_txt(self, style: str = '', include_alt_links: bool = False) -> Text:
+        """Returns colored links to epstein.media and alternates in a Text object."""
+        links = [link_text_obj(self.external_url, self.url_slug, style=style)]
+        if include_alt_links:
+            if self.doj_2026_dataset_id:
+                jmail_url = jmail_doj_2026_file_url(self.doj_2026_dataset_id, self.file_id)
+                jmail_link = link_text_obj(jmail_url, JMAIL, style=f"{style} dim" if style else ARCHIVE_LINK_COLOR)
+                links.append(jmail_link)
+            else:
+                links.append(self.epsteinify_link(style=ALT_LINK_STYLE, link_txt=EPSTEINIFY))
+                links.append(self.epstein_web_link(style=ALT_LINK_STYLE, link_txt=EPSTEIN_WEB))
+                if self._class_name == 'Email':
+                    links.append(self.rollcall_link(style=ALT_LINK_STYLE, link_txt=ROLLCALL))
+        links = [links[0]] + [parenthesize(link) for link in links[1:]]
+        base_txt = Text('', style='white' if include_alt_links else ARCHIVE_LINK_COLOR)
+        return base_txt.append(join_texts(links))
+    def file_info_panel(self) -> Group:
+        """Panel with filename linking to raw file plus any additional info about the file."""
+        panel = Panel(self.external_links_txt(include_alt_links=True), border_style=self.border_style, expand=False)
+        padded_info = [Padding(sentence, INFO_PADDING) for sentence in self.info]
+        return Group(*([panel] + padded_info))
+    def log(self, msg: str, level: int = logging.INFO):
+        """Log a message with with this document's filename as a prefix."""
+        logger.log(level, f"{self.file_path.stem} {msg}")
+    def log_top_lines(self, n: int = 10, msg: str = '', level: int = logging.INFO) -> None:
+        """Log first 'n' lines of self.text at 'level'. 'msg' can be optionally provided."""
+        separator = '\n\n' if '\n' in msg else '. '
+        msg = (msg + separator) if msg else ''
+        self.log(f"{msg}First {n} lines:\n\n{self.top_lines(n)}\n", level)
+    def matching_lines(self, _pattern: re.Pattern | str) -> list[MatchedLine]:
+        """Return lines matching a regex as colored list[Text]."""
+        pattern = patternize(_pattern)
+        return [MatchedLine(line, i) for i, line in enumerate(self.lines) if pattern.search(line)]
+    def printable_document(self) -> Self:
+        """Overloaded by `DojFile` to convert some files to `Email` objects."""
+        return self
     def raw_text(self) -> str:
+        """Reload the raw data from the underlying file and return it."""
         with open(self.file_path) as f:
             return f.read()
@@ -253,13 +366,9 @@ class Document:
         return text
-    def source_file_id(self) -> str:
-        """Strip off the _1, _2, etc. suffixes for extracted documents."""
-        return self.file_id[0:6]
     def summary(self) -> Text:
-        """Summary of this file for logging. Brackets are left open for subclasses to add stuff."""
-        txt = Text('').append(self._class_name(), style=self._class_style())
+        """Summary of this file for logging. Subclasses should extend with a method that closes the open '['."""
+        txt = Text('').append(self._class_name, style=self._class_style)
         txt.append(f" {self.file_path.stem}", style=FILENAME_STYLE)
         if self.timestamp:
@@ -267,52 +376,22 @@ class Document:
             txt.append(' (', style=SYMBOL_STYLE)
             txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
-        txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(0), style='aquamarine1')))
-        txt.append(", ").append(key_value_txt('lines', self.num_lines()))
+        txt.append(' [').append(key_value_txt('size', Text(str(self.length), style='aquamarine1')))
+        txt.append(", ").append(key_value_txt('lines', self.num_lines))
         if self.config and self.config.duplicate_of_id:
             txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='cyan dim')))
         return txt
-    def summary_panel(self) -> Panel:
-        """Panelized description() with info_txt(), used in search results."""
-        sentences = [self.summary()]
-        if self.include_description_in_summary_panel:
-            sentences += [Text('', style='italic').append(h) for h in self.info()]
-        return Panel(Group(*sentences), border_style=self._class_style(), expand=False)
-    def timestamp_sort_key(self) -> tuple[datetime, str, int]:
-        """Sort by timestamp, file_id, then whether or not it's a duplicate file."""
-        if self.is_duplicate():
-            sort_id = self.config.duplicate_of_id
-            dupe_idx = 1
-        else:
-            sort_id = self.file_id
-            dupe_idx = 0
-        return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
     def top_lines(self, n: int = 10) -> str:
         """First n lines."""
         return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
     def warn(self, msg: str) -> None:
+        """Print a warning message prefixed by info about this `Document`."""
         self.log(msg, level=logging.WARNING)
-    def _border_style(self) -> str:
-        """Should be overloaded in subclasses."""
-        return 'white'
-    def _class_name(self) -> str:
-        """Annoying workaround for circular import issues and isinstance()."""
-        return str(type(self).__name__)
-    def _class_style(self) -> str:
-        return DOC_TYPE_STYLES[self._class_name()]
     def _extract_author(self) -> None:
         """Get author from config. Extended in Email subclass to also check headers."""
         if self.config and self.config.author:
@@ -322,7 +401,7 @@ class Document:
         """Should be implemented in subclasses."""
         pass
-    def _load_file(self) -> None:
+    def _load_file(self) -> str:
         """Remove BOM and HOUSE OVERSIGHT lines, strip whitespace."""
         text = self.raw_text()
         text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text  # remove BOM
@@ -330,11 +409,10 @@ class Document:
         lines = [
             line.strip() if self.strip_whitespace else line for line in text.split('\n')
-            if not line.startswith(HOUSE_OVERSIGHT)
+            if not (line.startswith(HOUSE_OVERSIGHT) or line.startswith('EFTA'))
         ]
-        self.text = collapse_newlines('\n'.join(lines))
-        self.lines = self.text.split('\n')
+        return collapse_newlines('\n'.join(lines))
     def _repair(self) -> None:
         """Can optionally be overloaded in subclasses to further improve self.text."""
@@ -364,11 +442,20 @@ class Document:
         with open(output_path, 'w') as f:
             f.write(self.text)
-        logger.warning(f"Wrote {self.length()} chars of cleaned {self.filename} to {output_path}.")
+        logger.warning(f"Wrote {self.length} chars of cleaned {self.filename} to {output_path}.")
     def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
+        """Default `Document` renderer (Email and MessengerLog override this)."""
         yield self.file_info_panel()
-        text_panel = Panel(highlighter(self.text), border_style=self._border_style(), expand=False)
+        text_panel = Panel(
+            highlighter(self.text),
+            border_style=self.border_style,
+            expand=False,
+            title=f"({self.panel_title_timestamp})",
+            title_align='right',
+        )
         yield Padding(text_panel, (0, 0, 1, INFO_INDENT))
     def __str__(self) -> str:
@@ -392,8 +479,8 @@ class Document:
             'count': str(file_count),
             'author_count': NA_TXT if is_author_na else str(author_count),
             'no_author_count': NA_TXT if is_author_na else str(file_count - author_count),
-            'uncertain_author_count': NA_TXT if is_author_na else str(len([f for f in files if f.is_attribution_uncertain()])),
-            'bytes': file_size_to_str(sum([f.file_size() for f in files])),
+            'uncertain_author_count': NA_TXT if is_author_na else str(len([f for f in files if f.is_attribution_uncertain])),
+            'bytes': file_size_to_str(sum([f.file_size for f in files])),
         }
     @classmethod
@@ -430,6 +517,11 @@ class Document:
         for f in tmpfiles:
             f.unlink()
+    @staticmethod
+    def is_email(doc: 'Document') -> bool:
+        search_area = doc.text[0:5000]  # Limit search area to avoid pointless scans of huge files
+        return isinstance(doc.config, EmailCfg) or bool(DETECT_EMAIL_REGEX.match(search_area) and doc.config is None)
     @staticmethod
     def known_author_count(docs: Sequence['Document']) -> int:
         """Count of how many Document objects have an author attribution."""
@@ -439,9 +531,13 @@ class Document:
     def sort_by_id(docs: Sequence['DocumentType']) -> list['DocumentType']:
         return sorted(docs, key=lambda d: d.file_id)
+    @staticmethod
+    def sort_by_length(docs: Sequence['DocumentType']) -> list['DocumentType']:
+        return sorted(docs, key=lambda d: d.file_size, reverse=True)
     @staticmethod
     def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
-        return sorted(docs, key=lambda doc: doc.timestamp_sort_key())
+        return sorted(docs, key=lambda doc: doc.timestamp_sort_key)
     @staticmethod
     def uniquify(documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
@@ -451,7 +547,7 @@ class Document:
     @staticmethod
     def without_dupes(docs: Sequence['DocumentType']) -> list['DocumentType']:
-        return [doc for doc in docs if not doc.is_duplicate()]
+        return [doc for doc in docs if not doc.is_duplicate]
 DocumentType = TypeVar('DocumentType', bound=Document)

epstein-files 1.2.5__py3-none-any.whl → 1.5.0__py3-none-any.whl

epstein-files 1.2.5py3-none-any.whl → 1.5.0py3-none-any.whl