PyPI - epstein-files - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

epstein-files 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

epstein_files/__init__.py +75 -135
epstein_files/documents/communication.py +9 -9
epstein_files/documents/document.py +115 -87
epstein_files/documents/email.py +154 -85
epstein_files/documents/emails/email_header.py +7 -6
epstein_files/documents/imessage/text_message.py +3 -2
epstein_files/documents/json_file.py +17 -0
epstein_files/documents/messenger_log.py +62 -3
epstein_files/documents/other_file.py +165 -17
epstein_files/epstein_files.py +128 -169
epstein_files/util/constant/names.py +8 -1
epstein_files/util/constant/output_files.py +29 -0
epstein_files/util/constant/strings.py +27 -0
epstein_files/util/constant/urls.py +25 -9
epstein_files/util/constants.py +1018 -1045
epstein_files/util/data.py +20 -55
epstein_files/util/{file_cfg.py → doc_cfg.py} +121 -43
epstein_files/util/env.py +19 -20
epstein_files/util/file_helper.py +38 -21
epstein_files/util/highlighted_group.py +229 -177
epstein_files/util/logging.py +63 -0
epstein_files/util/output.py +180 -0
epstein_files/util/rich.py +29 -17
epstein_files/util/search_result.py +14 -6
epstein_files/util/timer.py +24 -0
epstein_files/util/word_count.py +2 -1
{epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/METADATA +20 -4
epstein_files-1.0.2.dist-info/RECORD +33 -0
epstein_files-1.0.2.dist-info/entry_points.txt +7 -0
epstein_files-1.0.0.dist-info/RECORD +0 -28
{epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/LICENSE +0 -0
{epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/WHEEL +0 -0

epstein_files/documents/document.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 import re
-from dataclasses import dataclass, field
+from dataclasses import asdict, dataclass, field
 from datetime import datetime
 from pathlib import Path
 from subprocess import run
@@ -14,33 +14,28 @@ from rich.text import Text
 from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import *
 from epstein_files.util.constant.urls import *
-from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP, VI_DAILY_NEWS_ARTICLE
-from epstein_files.util.file_cfg import FileCfg, MessageCfg
-from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize
-from epstein_files.util.env import args, logger
-from epstein_files.util.file_helper import DOCS_DIR, file_stem_for_id, extract_file_id, file_size_str, is_local_extract_file
-from epstein_files.util.rich import SYMBOL_STYLE, console, highlighter, key_value_txt, logger, link_text_obj
+from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
+from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize, without_nones
+from epstein_files.util.doc_cfg import EmailCfg, DocCfg, Metadata, TextCfg
+from epstein_files.util.env import args
+from epstein_files.util.file_helper import (DOCS_DIR, file_stem_for_id, extract_file_id, file_size,
+     file_size_str, is_local_extract_file)
+from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
+from epstein_files.util.rich import SYMBOL_STYLE, console, highlighter, key_value_txt, link_text_obj
+from epstein_files.util.search_result import MatchedLine
-WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
+CLOSE_PROPERTIES_CHAR = ']'
 HOUSE_OVERSIGHT = HOUSE_OVERSIGHT_PREFIX.replace('_', ' ').strip()
-MIN_DOCUMENT_ID = 10477
 INFO_INDENT = 2
 INFO_PADDING = (0, 0, 0, INFO_INDENT)
+MAX_TOP_LINES_LEN = 4000  # Only for logging
+MIN_DOCUMENT_ID = 10477
+LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
+WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
-CLOSE_PROPERTIES_CHAR = ']'
-MAX_EXTRACTED_TIMESTAMPS = 6
 MIN_TIMESTAMP = datetime(1991, 1, 1)
 MID_TIMESTAMP = datetime(2007, 1, 1)
 MAX_TIMESTAMP = datetime(2020, 1, 1)
-VI_DAILY_NEWS_REGEX = re.compile(r'virgin\s*is[kl][ai]nds\s*daily\s*news', re.IGNORECASE)
-DOC_TYPE_STYLES = {
-    DOCUMENT_CLASS: 'grey69',
-    EMAIL_CLASS: 'sea_green2',
-    JSON_FILE_CLASS: 'sandy_brown',
-    MESSENGER_LOG_CLASS: 'cyan',
-    OTHER_FILE_CLASS: 'grey69',
-}
 FILENAME_MATCH_STYLES = [
     'dark_green',
@@ -48,6 +43,13 @@ FILENAME_MATCH_STYLES = [
     'spring_green4',
 ]
+METADATA_FIELDS = [
+    'author',
+    'file_id',
+    'num_lines',
+    'timestamp'
+]
 OCR_REPAIRS = {
     re.compile(r'\.corn\b'): '.com',
     re.compile('ln(adequate|dyke)'): r'In\1',
@@ -61,7 +63,7 @@ class Document:
     file_path: Path
     # Optional fields
     author: str | None = None
-    config: FileCfg | MessageCfg | None = None
+    config: EmailCfg | DocCfg | TextCfg | None = None
     file_id: str = field(init=False)
     filename: str = field(init=False)
     is_duplicate: bool = False
@@ -72,8 +74,8 @@ class Document:
     timestamp: datetime | None = None
     url_slug: str = field(init=False)  # e.g. 'HOUSE_OVERSIGHT_123456
-    # Class variable; only used to cycle color of output when using lines_match()
-    file_matching_idx: ClassVar[int] = 0
+    # Class variable overridden in JsonFile
+    strip_whitespace: ClassVar[bool] = True
     def __post_init__(self):
         self.filename = self.file_path.name
@@ -82,12 +84,12 @@ class Document:
         self.is_duplicate = bool(self.config.dupe_of_id) if self.config else False
         if self.is_local_extract_file():
-            self.url_slug = file_stem_for_id(self.file_id)
+            self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
             cfg_type = type(self.config).__name__ if self.config else None
             # Coerce FileConfig for court docs etc. to MessageCfg for email files extracted from that document
-            if self.document_type() == EMAIL_CLASS and self.config and cfg_type != MessageCfg.__name__:
-                self.config = MessageCfg.from_file_cfg(self.config)
+            if self.class_name() == EMAIL_CLASS and self.config and cfg_type != EmailCfg.__name__:
+                self.config = EmailCfg.from_doc_cfg(self.config)
         else:
             self.url_slug = self.file_path.stem
@@ -96,41 +98,30 @@ class Document:
         self._extract_author()
         self.timestamp = self._extract_timestamp()
+    def class_name(self) -> str:
+        """Annoying workaround for circular import issues and isinstance()."""
+        return str(type(self).__name__)
     def configured_description(self) -> str | None:
-        return self.config.description if self.config else None
+        """Overloaded in OtherFile."""
+        if self.config and self.config.description:
+            return f"({self.config.description})"
     def date_str(self) -> str | None:
         return date_str(self.timestamp)
-    def description(self) -> Text:
-        """Mostly for logging. Brackets are left open for subclasses to add stuff."""
-        txt = Text('').append(self.url_slug, style='magenta')
-        txt.append(f' {self.document_type()}', style=self.document_type_style())
-        if self.timestamp:
-            txt.append(' (', style=SYMBOL_STYLE)
-            txt.append(f"{iso_timestamp(self.timestamp)}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
-        txt.append(" [").append(key_value_txt('num_lines', Text(f"{self.num_lines}", style='cyan')))
-        txt.append(', ').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
-        return txt
     def description_panel(self, include_hints: bool = False) -> Panel:
         """Panelized description() with info_txt(), used in search results."""
         hints = [Text('', style='italic').append(h) for h in (self.hints() if include_hints else [])]
-        return Panel(Group(*([self.description()] + hints)), border_style=self.document_type_style(), expand=False)
-    def document_type(self) -> str:
-        """Annoying workaround for circular import issues and isinstance()."""
-        return str(type(self).__name__)
+        return Panel(Group(*([self.summary()] + hints)), border_style=self.document_type_style(), expand=False)
     def document_type_style(self) -> str:
-        return DOC_TYPE_STYLES[self.document_type()]
+        return DOC_TYPE_STYLES[self.class_name()]
     def duplicate_file_txt(self) -> Text:
         """If the file is a dupe make a nice message to explain what file it's a duplicate of."""
         if not self.config or not self.config.dupe_of_id:
-            raise RuntimeError(f"duplicate_file_txt() called on {self.description()} but not a dupe! config:\n\n{self.config}")
+            raise RuntimeError(f"duplicate_file_txt() called on {self.summary()} but not a dupe! config:\n\n{self.config}")
         txt = Text(f"Not showing ", style='white dim italic').append(epstein_media_doc_link_txt(self.file_id, style='cyan'))
         txt.append(f" because it's {self.config.duplicate_reason()} ")
@@ -154,6 +145,9 @@ class Document:
         hints = [Padding(hint, INFO_PADDING) for hint in self.hints()]
         return Group(*([panel] + hints))
+    def file_size(self) -> int:
+        return file_size(self.file_path)
     def file_size_str(self) -> str:
         return file_size_str(self.file_path)
@@ -162,16 +156,10 @@ class Document:
         hints = listify(self.info_txt())
         hint_msg = self.configured_description()
-        if self.document_type() == OTHER_FILE_CLASS:
-            if not hint_msg and VI_DAILY_NEWS_REGEX.search(self.text):
-                hint_msg = VI_DAILY_NEWS_ARTICLE
-        elif hint_msg:
-            hint_msg = f"({hint_msg})"
         if hint_msg:
             hints.append(highlighter(Text(hint_msg, style='white dim italic')))
-        return hints
+        return without_nones(hints)
     def info_txt(self) -> Text | None:
         """Secondary info about this file (recipients, level of certainty, etc). Overload in subclasses."""
@@ -181,32 +169,42 @@ class Document:
         """True if file created by extracting text from a court doc (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
         return is_local_extract_file(self.filename)
-    def lines_matching_txt(self, _pattern: re.Pattern | str) -> list[Text]:
-        """Return lines matching a regex as colored list[Text]."""
-        pattern = patternize(_pattern)
-        matched_lines = [line for line in self.lines if pattern.search(line)]
-        if len(matched_lines) == 0:
-            return []
-        file_style = FILENAME_MATCH_STYLES[type(self).file_matching_idx % len(FILENAME_MATCH_STYLES)]
-        type(self).file_matching_idx += 1
-        return [
-            Text('').append(self.file_path.name, style=file_style).append(':').append(line)
-            for line in matched_lines
-        ]
     def log(self, msg: str, level: int = logging.WARNING):
-        """Log with [file_id] as a prefix."""
-        logger.log(level, f"[{self.file_id}] {msg}")
+        """Log with filename as a prefix."""
+        logger.log(level, f"{self.url_slug} {msg}")
     def log_top_lines(self, n: int = 10, msg: str = '', level: int = logging.INFO) -> None:
         """Log first 'n' lines of self.text at 'level'. 'msg' can be optionally provided."""
         separator = '\n\n' if '\n' in msg else '. '
-        msg = f"{msg + separator if msg else ''}Top lines of '{self.filename}' ({self.num_lines} lines):"
+        msg = (msg + separator) if msg else ''
+        msg = f"{self.filename}: {msg}First {n} lines:"
         logger.log(level, f"{msg}\n\n{self.top_lines(n)}\n")
+    def matching_lines(self, _pattern: re.Pattern | str) -> list[MatchedLine]:
+        """Return lines matching a regex as colored list[Text]."""
+        pattern = patternize(_pattern)
+        return [MatchedLine(line, i) for i, line in enumerate(self.lines) if pattern.search(line)]
+    def metadata(self) -> Metadata:
+        metadata = self.config.metadata() if self.config else {}
+        metadata.update({k: v for k, v in asdict(self).items() if k in METADATA_FIELDS and v is not None})
+        metadata['bytes'] = self.file_size()
+        metadata['filename'] = f"{self.url_slug}.txt"
+        metadata['type'] = self.class_name()
+        if self.is_local_extract_file():
+            metadata['extracted_file'] = {
+                'explanation': 'This file was extracted from a court filing, not distributed directly. A copy can be found on github.',
+                'extracted_from_file': self.url_slug + '.txt',
+                'extracted_file_url': extracted_file_url(self.filename),
+            }
+        return metadata
+    def raw_text(self) -> str:
+        with open(self.file_path) as f:
+            return f.read()
     def raw_document_link_txt(self, style: str = '', include_alt_link: bool = False) -> Text:
         """Returns colored links to epstein.media and and epsteinweb in a Text object."""
         txt = Text('', style='white' if include_alt_link else ARCHIVE_LINK_COLOR)
@@ -215,11 +213,13 @@ class Document:
             txt.append(self.epstein_web_link(style=style))
             if include_alt_link:
+                txt.append(' (').append(self.epsteinify_link(style='white dim', link_txt=EPSTEINIFY)).append(')')
                 txt.append(' (').append(self.epstein_media_link(style='white dim', link_txt=EPSTEIN_MEDIA)).append(')')
         else:
             txt.append(self.epstein_media_link(style=style))
             if include_alt_link:
+                txt.append(' (').append(self.epsteinify_link(style='white dim', link_txt=EPSTEINIFY)).append(')')
                 txt.append(' (').append(self.epstein_web_link(style='white dim', link_txt=EPSTEIN_WEB)).append(')')
         return txt
@@ -234,8 +234,36 @@ class Document:
         return text
+    def sort_key(self) -> tuple[datetime, str, int]:
+        if self.config and self.config.dupe_of_id:
+            sort_id = self.config.dupe_of_id
+            dupe_idx = 1
+        else:
+            sort_id = self.file_id
+            dupe_idx = 0
+        return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
+    def summary(self) -> Text:
+        """Summary of this file for logging. Brackets are left open for subclasses to add stuff."""
+        txt = Text('').append(self.class_name(), style=self.document_type_style())
+        txt.append(f" {self.url_slug}", style=FILENAME_STYLE)
+        if self.timestamp:
+            timestamp_str = iso_timestamp(self.timestamp).removesuffix(' 00:00:00')
+            txt.append(' (', style=SYMBOL_STYLE)
+            txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
+        txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
+        txt.append(", ").append(key_value_txt('lines', self.num_lines))
+        if self.config and self.config.dupe_of_id:
+            txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.dupe_of_id, style='magenta')))
+        return txt
     def top_lines(self, n: int = 10) -> str:
-        return '\n'.join(self.lines[0:n])
+        return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
     def _border_style(self) -> str:
         """Should be overloaded in subclasses."""
@@ -250,21 +278,20 @@ class Document:
         """Should be implemented in subclasses."""
         pass
-    def _load_file(self):
+    def _load_file(self) -> str:
         """Remove BOM and HOUSE OVERSIGHT lines, strip whitespace."""
-        with open(self.file_path) as f:
-            text = f.read()
-            text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text  # remove BOM
-            text = self.repair_ocr_text(OCR_REPAIRS, text.strip())
-            lines = [l.strip() for l in text.split('\n') if not l.startswith(HOUSE_OVERSIGHT)]
-            lines = lines[1:] if (len(lines) > 1 and lines[0] == '>>') else lines
-            return collapse_newlines('\n'.join(lines))
+        text = self.raw_text()
+        text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text  # remove BOM
+        text = self.repair_ocr_text(OCR_REPAIRS, text.strip())
+        lines = [l.strip() for l in text.split('\n') if not l.startswith(HOUSE_OVERSIGHT)]
+        return collapse_newlines('\n'.join(lines))
     def _repair(self) -> None:
-        """Can optionally be overloaded in subclasses."""
+        """Can optionally be overloaded in subclasses to further improve self.text."""
         pass
     def _set_computed_fields(self, lines: list[str] | None = None, text: str | None = None) -> None:
+        """Sets all fields derived from self.text based on either 'lines' or 'text' arg."""
         if (lines and text):
             raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (got both)")
         elif lines is not None:
@@ -275,7 +302,7 @@ class Document:
             raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (neither was)")
         self.length = len(self.text)
-        self.lines = [line.strip() for line in self.text.split('\n')]
+        self.lines = [line.strip() if self.strip_whitespace else line for line in self.text.split('\n')]
         self.num_lines = len(self.lines)
     def _write_clean_text(self, output_path: Path) -> None:
@@ -291,16 +318,17 @@ class Document:
         logger.warning(f"Wrote {self.length} chars of cleaned {self.filename} to {output_path}.")
-    def __rich_console__(self, _console: Console, _options: ConsoleOptions) -> RenderResult:
+    def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
         yield self.file_info_panel()
         text_panel = Panel(highlighter(self.text), border_style=self._border_style(), expand=False)
         yield Padding(text_panel, (0, 0, 1, INFO_INDENT))
     def __str__(self) -> str:
-        return self.description().plain
+        return self.summary().plain
     @staticmethod
     def diff_files(files: list[str]) -> None:
+        """Diff the contents of two Documents after all cleanup, BOM removal, etc."""
         if len(files) != 2:
             raise RuntimeError('Need 2 files')
         elif files[0] == files[1]:
@@ -330,7 +358,7 @@ class Document:
     @staticmethod
     def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
-        return sorted(docs, key=lambda doc: [doc.timestamp or FALLBACK_TIMESTAMP, doc.file_id])
+        return sorted(docs, key=lambda doc: doc.sort_key())
     @classmethod
     def uniquify(cls, documents: Sequence['DocumentType']) -> Sequence['DocumentType']:

epstein-files 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

epstein-files 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl