PyPI - epstein-files - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

epstein-files 1.0.0py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

epstein_files/__init__.py +59 -51
epstein_files/documents/communication.py +9 -9
epstein_files/documents/document.py +111 -87
epstein_files/documents/email.py +154 -85
epstein_files/documents/emails/email_header.py +7 -6
epstein_files/documents/imessage/text_message.py +3 -2
epstein_files/documents/json_file.py +17 -0
epstein_files/documents/messenger_log.py +62 -3
epstein_files/documents/other_file.py +165 -17
epstein_files/epstein_files.py +100 -143
epstein_files/util/constant/names.py +6 -0
epstein_files/util/constant/strings.py +27 -0
epstein_files/util/constant/urls.py +22 -9
epstein_files/util/constants.py +968 -1015
epstein_files/util/data.py +14 -28
epstein_files/util/{file_cfg.py → doc_cfg.py} +120 -34
epstein_files/util/env.py +16 -18
epstein_files/util/file_helper.py +56 -17
epstein_files/util/highlighted_group.py +227 -175
epstein_files/util/logging.py +57 -0
epstein_files/util/rich.py +18 -13
epstein_files/util/search_result.py +14 -6
epstein_files/util/timer.py +24 -0
epstein_files/util/word_count.py +2 -1
{epstein_files-1.0.0.dist-info → epstein_files-1.0.1.dist-info}/METADATA +3 -2
epstein_files-1.0.1.dist-info/RECORD +30 -0
epstein_files-1.0.0.dist-info/RECORD +0 -28
{epstein_files-1.0.0.dist-info → epstein_files-1.0.1.dist-info}/LICENSE +0 -0
{epstein_files-1.0.0.dist-info → epstein_files-1.0.1.dist-info}/WHEEL +0 -0

epstein_files/__init__.py CHANGED Viewed

@@ -13,19 +13,22 @@ load_dotenv()
 from rich.padding import Padding
 from epstein_files.documents.email import Email
+from epstein_files.documents.messenger_log import  MessengerLog
 from epstein_files.epstein_files import EpsteinFiles, count_by_month
 from epstein_files.util.constant.html import *
 from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import EMAIL_CLASS, MESSENGER_LOG_CLASS
-from epstein_files.util.data import Timer, dict_sets_to_lists, flatten
-from epstein_files.util.env import specified_names, args
-from epstein_files.util.file_helper import GH_PAGES_HTML_PATH
+from epstein_files.util.data import dict_sets_to_lists
+from epstein_files.util.env import args, specified_names
+from epstein_files.util.file_helper import GH_PAGES_HTML_PATH, JSON_METADATA_PATH, make_clean
+from epstein_files.util.logging import logger
 from epstein_files.util.rich import *
+from epstein_files.util.timer import Timer
 PRINT_COLOR_KEY_EVERY_N_EMAILS = 150
-# Order matters (will be order of output)
-PEOPLE_WHOSE_EMAILS_SHOULD_BE_PRINTED: list[str | None] = [
+# Order matters. Default names to print emails for.
+DEFAULT_EMAILERS = [
     JEREMY_RUBIN,
     AL_SECKEL,
     JOI_ITO,
@@ -49,8 +52,9 @@ PEOPLE_WHOSE_EMAILS_SHOULD_BE_PRINTED: list[str | None] = [
     None,
 ]
-# Order matters (will be order of output)
-PEOPLE_WHOSE_EMAILS_SHOULD_BE_TABLES: list[str | None] = [
+# Order matters. Default names to print tables w/email subject, timestamp, etc for.
+# TODO: get rid of this
+DEFAULT_EMAILER_TABLES: list[str | None] = [
     GHISLAINE_MAXWELL,
     LEON_BLACK,
     LANDON_THOMAS,
@@ -64,42 +68,58 @@ PEOPLE_WHOSE_EMAILS_SHOULD_BE_TABLES: list[str | None] = [
     TOM_PRITZKER,
 ]
+if len(set(DEFAULT_EMAILERS).intersection(set(DEFAULT_EMAILER_TABLES))) > 0:
+    raise RuntimeError(f"Some names appear in both DEFAULT_EMAILERS and DEFAULT_EMAILER_TABLES")
 def generate_html() -> None:
+    if args.make_clean:
+        make_clean()
+        exit()
     timer = Timer()
     epstein_files = EpsteinFiles.get_files(timer)
+    if args.json_metadata:
+        json_str = epstein_files.json_metadata()
+        if args.build:
+            with open(JSON_METADATA_PATH, 'w') as f:
+                f.write(json_str)
+                timer.print_at_checkpoint(f"Wrote {file_size_str(JSON_METADATA_PATH)} to '{JSON_METADATA_PATH}'")
+        else:
+            console.print_json(json_str, indent=4, sort_keys=True)
+        exit()
     print_header(epstein_files)
     if args.colors_only:
         exit()
-    # Text messages section
     if args.output_texts:
-        print_text_messages(epstein_files)
-        timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs):,} text message logs')
+        _print_text_messages(epstein_files)
+        timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs)} text message logs')
-    # Emails section
     if args.output_emails:
-        emails_printed = print_emails(epstein_files)
+        emails_printed = _print_emails(epstein_files)
         timer.print_at_checkpoint(f"Printed {emails_printed:,} emails")
     if args.output_other_files:
-        epstein_files.print_other_files_table()
-        timer.print_at_checkpoint(f"Printed {len(epstein_files.other_files):,} other files")
-    else:
-        logger.warning(f"Skipping other files section...")
+        files_printed = epstein_files.print_other_files_table()
+        timer.print_at_checkpoint(f"Printed {len(files_printed)} other files")
     # Save output
     write_html(GH_PAGES_HTML_PATH)
-    logger.warning(f"Total time: {timer.seconds_since_start()}")
+    logger.warning(f"Total time: {timer.seconds_since_start_str()}")
     # JSON stats (mostly used for building pytest checks)
     if args.json_stats:
         console.line(5)
-        print_json_stats(epstein_files)
+        _print_json_stats(epstein_files)
-def print_emails(epstein_files: EpsteinFiles) -> int:
+def _print_emails(epstein_files: EpsteinFiles) -> int:
     """Returns number of emails printed."""
     print_section_header(('Selections from ' if not args.all_emails else '') + 'His Emails')
     print_other_site_link(is_header=False)
@@ -109,7 +129,7 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
     emailers_to_print: list[str | None]
     emailer_tables: list[str | None] = []
-    emails_that_were_printed: list[Email] = []
+    already_printed_emails: list[Email] = []
     num_emails_printed_since_last_color_key = 0
     if args.all_emails:
@@ -117,26 +137,17 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
         emailers_to_print = sorted(epstein_files.all_emailers(), key=lambda e: epstein_files.earliest_email_at(e))
         print_numbered_list_of_emailers(emailers_to_print, epstein_files)
     else:
-        if len(specified_names) > 0:
-            emailers_to_print = specified_names
-        else:
-            emailers_to_print = PEOPLE_WHOSE_EMAILS_SHOULD_BE_PRINTED
+        emailers_to_print = specified_names if specified_names else DEFAULT_EMAILERS
         console.print('Email conversations grouped by counterparty can be found in the order listed below.')
         print_numbered_list_of_emailers(emailers_to_print)
         console.print("\nAfter that there's tables linking to (but not displaying) all known emails for each of these people:")
         if len(specified_names) > 0:
-            if args.all_email_tables:
-                emailer_tables = sorted(epstein_files.all_emailers(), key=lambda e: epstein_files.earliest_email_at(e))
-            else:
-                emailer_tables = PEOPLE_WHOSE_EMAILS_SHOULD_BE_TABLES
-            print_numbered_list_of_emailers(emailer_tables)
+            print_numbered_list_of_emailers(DEFAULT_EMAILER_TABLES)
     for author in emailers_to_print:
         newly_printed_emails = epstein_files.print_emails_for(author)
-        emails_that_were_printed.extend(newly_printed_emails)
+        already_printed_emails.extend(newly_printed_emails)
         num_emails_printed_since_last_color_key += len(newly_printed_emails)
         # Print color key every once in a while
@@ -144,36 +155,33 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
             print_color_key()
             num_emails_printed_since_last_color_key = 0
-    if len(emailer_tables) > 0 and len(specified_names) == 0:
-        print_author_header(f"Email Tables for {len(emailer_tables)} Other People", 'white')
+    if not specified_names:
+        if not args.all_emails:
+            print_author_header(f"Email Tables for {len(emailer_tables)} Other People", 'white')
-        for name in emailer_tables:
-            epstein_files.print_emails_table_for(name)
+            for name in DEFAULT_EMAILER_TABLES:
+                epstein_files.print_emails_table_for(name)
-    if len(specified_names) == 0:
         epstein_files.print_email_device_info()
-    logger.warning(f"Rewrote {len(Email.rewritten_header_ids)} headers of {len(epstein_files.emails)} emails")
+    # Check that all emails were actually printed
     if args.all_emails:
-        email_ids_that_were_printed = set([email.file_id for email in emails_that_were_printed])
-        logger.warning(f"Printed {len(emails_that_were_printed)} emails of {len(email_ids_that_were_printed)} unique file IDs.")
+        email_ids_that_were_printed = set([email.file_id for email in already_printed_emails])
+        logger.warning(f"Printed {len(already_printed_emails)} emails of {len(email_ids_that_were_printed)} unique file IDs.")
         for email in epstein_files.emails:
             if email.file_id not in email_ids_that_were_printed and not email.is_duplicate:
-                logger.warning(f"Failed to print {email.description()}")
+                logger.warning(f"Failed to print {email.summary()}")
-    return len(emails_that_were_printed)
+    logger.warning(f"Rewrote {len(Email.rewritten_header_ids)} headers of {len(epstein_files.emails)} emails")
+    return len(already_printed_emails)
-def print_text_messages(epstein_files: EpsteinFiles) -> None:
+def _print_text_messages(epstein_files: EpsteinFiles) -> None:
     print_section_header('Text Messages')
     print_centered("(conversations are sorted chronologically based on timestamp of first message)\n", style='gray30')
-    if len(specified_names) == 0:
-        log_files = epstein_files.imessage_logs
-    else:
-        log_files = flatten([epstein_files.imessage_logs_for(name) for name in specified_names])
+    authors: list[str | None] = specified_names if specified_names else [JEFFREY_EPSTEIN]
+    log_files = epstein_files.imessage_logs_for(authors)
     for log_file in log_files:
         console.print(Padding(log_file))
@@ -182,9 +190,9 @@ def print_text_messages(epstein_files: EpsteinFiles) -> None:
     epstein_files.print_imessage_summary()
-def print_json_stats(epstein_files: EpsteinFiles) -> None:
+def _print_json_stats(epstein_files: EpsteinFiles) -> None:
     console.print(Panel('JSON Stats Dump', expand=True, style='reverse bold'), '\n')
-    print_json(f"{MESSENGER_LOG_CLASS} Sender Counts", epstein_files.imessage_sender_counts(), skip_falsey=True)
+    print_json(f"{MESSENGER_LOG_CLASS} Sender Counts", MessengerLog.count_authors(epstein_files.imessage_logs), skip_falsey=True)
     print_json(f"{EMAIL_CLASS} Author Counts", epstein_files.email_author_counts, skip_falsey=True)
     print_json(f"{EMAIL_CLASS} Recipient Counts", epstein_files.email_recipient_counts, skip_falsey=True)
     print_json("Email signature_substitution_countss", epstein_files.email_signature_substitution_counts(), skip_falsey=True)

epstein_files/documents/communication.py CHANGED Viewed

@@ -8,7 +8,7 @@ from rich.text import Text
 from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, Document
 from epstein_files.util.constant.names import UNKNOWN
 from epstein_files.util.constants import FALLBACK_TIMESTAMP
-from epstein_files.util.file_cfg import MessageCfg
+from epstein_files.util.doc_cfg import CommunicationCfg
 from epstein_files.util.highlighted_group import get_style_for_name
 from epstein_files.util.rich import key_value_txt
@@ -20,7 +20,7 @@ class Communication(Document):
     """Superclass for Email and MessengerLog."""
     author_style: str = 'white'
     author_txt: Text = field(init=False)
-    config: MessageCfg | None = None
+    config: CommunicationCfg | None = None
     timestamp: datetime = FALLBACK_TIMESTAMP  # TODO this default sucks (though it never happens)
     def __post_init__(self):
@@ -31,22 +31,22 @@ class Communication(Document):
     def author_or_unknown(self) -> str:
         return self.author or UNKNOWN
-    def description(self) -> Text:
-        return self._description().append(CLOSE_PROPERTIES_CHAR)
-    def is_attribution_uncertain(self) -> bool | None:
-        return self.config and self.config.is_attribution_uncertain
+    def is_attribution_uncertain(self) -> bool:
+        return bool(self.config and self.config.is_attribution_uncertain)
     def raw_document_link_txt(self, _style: str = '', include_alt_link: bool = True) -> Text:
         """Overrides super() method to apply self.author_style."""
         return super().raw_document_link_txt(self.author_style, include_alt_link=include_alt_link)
+    def summary(self) -> Text:
+        return self._summary().append(CLOSE_PROPERTIES_CHAR)
     def timestamp_without_seconds(self) -> str:
         return TIMESTAMP_SECONDS_REGEX.sub('', str(self.timestamp))
-    def _description(self) -> Text:
+    def _summary(self) -> Text:
         """One line summary mostly for logging."""
-        txt = super().description().append(', ')
+        txt = super().summary().append(', ')
         return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown()}'", style=self.author_style)))

epstein_files/documents/document.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 import re
-from dataclasses import dataclass, field
+from dataclasses import asdict, dataclass, field
 from datetime import datetime
 from pathlib import Path
 from subprocess import run
@@ -14,33 +14,28 @@ from rich.text import Text
 from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import *
 from epstein_files.util.constant.urls import *
-from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP, VI_DAILY_NEWS_ARTICLE
-from epstein_files.util.file_cfg import FileCfg, MessageCfg
-from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize
-from epstein_files.util.env import args, logger
-from epstein_files.util.file_helper import DOCS_DIR, file_stem_for_id, extract_file_id, file_size_str, is_local_extract_file
-from epstein_files.util.rich import SYMBOL_STYLE, console, highlighter, key_value_txt, logger, link_text_obj
+from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
+from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize, without_nones
+from epstein_files.util.doc_cfg import EmailCfg, DocCfg, Metadata, TextCfg
+from epstein_files.util.env import args
+from epstein_files.util.file_helper import (DOCS_DIR, file_stem_for_id, extract_file_id, file_size,
+     file_size_str, is_local_extract_file)
+from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
+from epstein_files.util.rich import SYMBOL_STYLE, console, highlighter, key_value_txt, link_text_obj
+from epstein_files.util.search_result import MatchedLine
-WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
+CLOSE_PROPERTIES_CHAR = ']'
 HOUSE_OVERSIGHT = HOUSE_OVERSIGHT_PREFIX.replace('_', ' ').strip()
-MIN_DOCUMENT_ID = 10477
 INFO_INDENT = 2
 INFO_PADDING = (0, 0, 0, INFO_INDENT)
+MAX_TOP_LINES_LEN = 4000  # Only for logging
+MIN_DOCUMENT_ID = 10477
+LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
+WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
-CLOSE_PROPERTIES_CHAR = ']'
-MAX_EXTRACTED_TIMESTAMPS = 6
 MIN_TIMESTAMP = datetime(1991, 1, 1)
 MID_TIMESTAMP = datetime(2007, 1, 1)
 MAX_TIMESTAMP = datetime(2020, 1, 1)
-VI_DAILY_NEWS_REGEX = re.compile(r'virgin\s*is[kl][ai]nds\s*daily\s*news', re.IGNORECASE)
-DOC_TYPE_STYLES = {
-    DOCUMENT_CLASS: 'grey69',
-    EMAIL_CLASS: 'sea_green2',
-    JSON_FILE_CLASS: 'sandy_brown',
-    MESSENGER_LOG_CLASS: 'cyan',
-    OTHER_FILE_CLASS: 'grey69',
-}
 FILENAME_MATCH_STYLES = [
     'dark_green',
@@ -48,6 +43,13 @@ FILENAME_MATCH_STYLES = [
     'spring_green4',
 ]
+METADATA_FIELDS = [
+    'author',
+    'file_id',
+    'num_lines',
+    'timestamp'
+]
 OCR_REPAIRS = {
     re.compile(r'\.corn\b'): '.com',
     re.compile('ln(adequate|dyke)'): r'In\1',
@@ -61,7 +63,7 @@ class Document:
     file_path: Path
     # Optional fields
     author: str | None = None
-    config: FileCfg | MessageCfg | None = None
+    config: EmailCfg | DocCfg | TextCfg | None = None
     file_id: str = field(init=False)
     filename: str = field(init=False)
     is_duplicate: bool = False
@@ -72,8 +74,8 @@ class Document:
     timestamp: datetime | None = None
     url_slug: str = field(init=False)  # e.g. 'HOUSE_OVERSIGHT_123456
-    # Class variable; only used to cycle color of output when using lines_match()
-    file_matching_idx: ClassVar[int] = 0
+    # Class variable overridden in JsonFile
+    strip_whitespace: ClassVar[bool] = True
     def __post_init__(self):
         self.filename = self.file_path.name
@@ -82,12 +84,12 @@ class Document:
         self.is_duplicate = bool(self.config.dupe_of_id) if self.config else False
         if self.is_local_extract_file():
-            self.url_slug = file_stem_for_id(self.file_id)
+            self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
             cfg_type = type(self.config).__name__ if self.config else None
             # Coerce FileConfig for court docs etc. to MessageCfg for email files extracted from that document
-            if self.document_type() == EMAIL_CLASS and self.config and cfg_type != MessageCfg.__name__:
-                self.config = MessageCfg.from_file_cfg(self.config)
+            if self.class_name() == EMAIL_CLASS and self.config and cfg_type != EmailCfg.__name__:
+                self.config = EmailCfg.from_doc_cfg(self.config)
         else:
             self.url_slug = self.file_path.stem
@@ -96,41 +98,30 @@ class Document:
         self._extract_author()
         self.timestamp = self._extract_timestamp()
+    def class_name(self) -> str:
+        """Annoying workaround for circular import issues and isinstance()."""
+        return str(type(self).__name__)
     def configured_description(self) -> str | None:
-        return self.config.description if self.config else None
+        """Overloaded in OtherFile."""
+        if self.config and self.config.description:
+            return f"({self.config.description})"
     def date_str(self) -> str | None:
         return date_str(self.timestamp)
-    def description(self) -> Text:
-        """Mostly for logging. Brackets are left open for subclasses to add stuff."""
-        txt = Text('').append(self.url_slug, style='magenta')
-        txt.append(f' {self.document_type()}', style=self.document_type_style())
-        if self.timestamp:
-            txt.append(' (', style=SYMBOL_STYLE)
-            txt.append(f"{iso_timestamp(self.timestamp)}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
-        txt.append(" [").append(key_value_txt('num_lines', Text(f"{self.num_lines}", style='cyan')))
-        txt.append(', ').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
-        return txt
     def description_panel(self, include_hints: bool = False) -> Panel:
         """Panelized description() with info_txt(), used in search results."""
         hints = [Text('', style='italic').append(h) for h in (self.hints() if include_hints else [])]
-        return Panel(Group(*([self.description()] + hints)), border_style=self.document_type_style(), expand=False)
-    def document_type(self) -> str:
-        """Annoying workaround for circular import issues and isinstance()."""
-        return str(type(self).__name__)
+        return Panel(Group(*([self.summary()] + hints)), border_style=self.document_type_style(), expand=False)
     def document_type_style(self) -> str:
-        return DOC_TYPE_STYLES[self.document_type()]
+        return DOC_TYPE_STYLES[self.class_name()]
     def duplicate_file_txt(self) -> Text:
         """If the file is a dupe make a nice message to explain what file it's a duplicate of."""
         if not self.config or not self.config.dupe_of_id:
-            raise RuntimeError(f"duplicate_file_txt() called on {self.description()} but not a dupe! config:\n\n{self.config}")
+            raise RuntimeError(f"duplicate_file_txt() called on {self.summary()} but not a dupe! config:\n\n{self.config}")
         txt = Text(f"Not showing ", style='white dim italic').append(epstein_media_doc_link_txt(self.file_id, style='cyan'))
         txt.append(f" because it's {self.config.duplicate_reason()} ")
@@ -154,6 +145,9 @@ class Document:
         hints = [Padding(hint, INFO_PADDING) for hint in self.hints()]
         return Group(*([panel] + hints))
+    def file_size(self) -> int:
+        return file_size(self.file_path)
     def file_size_str(self) -> str:
         return file_size_str(self.file_path)
@@ -162,16 +156,10 @@ class Document:
         hints = listify(self.info_txt())
         hint_msg = self.configured_description()
-        if self.document_type() == OTHER_FILE_CLASS:
-            if not hint_msg and VI_DAILY_NEWS_REGEX.search(self.text):
-                hint_msg = VI_DAILY_NEWS_ARTICLE
-        elif hint_msg:
-            hint_msg = f"({hint_msg})"
         if hint_msg:
             hints.append(highlighter(Text(hint_msg, style='white dim italic')))
-        return hints
+        return without_nones(hints)
     def info_txt(self) -> Text | None:
         """Secondary info about this file (recipients, level of certainty, etc). Overload in subclasses."""
@@ -181,32 +169,42 @@ class Document:
         """True if file created by extracting text from a court doc (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
         return is_local_extract_file(self.filename)
-    def lines_matching_txt(self, _pattern: re.Pattern | str) -> list[Text]:
-        """Return lines matching a regex as colored list[Text]."""
-        pattern = patternize(_pattern)
-        matched_lines = [line for line in self.lines if pattern.search(line)]
-        if len(matched_lines) == 0:
-            return []
-        file_style = FILENAME_MATCH_STYLES[type(self).file_matching_idx % len(FILENAME_MATCH_STYLES)]
-        type(self).file_matching_idx += 1
-        return [
-            Text('').append(self.file_path.name, style=file_style).append(':').append(line)
-            for line in matched_lines
-        ]
     def log(self, msg: str, level: int = logging.WARNING):
-        """Log with [file_id] as a prefix."""
-        logger.log(level, f"[{self.file_id}] {msg}")
+        """Log with filename as a prefix."""
+        logger.log(level, f"{self.url_slug} {msg}")
     def log_top_lines(self, n: int = 10, msg: str = '', level: int = logging.INFO) -> None:
         """Log first 'n' lines of self.text at 'level'. 'msg' can be optionally provided."""
         separator = '\n\n' if '\n' in msg else '. '
-        msg = f"{msg + separator if msg else ''}Top lines of '{self.filename}' ({self.num_lines} lines):"
+        msg = (msg + separator) if msg else ''
+        msg = f"{self.filename}: {msg}First {n} lines:"
         logger.log(level, f"{msg}\n\n{self.top_lines(n)}\n")
+    def matching_lines(self, _pattern: re.Pattern | str) -> list[MatchedLine]:
+        """Return lines matching a regex as colored list[Text]."""
+        pattern = patternize(_pattern)
+        return [MatchedLine(line, i) for i, line in enumerate(self.lines) if pattern.search(line)]
+    def metadata(self) -> Metadata:
+        metadata = self.config.metadata() if self.config else {}
+        metadata.update({k: v for k, v in asdict(self).items() if k in METADATA_FIELDS and v is not None})
+        metadata['bytes'] = self.file_size()
+        metadata['filename'] = f"{self.url_slug}.txt"
+        metadata['type'] = self.class_name()
+        if self.is_local_extract_file():
+            metadata['extracted_file'] = {
+                'explanation': 'This file was extracted from a court filing, not distributed directly. A copy can be found on github.',
+                'extracted_from_file': self.url_slug + '.txt',
+                'extracted_file_url': extracted_file_url(self.filename),
+            }
+        return metadata
+    def raw_text(self) -> str:
+        with open(self.file_path) as f:
+            return f.read()
     def raw_document_link_txt(self, style: str = '', include_alt_link: bool = False) -> Text:
         """Returns colored links to epstein.media and and epsteinweb in a Text object."""
         txt = Text('', style='white' if include_alt_link else ARCHIVE_LINK_COLOR)
@@ -215,11 +213,13 @@ class Document:
             txt.append(self.epstein_web_link(style=style))
             if include_alt_link:
+                txt.append(' (').append(self.epsteinify_link(style='white dim', link_txt=EPSTEINIFY)).append(')')
                 txt.append(' (').append(self.epstein_media_link(style='white dim', link_txt=EPSTEIN_MEDIA)).append(')')
         else:
             txt.append(self.epstein_media_link(style=style))
             if include_alt_link:
+                txt.append(' (').append(self.epsteinify_link(style='white dim', link_txt=EPSTEINIFY)).append(')')
                 txt.append(' (').append(self.epstein_web_link(style='white dim', link_txt=EPSTEIN_WEB)).append(')')
         return txt
@@ -234,8 +234,32 @@ class Document:
         return text
+    def sort_key(self) -> tuple[datetime, str, int]:
+        if self.config and self.config.dupe_of_id:
+            sort_id = self.config.dupe_of_id
+            dupe_idx = 1
+        else:
+            sort_id = self.file_id
+            dupe_idx = 0
+        return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
+    def summary(self) -> Text:
+        """Summary of this file for logging. Brackets are left open for subclasses to add stuff."""
+        txt = Text('').append(self.class_name(), style=self.document_type_style())
+        txt.append(f" {self.url_slug}", style=FILENAME_STYLE)
+        if self.timestamp:
+            timestamp_str = iso_timestamp(self.timestamp).removesuffix(' 00:00:00')
+            txt.append(' (', style=SYMBOL_STYLE)
+            txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
+        txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
+        txt.append(", ").append(key_value_txt('lines', Text(f"{self.num_lines}", style='cyan')))
+        return txt
     def top_lines(self, n: int = 10) -> str:
-        return '\n'.join(self.lines[0:n])
+        return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
     def _border_style(self) -> str:
         """Should be overloaded in subclasses."""
@@ -250,21 +274,20 @@ class Document:
         """Should be implemented in subclasses."""
         pass
-    def _load_file(self):
+    def _load_file(self) -> str:
         """Remove BOM and HOUSE OVERSIGHT lines, strip whitespace."""
-        with open(self.file_path) as f:
-            text = f.read()
-            text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text  # remove BOM
-            text = self.repair_ocr_text(OCR_REPAIRS, text.strip())
-            lines = [l.strip() for l in text.split('\n') if not l.startswith(HOUSE_OVERSIGHT)]
-            lines = lines[1:] if (len(lines) > 1 and lines[0] == '>>') else lines
-            return collapse_newlines('\n'.join(lines))
+        text = self.raw_text()
+        text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text  # remove BOM
+        text = self.repair_ocr_text(OCR_REPAIRS, text.strip())
+        lines = [l.strip() for l in text.split('\n') if not l.startswith(HOUSE_OVERSIGHT)]
+        return collapse_newlines('\n'.join(lines))
     def _repair(self) -> None:
-        """Can optionally be overloaded in subclasses."""
+        """Can optionally be overloaded in subclasses to further improve self.text."""
         pass
     def _set_computed_fields(self, lines: list[str] | None = None, text: str | None = None) -> None:
+        """Sets all fields derived from self.text based on either 'lines' or 'text' arg."""
         if (lines and text):
             raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (got both)")
         elif lines is not None:
@@ -275,7 +298,7 @@ class Document:
             raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (neither was)")
         self.length = len(self.text)
-        self.lines = [line.strip() for line in self.text.split('\n')]
+        self.lines = [line.strip() if self.strip_whitespace else line for line in self.text.split('\n')]
         self.num_lines = len(self.lines)
     def _write_clean_text(self, output_path: Path) -> None:
@@ -291,16 +314,17 @@ class Document:
         logger.warning(f"Wrote {self.length} chars of cleaned {self.filename} to {output_path}.")
-    def __rich_console__(self, _console: Console, _options: ConsoleOptions) -> RenderResult:
+    def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
         yield self.file_info_panel()
         text_panel = Panel(highlighter(self.text), border_style=self._border_style(), expand=False)
         yield Padding(text_panel, (0, 0, 1, INFO_INDENT))
     def __str__(self) -> str:
-        return self.description().plain
+        return self.summary().plain
     @staticmethod
     def diff_files(files: list[str]) -> None:
+        """Diff the contents of two Documents after all cleanup, BOM removal, etc."""
         if len(files) != 2:
             raise RuntimeError('Need 2 files')
         elif files[0] == files[1]:
@@ -330,7 +354,7 @@ class Document:
     @staticmethod
     def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
-        return sorted(docs, key=lambda doc: [doc.timestamp or FALLBACK_TIMESTAMP, doc.file_id])
+        return sorted(docs, key=lambda doc: doc.sort_key())
     @classmethod
     def uniquify(cls, documents: Sequence['DocumentType']) -> Sequence['DocumentType']:

epstein-files 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

epstein-files 1.0.0py3-none-any.whl → 1.0.1py3-none-any.whl