PyPI - epstein-files - Versions diffs - 1.2.0__py3-none-any.whl → 1.2.5__py3-none-any.whl - Mend

epstein-files 1.2.0py3-none-any.whl → 1.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

epstein_files/__init__.py +42 -30
epstein_files/documents/communication.py +0 -3
epstein_files/documents/document.py +66 -19
epstein_files/documents/email.py +203 -208
epstein_files/documents/emails/email_header.py +10 -2
epstein_files/documents/imessage/text_message.py +3 -2
epstein_files/documents/other_file.py +16 -34
epstein_files/epstein_files.py +24 -35
epstein_files/person.py +67 -73
epstein_files/util/constant/names.py +21 -12
epstein_files/util/constant/output_files.py +8 -5
epstein_files/util/constant/strings.py +2 -2
epstein_files/util/constant/urls.py +14 -2
epstein_files/util/constants.py +38 -12
epstein_files/util/data.py +2 -1
epstein_files/util/doc_cfg.py +3 -3
epstein_files/util/env.py +10 -7
epstein_files/util/highlighted_group.py +366 -202
epstein_files/util/logging.py +1 -1
epstein_files/util/output.py +54 -21
epstein_files/util/rich.py +21 -16
epstein_files/util/timer.py +14 -0
epstein_files/util/word_count.py +1 -1
{epstein_files-1.2.0.dist-info → epstein_files-1.2.5.dist-info}/METADATA +5 -2
epstein_files-1.2.5.dist-info/RECORD +34 -0
epstein_files-1.2.0.dist-info/RECORD +0 -34
{epstein_files-1.2.0.dist-info → epstein_files-1.2.5.dist-info}/LICENSE +0 -0
{epstein_files-1.2.0.dist-info → epstein_files-1.2.5.dist-info}/WHEEL +0 -0
{epstein_files-1.2.0.dist-info → epstein_files-1.2.5.dist-info}/entry_points.txt +0 -0

epstein_files/__init__.py CHANGED Viewed

@@ -16,14 +16,18 @@ from rich.text import Text
 from epstein_files.epstein_files import EpsteinFiles, document_cls
 from epstein_files.documents.document import INFO_PADDING, Document
 from epstein_files.documents.email import Email
+from epstein_files.documents.messenger_log import MessengerLog
+from epstein_files.documents.other_file import OtherFile
 from epstein_files.util.constant.output_files import make_clean
+from epstein_files.util.constant.strings import ID_REGEX
+from epstein_files.util.data import flatten
 from epstein_files.util.env import args
 from epstein_files.util.file_helper import coerce_file_path, extract_file_id
 from epstein_files.util.logging import exit_with_error, logger
 from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
-     print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info_png,
+     print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info,
      print_json_metadata, write_urls)
-from epstein_files.util.rich import (build_highlighter, console, print_color_key, print_title_page_header,
+from epstein_files.util.rich import (build_highlighter, console, highlighter, print_color_key, print_title_page_header,
      print_title_page_tables, print_subtitle_panel, write_html)
 from epstein_files.util.timer import Timer
 from epstein_files.util.word_count import write_word_counts_html
@@ -38,15 +42,15 @@ def generate_html() -> None:
     timer = Timer()
     epstein_files = EpsteinFiles.get_files(timer)
-    if args.json_metadata:
+    if args.emailers_info:
+        print_emailers_info(epstein_files)
+        exit()
+    elif args.json_metadata:
         print_json_metadata(epstein_files)
         exit()
     elif args.json_files:
         print_json_files(epstein_files)
         exit()
-    elif args.emailers_info_png:
-        print_emailers_info_png(epstein_files)
-        exit()
     print_title_page_header()
@@ -59,29 +63,26 @@ def generate_html() -> None:
         exit()
     if args.output_texts:
-        imessage_logs = [log for log in epstein_files.imessage_logs if not args.names or log.author in args.names]
-        print_text_messages_section(imessage_logs)
-        timer.print_at_checkpoint(f'Printed {len(imessage_logs)} text message log files')
+        printed_logs = print_text_messages_section(epstein_files)
+        timer.log_section_complete('MessengerLog', epstein_files.imessage_logs, printed_logs)
     if args.output_emails:
-        emails_that_were_printed = print_emails_section(epstein_files)
-        timer.print_at_checkpoint(f"Printed {len(emails_that_were_printed):,} emails")
+        printed_emails = print_emails_section(epstein_files)
+        timer.log_section_complete('Email', epstein_files.emails, printed_emails)
     elif args.email_timeline:
         print_email_timeline(epstein_files)
         timer.print_at_checkpoint(f"Printed chronological emails table")
     if args.output_other:
-        if args.uninteresting:
-            files = [f for f in epstein_files.other_files if not f.is_interesting()]
-        else:
-            files = [f for f in epstein_files.other_files if args.all_other_files or f.is_interesting()]
-        print_other_files_section(files, epstein_files)
-        timer.print_at_checkpoint(f"Printed {len(files)} other files (skipped {len(epstein_files.other_files) - len(files)})")
+        printed_files = print_other_files_section(epstein_files)
+        timer.log_section_complete('OtherFile', epstein_files.other_files, printed_files)
     write_html(args.build)
     logger.warning(f"Total time: {timer.seconds_since_start_str()}")
+    if args.debug:
+        highlighter.print_highlight_counts(console)
     # JSON stats (mostly used for building pytest checks)
     if args.json_stats:
         print_json_stats(epstein_files)
@@ -94,36 +95,52 @@ def epstein_diff():
 def epstein_search():
     """Search the cleaned up text of the files."""
-    _assert_positional_args()
     epstein_files = EpsteinFiles.get_files()
+    if ID_REGEX.match(args.positional_args[0]):
+        logger.warning(f"'{args.positional_args[0]}' seems to be an ID, running epstein_show instead...")
+        epstein_show()
+        return
     for search_term in args.positional_args:
         temp_highlighter = build_highlighter(search_term)
         search_results = epstein_files.docs_matching(search_term, args.names)
         print_subtitle_panel(f"Found {len(search_results)} documents matching '{search_term}'")
         for search_result in search_results:
-            console.line()
+            document = search_result.document
+            if (isinstance(document, Email) and not args.output_emails) \
+                    or (isinstance(document, OtherFile) and not args.output_other) \
+                    or (isinstance(document, MessengerLog) and not args.output_texts):
+                document.warn(f"{type(document).__name__} Skipping search result...")
+                continue
             if args.whole_file:
-                console.print(search_result.document)
+                console.print(document)
             else:
-                console.print(search_result.document.summary_panel())
+                console.print(document.summary_panel())
                 for matching_line in search_result.lines:
                     line_txt = matching_line.__rich__()
                     console.print(Padding(temp_highlighter(line_txt), INFO_PADDING), style='gray37')
+            console.line()
 def epstein_show():
     """Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
-    _assert_positional_args()
     raw_docs: list[Document] = []
     console.line()
     try:
-        ids = [extract_file_id(arg) for arg in args.positional_args]
-        raw_docs = [Document(coerce_file_path(id)) for id in ids]
+        if args.names:
+            people = EpsteinFiles.get_files().person_objs(args.names)
+            raw_docs = [doc for doc in flatten([p.emails for p in people])]
+        else:
+            ids = [extract_file_id(arg) for arg in args.positional_args]
+            raw_docs = [Document(coerce_file_path(id)) for id in ids]
         docs = Document.sort_by_timestamp([document_cls(doc)(doc.file_path) for doc in raw_docs])
     except Exception as e:
         exit_with_error(str(e))
@@ -142,8 +159,3 @@ def epstein_show():
 def epstein_word_count() -> None:
     write_word_counts_html()
-def _assert_positional_args():
-    if not args.positional_args:
-        exit_with_error(f"No positional args provided!\n")

epstein_files/documents/communication.py CHANGED Viewed

@@ -34,9 +34,6 @@ class Communication(Document):
         """Overrides super() method to apply self.author_style."""
         return super().external_links_txt(self.author_style(), include_alt_links=include_alt_links)
-    def is_attribution_uncertain(self) -> bool:
-        return bool(self.config and self.config.is_attribution_uncertain)
     def summary(self) -> Text:
         return self._summary().append(CLOSE_PROPERTIES_CHAR)

epstein_files/documents/document.py CHANGED Viewed

@@ -11,17 +11,19 @@ from rich.console import Console, ConsoleOptions, Group, RenderResult
 from rich.padding import Padding
 from rich.panel import Panel
 from rich.text import Text
+from rich.table import Table
 from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import *
 from epstein_files.util.constant.urls import *
 from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
-from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time_from_timestamp_str, without_falsey
+from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time, without_falsey
 from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
 from epstein_files.util.env import DOCS_DIR, args
-from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, is_local_extract_file
+from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, file_size_to_str, is_local_extract_file
 from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
-from epstein_files.util.rich import INFO_STYLE, SYMBOL_STYLE, console, highlighter, join_texts, key_value_txt, link_text_obj, parenthesize
+from epstein_files.util.rich import (INFO_STYLE, NA_TXT, SYMBOL_STYLE, add_cols_to_table, build_table, console,
+     highlighter, join_texts, key_value_txt, link_text_obj, parenthesize)
 from epstein_files.util.search_result import MatchedLine
 ALT_LINK_STYLE = 'white dim'
@@ -55,6 +57,14 @@ OCR_REPAIRS = {
     'Nil Priell': 'Nili Priell',
 }
+SUMMARY_TABLE_COLS: list[str | dict] = [
+    'Count',
+    {'name': 'Has Author', 'style': 'honeydew2'},
+    {'name': 'No Author', 'style': 'wheat4'},
+    {'name': 'Uncertain Author', 'style': 'royal_blue1 dim'},
+    {'name': 'Size', 'justify': 'right', 'style': 'dim'},
+]
 @dataclass
 class Document:
@@ -181,6 +191,9 @@ class Document:
         """Secondary info about this file (description recipients, etc). Overload in subclasses."""
         return None
+    def is_attribution_uncertain(self) -> bool:
+        return bool(self.config and self.config.is_attribution_uncertain)
     def is_duplicate(self) -> bool:
         return bool(self.duplicate_of_id())
@@ -240,17 +253,6 @@ class Document:
         return text
-    def sort_key(self) -> tuple[datetime, str, int]:
-        """Sort by timestamp, file_id, then whether or not it's a duplicate file."""
-        if self.is_duplicate():
-            sort_id = self.config.duplicate_of_id
-            dupe_idx = 1
-        else:
-            sort_id = self.file_id
-            dupe_idx = 0
-        return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
     def source_file_id(self) -> str:
         """Strip off the _1, _2, etc. suffixes for extracted documents."""
         return self.file_id[0:6]
@@ -261,7 +263,7 @@ class Document:
         txt.append(f" {self.file_path.stem}", style=FILENAME_STYLE)
         if self.timestamp:
-            timestamp_str = remove_zero_time_from_timestamp_str(self.timestamp).replace('T', ' ')
+            timestamp_str = remove_zero_time(self.timestamp).replace('T', ' ')
             txt.append(' (', style=SYMBOL_STYLE)
             txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
@@ -269,7 +271,7 @@ class Document:
         txt.append(", ").append(key_value_txt('lines', self.num_lines()))
         if self.config and self.config.duplicate_of_id:
-            txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='magenta')))
+            txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='cyan dim')))
         return txt
@@ -282,6 +284,17 @@ class Document:
         return Panel(Group(*sentences), border_style=self._class_style(), expand=False)
+    def timestamp_sort_key(self) -> tuple[datetime, str, int]:
+        """Sort by timestamp, file_id, then whether or not it's a duplicate file."""
+        if self.is_duplicate():
+            sort_id = self.config.duplicate_of_id
+            dupe_idx = 1
+        else:
+            sort_id = self.file_id
+            dupe_idx = 0
+        return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
     def top_lines(self, n: int = 10) -> str:
         """First n lines."""
         return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
@@ -361,6 +374,32 @@ class Document:
     def __str__(self) -> str:
         return self.summary().plain
+    @classmethod
+    def file_info_table(cls, title: str, first_col_name: str) -> Table:
+        """Empty table with appropriate cols for summarizing groups of files."""
+        table = build_table(title)
+        cols = [{'name': first_col_name, 'min_width': 14}] + SUMMARY_TABLE_COLS
+        add_cols_to_table(table, cols, 'right')
+        return table
+    @classmethod
+    def files_info(cls, files: Sequence['Document'], is_author_na: bool = False) -> dict[str, str | Text]:
+        """Summary info about a group of files."""
+        file_count = len(files)
+        author_count = cls.known_author_count(files)
+        return {
+            'count': str(file_count),
+            'author_count': NA_TXT if is_author_na else str(author_count),
+            'no_author_count': NA_TXT if is_author_na else str(file_count - author_count),
+            'uncertain_author_count': NA_TXT if is_author_na else str(len([f for f in files if f.is_attribution_uncertain()])),
+            'bytes': file_size_to_str(sum([f.file_size() for f in files])),
+        }
+    @classmethod
+    def files_info_row(cls, files: Sequence['Document'], author_na: bool = False) -> Sequence[str | Text]:
+        return [v for v in cls.files_info(files, author_na).values()]
     @staticmethod
     def diff_files(files: list[str]) -> None:
         """Diff the contents of two Documents after all cleanup, BOM removal, etc."""
@@ -396,16 +435,24 @@ class Document:
         """Count of how many Document objects have an author attribution."""
         return len([doc for doc in docs if doc.author])
+    @staticmethod
+    def sort_by_id(docs: Sequence['DocumentType']) -> list['DocumentType']:
+        return sorted(docs, key=lambda d: d.file_id)
     @staticmethod
     def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
-        return sorted(docs, key=lambda doc: doc.sort_key())
+        return sorted(docs, key=lambda doc: doc.timestamp_sort_key())
-    @classmethod
-    def uniquify(cls, documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
+    @staticmethod
+    def uniquify(documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
         """Uniquify by file_id."""
         id_map = {doc.file_id: doc for doc in documents}
         return [doc for doc in id_map.values()]
+    @staticmethod
+    def without_dupes(docs: Sequence['DocumentType']) -> list['DocumentType']:
+        return [doc for doc in docs if not doc.is_duplicate()]
 DocumentType = TypeVar('DocumentType', bound=Document)

epstein-files 1.2.0__py3-none-any.whl → 1.2.5__py3-none-any.whl

epstein-files 1.2.0py3-none-any.whl → 1.2.5py3-none-any.whl