PyPI - epstein-files - Versions diffs - 1.2.5__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

epstein-files 1.2.5py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

epstein_files/__init__.py +55 -23
epstein_files/documents/communication.py +9 -5
epstein_files/documents/document.py +231 -135
epstein_files/documents/doj_file.py +242 -0
epstein_files/documents/doj_files/full_text.py +166 -0
epstein_files/documents/email.py +289 -232
epstein_files/documents/emails/email_header.py +35 -16
epstein_files/documents/emails/emailers.py +223 -0
epstein_files/documents/imessage/text_message.py +2 -3
epstein_files/documents/json_file.py +18 -14
epstein_files/documents/messenger_log.py +23 -39
epstein_files/documents/other_file.py +54 -48
epstein_files/epstein_files.py +65 -29
epstein_files/person.py +151 -94
epstein_files/util/constant/names.py +37 -10
epstein_files/util/constant/output_files.py +2 -0
epstein_files/util/constant/strings.py +14 -7
epstein_files/util/constant/urls.py +17 -0
epstein_files/util/constants.py +556 -391
epstein_files/util/data.py +2 -0
epstein_files/util/doc_cfg.py +44 -33
epstein_files/util/env.py +34 -19
epstein_files/util/file_helper.py +30 -6
epstein_files/util/helpers/debugging_helper.py +13 -0
epstein_files/util/helpers/env_helpers.py +21 -0
epstein_files/util/highlighted_group.py +121 -37
epstein_files/util/layout/left_bar_panel.py +26 -0
epstein_files/util/logging.py +28 -13
epstein_files/util/output.py +49 -40
epstein_files/util/rich.py +30 -3
epstein_files/util/word_count.py +7 -7
{epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/METADATA +16 -3
epstein_files-1.5.0.dist-info/RECORD +40 -0
{epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/entry_points.txt +1 -1
epstein_files-1.2.5.dist-info/RECORD +0 -34
{epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/LICENSE +0 -0
{epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/WHEEL +0 -0

epstein_files/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ Reformat Epstein text message files for readability and count email senders.
     Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT epstein_generate'
 """
+import re
 from sys import exit
 from dotenv import load_dotenv
@@ -15,20 +16,21 @@ from rich.text import Text
 from epstein_files.epstein_files import EpsteinFiles, document_cls
 from epstein_files.documents.document import INFO_PADDING, Document
+from epstein_files.documents.doj_file import DojFile
 from epstein_files.documents.email import Email
 from epstein_files.documents.messenger_log import MessengerLog
 from epstein_files.documents.other_file import OtherFile
 from epstein_files.util.constant.output_files import make_clean
-from epstein_files.util.constant.strings import ID_REGEX
+from epstein_files.util.constant.strings import HOUSE_OVERSIGHT_NOV_2025_ID_REGEX
 from epstein_files.util.data import flatten
 from epstein_files.util.env import args
 from epstein_files.util.file_helper import coerce_file_path, extract_file_id
 from epstein_files.util.logging import exit_with_error, logger
-from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
+from epstein_files.util.output import (print_doj_files, print_emails_section, print_json_files, print_stats,
      print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info,
      print_json_metadata, write_urls)
-from epstein_files.util.rich import (build_highlighter, console, highlighter, print_color_key, print_title_page_header,
-     print_title_page_tables, print_subtitle_panel, write_html)
+from epstein_files.util.rich import (build_highlighter, console, highlighter, print_color_key, print_json,
+     print_title_page_header, print_title_page_tables, print_subtitle_panel, write_html)
 from epstein_files.util.timer import Timer
 from epstein_files.util.word_count import write_word_counts_html
@@ -62,6 +64,10 @@ def generate_html() -> None:
     if args.colors_only:
         exit()
+    if args.output_doj_files:
+        printed_doj_files = print_doj_files(epstein_files)
+        timer.log_section_complete('DojFile', epstein_files.doj_files, printed_doj_files)
     if args.output_texts:
         printed_logs = print_text_messages_section(epstein_files)
         timer.log_section_complete('MessengerLog', epstein_files.imessage_logs, printed_logs)
@@ -83,9 +89,8 @@ def generate_html() -> None:
     if args.debug:
         highlighter.print_highlight_counts(console)
-    # JSON stats (mostly used for building pytest checks)
-    if args.json_stats:
-        print_json_stats(epstein_files)
+    if args.stats:
+        print_stats(epstein_files)  # Used for building pytest checks
 def epstein_diff():
@@ -93,11 +98,11 @@ def epstein_diff():
     Document.diff_files(args.positional_args)
-def epstein_search():
+def epstein_grep():
     """Search the cleaned up text of the files."""
     epstein_files = EpsteinFiles.get_files()
-    if ID_REGEX.match(args.positional_args[0]):
+    if HOUSE_OVERSIGHT_NOV_2025_ID_REGEX.match(args.positional_args[0]):
         logger.warning(f"'{args.positional_args[0]}' seems to be an ID, running epstein_show instead...")
         epstein_show()
         return
@@ -106,26 +111,41 @@ def epstein_search():
         temp_highlighter = build_highlighter(search_term)
         search_results = epstein_files.docs_matching(search_term, args.names)
         print_subtitle_panel(f"Found {len(search_results)} documents matching '{search_term}'")
+        last_document = None
         for search_result in search_results:
-            document = search_result.document
+            doc = search_result.document
+            lines = search_result.lines
-            if (isinstance(document, Email) and not args.output_emails) \
-                    or (isinstance(document, OtherFile) and not args.output_other) \
-                    or (isinstance(document, MessengerLog) and not args.output_texts):
-                document.warn(f"{type(document).__name__} Skipping search result...")
+            if (isinstance(doc, Email) and not args.output_emails) \
+                    or (isinstance(doc, (DojFile, OtherFile)) and not args.output_other) \
+                    or (isinstance(doc, MessengerLog) and not args.output_texts):
+                doc.log(f"{type(doc).__name__} Skipping search result...")
                 continue
+            elif isinstance(doc, Email) and args.email_body:
+                lines = [l for l in search_result.lines if l.line_number > doc.header.num_header_rows]
+                if not lines:
+                    doc.log(f"None of the matches for '{search_term}' seem to be in the body of the email")
+                    continue
+            if doc.is_duplicate:
+                if last_document and not last_document.is_duplicate:
+                    console.line()
-            if args.whole_file:
-                console.print(document)
+                last_document = doc
+                console.print(doc.duplicate_file_txt)
+            elif args.whole_file:
+                console.print(doc)
             else:
-                console.print(document.summary_panel())
+                console.print(doc.summary_panel)
-                for matching_line in search_result.lines:
+                for matching_line in lines:
                     line_txt = matching_line.__rich__()
                     console.print(Padding(temp_highlighter(line_txt), INFO_PADDING), style='gray37')
             console.line()
+            console.print(doc.local_path_and_url + '\n', style='dim')
 def epstein_show():
@@ -138,23 +158,35 @@ def epstein_show():
             people = EpsteinFiles.get_files().person_objs(args.names)
             raw_docs = [doc for doc in flatten([p.emails for p in people])]
         else:
-            ids = [extract_file_id(arg) for arg in args.positional_args]
-            raw_docs = [Document(coerce_file_path(id)) for id in ids]
+            ids = [extract_file_id(arg.strip().strip('_')) for arg in args.positional_args]
+            logger.info(f"extracted IDs: {ids}")
+            raw_docs = [Document.from_file_id(id) for id in ids]
+            logger.info(f"raw docs: {raw_docs}")
+        # Rebuild the Document objs so we can see result of latest processing
         docs = Document.sort_by_timestamp([document_cls(doc)(doc.file_path) for doc in raw_docs])
+        logger.info(f"Document types: {[doc._class_name for doc in docs]}")
     except Exception as e:
+        console.print_exception()
         exit_with_error(str(e))
     for doc in docs:
         console.print('\n', doc, '\n')
         if args.raw:
-            console.print(Panel(Text("RAW: ").append(doc.summary()), expand=False, style=doc._border_style()))
+            console.print(Panel(Text("RAW: ").append(doc.summary()), expand=False, style=doc.border_style))
             console.print(escape(doc.raw_text()), '\n')
             if isinstance(doc, Email):
-                console.print(Panel(Text("actual_text: ").append(doc.summary()), expand=False, style=doc._border_style()))
-                console.print(escape(doc._actual_text()), '\n')
+                console.print(Panel(Text("actual_text: ").append(doc.summary()), expand=False, style=doc.border_style))
+                console.print(escape(doc._extract_actual_text()), '\n')
+                metadata = doc.metadata
+                metadata['is_fwded_article'] = doc.is_fwded_article
+                metadata['is_word_count_worthy'] = doc.is_word_count_worthy
+                metadata['_is_first_for_user'] = doc._is_first_for_user
+                print_json(f"{doc.file_id} Metadata", metadata)
+        console.print(doc.local_path_and_url, style='dim')
 def epstein_word_count() -> None:

epstein_files/documents/communication.py CHANGED Viewed

@@ -21,26 +21,30 @@ class Communication(Document):
     config: CommunicationCfg | None = None
     timestamp: datetime = FALLBACK_TIMESTAMP  # TODO this default sucks (though it never happens)
+    @property
     def author_or_unknown(self) -> str:
         return self.author or UNKNOWN
+    @property
     def author_style(self) -> str:
         return get_style_for_name(self.author)
+    @property
     def author_txt(self) -> Text:
         return styled_name(self.author)
+    @property
+    def timestamp_without_seconds(self) -> str:
+        return TIMESTAMP_SECONDS_REGEX.sub('', str(self.timestamp))
     def external_links_txt(self, _style: str = '', include_alt_links: bool = True) -> Text:
         """Overrides super() method to apply self.author_style."""
-        return super().external_links_txt(self.author_style(), include_alt_links=include_alt_links)
+        return super().external_links_txt(self.author_style, include_alt_links=include_alt_links)
     def summary(self) -> Text:
         return self._summary().append(CLOSE_PROPERTIES_CHAR)
-    def timestamp_without_seconds(self) -> str:
-        return TIMESTAMP_SECONDS_REGEX.sub('', str(self.timestamp))
     def _summary(self) -> Text:
         """One line summary mostly for logging."""
         txt = super().summary().append(', ')
-        return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown()}'", style=self.author_style())))
+        return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown}'", style=self.author_style)))

epstein-files 1.2.5__py3-none-any.whl → 1.5.0__py3-none-any.whl

epstein-files 1.2.5py3-none-any.whl → 1.5.0py3-none-any.whl