PyPI - epstein-files - Versions diffs - 1.0.10__py3-none-any.whl → 1.0.11__py3-none-any.whl - Mend

epstein-files 1.0.10py3-none-any.whl → 1.0.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

epstein_files/__init__.py +4 -6
epstein_files/documents/document.py +92 -49
epstein_files/documents/email.py +7 -4
epstein_files/documents/imessage/text_message.py +3 -12
epstein_files/documents/json_file.py +13 -1
epstein_files/documents/messenger_log.py +32 -19
epstein_files/documents/other_file.py +66 -43
epstein_files/epstein_files.py +22 -15
epstein_files/util/constant/names.py +2 -2
epstein_files/util/constants.py +84 -78
epstein_files/util/doc_cfg.py +17 -25
epstein_files/util/env.py +29 -17
epstein_files/util/file_helper.py +13 -24
epstein_files/util/highlighted_group.py +22 -14
epstein_files/util/logging.py +0 -6
epstein_files/util/output.py +12 -7
epstein_files/util/rich.py +15 -10
epstein_files/util/word_count.py +65 -5
{epstein_files-1.0.10.dist-info → epstein_files-1.0.11.dist-info}/METADATA +1 -1
epstein_files-1.0.11.dist-info/RECORD +33 -0
epstein_files/count_words.py +0 -72
epstein_files-1.0.10.dist-info/RECORD +0 -34
{epstein_files-1.0.10.dist-info → epstein_files-1.0.11.dist-info}/LICENSE +0 -0
{epstein_files-1.0.10.dist-info → epstein_files-1.0.11.dist-info}/WHEEL +0 -0
{epstein_files-1.0.10.dist-info → epstein_files-1.0.11.dist-info}/entry_points.txt +0 -0

epstein_files/documents/other_file.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import re
 import logging
 import warnings
+from collections import defaultdict
 from dataclasses import asdict, dataclass
 from datetime import datetime
+from typing import ClassVar, Sequence
 import datefinder
 import dateutil
@@ -16,11 +18,11 @@ from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, WHITESPACE_R
 from epstein_files.util.constant.strings import *
 from epstein_files.util.constants import *
 from epstein_files.util.doc_cfg import FINANCIAL_REPORTS_AUTHORS, DocCfg, Metadata
-from epstein_files.util.data import escape_single_quotes, remove_timezone, uniquify
-from epstein_files.util.file_helper import FILENAME_LENGTH
+from epstein_files.util.data import escape_single_quotes, remove_timezone, sort_dict, uniquify
+from epstein_files.util.file_helper import FILENAME_LENGTH, file_size_to_str
 from epstein_files.util.env import args
-from epstein_files.util.highlighted_group import get_style_for_category
-from epstein_files.util.rich import QUESTION_MARK_TXT, build_table, highlighter
+from epstein_files.util.highlighted_group import styled_category
+from epstein_files.util.rich import QUESTION_MARK_TXT, add_cols_to_table, build_table, highlighter
 from epstein_files.util.logging import logger
 MAX_DAYS_SPANNED_TO_BE_VALID = 10
@@ -38,14 +40,11 @@ UNINTERESTING_CATEGORES = [
     ARTS,
     BOOK,
     JUNK,
+    SKYPE_LOG,
     SPEECH,
 ]
-UNINTERESTING_IDS = [
-    '031794',
-]
-# OtherFiles whose description/hints match these prefixes are not displayed unless --all-other-files is used
+# OtherFiles whose descriptions/info match these prefixes are not displayed unless --all-other-files is used
 UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
     'article about',
     ARTICLE_DRAFT,
@@ -60,7 +59,7 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
     CVRA,
     DAILY_MAIL,
     DAILY_TELEGRAPH,
-    DAVID_SCHOEN_CVRA_LEXIS_SEARCH[0:-12],  # Because date at end :(
+    CVRA_LEXIS_SEARCH[0:-12],  # Because date at end :(
     DERSH_GIUFFRE_TWEET,
     'Financial Times',
     'Forbes',
@@ -78,8 +77,10 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
     LA_TIMES,
     'Litigation Daily',
     LAWRENCE_KRAUSS,
+    LAWRENCE_KRAUSS_ASU_ORIGINS,
     'MarketWatch',
     MARTIN_NOWAK,
+    'Morning News',
     NOBEL_CHARITABLE_TRUST,
     'Nautilus',
     'New Yorker',
@@ -122,24 +123,25 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
 class OtherFile(Document):
     """File that is not an email, an iMessage log, or JSON data."""
+    include_description_in_summary_panel: ClassVar[bool] = True
     def __post_init__(self):
         super().__post_init__()
         if self.config is None and VI_DAILY_NEWS_REGEX.search(self.text):
-            self.log(f"Creating synthetic config for VI Daily News article...", logging.INFO)
+            self.log(f"Creating synthetic config for VI Daily News article...")
             self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
     def category(self) -> str | None:
         return self.config and self.config.category
-    def configured_description(self) -> str | None:
+    def category_txt(self) -> Text | None:
+        return styled_category(self.category() or UNKNOWN)
+    def config_description(self) -> str | None:
         """Overloads superclass method."""
         if self.config is not None:
-            return self.config.info_str()
-    def description_panel(self, include_hints=True) -> Panel:
-        """Panelized description() with info_txt(), used in search results."""
-        return super().description_panel(include_hints=include_hints)
+            return self.config.complete_description()
     def highlighted_preview_text(self) -> Text:
         try:
@@ -153,13 +155,11 @@ class OtherFile(Document):
     def is_interesting(self):
         """False for lame prefixes, duplicates, and other boring files."""
-        hints = self.hints()
+        info_sentences = self.info()
-        if self.is_duplicate:
+        if self.is_duplicate():
             return False
-        elif self.file_id in UNINTERESTING_IDS:
-            return False
-        elif len(hints) == 0:
+        elif len(info_sentences) == 0:
             return True
         elif self.config:
             if self.config.is_interesting:
@@ -170,7 +170,7 @@ class OtherFile(Document):
                 return False
         for prefix in UNINTERESTING_PREFIXES:
-            if hints[0].plain.startswith(prefix):
+            if info_sentences[0].plain.startswith(prefix):
                 return False
         return True
@@ -195,7 +195,6 @@ class OtherFile(Document):
         timestamps: list[datetime] = []
         with warnings.catch_warnings():
-            warnings.filterwarnings("ignore", module="datefinder")
             warnings.filterwarnings("ignore", module="dateutil")
             try:
@@ -208,11 +207,11 @@ class OtherFile(Document):
                     if len(timestamps) >= MAX_EXTRACTED_TIMESTAMPS:
                         break
             except ValueError as e:
-                logger.warning(f"Error while iterating through datefinder.find_dates(): {e}")
+                self.log(f"Error while iterating through datefinder.find_dates(): {e}", logging.WARNING)
         if len(timestamps) == 0:
-            if not self.is_duplicate and VAST_HOUSE not in self.text:
-                self.log_top_lines(15, msg=f"No timestamps found", level=logging.INFO)
+            if not (self.is_duplicate() or VAST_HOUSE in self.text):
+                self.log_top_lines(15, msg=f"No timestamps found")
             return None
         elif len(timestamps) == 1:
@@ -231,7 +230,7 @@ class OtherFile(Document):
             self.log_top_lines(15, msg=timestamps_log_msg, level=logging.DEBUG)
     @staticmethod
-    def build_table(docs: list['OtherFile']) -> Table:
+    def build_table(files: Sequence['OtherFile']) -> Table:
         """Build a table of OtherFile documents."""
         table = build_table(None, show_lines=True)
         table.add_column('File', justify='center', width=FILENAME_LENGTH)
@@ -240,31 +239,55 @@ class OtherFile(Document):
         table.add_column('Type', justify='center')
         table.add_column(FIRST_FEW_LINES, justify='left', style='pale_turquoise4')
-        for doc in docs:
-            link_and_info = [doc.raw_document_link_txt()]
-            category = doc.category()
-            date_str = doc.date_str()
+        for file in files:
+            link_and_info = [file.raw_document_link_txt()]
+            date_str = file.date_str()
-            if doc.is_duplicate:
-                preview_text = doc.duplicate_file_txt()
+            if file.is_duplicate():
+                preview_text = file.duplicate_file_txt()
                 row_style = ' dim'
             else:
-                link_and_info += doc.hints()
-                preview_text = doc.highlighted_preview_text()
+                link_and_info += file.info()
+                preview_text = file.highlighted_preview_text()
                 row_style = ''
-            if category:
-                category_txt = Text(category, get_style_for_category(category) or 'wheat4')
-            else:
-                category_txt = Text('')
             table.add_row(
                 Group(*link_and_info),
                 Text(date_str, style=TIMESTAMP_DIM) if date_str else QUESTION_MARK_TXT,
-                doc.file_size_str(),
-                category_txt,
+                file.file_size_str(),
+                file.category_txt(),
                 preview_text,
                 style=row_style
             )
         return table
+    @staticmethod
+    def count_by_category_table(files: Sequence['OtherFile']) -> Table:
+        counts = defaultdict(int)
+        category_bytes = defaultdict(int)
+        for file in files:
+            if file.category() is None:
+                logger.warning(f"file {file.file_id} has no category")
+            counts[file.category()] += 1
+            category_bytes[file.category()] += file.length
+        table = build_table('Other Files Summary')
+        add_cols_to_table(table, ['Category', 'Count', 'Has Author', 'No Author', 'Size'])
+        table.columns[-1].style = 'dim'
+        for (category, count) in sort_dict(counts):
+            category_files = [f for f in files if f.category() == category]
+            known_author_count = Document.known_author_count(category_files)
+            table.add_row(
+                styled_category(category or UNKNOWN),
+                str(count),
+                str(known_author_count),
+                str(count - known_author_count),
+                file_size_to_str(category_bytes[category]),
+            )
+        return table

epstein_files/epstein_files.py CHANGED Viewed

@@ -23,12 +23,12 @@ from epstein_files.util.constant.strings import *
 from epstein_files.util.constant.urls import (EPSTEIN_MEDIA, EPSTEIN_WEB, JMAIL, epstein_media_person_url,
      epsteinify_name_url, epstein_web_person_url, search_jmail_url, search_twitter_url)
 from epstein_files.util.constants import *
-from epstein_files.util.data import dict_sets_to_lists, json_safe, sort_dict
+from epstein_files.util.data import dict_sets_to_lists, json_safe, listify, sort_dict
 from epstein_files.util.doc_cfg import EmailCfg, Metadata
-from epstein_files.util.env import args, logger
-from epstein_files.util.file_helper import DOCS_DIR, file_size_str
+from epstein_files.util.env import DOCS_DIR, args, logger
+from epstein_files.util.file_helper import file_size_str
 from epstein_files.util.highlighted_group import get_info_for_name, get_style_for_name
-from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, TABLE_BORDER_STYLE, add_cols_to_table,
+from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, add_cols_to_table,
      build_table, console, highlighter, link_text_obj, link_markup, print_author_header, print_centered,
      print_other_site_link, print_panel, print_section_header, vertically_pad)
 from epstein_files.util.search_result import SearchResult
@@ -66,7 +66,7 @@ class EpsteinFiles:
     def __post_init__(self):
         """Iterate through files and build appropriate objects."""
-        self.all_files = [f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')]
+        self.all_files = sorted([f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')])
         documents = []
         file_type_count = defaultdict(int)
@@ -74,12 +74,15 @@ class EpsteinFiles:
         for file_arg in self.all_files:
             doc_timer = Timer(decimals=4)
             document = Document(file_arg)
+            cls = document_cls(document)
             if document.length == 0:
                 logger.warning(f"Skipping empty file: {document}]")
                 continue
+            elif args.skip_other_files and cls == OtherFile and file_type_count[cls.__name__] > 1:
+                logger.warning(f"Skipping {document.filename}...")
+                continue
-            cls = document_cls(document)
             documents.append(cls(file_arg, text=document.text))
             logger.info(str(documents[-1]))
             file_type_count[cls.__name__] += 1
@@ -186,7 +189,8 @@ class EpsteinFiles:
         else:
             return [e for e in self.emails if author in e.recipients]
-    def get_documents_by_id(self, file_ids: list[str]) -> list[Document]:
+    def get_documents_by_id(self, file_ids: str | list[str]) -> list[Document]:
+        file_ids = listify(file_ids)
         docs = [doc for doc in self.all_documents() if doc.file_id in file_ids]
         if len(docs) != len(file_ids):
@@ -223,7 +227,7 @@ class EpsteinFiles:
                 f"{len(docs):,}",
                 f"{known:,}" if known is not None else NA_TXT,
                 f"{len(docs) - known:,}" if known is not None else NA_TXT,
-                f"{len([d for d in docs if d.is_duplicate])}",
+                f"{len([d for d in docs if d.is_duplicate()])}",
             )
         add_row('iMessage Logs', self.imessage_logs)
@@ -237,7 +241,7 @@ class EpsteinFiles:
         """Print complete emails to or from a particular 'author'. Returns the Emails that were printed."""
         conversation_length = self.email_conversation_length_in_days(_author)
         emails = self.emails_for(_author)
-        unique_emails = [email for email in emails if not email.is_duplicate]
+        unique_emails = [email for email in emails if not email.is_duplicate()]
         author = _author or UNKNOWN
         print_author_header(
@@ -250,7 +254,7 @@ class EpsteinFiles:
         last_printed_email_was_duplicate = False
         for email in emails:
-            if email.is_duplicate:
+            if email.is_duplicate():
                 console.print(Padding(email.duplicate_file_txt().append('...'), (0, 0, 0, 4)))
                 last_printed_email_was_duplicate = True
             else:
@@ -263,7 +267,7 @@ class EpsteinFiles:
         return emails
     def print_emails_table_for(self, author: str | None) -> None:
-        emails = [email for email in self.emails_for(author) if not email.is_duplicate]  # Remove dupes
+        emails = [email for email in self.emails_for(author) if not email.is_duplicate()]  # Remove dupes
         console.print(Align.center(Email.build_table(emails, author)), '\n')
     def print_email_device_info(self) -> None:
@@ -272,7 +276,7 @@ class EpsteinFiles:
         console.print(_build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
     def print_emailer_counts_table(self) -> None:
-        footer = f"Identified authors of {self.attributed_email_count():,} emails out of {len(self.emails):,}."
+        footer = f"Identified authors of {self.attributed_email_count():,} out of {len(self.emails):,} emails ."
         counts_table = build_table("Email Counts", caption=footer)
         add_cols_to_table(counts_table, ['Name', 'Count', 'Sent', "Recv'd", JMAIL, EPSTEIN_MEDIA, EPSTEIN_WEB, 'Twitter'])
@@ -303,7 +307,7 @@ class EpsteinFiles:
         text_summary_msg = f"\nDeanonymized {Document.known_author_count(self.imessage_logs)} of "
         text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files):,} files."
         console.print(text_summary_msg)
-        imessage_msg_count = sum([len(log.messages()) for log in self.imessage_logs])
+        imessage_msg_count = sum([len(log.messages) for log in self.imessage_logs])
         console.print(f"Found {imessage_msg_count} text messages in {len(self.imessage_logs)} iMessage log files.")
     def print_other_files_table(self) -> list[OtherFile]:
@@ -318,17 +322,18 @@ class EpsteinFiles:
             console.line(2)
         console.print(OtherFile.build_table(interesting_files))
+        console.print(Padding(OtherFile.count_by_category_table(interesting_files), (2, 0, 2, 2)))
         skipped_file_count = len(self.other_files) - len(interesting_files)
         if skipped_file_count > 0:
-            logger.warning(f"Skipped {skipped_file_count} uninteresting files...")
+            logger.warning(f"Skipped {skipped_file_count} uninteresting other files...")
         return interesting_files
     def _tally_email_data(self) -> None:
         """Tally up summary info about Email objects."""
         for email in self.emails:
-            if email.is_duplicate:
+            if email.is_duplicate():
                 continue
             self.email_author_counts[email.author] += 1
@@ -360,6 +365,8 @@ def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
 def document_cls(doc: Document) -> Type[Document]:
     search_area = doc.text[0:5000]  # Limit search area to avoid pointless scans of huge files
+    if doc.length == 0:
+        return Document
     if doc.text[0] == '{':
         return JsonFile
     elif isinstance(doc.config, EmailCfg) or (DETECT_EMAIL_REGEX.match(search_area) and doc.config is None):

epstein_files/util/constant/names.py CHANGED Viewed

@@ -233,14 +233,14 @@ OTHER_NAMES = NAMES_TO_NOT_HIGHLIGHT + """
     ferguson flachsbart francis franco frank
     gardner gary geoff geoffrey gilbert gloria goldberg gonzalez gould graham greene guarino gwyneth
     hancock harold harrison harry helen hirsch hofstadter horowitz hussein
-    isaac isaacson
+    ian isaac isaacson
     jamie jane janet jason jen jim joe johnson jones josh julie justin
     karl kate kathy kelly kim kruger kyle
     leo leonard lenny leslie lieberman louis lynch lynn
     marcus marianne matt matthew melissa michele michelle moore moscowitz
     nicole nussbaum
     paulson philippe
-    rafael ray richardson rob robin ron rudolph ryan
+    rafael ray richard richardson rob robin ron rubin rudolph ryan
     sara sarah seligman serge sergey silverman sloman smith snowden sorkin steele stevie stewart
     ted theresa thompson tiffany timothy tony
     valeria

epstein-files 1.0.10__py3-none-any.whl → 1.0.11__py3-none-any.whl

epstein-files 1.0.10py3-none-any.whl → 1.0.11py3-none-any.whl