PyPI - epstein-files - Versions diffs - 1.2.5__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

epstein-files 1.2.5py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

epstein_files/__init__.py +55 -23
epstein_files/documents/communication.py +9 -5
epstein_files/documents/document.py +231 -135
epstein_files/documents/doj_file.py +242 -0
epstein_files/documents/doj_files/full_text.py +166 -0
epstein_files/documents/email.py +289 -232
epstein_files/documents/emails/email_header.py +35 -16
epstein_files/documents/emails/emailers.py +223 -0
epstein_files/documents/imessage/text_message.py +2 -3
epstein_files/documents/json_file.py +18 -14
epstein_files/documents/messenger_log.py +23 -39
epstein_files/documents/other_file.py +54 -48
epstein_files/epstein_files.py +65 -29
epstein_files/person.py +151 -94
epstein_files/util/constant/names.py +37 -10
epstein_files/util/constant/output_files.py +2 -0
epstein_files/util/constant/strings.py +14 -7
epstein_files/util/constant/urls.py +17 -0
epstein_files/util/constants.py +556 -391
epstein_files/util/data.py +2 -0
epstein_files/util/doc_cfg.py +44 -33
epstein_files/util/env.py +34 -19
epstein_files/util/file_helper.py +30 -6
epstein_files/util/helpers/debugging_helper.py +13 -0
epstein_files/util/helpers/env_helpers.py +21 -0
epstein_files/util/highlighted_group.py +121 -37
epstein_files/util/layout/left_bar_panel.py +26 -0
epstein_files/util/logging.py +28 -13
epstein_files/util/output.py +49 -40
epstein_files/util/rich.py +30 -3
epstein_files/util/word_count.py +7 -7
{epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/METADATA +16 -3
epstein_files-1.5.0.dist-info/RECORD +40 -0
{epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/entry_points.txt +1 -1
epstein_files-1.2.5.dist-info/RECORD +0 -34
{epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/LICENSE +0 -0
{epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/WHEEL +0 -0

epstein_files/documents/other_file.py CHANGED Viewed

@@ -30,7 +30,6 @@ MAX_DAYS_SPANNED_TO_BE_VALID = 10
 MAX_EXTRACTED_TIMESTAMPS = 100
 MIN_TIMESTAMP = datetime(2000, 1, 1)
 MID_TIMESTAMP = datetime(2007, 1, 1)
-MAX_TIMESTAMP = datetime(2022, 12, 31)
 PREVIEW_CHARS = int(580 * (1 if args.all_other_files else 1.5))
 LOG_INDENT = '\n         '
 TIMESTAMP_LOG_INDENT = f'{LOG_INDENT}    '
@@ -93,40 +92,28 @@ class OtherFile(Document):
     """
     was_timestamp_extracted: bool = False
     include_description_in_summary_panel: ClassVar[bool] = True  # Class var for logging output
+    max_timestamp: ClassVar[datetime] = datetime(2022, 12, 31) # Overloaded in DojFile
-    def __post_init__(self):
-        super().__post_init__()
-        if self.config is None and VI_DAILY_NEWS_REGEX.search(self.text):
-            self.log(f"Creating synthetic config for VI Daily News article...")
-            self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
+    @property
+    def config_description(self) -> str | None:
+        """Overloads superclass property."""
+        if self.config and self.config.description:
+            return self.config.complete_description
+    @property
     def category(self) -> str | None:
         return self.config and self.config.category
+    @property
     def category_txt(self) -> Text | None:
-        return styled_category(self.category())
-    def config_description(self) -> str | None:
-        """Overloads superclass method."""
-        if self.config is not None:
-            return self.config.complete_description()
-    def highlighted_preview_text(self) -> Text:
-        try:
-            return highlighter(escape(self.preview_text()))
-        except Exception as e:
-            logger.error(f"Failed to apply markup in string '{escape_single_quotes(self.preview_text())}'\n"
-                         f"Original string: '{escape_single_quotes(self.preview_text())}'\n"
-                         f"File: '{self.filename}'\n")
-            return Text(escape(self.preview_text()))
+        return styled_category(self.category)
-    def is_interesting(self):
-        """False for lame prefixes, duplicates, and other boring files."""
-        info_sentences = self.info()
+    @property
+    def is_interesting(self) -> bool:
+        """Overloaded. False for lame prefixes, duplicates, and other boring files."""
+        info_sentences = self.info
-        if self.is_duplicate():
+        if self.is_duplicate:
             return False
         elif len(info_sentences) == 0:
             return True
@@ -135,9 +122,9 @@ class OtherFile(Document):
                 return self.config.is_interesting
             elif self.config.author in INTERESTING_AUTHORS:
                 return True
-            elif self.category() == FINANCE and self.author is not None:
+            elif self.category == FINANCE and self.author is not None:
                 return False
-            elif self.category() in UNINTERESTING_CATEGORIES:
+            elif self.category in UNINTERESTING_CATEGORIES:
                 return False
         for prefix in UNINTERESTING_PREFIXES:
@@ -146,15 +133,33 @@ class OtherFile(Document):
         return True
+    @property
     def metadata(self) -> Metadata:
-        metadata = super().metadata()
-        metadata['is_interesting'] = self.is_interesting()
+        metadata = super().metadata
+        metadata['is_interesting'] = self.is_interesting
         if self.was_timestamp_extracted:
             metadata['was_timestamp_extracted'] = self.was_timestamp_extracted
         return metadata
+    def __post_init__(self):
+        super().__post_init__()
+        if self.config is None and VI_DAILY_NEWS_REGEX.search(self.text):
+            self.log(f"Creating synthetic config for VI Daily News article...")
+            self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
+    def highlighted_preview_text(self) -> Text:
+        try:
+            return highlighter(escape(self.preview_text()))
+        except Exception as e:
+            logger.error(f"Failed to apply markup in string '{escape_single_quotes(self.preview_text())}'\n"
+                         f"Original string: '{escape_single_quotes(self.preview_text())}'\n"
+                         f"File: '{self.filename}'\n")
+            return Text(escape(self.preview_text()))
     def preview_text(self) -> str:
         return WHITESPACE_REGEX.sub(' ', self.text)[0:PREVIEW_CHARS]
@@ -164,9 +169,7 @@ class OtherFile(Document):
     def _extract_timestamp(self) -> datetime | None:
         """Return configured timestamp or value extracted by scanning text with datefinder."""
-        if self.config and self.config.timestamp:
-            return self.config.timestamp
-        elif self.config and any([s in (self.config_description() or '') for s in SKIP_TIMESTAMP_EXTRACT]):
+        if self.config and any([s in (self.config_description or '') for s in SKIP_TIMESTAMP_EXTRACT]):
             return None
         timestamps: list[datetime] = []
@@ -175,10 +178,11 @@ class OtherFile(Document):
             warnings.filterwarnings("ignore", module="dateutil")
             try:
-                for timestamp in datefinder.find_dates(self.text, strict=True):
+                # TODO: datefinder.find_dates() cannot find 08/29/2019 style e.g. in EFTA00005783 :(
+                for timestamp in datefinder.find_dates(self.text, strict=False):
                     timestamp = remove_timezone(timestamp)
-                    if MIN_TIMESTAMP < timestamp < MAX_TIMESTAMP:
+                    if MIN_TIMESTAMP < timestamp < self.max_timestamp:
                         timestamps.append(timestamp)
                     if len(timestamps) >= MAX_EXTRACTED_TIMESTAMPS:
@@ -187,7 +191,7 @@ class OtherFile(Document):
                 self.warn(f"Error while iterating through datefinder.find_dates(): {e}")
         if len(timestamps) == 0:
-            if not (self.is_duplicate() or VAST_HOUSE in self.text):
+            if not (self.is_duplicate or VAST_HOUSE in self.text):
                 self.log_top_lines(15, msg=f"No timestamps found")
             return None
@@ -210,9 +214,10 @@ class OtherFile(Document):
             self.log_top_lines(15, msg=timestamps_log_msg, level=logging.DEBUG)
     @classmethod
-    def files_preview_table(cls, files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
+    def files_preview_table(cls, files: Sequence['OtherFile'], title_pfx: str = '', title: str = '') -> Table:
         """Build a table of OtherFile documents."""
-        table = build_table(f'{title_pfx}Other Files Details in Chronological Order', show_lines=True)
+        title = title or f'{title_pfx}Other Files Details in Chronological Order'
+        table = build_table(title, show_lines=True, title_justify='left' if title else 'center')
         table.add_column('File', justify='center', width=FILENAME_LENGTH)
         table.add_column('Date', justify='center')
         table.add_column('Size', justify='right', style='dim')
@@ -221,21 +226,21 @@ class OtherFile(Document):
         for file in files:
             link_and_info = [file.external_links_txt()]
-            date_str = file.date_str()
+            date_str = file.date_str
-            if file.is_duplicate():
-                preview_text = file.duplicate_file_txt()
+            if file.is_duplicate:
+                preview_text = file.duplicate_file_txt
                 row_style = ' dim'
             else:
-                link_and_info += file.info()
+                link_and_info += file.info
                 preview_text = file.highlighted_preview_text()
                 row_style = ''
             table.add_row(
                 Group(*link_and_info),
                 Text(date_str, style=TIMESTAMP_STYLE) if date_str else QUESTION_MARKS_TXT,
-                file.file_size_str(),
-                file.category_txt(),
+                file.file_size_str,
+                file.category_txt,
                 preview_text,
                 style=row_style
             )
@@ -244,12 +249,13 @@ class OtherFile(Document):
     @classmethod
     def summary_table(cls, files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
-        categories = uniquify([f.category() for f in files])
-        categories = sorted(categories, key=lambda c: -len([f for f in files if f.category() == c]))
+        """Table showing file count by category."""
+        categories = uniquify([f.category for f in files])
+        categories = sorted(categories, key=lambda c: -len([f for f in files if f.category == c]))
         table = cls.file_info_table(f'{title_pfx}Other Files Summary', 'Category')
         for category in categories:
-            category_files = [f for f in files if f.category() == category]
+            category_files = [f for f in files if f.category == category]
             table.add_row(styled_category(category), *cls.files_info_row(category_files))
         table.columns = table.columns[:-2] + [table.columns[-1]]  # Removee unknown author col

epstein_files/epstein_files.py CHANGED Viewed

@@ -12,7 +12,8 @@ from typing import Sequence, Type, cast
 from rich.table import Table
 from epstein_files.documents.document import Document
-from epstein_files.documents.email import DETECT_EMAIL_REGEX, Email
+from epstein_files.documents.doj_file import DojFile
+from epstein_files.documents.email import Email
 from epstein_files.documents.json_file import JsonFile
 from epstein_files.documents.messenger_log import MSG_REGEX, MessengerLog
 from epstein_files.documents.other_file import OtherFile
@@ -21,7 +22,7 @@ from epstein_files.util.constant.strings import *
 from epstein_files.util.constants import *
 from epstein_files.util.data import flatten, json_safe, listify, uniquify
 from epstein_files.util.doc_cfg import EmailCfg, Metadata
-from epstein_files.util.env import DOCS_DIR, args, logger
+from epstein_files.util.env import DOCS_DIR, DOJ_PDFS_20260130_DIR, args, logger
 from epstein_files.util.file_helper import file_size_str
 from epstein_files.util.highlighted_group import HIGHLIGHTED_NAMES, HighlightedNames
 from epstein_files.util.search_result import SearchResult
@@ -49,14 +50,28 @@ class EpsteinFiles:
     imessage_logs: list[MessengerLog] = field(default_factory=list)
     json_files: list[JsonFile] = field(default_factory=list)
     other_files: list[OtherFile] = field(default_factory=list)
+    doj_files: list[DojFile] = field(default_factory=list)
     timer: Timer = field(default_factory=lambda: Timer())
     uninteresting_ccs: list[Name] = field(default_factory=list)
+    @property
+    def all_documents(self) -> Sequence[Document]:
+        return self.imessage_logs + self.emails + self.other_files + self.doj_files
+    @property
+    def all_doj_files(self) -> Sequence[DojFile | Email]:
+        """All files with the filename EFTAXXXXXX."""
+        return [doc for doc in self.all_documents if doc.is_doj_file]
     def __post_init__(self):
         """Iterate through files and build appropriate objects."""
         self.all_files = sorted([f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')])
-        documents = []
-        file_type_count = defaultdict(int)  # Hack used by --skip-other-files option
+        if DOJ_PDFS_20260130_DIR:
+            self.all_files += sorted([f for f in DOJ_PDFS_20260130_DIR.glob('**/*.txt')])
+        docs = []
+        file_type_count = defaultdict(int)  # Hack used by --skip-other-files option to get a few files parsed before skipping the rest
         # Read through and classify all the files
         for file_arg in self.all_files:
@@ -64,26 +79,28 @@ class EpsteinFiles:
             document = Document(file_arg)
             cls = document_cls(document)
-            if document.length() == 0:
+            if document.length == 0:
                 logger.warning(f"Skipping empty file: {document}]")
                 continue
             elif args.skip_other_files and cls == OtherFile and file_type_count[cls.__name__] > 1:
                 document.log(f"Skipping OtherFile...")
                 continue
-            documents.append(cls(file_arg, lines=document.lines, text=document.text))
-            logger.info(str(documents[-1]))
+            docs.append(cls(file_arg, lines=document.lines, text=document.text).printable_document())
+            logger.info(str(docs[-1]))
             file_type_count[cls.__name__] += 1
             if doc_timer.seconds_since_start() > SLOW_FILE_SECONDS:
-                doc_timer.print_at_checkpoint(f"Slow file: {documents[-1]} processed")
+                doc_timer.print_at_checkpoint(f"Slow file: {docs[-1]} processed")
-        self.emails = Document.sort_by_timestamp([d for d in documents if isinstance(d, Email)])
-        self.imessage_logs = Document.sort_by_timestamp([d for d in documents if isinstance(d, MessengerLog)])
-        self.other_files = Document.sort_by_timestamp([d for d in documents if isinstance(d, (JsonFile, OtherFile))])
-        self.json_files = [doc for doc in self.other_files if isinstance(doc, JsonFile)]
+        self.doj_files = Document.sort_by_timestamp([d for d in docs if isinstance(d, DojFile)])
+        self.emails = Document.sort_by_timestamp([d for d in docs if isinstance(d, Email)])
+        self.imessage_logs = Document.sort_by_timestamp([d for d in docs if isinstance(d, MessengerLog)])
+        self.json_files = Document.sort_by_timestamp([d for d in docs if isinstance(d, JsonFile)])
+        self.other_files = Document.sort_by_timestamp([d for d in docs if isinstance(d, OtherFile) and not isinstance(d, DojFile)])
         self._set_uninteresting_ccs()
         self._copy_duplicate_email_properties()
+        self._find_email_attachments_and_set_is_first_for_user()
     @classmethod
     def get_files(cls, timer: Timer | None = None) -> 'EpsteinFiles':
@@ -110,19 +127,19 @@ class EpsteinFiles:
         timer.print_at_checkpoint(f'Processed {len(epstein_files.all_files):,} documents')
         return epstein_files
-    def all_documents(self) -> Sequence[Document]:
-        return self.imessage_logs + self.emails + self.other_files
     def docs_matching(self, pattern: re.Pattern | str, names: list[Name] | None = None) -> list[SearchResult]:
         """Find documents whose text matches a pattern (file_type and names args limit the documents searched)."""
         results: list[SearchResult] = []
-        for doc in self.all_documents():
+        for doc in self.all_documents:
             if names and doc.author not in names:
                 continue
             lines = doc.matching_lines(pattern)
+            if args.min_line_length:
+                lines = [line for line in lines if len(line.line) > args.min_line_length]
             if len(lines) > 0:
                 results.append(SearchResult(doc, lines))
@@ -136,15 +153,15 @@ class EpsteinFiles:
     def email_author_counts(self) -> dict[Name, int]:
         return {
-            person.name: len(person.unique_emails_by())
-            for person in self.emailers() if len(person.unique_emails_by()) > 0
+            person.name: len(person.unique_emails_by)
+            for person in self.emailers() if len(person.unique_emails_by) > 0
         }
     def email_authors_to_device_signatures(self) -> dict[str, set[str]]:
         signatures = defaultdict(set)
         for email in [e for e in self.non_duplicate_emails() if e.sent_from_device]:
-            signatures[email.author_or_unknown()].add(email.sent_from_device)
+            signatures[email.author_or_unknown].add(email.sent_from_device)
         return signatures
@@ -152,14 +169,14 @@ class EpsteinFiles:
         signatures = defaultdict(set)
         for email in [e for e in self.non_duplicate_emails() if e.sent_from_device]:
-            signatures[email.sent_from_device].add(email.author_or_unknown())
+            signatures[email.sent_from_device].add(email.author_or_unknown)
         return signatures
     def email_recipient_counts(self) -> dict[Name, int]:
         return {
-            person.name: len(person.unique_emails_to())
-            for person in self.emailers() if len(person.unique_emails_to()) > 0
+            person.name: len(person.unique_emails_to)
+            for person in self.emailers() if len(person.unique_emails_to) > 0
         }
     def email_signature_substitution_counts(self) -> dict[str, int]:
@@ -208,7 +225,7 @@ class EpsteinFiles:
     def for_ids(self, file_ids: str | list[str]) -> list[Document]:
         file_ids = listify(file_ids)
-        docs = [doc for doc in self.all_documents() if doc.file_id in file_ids]
+        docs = [doc for doc in (list(self.all_documents) + self.doj_files) if doc.file_id in file_ids]
         if len(docs) != len(file_ids):
             logger.warning(f"{len(file_ids)} file IDs provided but only {len(docs)} Epstein files found!")
@@ -251,7 +268,7 @@ class EpsteinFiles:
                 name=name,
                 emails=self.emails_for(name),
                 imessage_logs=self.imessage_logs_for(name),
-                is_uninteresting_cc=name in self.uninteresting_emailers(),
+                is_uninteresting=name in self.uninteresting_emailers(),
                 other_files=[f for f in self.other_files if name and name == f.author]
             )
             for name in names
@@ -276,13 +293,30 @@ class EpsteinFiles:
         return self._uninteresting_emailers
+    def _find_email_attachments_and_set_is_first_for_user(self) -> None:
+        for other_file in self.other_files:
+            if other_file.config and other_file.config.attached_to_email_id:
+                email = self.email_for_id(other_file.config.attached_to_email_id)
+                email.attached_docs.append(other_file)
+                if other_file.timestamp \
+                        and other_file.timestamp != email.timestamp \
+                        and not other_file.config_timestamp:
+                    other_file.warn(f"Overwriting '{other_file.timestamp}' with {email}'s timestamp {email.timestamp}")
+                other_file.timestamp = email.timestamp
+        for emailer in self.emailers():
+            first_email = emailer.emails[0]
+            first_email._is_first_for_user = True
     def _copy_duplicate_email_properties(self) -> None:
         """Ensure dupe emails have the properties of the emails they duplicate to capture any repairs, config etc."""
         for email in self.emails:
-            if not email.is_duplicate():
+            if not email.is_duplicate:
                 continue
-            original = self.email_for_id(email.duplicate_of_id())
+            original = self.email_for_id(email.duplicate_of_id)
             for field_name in DUPLICATE_PROPS_TO_COPY:
                 original_prop = getattr(original, field_name)
@@ -321,11 +355,13 @@ def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
 def document_cls(doc: Document) -> Type[Document]:
     search_area = doc.text[0:5000]  # Limit search area to avoid pointless scans of huge files
-    if doc.length() == 0:
+    if doc.length == 0:
         return Document
+    elif doc.is_doj_file:
+        return DojFile
     if doc.text[0] == '{':
         return JsonFile
-    elif isinstance(doc.config, EmailCfg) or (DETECT_EMAIL_REGEX.match(search_area) and doc.config is None):
+    elif Document.is_email(doc):  # TODO: right now we setup the DojFile which makes an Email obj only later at print time
         return Email
     elif MSG_REGEX.search(search_area):
         return MessengerLog
@@ -334,4 +370,4 @@ def document_cls(doc: Document) -> Type[Document]:
 def _sorted_metadata(docs: Sequence[Document]) -> list[Metadata]:
-    return [json_safe(d.metadata()) for d in Document.sort_by_id(docs)]
+    return [json_safe(d.metadata) for d in Document.sort_by_id(docs)]

epstein-files 1.2.5__py3-none-any.whl → 1.5.0__py3-none-any.whl

epstein-files 1.2.5py3-none-any.whl → 1.5.0py3-none-any.whl