PyPI - epstein-files - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

epstein-files 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

epstein_files/__init__.py +75 -135
epstein_files/documents/communication.py +9 -9
epstein_files/documents/document.py +115 -87
epstein_files/documents/email.py +154 -85
epstein_files/documents/emails/email_header.py +7 -6
epstein_files/documents/imessage/text_message.py +3 -2
epstein_files/documents/json_file.py +17 -0
epstein_files/documents/messenger_log.py +62 -3
epstein_files/documents/other_file.py +165 -17
epstein_files/epstein_files.py +128 -169
epstein_files/util/constant/names.py +8 -1
epstein_files/util/constant/output_files.py +29 -0
epstein_files/util/constant/strings.py +27 -0
epstein_files/util/constant/urls.py +25 -9
epstein_files/util/constants.py +1018 -1045
epstein_files/util/data.py +20 -55
epstein_files/util/{file_cfg.py → doc_cfg.py} +121 -43
epstein_files/util/env.py +19 -20
epstein_files/util/file_helper.py +38 -21
epstein_files/util/highlighted_group.py +229 -177
epstein_files/util/logging.py +63 -0
epstein_files/util/output.py +180 -0
epstein_files/util/rich.py +29 -17
epstein_files/util/search_result.py +14 -6
epstein_files/util/timer.py +24 -0
epstein_files/util/word_count.py +2 -1
{epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/METADATA +20 -4
epstein_files-1.0.2.dist-info/RECORD +33 -0
epstein_files-1.0.2.dist-info/entry_points.txt +7 -0
epstein_files-1.0.0.dist-info/RECORD +0 -28
{epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/LICENSE +0 -0
{epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/WHEEL +0 -0

epstein_files/epstein_files.py CHANGED Viewed

@@ -1,14 +1,14 @@
 import gzip
+import json
 import pickle
 import re
 from collections import defaultdict
 from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path
-from typing import Literal, Sequence
+from typing import Sequence, Type
 from rich.align import Align
-from rich.console import Group
 from rich.padding import Padding
 from rich.table import Table
 from rich.text import Text
@@ -19,24 +19,26 @@ from epstein_files.documents.emails.email_header import AUTHOR
 from epstein_files.documents.json_file import JsonFile
 from epstein_files.documents.messenger_log import MSG_REGEX, MessengerLog
 from epstein_files.documents.other_file import OtherFile
+from epstein_files.util.constant.output_files import PICKLED_PATH
 from epstein_files.util.constant.strings import *
 from epstein_files.util.constant.urls import (EPSTEIN_WEB, JMAIL, epsteinify_name_url, epstein_web_person_url,
      search_jmail_url, search_twitter_url)
 from epstein_files.util.constants import *
-from epstein_files.util.data import Timer, dict_sets_to_lists, iso_timestamp, sort_dict
+from epstein_files.util.data import dict_sets_to_lists, json_safe, sort_dict
+from epstein_files.util.doc_cfg import EmailCfg
 from epstein_files.util.env import args, logger
-from epstein_files.util.file_cfg import MessageCfg
-from epstein_files.util.file_helper import DOCS_DIR, FILENAME_LENGTH, PICKLED_PATH, file_size_str
+from epstein_files.util.file_helper import DOCS_DIR, file_size_str
 from epstein_files.util.highlighted_group import get_info_for_name, get_style_for_name
-from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, QUESTION_MARK_TXT, add_cols_to_table, console,
-     highlighter, link_text_obj, link_markup, print_author_header, print_centered, print_other_site_link, print_panel,
+from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, add_cols_to_table, console, highlighter,
+     link_text_obj, link_markup, print_author_header, print_centered, print_other_site_link, print_panel,
      print_section_header, vertically_pad)
 from epstein_files.util.search_result import SearchResult
+from epstein_files.util.timer import Timer
 DEVICE_SIGNATURE = 'Device Signature'
-FIRST_FEW_LINES = 'First Few Lines'
 DEVICE_SIGNATURE_PADDING = (1, 0)
 NOT_INCLUDED_EMAILERS = [e.lower() for e in (USELESS_EMAILERS + [JEFFREY_EPSTEIN])]
+SLOW_FILE_SECONDS = 1.0
 INVALID_FOR_EPSTEIN_WEB = JUNK_EMAILERS + KRASSNER_RECIPIENTS + [
     'ACT for America',
@@ -53,70 +55,55 @@ class EpsteinFiles:
     imessage_logs: list[MessengerLog] = field(default_factory=list)
     json_files: list[JsonFile] = field(default_factory=list)
     other_files: list[OtherFile] = field(default_factory=list)
+    timer: Timer = field(default_factory=lambda: Timer())
     # Analytics / calculations
     email_author_counts: dict[str | None, int] = field(default_factory=lambda: defaultdict(int))
     email_authors_to_device_signatures: dict[str, set] = field(default_factory=lambda: defaultdict(set))
     email_device_signatures_to_authors: dict[str, set] = field(default_factory=lambda: defaultdict(set))
     email_recipient_counts: dict[str | None, int] = field(default_factory=lambda: defaultdict(int))
-    _email_unknown_recipient_file_ids: set[str] = field(default_factory=set)
+    unknown_recipient_email_ids: set[str] = field(default_factory=set)
     def __post_init__(self):
+        """Iterate through files and build appropriate objects."""
         self.all_files = [f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')]
+        documents = []
         # Read through and classify all the files
         for file_arg in self.all_files:
-            logger.info(f"Scanning '{file_arg.name}'...")
+            doc_timer = Timer(decimals=4)
             document = Document(file_arg)
             if document.length == 0:
-                logger.info(f"Skipping empty file {document.description().plain}")
-            elif document.text[0] == '{':
-                # Handle JSON files
-                self.json_files.append(JsonFile(file_arg, text=document.text))
-                logger.info(self.json_files[-1].description().plain)
-            elif MSG_REGEX.search(document.text):
-                # Handle iMessage log files
-                self.imessage_logs.append(MessengerLog(file_arg, text=document.text))
-                logger.info(self.imessage_logs[-1].description().plain)
-            elif DETECT_EMAIL_REGEX.match(document.text) or isinstance(document.config, MessageCfg):
-                # Handle emails
-                email = Email(file_arg, text=document.text)
-                logger.info(email.description().plain)
-                self.emails.append(email)
-                self.email_author_counts[email.author] += 1
-                if len(email.recipients) == 0:
-                    self._email_unknown_recipient_file_ids.add(email.file_id)
-                    self.email_recipient_counts[None] += 1
-                else:
-                    for recipient in email.recipients:
-                        self.email_recipient_counts[recipient] += 1
-                if email.sent_from_device:
-                    self.email_authors_to_device_signatures[email.author_or_unknown()].add(email.sent_from_device)
-                    self.email_device_signatures_to_authors[email.sent_from_device].add(email.author_or_unknown())
-            else:
-                # Handle OtherFiles
-                self.other_files.append(OtherFile(file_arg, text=document.text))
-                logger.info(self.other_files[-1].description().plain)
+                logger.warning(f"Skipping empty file: {document}")
+                continue
+            cls = document_cls(document)
+            documents.append(cls(file_arg, text=document.text))
+            logger.info(str(documents[-1]))
-        self.emails = Document.sort_by_timestamp(self.emails)
-        self.imessage_logs = Document.sort_by_timestamp(self.imessage_logs)
-        self.other_files = Document.sort_by_timestamp(self.other_files + self.json_files)
+            if doc_timer.seconds_since_start() > SLOW_FILE_SECONDS:
+                doc_timer.print_at_checkpoint(f"Slow file: {documents[-1]} processed")
+        self.emails = Document.sort_by_timestamp([d for d in documents if isinstance(d, Email)])
+        self.imessage_logs = Document.sort_by_timestamp([d for d in documents if isinstance(d, MessengerLog)])
+        self.other_files = Document.sort_by_timestamp([d for d in documents if isinstance(d, (JsonFile, OtherFile))])
+        self.json_files = [doc for doc in self.other_files if isinstance(doc, JsonFile)]
+        self._tally_email_data()
     @classmethod
-    def get_files(cls, timer: Timer | None = None) -> 'EpsteinFiles':
+    def get_files(cls, timer: Timer | None = None, use_pickled: bool = False) -> 'EpsteinFiles':
         """Alternate constructor that reads/writes a pickled version of the data ('timer' arg is for logging)."""
         timer = timer or Timer()
-        if (args.pickled and PICKLED_PATH.exists()) and not args.overwrite_pickle:
+        if ((args.pickled or use_pickled) and PICKLED_PATH.exists()) and not args.overwrite_pickle:
             with gzip.open(PICKLED_PATH, 'rb') as file:
                 epstein_files = pickle.load(file)
                 timer.print_at_checkpoint(f"Loaded {len(epstein_files.all_files):,} documents from '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})")
+                epstein_files.timer = timer
                 return epstein_files
-        epstein_files = EpsteinFiles()
+        epstein_files = EpsteinFiles(timer=timer)
         if args.overwrite_pickle or not PICKLED_PATH.exists():
             with gzip.open(PICKLED_PATH, 'wb') as file:
@@ -141,18 +128,17 @@ class EpsteinFiles:
     def docs_matching(
             self,
             pattern: re.Pattern | str,
-            file_type: Literal['all', 'other'] = 'all',
             names: list[str | None] | None = None
         ) -> list[SearchResult]:
         """Find documents whose text matches a pattern (file_type and names args limit the documents searched)."""
         results: list[SearchResult] = []
-        for doc in (self.all_documents() if file_type == 'all' else self.other_files):
-            lines = doc.lines_matching_txt(pattern)
-            if names and ((not isinstance(doc, (Email, MessengerLog))) or doc.author not in names):
+        for doc in self.all_documents():
+            if names and doc.author not in names:
                 continue
+            lines = doc.matching_lines(pattern)
             if len(lines) > 0:
                 results.append(SearchResult(doc, lines))
@@ -178,7 +164,7 @@ class EpsteinFiles:
         return substitution_counts
     def email_unknown_recipient_file_ids(self) -> list[str]:
-        return sorted(list(self._email_unknown_recipient_file_ids))
+        return sorted(list(self.unknown_recipient_email_ids))
     def emails_by(self, author: str | None) -> list[Email]:
         return [e for e in self.emails if e.author == author]
@@ -198,48 +184,52 @@ class EpsteinFiles:
         else:
             return [e for e in self.emails if author in e.recipients]
-    def imessage_logs_for(self, author: str | None | list[str | None]) -> Sequence[MessengerLog]:
-        if author in [EVERYONE, JEFFREY_EPSTEIN]:
-            return self.imessage_logs
+    def get_documents_by_id(self, file_ids: list[str]) -> list[Document]:
+        docs = [doc for doc in self.all_documents() if doc.file_id in file_ids]
+        if len(docs) != len(file_ids):
+            logger.warning(f"{len(file_ids)} file IDs provided but only {len(docs)} Epstein files found!")
-        authors = author if isinstance(author, list) else [author]
-        return [log for log in self.imessage_logs if log.author in authors]
+        return docs
+    def imessage_logs_for(self, author: str | None | list[str | None]) -> Sequence[MessengerLog]:
+        return MessengerLog.logs_for(author, self.imessage_logs)
     def identified_imessage_log_count(self) -> int:
         return len([log for log in self.imessage_logs if log.author])
-    def imessage_sender_counts(self) -> dict[str | None, int]:
-        sender_counts: dict[str | None, int] = defaultdict(int)
+    def json_metadata(self) -> str:
+        metadata = {
+            EMAIL_CLASS: [json_safe(d.metadata()) for d in self.emails],
+            JSON_FILE_CLASS: [json_safe(d.metadata()) for d in self.json_files],
+            MESSENGER_LOG_CLASS: [json_safe(d.metadata()) for d in self.imessage_logs],
+            OTHER_FILE_CLASS: [json_safe(d.metadata()) for d in self.other_files if not isinstance(d, JsonFile)],
+        }
-        for message_log in self.imessage_logs:
-            for message in message_log.messages():
-                sender_counts[message.author] += 1
+        return json.dumps(metadata, indent=4, sort_keys=True)
-        return sender_counts
+    def non_json_other_files(self) -> list[OtherFile]:
+        return [doc for doc in self.other_files if not isinstance(doc, JsonFile)]
     def print_files_summary(self) -> None:
-        dupes = defaultdict(int)
-        for doc in self.all_documents():
-            if doc.is_duplicate:
-                dupes[doc.document_type()] += 1
-        table = Table()
+        table = Table(title='Summary of Document Types')
         add_cols_to_table(table, ['File Type', 'Files', 'Author Known', 'Author Unknown', 'Duplicates'])
-        def add_row(label: str, docs: list, known: int | None = None, dupes: int | None = None):
+        def add_row(label: str, docs: list):
+            known = None if isinstance(docs[0], JsonFile) else len([d for d in docs if d.author])
             table.add_row(
                 label,
                 f"{len(docs):,}",
-                f"{known:,}" if known else NA_TXT,
-                f"{len(docs) - known:,}" if known else NA_TXT,
-                f"{dupes:,}" if dupes else NA_TXT,
+                f"{known:,}" if known is not None else NA_TXT,
+                f"{len(docs) - known:,}" if known is not None else NA_TXT,
+                f"{len([d for d in docs if d.is_duplicate])}",
             )
-        add_row('iMessage Logs', self.imessage_logs, self.identified_imessage_log_count())
-        add_row('Emails', self.emails, len([e for e in self.emails if e.author]), dupes[EMAIL_CLASS])
-        add_row('JSON Data', self.json_files, dupes=0)
-        add_row('Other', self.other_files, dupes=dupes[OTHER_FILE_CLASS])
+        add_row('iMessage Logs', self.imessage_logs)
+        add_row('Emails', self.emails)
+        add_row('JSON Data', self.json_files)
+        add_row('Other', self.non_json_other_files())
         console.print(Align.center(table))
         console.line()
@@ -247,10 +237,11 @@ class EpsteinFiles:
         """Print complete emails to or from a particular 'author'. Returns the Emails that were printed."""
         conversation_length = self.email_conversation_length_in_days(_author)
         emails = self.emails_for(_author)
+        unique_emails = [email for email in emails if not email.is_duplicate]
         author = _author or UNKNOWN
         print_author_header(
-            f"Found {len(emails)} {author} emails starting {emails[0].timestamp.date()} over {conversation_length:,} days",
+            f"Found {len(unique_emails)} {author} emails starting {emails[0].timestamp.date()} over {conversation_length:,} days",
             get_style_for_name(author),
             get_info_for_name(author)
         )
@@ -271,28 +262,9 @@ class EpsteinFiles:
         return emails
-    def print_emails_table_for(self, _author: str | None) -> None:
-        emails = [email for email in self.emails_for(_author) if not email.is_duplicate]  # Remove dupes
-        author = _author or UNKNOWN
-        table = Table(
-            title=f"Emails to/from {author} starting {emails[0].timestamp.date()}",
-            border_style=get_style_for_name(author, allow_bold=False),
-            header_style="bold"
-        )
-        table.add_column('From', justify='left')
-        table.add_column('Timestamp', justify='center')
-        table.add_column('Subject', justify='left', style='honeydew2', min_width=60)
-        for email in emails:
-            table.add_row(
-                email.author_txt,
-                email.epstein_media_link(link_txt=email.timestamp_without_seconds()),
-                highlighter(email.subject())
-            )
-        console.print(Align.center(table), '\n')
+    def print_emails_table_for(self, author: str | None) -> None:
+        emails = [email for email in self.emails_for(author) if not email.is_duplicate]  # Remove dupes
+        console.print(Align.center(Email.build_table(emails, author)), '\n')
     def print_email_device_info(self) -> None:
         print_panel(f"Email [italic]Sent from \\[DEVICE][/italic] Signature Breakdown", padding=(4, 0, 0, 0), centered=True)
@@ -300,13 +272,13 @@ class EpsteinFiles:
         console.print(build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
     def print_emailer_counts_table(self) -> None:
-        footer = f"Identified authors of {self.attributed_email_count()} emails out of {len(self.emails)} potential email files."
+        footer = f"Identified authors of {self.attributed_email_count():,} emails out of {len(self.emails):,}."
         counts_table = Table(title=f"Email Counts", caption=footer, header_style="bold")
         add_cols_to_table(counts_table, ['Name', 'Count', 'Sent', "Recv'd", JMAIL, EPSTEIN_WEB, 'Twitter'])
         emailer_counts = {
-            e: self.email_author_counts[e] + self.email_recipient_counts[e]
-            for e in self.all_emailers(True)
+            emailer: self.email_author_counts[emailer] + self.email_recipient_counts[emailer]
+            for emailer in self.all_emailers(True)
         }
         for p, count in sort_dict(emailer_counts):
@@ -326,76 +298,50 @@ class EpsteinFiles:
     def print_imessage_summary(self) -> None:
         """Print summary table and stats for text messages."""
-        counts_table = Table(title="Text Message Counts By Author", header_style="bold")
-        counts_table.add_column(AUTHOR.title(), justify='left', style="steel_blue bold", width=30)
-        counts_table.add_column('Files', justify='right', style='white')
-        counts_table.add_column("Msgs", justify='right')
-        counts_table.add_column('First Sent At', justify='center', highlight=True, width=21)
-        counts_table.add_column('Last Sent At', justify='center', style='wheat4', width=21)
-        counts_table.add_column('Days', justify='right', style='dim')
-        for name, count in sort_dict(self.imessage_sender_counts()):
-            logs = self.imessage_logs_for(name)
-            first_at = logs[0].first_message_at(name)
-            last_at = logs[-1].first_message_at(name)
-            counts_table.add_row(
-                Text(name or UNKNOWN,
-                    get_style_for_name(name)),
-                    str(len(logs)),
-                    f"{count:,}",
-                    iso_timestamp(first_at),
-                    iso_timestamp(last_at),
-                    str((last_at - first_at).days + 1),
-                )
-        console.print(counts_table)
+        console.print(MessengerLog.summary_table(self.imessage_logs))
         text_summary_msg = f"\nDeanonymized {self.identified_imessage_log_count()} of "
-        text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files)} files."
+        text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files):,} files."
         console.print(text_summary_msg)
         imessage_msg_count = sum([len(log.messages()) for log in self.imessage_logs])
-        console.print(f"Found {imessage_msg_count} total text messages in {len(self.imessage_logs)} conversations.")
-        console.print(f"(Last deploy found 4668 messages in 77 conversations)", style='dim')
+        console.print(f"Found {imessage_msg_count} text messages in {len(self.imessage_logs)} iMessage log files.")
     def print_other_files_table(self) -> list[OtherFile]:
-        """Returns the OtherFiles that were interesting enough to print."""
+        """Returns the OtherFile objects that were interesting enough to print."""
         interesting_files = [doc for doc in self.other_files if args.all_other_files or doc.is_interesting()]
         header_pfx = '' if args.all_other_files else 'Selected '
         print_section_header(f"{FIRST_FEW_LINES} of {len(interesting_files)} {header_pfx}Files That Are Neither Emails Nor Text Msgs")
         if not args.all_other_files:
-            print_centered(f"(the other site is uncurated and has all {len(self.other_files)} unclassifiable files and all {len(self.emails):,} emails)", style='dim')
+            print_centered(f"(the other site is uncurated and has all {len(self.other_files)} unclassifiable files and {len(self.emails):,} emails)", style='dim')
             print_other_site_link(False)
             console.line(2)
-        table = Table(header_style='bold', show_lines=True)
-        table.add_column('File', justify='center', width=FILENAME_LENGTH)
-        table.add_column('Date', justify='center')
-        table.add_column('Length', justify='center')
-        table.add_column(FIRST_FEW_LINES, justify='left', style='pale_turquoise4')
+        console.print(OtherFile.build_table(interesting_files))
+        skipped_file_count = len(self.other_files) - len(interesting_files)
-        for doc in interesting_files:
-            link_and_info = [doc.raw_document_link_txt(), *doc.hints()]
-            date_str = doc.date_str()
+        if skipped_file_count > 0:
+            logger.warning(f"Skipped {skipped_file_count} uninteresting files...")
-            if doc.is_duplicate:
-                preview_text = doc.duplicate_file_txt()
-                row_style = ' dim'
-            else:
-                preview_text = doc.highlighted_preview_text()
-                row_style = ''
+        return interesting_files
-            table.add_row(
-                Group(*link_and_info),
-                Text(date_str, style=TIMESTAMP_DIM) if date_str else QUESTION_MARK_TXT,
-                doc.file_size_str(),
-                preview_text,
-                style=row_style
-            )
+    def _tally_email_data(self) -> None:
+        """Tally up summary info about Email objects."""
+        for email in self.emails:
+            if email.is_duplicate:
+                continue
-        console.print(table)
-        logger.warning(f"Skipped {len(self.other_files) - len(interesting_files)} uninteresting files...")
-        return interesting_files
+            self.email_author_counts[email.author] += 1
+            if len(email.recipients) == 0:
+                self.unknown_recipient_email_ids.add(email.file_id)
+                self.email_recipient_counts[None] += 1
+            else:
+                for recipient in email.recipients:
+                    self.email_recipient_counts[recipient] += 1
+            if email.sent_from_device:
+                self.email_authors_to_device_signatures[email.author_or_unknown()].add(email.sent_from_device)
+                self.email_device_signatures_to_authors[email.sent_from_device].add(email.author_or_unknown())
 def build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str], join_char: str = '\n') -> Padding:
@@ -413,18 +359,6 @@ def build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str]
     return Padding(table, DEVICE_SIGNATURE_PADDING)
-def is_ok_for_epstein_web(name: str | None) -> bool:
-    """Return True if it's likely that EpsteinWeb has a page for this name."""
-    if name is None or ' ' not in name:
-        return False
-    elif '@' in name or '/' in name or '??' in name:
-        return False
-    elif name in INVALID_FOR_EPSTEIN_WEB:
-        return False
-    return True
 def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
     counts: dict[str | None, int] = defaultdict(int)
@@ -435,3 +369,28 @@ def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
             counts[None] += 1
     return counts
+def document_cls(document: Document) -> Type[Document]:
+    search_area = document.text[0:5000]  # Limit search area to avoid pointless scans of huge files
+    if document.text[0] == '{':
+        return JsonFile
+    elif isinstance(document.config, EmailCfg) or DETECT_EMAIL_REGEX.match(search_area):
+        return Email
+    elif MSG_REGEX.search(search_area):
+        return MessengerLog
+    else:
+        return OtherFile
+def is_ok_for_epstein_web(name: str | None) -> bool:
+    """Return True if it's likely that EpsteinWeb has a page for this name."""
+    if name is None or ' ' not in name:
+        return False
+    elif '@' in name or '/' in name or '??' in name:
+        return False
+    elif name in INVALID_FOR_EPSTEIN_WEB:
+        return False
+    return True

epstein_files/util/constant/names.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from epstein_files.util.constant.strings import QUESTION_MARKS, remove_question_marks
 UNKNOWN = '(unknown)'
 # Texting Names
@@ -170,6 +169,7 @@ ZUBAIR_KHAN = 'Zubair Khan'
 # No communications but name is in the files
 BILL_GATES = 'Bill Gates'
+DONALD_TRUMP = 'Donald Trump'
 ELON_MUSK = 'Elon Musk'
 HENRY_HOLT = 'Henry Holt'  # Actually a company?
 IVANKA = 'Ivanka'
@@ -184,15 +184,22 @@ TULSI_GABBARD = 'Tulsi Gabbard'
 VIRGINIA_GIUFFRE = 'Virginia Giuffre'
 # Organizations
+BOFA = 'BofA'
 CNN = 'CNN'
 DEUTSCHE_BANK = 'Deutsche Bank'
+ELECTRON_CAPITAL_PARTNERS = 'Electron Capital Partners'
 GOLDMAN_SACHS = 'Goldman Sachs'
+GOLDMAN_INVESTMENT_MGMT = f'{GOLDMAN_SACHS} Investment Management Division'
 HARVARD = 'Harvard'
 INSIGHTS_POD = f"InsightsPod"  # Zubair bots
+NEXT_MANAGEMENT = 'Next Management LLC'
 JP_MORGAN = 'JP Morgan'
 OSBORNE_LLP = f"{IAN_OSBORNE} & Partners LLP"  # Ian Osborne's PR firm
+TRUMP_ORG = 'Trump Organization'
+UBS = 'UBS'
 # Locations
+PALM_BEACH = 'Palm Beach'
 VIRGIN_ISLANDS = 'Virgin Islands'
 # First and last names that should be made part of a highlighting regex for emailers

epstein_files/util/constant/output_files.py ADDED Viewed

@@ -0,0 +1,29 @@
+from pathlib import Path
+PICKLED_PATH = Path("the_epstein_files.pkl.gz")
+EPSTEIN_FILES_NOV_2025 = 'epstein_files_nov_2025'
+URLS_ENV = '.urls.env'
+HTML_DIR = Path('docs')
+ALL_EMAILS_PATH = HTML_DIR.joinpath(f'all_emails_{EPSTEIN_FILES_NOV_2025}.html')
+JSON_METADATA_PATH = HTML_DIR.joinpath(f'file_metadata_{EPSTEIN_FILES_NOV_2025}.json')
+TEXT_MSGS_HTML_PATH = HTML_DIR.joinpath('index.html')
+WORD_COUNT_HTML_PATH = HTML_DIR.joinpath(f'communication_word_count_{EPSTEIN_FILES_NOV_2025}.html')
+# EPSTEIN_WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_texts_and_emails_word_count.html')
+BUILD_ARTIFACTS = [
+    ALL_EMAILS_PATH,
+    # EPSTEIN_WORD_COUNT_HTML_PATH,
+    JSON_METADATA_PATH,
+    TEXT_MSGS_HTML_PATH,
+    WORD_COUNT_HTML_PATH,
+]
+def make_clean() -> None:
+    """Delete all build artifacts."""
+    for build_file in BUILD_ARTIFACTS:
+        if build_file.exists():
+            print(f"Removing build file '{build_file}'...")
+            build_file.unlink()

epstein_files/util/constant/strings.py CHANGED Viewed

@@ -9,6 +9,27 @@ JSON_FILE_CLASS = 'JsonFile'
 MESSENGER_LOG_CLASS = 'MessengerLog'
 OTHER_FILE_CLASS = 'OtherFile'
+# categories
+ACADEMIA = 'academia'
+ARTS = 'arts'
+ARTICLE = 'article'
+BOOK = 'book'
+BUSINESS = 'business'
+CONFERENCE = 'conference'
+ENTERTAINER = 'entertainer'
+FINANCE = 'finance'
+FLIGHT_LOGS = 'flight logs'
+JOURNALIST = 'journalist'
+JUNK = 'junk'
+LEGAL = 'legal'
+LOBBYIST = 'lobbyist'
+POLITICS = 'politics'
+PROPERTY = 'property'
+PUBLICIST = 'publicist'
+REPUTATION = 'reputation'
+SOCIAL = 'social'
+SPEECH = 'speech'
 # Publications
 BBC = 'BBC'
 BLOOMBERG = 'Bloomberg'
@@ -36,11 +57,17 @@ TIMESTAMP_DIM = f"turquoise4 dim"
 AUTHOR = 'author'
 DEFAULT = 'default'
 EVERYONE = 'everyone'
+FIRST_FEW_LINES = 'First Few Lines'
 HOUSE_OVERSIGHT_PREFIX = 'HOUSE_OVERSIGHT_'
+JSON = 'json'
 NA = 'n/a'
 REDACTED = '<REDACTED>'
 URL_SIGNIFIERS = ['gclid', 'htm', 'ref=', 'utm']
 QUESTION_MARKS = '(???)'
+# Regexes
+FILE_STEM_REGEX = re.compile(fr"{HOUSE_OVERSIGHT_PREFIX}(\d{{6}}(_\d{{1,2}})?)")
+FILE_NAME_REGEX = re.compile(fr"{FILE_STEM_REGEX.pattern}(\.txt(\.json)?)?")
 QUESTION_MARKS_REGEX = re.compile(fr' {re.escape(QUESTION_MARKS)}$')

epstein_files/util/constant/urls.py CHANGED Viewed

@@ -5,8 +5,9 @@ from typing import Literal
 from inflection import parameterize
 from rich.text import Text
+from epstein_files.util.constant.output_files import *
 from epstein_files.util.constant.strings import EMAIL, TEXT_MESSAGE, SiteType
-from epstein_files.util.file_helper import coerce_file_stem, filename_for_id
+from epstein_files.util.file_helper import coerce_file_stem
 # Style stuff
 ARCHIVE_LINK_COLOR = 'slate_blue3'
@@ -20,8 +21,29 @@ EPSTEIN_WEB = 'EpsteinWeb'
 EPSTEINIFY = 'epsteinify'
 JMAIL = 'Jmail'
-# URLs
-ATTRIBUTIONS_URL = 'https://github.com/michelcrypt4d4mus/epstein_text_messages/blob/master/epstein_files/util/constants.py'
+# Deployment URLS
+# NOTE: don't rename these variables without changing deploy.sh!
+GH_PAGES_BASE_URL = 'https://michelcrypt4d4mus.github.io'
+TEXT_MSGS_URL = f"{GH_PAGES_BASE_URL}/epstein_text_messages"
+ALL_EMAILS_URL = f'{TEXT_MSGS_URL}/{ALL_EMAILS_PATH.name}'
+JSON_METADATA_URL = f'{TEXT_MSGS_URL}/{JSON_METADATA_PATH.name}'
+WORD_COUNT_URL = f'{TEXT_MSGS_URL}/{WORD_COUNT_HTML_PATH.name}'
+SITE_URLS: dict[SiteType, str] = {
+    EMAIL: ALL_EMAILS_URL,
+    TEXT_MESSAGE: TEXT_MSGS_URL,
+}
+GH_PROJECT_URL = 'https://github.com/michelcrypt4d4mus/epstein_text_messages'
+GH_MASTER_URL = f"{GH_PROJECT_URL}/blob/master"
+ATTRIBUTIONS_URL = f'{GH_MASTER_URL}/epstein_files/util/constants.py'
+EXTRACTS_BASE_URL = f'{GH_MASTER_URL}/emails_extracted_from_legal_filings'
+extracted_file_url = lambda f: f"{EXTRACTS_BASE_URL}/{f}"
+# External URLs
 COFFEEZILLA_ARCHIVE_URL = 'https://journaliststudio.google.com/pinpoint/search?collection=061ce61c9e70bdfd'
 COURIER_NEWSROOM_ARCHIVE_URL = 'https://journaliststudio.google.com/pinpoint/search?collection=092314e384a58618'
 EPSTEINIFY_URL = 'https://epsteinify.com'
@@ -31,12 +53,6 @@ JMAIL_URL = 'https://jmail.world'
 OVERSIGHT_REPUBLICANS_PRESSER_URL = 'https://oversight.house.gov/release/oversight-committee-releases-additional-epstein-estate-documents/'
 RAW_OVERSIGHT_DOCS_GOOGLE_DRIVE_URL = 'https://drive.google.com/drive/folders/1hTNH5woIRio578onLGElkTWofUSWRoH_'
 SUBSTACK_URL = 'https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great'
-WORD_COUNT_URL = 'https://michelcrypt4d4mus.github.io/epstein_text_messages/epstein_emails_word_count.html'
-SITE_URLS: dict[SiteType, str] = {
-    EMAIL: 'https://michelcrypt4d4mus.github.io/epstein_emails_house_oversight/',
-    TEXT_MESSAGE: 'https://michelcrypt4d4mus.github.io/epstein_text_messages/',
-}
 DOC_LINK_BASE_URLS: dict[ExternalSite, str] = {
     EPSTEIN_MEDIA: f"{EPSTEIN_MEDIA_URL}/files",

epstein-files 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

epstein-files 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl