PyPI - epstein-files - Versions diffs - 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl - Mend

epstein-files 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

epstein_files/__init__.py +66 -131
epstein_files/documents/document.py +12 -3
epstein_files/documents/email.py +33 -13
epstein_files/documents/imessage/text_message.py +11 -15
epstein_files/documents/messenger_log.py +15 -11
epstein_files/documents/other_file.py +13 -8
epstein_files/epstein_files.py +51 -43
epstein_files/util/constant/names.py +21 -24
epstein_files/util/constant/output_files.py +29 -0
epstein_files/util/constant/strings.py +8 -2
epstein_files/util/constant/urls.py +11 -7
epstein_files/util/constants.py +325 -227
epstein_files/util/data.py +12 -33
epstein_files/util/doc_cfg.py +7 -14
epstein_files/util/env.py +5 -3
epstein_files/util/file_helper.py +0 -22
epstein_files/util/highlighted_group.py +31 -26
epstein_files/util/logging.py +7 -0
epstein_files/util/output.py +179 -0
epstein_files/util/rich.py +22 -10
{epstein_files-1.0.1.dist-info → epstein_files-1.0.3.dist-info}/METADATA +32 -7
epstein_files-1.0.3.dist-info/RECORD +33 -0
epstein_files-1.0.3.dist-info/entry_points.txt +7 -0
epstein_files-1.0.1.dist-info/RECORD +0 -30
{epstein_files-1.0.1.dist-info → epstein_files-1.0.3.dist-info}/LICENSE +0 -0
{epstein_files-1.0.1.dist-info → epstein_files-1.0.3.dist-info}/WHEEL +0 -0

epstein_files/__init__.py CHANGED Viewed

@@ -4,73 +4,30 @@ Reformat Epstein text message files for readability and count email senders.
 For use with iMessage log files from https://drive.google.com/drive/folders/1hTNH5woIRio578onLGElkTWofUSWRoH_
 Install: 'poetry install'
-    Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT ./generate.py'
+    Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT epstein_generate'
 """
 from sys import exit
 from dotenv import load_dotenv
 load_dotenv()
+from rich.markup import escape
 from rich.padding import Padding
+from rich.panel import Panel
+from epstein_files.epstein_files import EpsteinFiles, document_cls
+from epstein_files.documents.document import INFO_PADDING, Document
 from epstein_files.documents.email import Email
-from epstein_files.documents.messenger_log import  MessengerLog
-from epstein_files.epstein_files import EpsteinFiles, count_by_month
 from epstein_files.util.constant.html import *
 from epstein_files.util.constant.names import *
-from epstein_files.util.constant.strings import EMAIL_CLASS, MESSENGER_LOG_CLASS
-from epstein_files.util.data import dict_sets_to_lists
+from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, TEXT_MSGS_HTML_PATH, make_clean
 from epstein_files.util.env import args, specified_names
-from epstein_files.util.file_helper import GH_PAGES_HTML_PATH, JSON_METADATA_PATH, make_clean
+from epstein_files.util.file_helper import coerce_file_path, extract_file_id
 from epstein_files.util.logging import logger
-from epstein_files.util.rich import *
+from epstein_files.util.output import print_emails, print_json_metadata, print_json_stats, print_text_messages, write_urls
+from epstein_files.util.rich import build_highlighter, console, print_header, print_panel, write_html
 from epstein_files.util.timer import Timer
-PRINT_COLOR_KEY_EVERY_N_EMAILS = 150
-# Order matters. Default names to print emails for.
-DEFAULT_EMAILERS = [
-    JEREMY_RUBIN,
-    AL_SECKEL,
-    JOI_ITO,
-    JABOR_Y,
-    STEVEN_SINOFSKY,
-    DANIEL_SIAD,
-    JEAN_LUC_BRUNEL,
-    STEVEN_HOFFENBERG,
-    EHUD_BARAK,
-    MARTIN_NOWAK,
-    MASHA_DROKOVA,
-    RENATA_BOLOTOVA,
-    STEVE_BANNON,
-    OLIVIER_COLOM,
-    BORIS_NIKOLIC,
-    PRINCE_ANDREW,
-    JIDE_ZEITLIN,
-    DAVID_STERN,
-    MOHAMED_WAHEED_HASSAN,
-    JENNIFER_JACQUET,
-    None,
-]
-# Order matters. Default names to print tables w/email subject, timestamp, etc for.
-# TODO: get rid of this
-DEFAULT_EMAILER_TABLES: list[str | None] = [
-    GHISLAINE_MAXWELL,
-    LEON_BLACK,
-    LANDON_THOMAS,
-    KATHRYN_RUEMMLER,
-    DARREN_INDYKE,
-    RICHARD_KAHN,
-    TYLER_SHEARS,
-    SULTAN_BIN_SULAYEM,
-    DEEPAK_CHOPRA,
-    ARIANE_DE_ROTHSCHILD,
-    TOM_PRITZKER,
-]
-if len(set(DEFAULT_EMAILERS).intersection(set(DEFAULT_EMAILER_TABLES))) > 0:
-    raise RuntimeError(f"Some names appear in both DEFAULT_EMAILERS and DEFAULT_EMAILER_TABLES")
 def generate_html() -> None:
     if args.make_clean:
@@ -81,15 +38,7 @@ def generate_html() -> None:
     epstein_files = EpsteinFiles.get_files(timer)
     if args.json_metadata:
-        json_str = epstein_files.json_metadata()
-        if args.build:
-            with open(JSON_METADATA_PATH, 'w') as f:
-                f.write(json_str)
-                timer.print_at_checkpoint(f"Wrote {file_size_str(JSON_METADATA_PATH)} to '{JSON_METADATA_PATH}'")
-        else:
-            console.print_json(json_str, indent=4, sort_keys=True)
+        print_json_metadata(epstein_files)
         exit()
     print_header(epstein_files)
@@ -98,11 +47,11 @@ def generate_html() -> None:
         exit()
     if args.output_texts:
-        _print_text_messages(epstein_files)
+        print_text_messages(epstein_files)
         timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs)} text message logs')
     if args.output_emails:
-        emails_printed = _print_emails(epstein_files)
+        emails_printed = print_emails(epstein_files)
         timer.print_at_checkpoint(f"Printed {emails_printed:,} emails")
     if args.output_other_files:
@@ -110,93 +59,79 @@ def generate_html() -> None:
         timer.print_at_checkpoint(f"Printed {len(files_printed)} other files")
     # Save output
-    write_html(GH_PAGES_HTML_PATH)
+    write_html(ALL_EMAILS_PATH if args.all_emails else TEXT_MSGS_HTML_PATH)
     logger.warning(f"Total time: {timer.seconds_since_start_str()}")
     # JSON stats (mostly used for building pytest checks)
     if args.json_stats:
-        console.line(5)
-        _print_json_stats(epstein_files)
+        print_json_stats(epstein_files)
-def _print_emails(epstein_files: EpsteinFiles) -> int:
-    """Returns number of emails printed."""
-    print_section_header(('Selections from ' if not args.all_emails else '') + 'His Emails')
-    print_other_site_link(is_header=False)
+def epstein_diff():
+    """Diff the cleaned up text of two files."""
+    Document.diff_files(args.positional_args)
-    if len(specified_names) == 0:
-        epstein_files.print_emailer_counts_table()
-    emailers_to_print: list[str | None]
-    emailer_tables: list[str | None] = []
-    already_printed_emails: list[Email] = []
-    num_emails_printed_since_last_color_key = 0
+def epstein_search():
+    """Search the cleaned up text of the files."""
+    _assert_positional_args()
+    epstein_files = EpsteinFiles.get_files(use_pickled=True)
-    if args.all_emails:
-        console.print('Email conversations are sorted chronologically based on time of the first email.')
-        emailers_to_print = sorted(epstein_files.all_emailers(), key=lambda e: epstein_files.earliest_email_at(e))
-        print_numbered_list_of_emailers(emailers_to_print, epstein_files)
-    else:
-        emailers_to_print = specified_names if specified_names else DEFAULT_EMAILERS
-        console.print('Email conversations grouped by counterparty can be found in the order listed below.')
-        print_numbered_list_of_emailers(emailers_to_print)
-        console.print("\nAfter that there's tables linking to (but not displaying) all known emails for each of these people:")
-        if len(specified_names) > 0:
-            print_numbered_list_of_emailers(DEFAULT_EMAILER_TABLES)
+    for search_term in args.positional_args:
+        temp_highlighter = build_highlighter(search_term)
+        search_results = epstein_files.docs_matching(search_term, specified_names)
+        console.line(2)
+        print_panel(f"Found {len(search_results)} documents matching '{search_term}'", padding=(0, 0, 0, 3))
-    for author in emailers_to_print:
-        newly_printed_emails = epstein_files.print_emails_for(author)
-        already_printed_emails.extend(newly_printed_emails)
-        num_emails_printed_since_last_color_key += len(newly_printed_emails)
+        for search_result in search_results:
+            console.line()
-        # Print color key every once in a while
-        if num_emails_printed_since_last_color_key > PRINT_COLOR_KEY_EVERY_N_EMAILS:
-            print_color_key()
-            num_emails_printed_since_last_color_key = 0
+            if args.whole_file:
+                if isinstance(search_result.document, Email):
+                    search_result.document.truncation_allowed = False
-    if not specified_names:
-        if not args.all_emails:
-            print_author_header(f"Email Tables for {len(emailer_tables)} Other People", 'white')
+                console.print(search_result.document)
+            else:
+                console.print(search_result.document.description_panel())
-            for name in DEFAULT_EMAILER_TABLES:
-                epstein_files.print_emails_table_for(name)
+                for matching_line in search_result.lines:
+                    line_txt = matching_line.__rich__()
+                    console.print(Padding(temp_highlighter(line_txt), INFO_PADDING), style='gray37')
-        epstein_files.print_email_device_info()
-    # Check that all emails were actually printed
-    if args.all_emails:
-        email_ids_that_were_printed = set([email.file_id for email in already_printed_emails])
-        logger.warning(f"Printed {len(already_printed_emails)} emails of {len(email_ids_that_were_printed)} unique file IDs.")
+def epstein_show():
+    """Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
+    _assert_positional_args()
+    ids = [extract_file_id(arg) for arg in args.positional_args]
+    console.line()
-        for email in epstein_files.emails:
-            if email.file_id not in email_ids_that_were_printed and not email.is_duplicate:
-                logger.warning(f"Failed to print {email.summary()}")
+    if args.pickled:
+        epstein_files = EpsteinFiles.get_files(use_pickled=True)
+        docs = epstein_files.get_documents_by_id(ids)
+    else:
+        raw_docs = [Document(coerce_file_path(id)) for id in ids]
+        docs = [document_cls(doc)(doc.file_path) for doc in raw_docs]
-    logger.warning(f"Rewrote {len(Email.rewritten_header_ids)} headers of {len(epstein_files.emails)} emails")
-    return len(already_printed_emails)
+    for doc in docs:
+        console.line()
+        console.print(doc)
+        if args.raw:
+            console.line()
+            console.print(Panel(f"*** {doc.url_slug} RAW ***", expand=False, style=doc._border_style()))
+            console.print(escape(doc.raw_text()))
-def _print_text_messages(epstein_files: EpsteinFiles) -> None:
-    print_section_header('Text Messages')
-    print_centered("(conversations are sorted chronologically based on timestamp of first message)\n", style='gray30')
-    authors: list[str | None] = specified_names if specified_names else [JEFFREY_EPSTEIN]
-    log_files = epstein_files.imessage_logs_for(authors)
+            if isinstance(doc, Email):
+                console.line()
+                console.print(Panel(f"*** {doc.url_slug} actual_text ***", expand=False, style=doc._border_style()))
+                console.print(escape(doc._actual_text()))
-    for log_file in log_files:
-        console.print(Padding(log_file))
-        console.line(2)
-    epstein_files.print_imessage_summary()
+def epstein_dump_urls() -> None:
+    write_urls()
-def _print_json_stats(epstein_files: EpsteinFiles) -> None:
-    console.print(Panel('JSON Stats Dump', expand=True, style='reverse bold'), '\n')
-    print_json(f"{MESSENGER_LOG_CLASS} Sender Counts", MessengerLog.count_authors(epstein_files.imessage_logs), skip_falsey=True)
-    print_json(f"{EMAIL_CLASS} Author Counts", epstein_files.email_author_counts, skip_falsey=True)
-    print_json(f"{EMAIL_CLASS} Recipient Counts", epstein_files.email_recipient_counts, skip_falsey=True)
-    print_json("Email signature_substitution_countss", epstein_files.email_signature_substitution_counts(), skip_falsey=True)
-    print_json("email_author_device_signatures", dict_sets_to_lists(epstein_files.email_authors_to_device_signatures))
-    print_json("email_sent_from_devices", dict_sets_to_lists(epstein_files.email_device_signatures_to_authors))
-    print_json("email_unknown_recipient_file_ids", epstein_files.email_unknown_recipient_file_ids())
-    print_json("count_by_month", count_by_month(epstein_files.all_documents()))
+def _assert_positional_args():
+    if not args.positional_args:
+        console.print(f"\n  ERROR: No positional args!\n", style='red1')
+        exit(1)

epstein_files/documents/document.py CHANGED Viewed

@@ -15,7 +15,7 @@ from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import *
 from epstein_files.util.constant.urls import *
 from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
-from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize, without_nones
+from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize, without_falsey
 from epstein_files.util.doc_cfg import EmailCfg, DocCfg, Metadata, TextCfg
 from epstein_files.util.env import args
 from epstein_files.util.file_helper import (DOCS_DIR, file_stem_for_id, extract_file_id, file_size,
@@ -159,7 +159,7 @@ class Document:
         if hint_msg:
             hints.append(highlighter(Text(hint_msg, style='white dim italic')))
-        return without_nones(hints)
+        return without_falsey(hints)
     def info_txt(self) -> Text | None:
         """Secondary info about this file (recipients, level of certainty, etc). Overload in subclasses."""
@@ -255,7 +255,11 @@ class Document:
             txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
         txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
-        txt.append(", ").append(key_value_txt('lines', Text(f"{self.num_lines}", style='cyan')))
+        txt.append(", ").append(key_value_txt('lines', self.num_lines))
+        if self.config and self.config.dupe_of_id:
+            txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.dupe_of_id, style='magenta')))
         return txt
     def top_lines(self, n: int = 10) -> str:
@@ -352,6 +356,11 @@ class Document:
         for f in tmpfiles:
             f.unlink()
+    @staticmethod
+    def known_author_count(docs: Sequence['Document']) -> int:
+        """Count of how many Document objects have an author attribution."""
+        return len([doc for doc in docs if doc.author])
     @staticmethod
     def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
         return sorted(docs, key=lambda doc: doc.sort_key())

epstein_files/documents/email.py CHANGED Viewed

@@ -30,7 +30,6 @@ BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communicati
 DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
 LINK_LINE_REGEX = re.compile(f"^(> )?htt")
 QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
-REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + ['********************************']
 REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
 BAD_TIMEZONE_REGEX = re.compile(fr'\((UTC|GMT\+\d\d:\d\d)\)|{REDACTED}')
@@ -39,10 +38,16 @@ TIMESTAMP_LINE_REGEX = re.compile(r"\d+:\d+")
 SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
 REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
+IS_JUNK_MAIL = 'is_junk_mail'
 MAX_CHARS_TO_PRINT = 4000
 MAX_NUM_HEADER_LINES = 14
 MAX_QUOTED_REPLIES = 2
+REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
+    '********************************',
+    'Begin forwarded message',
+]
 OCR_REPAIRS: dict[str | re.Pattern, str] = {
     re.compile(r'grnail\.com'): 'gmail.com',
     re.compile(r"^(From|To)(: )?[_1.]{5,}", re.MULTILINE): rf"\1: {REDACTED}",  # Redacted email addresses
@@ -119,6 +124,7 @@ EMAIL_SIGNATURE_REGEXES = {
 # Invalid for links to EpsteinWeb
 JUNK_EMAILERS = [
     'asmallworld@travel.asmallworld.net',
+    "digest-noreply@quora.com",
     'editorialstaff@flipboard.com',
     'How To Academy',
     'Jokeland',
@@ -126,9 +132,13 @@ JUNK_EMAILERS = [
     'Saved by Internet Explorer 11',
 ]
-TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + [
-    'Alan S Halperin',
+MAILING_LISTS = [
+    INTELLIGENCE_SQUARED,
     'middle.east.update@hotmail.com',
+]
+TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + MAILING_LISTS + [
+    'Alan S Halperin',
     'Mitchell Bard',
     'Skip Rimer',
 ]
@@ -281,7 +291,7 @@ SELF_EMAILS_FILE_IDS = [
 ]
 METADATA_FIELDS = [
-    'is_junk_mail',
+    IS_JUNK_MAIL,
     'recipients',
     'sent_from_device',
 ]
@@ -294,7 +304,6 @@ class Email(Communication):
         actual_text (str) - best effort at the text actually sent in this email, excluding quoted replies and forwards
         config (EmailCfg | None) - manual config for this email (if it exists)
         header (EmailHeader) - header data extracted from the text (from/to/sent/subject etc)
-        is_junk_mail (bool) - True if this is junk mail
         recipients (list[str | None]) - who this email was sent to
         sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
         signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
@@ -302,17 +311,16 @@ class Email(Communication):
     actual_text: str = field(init=False)
     config: EmailCfg | None = None
     header: EmailHeader = field(init=False)
-    is_junk_mail: bool = False
     recipients: list[str | None] = field(default_factory=list)
     sent_from_device: str | None = None
     signature_substitution_counts: dict[str, int] = field(default_factory=dict)  # defaultdict breaks asdict :(
+    truncation_allowed: bool = True
     # For logging how many headers we prettified while printing, kind of janky
     rewritten_header_ids: ClassVar[set[str]] = set([])
     def __post_init__(self):
         super().__post_init__()
-        self.is_junk_mail = self.author in JUNK_EMAILERS
         if self.config and self.config.recipients:
             self.recipients = cast(list[str | None], self.config.recipients)
@@ -331,9 +339,17 @@ class Email(Communication):
         txt = Text("OCR text of email from ", style='grey46').append(self.author_txt).append(' to ')
         return txt.append(self._recipients_txt()).append(highlighter(f" probably sent at {self.timestamp}"))
+    def is_fwded_article(self) -> bool:
+        return bool(self.config and self.config.is_fwded_article)
+    def is_junk_mail(self) -> bool:
+        return self.author in JUNK_EMAILERS or self.author in MAILING_LISTS
     def metadata(self) -> Metadata:
+        local_metadata = asdict(self)
+        local_metadata[IS_JUNK_MAIL] = self.is_junk_mail()
         metadata = super().metadata()
-        metadata.update({k: v for k, v in asdict(self).items() if v and k in METADATA_FIELDS})
+        metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
         return metadata
     def subject(self) -> str:
@@ -352,17 +368,18 @@ class Email(Communication):
         """The text that comes before likely quoted replies and forwards etc."""
         if self.config and self.config.actual_text is not None:
             return self.config.actual_text
+        text = '\n'.join(self.text.split('\n')[self.header.num_header_rows:]).strip()
+        if self.config and self.config.fwded_text_after:
+            return text.split(self.config.fwded_text_after)[0].strip()
         elif self.header.num_header_rows == 0:
             return self.text
-        text = '\n'.join(self.text.split('\n')[self.header.num_header_rows:]).strip()
         reply_text_match = REPLY_TEXT_REGEX.search(text)
         # logger.info(f"Raw text:\n" + self.top_lines(20) + '\n\n')
         # logger.info(f"With header removed:\n" + text[0:500] + '\n\n')
-        if self.file_id in ['024624']:  # This email starts with "On September 14th"
-            return text.split('On Tue, May 14')[0].strip()
         if reply_text_match:
             actual_num_chars = len(reply_text_match.group(1))
             actual_text_pct = f"{(100 * float(actual_num_chars) / len(text)):.1f}%"
@@ -555,6 +572,9 @@ class Email(Communication):
             self._merge_lines(3, 5)
         elif self.file_id == '028931':
             self._merge_lines(3, 6)
+        elif self.file_id == '013415':
+            for _i in range(2):
+                self._merge_lines(4)
         elif self.file_id in ['033568']:
             for _i in range(5):
                 self._merge_lines(5)
@@ -637,7 +657,7 @@ class Email(Communication):
             num_chars = quote_cutoff
         # Truncate long emails but leave a note explaining what happened w/link to source document
-        if len(text) > num_chars:
+        if len(text) > num_chars and self.truncation_allowed:
             text = text[0:num_chars]
             doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
             trim_note = f"<...trimmed to {num_chars} characters of {self.length}, read the rest at {doc_link_markup}...>"

epstein_files/documents/imessage/text_message.py CHANGED Viewed

@@ -4,7 +4,7 @@ from datetime import datetime
 from rich.text import Text
-from epstein_files.util.constant.names import JEFFREY_EPSTEIN, ANTHONY_SCARAMUCCI, STEVE_BANNON, UNKNOWN
+from epstein_files.util.constant.names import JEFFREY_EPSTEIN, ANTHONY_SCARAMUCCI, CELINA_DUBIN, EVA, STEVE_BANNON, UNKNOWN
 from epstein_files.util.data import extract_last_name
 from epstein_files.util.highlighted_group import get_style_for_name
 from epstein_files.util.logging import logger
@@ -19,17 +19,18 @@ DISPLAY_LAST_NAME_ONLY = [
     STEVE_BANNON,
 ]
-UNKNOWN_TEXTERS = [
-    '+16463880059',
-    '+13108737937',
-    '+13108802851',
-]
+PHONE_NUMBER_MAPPING = {
+    '+19174393646': ANTHONY_SCARAMUCCI,
+    '+13109906526': STEVE_BANNON,
+    '+16463880059': EVA,
+    '+13108737937': CELINA_DUBIN,
+    '+13108802851': STEVE_BANNON,
+}
 TEXTER_MAPPING = {
     'e:': JEFFREY_EPSTEIN,
     'e:jeeitunes@gmail.com': JEFFREY_EPSTEIN,
-    '+19174393646': ANTHONY_SCARAMUCCI,
-    '+13109906526': STEVE_BANNON,
 }
@@ -37,7 +38,7 @@ TEXTER_MAPPING = {
 class TextMessage:
     """Class representing a single iMessage text message."""
     author: str | None
-    author_str: str = field(init=False)
+    author_str: str | None = None
     id_confirmed: bool = False
     text: str
     timestamp_str: str
@@ -47,14 +48,10 @@ class TextMessage:
         if self.author is None:
             self.author_str = UNKNOWN
-        elif self.author in UNKNOWN_TEXTERS:
-            logger.warning(f"Bad text from '{self.author}': \"{self.text}\"")
-            self.author_str = self.author
-            self.author = None  # TODO: this shouldn't be happening; we still know the author...
         elif self.author in DISPLAY_LAST_NAME_ONLY:
             self.author_str = extract_last_name(self.author)
         else:
-            self.author_str = self.author
+            self.author_str = self.author_str or self.author
         if not self.id_confirmed and self.author is not None and self.author != JEFFREY_EPSTEIN:
             self.author_str = self.author + ' (?)'
@@ -87,7 +84,6 @@ class TextMessage:
         return msg_txt
     def __rich__(self) -> Text:
-        # TODO: Workaround for phone numbers that sucks
         author_style = get_style_for_name(self.author_str if self.author_str.startswith('+') else self.author)
         author_txt = Text(self.author_str, style=author_style)
         timestamp_txt = Text(f"[{self.timestamp_str}]", style=TIMESTAMP_STYLE).append(' ')

epstein_files/documents/messenger_log.py CHANGED Viewed

@@ -44,17 +44,8 @@ class MessengerLog(Communication):
     def messages(self) -> list[TextMessage]:
         """Lazily evaluated accessor for self._messages."""
-        if len(self._messages) == 0:
-            self._messages = [
-                TextMessage(
-                    # If the Sender: is redacted that means it's from self.author
-                    author=REDACTED_AUTHOR_REGEX.sub('', match.group(1).strip()) or self.author,
-                    id_confirmed=not self.is_attribution_uncertain(),
-                    text=match.group(4).strip(),
-                    timestamp_str=match.group(2).strip(),
-                )
-                for match in MSG_REGEX.finditer(self.text)
-            ]
+        if not self._messages:
+            self._messages = [self._build_message(match) for match in MSG_REGEX.finditer(self.text)]
         return self._messages
@@ -70,6 +61,19 @@ class MessengerLog(Communication):
     def _border_style(self) -> str:
         return self.author_style
+    def _build_message(self, match: re.Match) -> TextMessage:
+        """Turn a regex match into a TextMessage."""
+        author_str = REDACTED_AUTHOR_REGEX.sub('', match.group(1).strip())
+        # If the Sender: is redacted that means it's from self.author
+        return TextMessage(
+            author=self.author if (author_str.startswith('+') or not author_str) else author_str,
+            author_str=author_str if author_str.startswith('+') else None,  # Preserve phone numbers
+            id_confirmed=not self.is_attribution_uncertain(),
+            text=match.group(4).strip(),
+            timestamp_str=match.group(2).strip(),
+        )
     def _extract_timestamp(self) -> datetime:
         for match in MSG_REGEX.finditer(self.text):
             timestamp_str = match.group(2).strip()

epstein_files/documents/other_file.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import re
 import logging
 import warnings
-from dataclasses import dataclass
+from dataclasses import asdict, dataclass
 from datetime import datetime
 import datefinder
@@ -15,7 +15,7 @@ from rich.text import Text
 from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, WHITESPACE_REGEX, Document
 from epstein_files.util.constant.strings import *
 from epstein_files.util.constants import *
-from epstein_files.util.doc_cfg import FINANCIAL_REPORTS_AUTHORS, DocCfg
+from epstein_files.util.doc_cfg import FINANCIAL_REPORTS_AUTHORS, DocCfg, Metadata
 from epstein_files.util.data import escape_single_quotes, remove_timezone, uniquify
 from epstein_files.util.file_helper import FILENAME_LENGTH
 from epstein_files.util.env import args
@@ -83,11 +83,10 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
     NOBEL_CHARITABLE_TRUST,
     'Nautilus',
     'New Yorker',
-    NYT_ARTICLE,
-    NYT_COLUMN,
+    NYT,
     PALM_BEACH_CODE_ENFORCEMENT,
-    PALM_BEACH_DAILY_ARTICLE,
-    PALM_BEACH_POST_ARTICLE,
+    PALM_BEACH_DAILY_NEWS,
+    PALM_BEACH_POST,
     PALM_BEACH_TSV,
     PALM_BEACH_WATER_COMMITTEE,
     PAUL_KRASSNER,
@@ -102,6 +101,7 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
     SHIMON_POST_ARTICLE,
     SINGLE_PAGE,
     STACEY_PLASKETT,
+    'Tatler',
     TERJE_ROD_LARSEN,
     TEXT_OF_US_LAW,
     TRANSLATION,
@@ -113,7 +113,7 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
     'U.S. News',
     'US Office',
     'Vanity Fair',
-    VI_DAILY_NEWS_ARTICLE,
+    VI_DAILY_NEWS,
     WAPO,
 ]
@@ -127,7 +127,7 @@ class OtherFile(Document):
         if self.config is None and VI_DAILY_NEWS_REGEX.search(self.text):
             self.log(f"Creating synthetic config for VI Daily News article...", logging.INFO)
-            self.config = DocCfg(id=self.file_id, description=VI_DAILY_NEWS_ARTICLE, category=ARTICLE)
+            self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
     def category(self) -> str | None:
         return self.config and self.config.category
@@ -175,6 +175,11 @@ class OtherFile(Document):
         return True
+    def metadata(self) -> Metadata:
+        metadata = super().metadata()
+        metadata['is_interesting'] = self.is_interesting()
+        return metadata
     def preview_text(self) -> str:
         return WHITESPACE_REGEX.sub(' ', self.text)[0:PREVIEW_CHARS]

epstein-files 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

epstein-files 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl