PyPI - epstein-files - Versions diffs - 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl - Mend

epstein-files 1.0.2py3-none-any.whl → 1.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

epstein_files/__init__.py +4 -1
epstein_files/documents/document.py +7 -2
epstein_files/documents/email.py +33 -13
epstein_files/documents/imessage/text_message.py +11 -15
epstein_files/documents/messenger_log.py +15 -11
epstein_files/documents/other_file.py +13 -8
epstein_files/epstein_files.py +21 -15
epstein_files/util/constant/names.py +19 -23
epstein_files/util/constant/strings.py +8 -2
epstein_files/util/constant/urls.py +1 -0
epstein_files/util/constants.py +194 -116
epstein_files/util/data.py +1 -1
epstein_files/util/doc_cfg.py +5 -4
epstein_files/util/env.py +3 -2
epstein_files/util/highlighted_group.py +30 -25
epstein_files/util/logging.py +1 -0
epstein_files/util/output.py +8 -9
epstein_files/util/rich.py +6 -1
{epstein_files-1.0.2.dist-info → epstein_files-1.0.4.dist-info}/METADATA +18 -8
epstein_files-1.0.4.dist-info/RECORD +33 -0
epstein_files-1.0.2.dist-info/RECORD +0 -33
{epstein_files-1.0.2.dist-info → epstein_files-1.0.4.dist-info}/LICENSE +0 -0
{epstein_files-1.0.2.dist-info → epstein_files-1.0.4.dist-info}/WHEEL +0 -0
{epstein_files-1.0.2.dist-info → epstein_files-1.0.4.dist-info}/entry_points.txt +0 -0

epstein_files/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@ Reformat Epstein text message files for readability and count email senders.
 For use with iMessage log files from https://drive.google.com/drive/folders/1hTNH5woIRio578onLGElkTWofUSWRoH_
 Install: 'poetry install'
-    Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT ./generate.py'
+    Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT epstein_generate'
 """
 from sys import exit
@@ -87,6 +87,9 @@ def epstein_search():
             console.line()
             if args.whole_file:
+                if isinstance(search_result.document, Email):
+                    search_result.document.truncation_allowed = False
                 console.print(search_result.document)
             else:
                 console.print(search_result.document.description_panel())

epstein_files/documents/document.py CHANGED Viewed

@@ -15,7 +15,7 @@ from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import *
 from epstein_files.util.constant.urls import *
 from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
-from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize, without_nones
+from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize, without_falsey
 from epstein_files.util.doc_cfg import EmailCfg, DocCfg, Metadata, TextCfg
 from epstein_files.util.env import args
 from epstein_files.util.file_helper import (DOCS_DIR, file_stem_for_id, extract_file_id, file_size,
@@ -159,7 +159,7 @@ class Document:
         if hint_msg:
             hints.append(highlighter(Text(hint_msg, style='white dim italic')))
-        return without_nones(hints)
+        return without_falsey(hints)
     def info_txt(self) -> Text | None:
         """Secondary info about this file (recipients, level of certainty, etc). Overload in subclasses."""
@@ -356,6 +356,11 @@ class Document:
         for f in tmpfiles:
             f.unlink()
+    @staticmethod
+    def known_author_count(docs: Sequence['Document']) -> int:
+        """Count of how many Document objects have an author attribution."""
+        return len([doc for doc in docs if doc.author])
     @staticmethod
     def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
         return sorted(docs, key=lambda doc: doc.sort_key())

epstein_files/documents/email.py CHANGED Viewed

@@ -30,7 +30,6 @@ BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communicati
 DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
 LINK_LINE_REGEX = re.compile(f"^(> )?htt")
 QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
-REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + ['********************************']
 REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
 BAD_TIMEZONE_REGEX = re.compile(fr'\((UTC|GMT\+\d\d:\d\d)\)|{REDACTED}')
@@ -39,10 +38,16 @@ TIMESTAMP_LINE_REGEX = re.compile(r"\d+:\d+")
 SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
 REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
+IS_JUNK_MAIL = 'is_junk_mail'
 MAX_CHARS_TO_PRINT = 4000
 MAX_NUM_HEADER_LINES = 14
 MAX_QUOTED_REPLIES = 2
+REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
+    '********************************',
+    'Begin forwarded message',
+]
 OCR_REPAIRS: dict[str | re.Pattern, str] = {
     re.compile(r'grnail\.com'): 'gmail.com',
     re.compile(r"^(From|To)(: )?[_1.]{5,}", re.MULTILINE): rf"\1: {REDACTED}",  # Redacted email addresses
@@ -119,6 +124,7 @@ EMAIL_SIGNATURE_REGEXES = {
 # Invalid for links to EpsteinWeb
 JUNK_EMAILERS = [
     'asmallworld@travel.asmallworld.net',
+    "digest-noreply@quora.com",
     'editorialstaff@flipboard.com',
     'How To Academy',
     'Jokeland',
@@ -126,9 +132,13 @@ JUNK_EMAILERS = [
     'Saved by Internet Explorer 11',
 ]
-TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + [
-    'Alan S Halperin',
+MAILING_LISTS = [
+    INTELLIGENCE_SQUARED,
     'middle.east.update@hotmail.com',
+]
+TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + MAILING_LISTS + [
+    'Alan S Halperin',
     'Mitchell Bard',
     'Skip Rimer',
 ]
@@ -281,7 +291,7 @@ SELF_EMAILS_FILE_IDS = [
 ]
 METADATA_FIELDS = [
-    'is_junk_mail',
+    IS_JUNK_MAIL,
     'recipients',
     'sent_from_device',
 ]
@@ -294,7 +304,6 @@ class Email(Communication):
         actual_text (str) - best effort at the text actually sent in this email, excluding quoted replies and forwards
         config (EmailCfg | None) - manual config for this email (if it exists)
         header (EmailHeader) - header data extracted from the text (from/to/sent/subject etc)
-        is_junk_mail (bool) - True if this is junk mail
         recipients (list[str | None]) - who this email was sent to
         sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
         signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
@@ -302,17 +311,16 @@ class Email(Communication):
     actual_text: str = field(init=False)
     config: EmailCfg | None = None
     header: EmailHeader = field(init=False)
-    is_junk_mail: bool = False
     recipients: list[str | None] = field(default_factory=list)
     sent_from_device: str | None = None
     signature_substitution_counts: dict[str, int] = field(default_factory=dict)  # defaultdict breaks asdict :(
+    truncation_allowed: bool = True
     # For logging how many headers we prettified while printing, kind of janky
     rewritten_header_ids: ClassVar[set[str]] = set([])
     def __post_init__(self):
         super().__post_init__()
-        self.is_junk_mail = self.author in JUNK_EMAILERS
         if self.config and self.config.recipients:
             self.recipients = cast(list[str | None], self.config.recipients)
@@ -331,9 +339,17 @@ class Email(Communication):
         txt = Text("OCR text of email from ", style='grey46').append(self.author_txt).append(' to ')
         return txt.append(self._recipients_txt()).append(highlighter(f" probably sent at {self.timestamp}"))
+    def is_fwded_article(self) -> bool:
+        return bool(self.config and self.config.is_fwded_article)
+    def is_junk_mail(self) -> bool:
+        return self.author in JUNK_EMAILERS or self.author in MAILING_LISTS
     def metadata(self) -> Metadata:
+        local_metadata = asdict(self)
+        local_metadata[IS_JUNK_MAIL] = self.is_junk_mail()
         metadata = super().metadata()
-        metadata.update({k: v for k, v in asdict(self).items() if v and k in METADATA_FIELDS})
+        metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
         return metadata
     def subject(self) -> str:
@@ -352,17 +368,18 @@ class Email(Communication):
         """The text that comes before likely quoted replies and forwards etc."""
         if self.config and self.config.actual_text is not None:
             return self.config.actual_text
+        text = '\n'.join(self.text.split('\n')[self.header.num_header_rows:]).strip()
+        if self.config and self.config.fwded_text_after:
+            return text.split(self.config.fwded_text_after)[0].strip()
         elif self.header.num_header_rows == 0:
             return self.text
-        text = '\n'.join(self.text.split('\n')[self.header.num_header_rows:]).strip()
         reply_text_match = REPLY_TEXT_REGEX.search(text)
         # logger.info(f"Raw text:\n" + self.top_lines(20) + '\n\n')
         # logger.info(f"With header removed:\n" + text[0:500] + '\n\n')
-        if self.file_id in ['024624']:  # This email starts with "On September 14th"
-            return text.split('On Tue, May 14')[0].strip()
         if reply_text_match:
             actual_num_chars = len(reply_text_match.group(1))
             actual_text_pct = f"{(100 * float(actual_num_chars) / len(text)):.1f}%"
@@ -555,6 +572,9 @@ class Email(Communication):
             self._merge_lines(3, 5)
         elif self.file_id == '028931':
             self._merge_lines(3, 6)
+        elif self.file_id == '013415':
+            for _i in range(2):
+                self._merge_lines(4)
         elif self.file_id in ['033568']:
             for _i in range(5):
                 self._merge_lines(5)
@@ -637,7 +657,7 @@ class Email(Communication):
             num_chars = quote_cutoff
         # Truncate long emails but leave a note explaining what happened w/link to source document
-        if len(text) > num_chars:
+        if len(text) > num_chars and self.truncation_allowed:
             text = text[0:num_chars]
             doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
             trim_note = f"<...trimmed to {num_chars} characters of {self.length}, read the rest at {doc_link_markup}...>"

epstein_files/documents/imessage/text_message.py CHANGED Viewed

@@ -4,7 +4,7 @@ from datetime import datetime
 from rich.text import Text
-from epstein_files.util.constant.names import JEFFREY_EPSTEIN, ANTHONY_SCARAMUCCI, STEVE_BANNON, UNKNOWN
+from epstein_files.util.constant.names import JEFFREY_EPSTEIN, ANTHONY_SCARAMUCCI, CELINA_DUBIN, EVA, STEVE_BANNON, UNKNOWN
 from epstein_files.util.data import extract_last_name
 from epstein_files.util.highlighted_group import get_style_for_name
 from epstein_files.util.logging import logger
@@ -19,17 +19,18 @@ DISPLAY_LAST_NAME_ONLY = [
     STEVE_BANNON,
 ]
-UNKNOWN_TEXTERS = [
-    '+16463880059',
-    '+13108737937',
-    '+13108802851',
-]
+PHONE_NUMBER_MAPPING = {
+    '+19174393646': ANTHONY_SCARAMUCCI,
+    '+13109906526': STEVE_BANNON,
+    '+16463880059': EVA,
+    '+13108737937': CELINA_DUBIN,
+    '+13108802851': STEVE_BANNON,
+}
 TEXTER_MAPPING = {
     'e:': JEFFREY_EPSTEIN,
     'e:jeeitunes@gmail.com': JEFFREY_EPSTEIN,
-    '+19174393646': ANTHONY_SCARAMUCCI,
-    '+13109906526': STEVE_BANNON,
 }
@@ -37,7 +38,7 @@ TEXTER_MAPPING = {
 class TextMessage:
     """Class representing a single iMessage text message."""
     author: str | None
-    author_str: str = field(init=False)
+    author_str: str | None = None
     id_confirmed: bool = False
     text: str
     timestamp_str: str
@@ -47,14 +48,10 @@ class TextMessage:
         if self.author is None:
             self.author_str = UNKNOWN
-        elif self.author in UNKNOWN_TEXTERS:
-            logger.warning(f"Bad text from '{self.author}': \"{self.text}\"")
-            self.author_str = self.author
-            self.author = None  # TODO: this shouldn't be happening; we still know the author...
         elif self.author in DISPLAY_LAST_NAME_ONLY:
             self.author_str = extract_last_name(self.author)
         else:
-            self.author_str = self.author
+            self.author_str = self.author_str or self.author
         if not self.id_confirmed and self.author is not None and self.author != JEFFREY_EPSTEIN:
             self.author_str = self.author + ' (?)'
@@ -87,7 +84,6 @@ class TextMessage:
         return msg_txt
     def __rich__(self) -> Text:
-        # TODO: Workaround for phone numbers that sucks
         author_style = get_style_for_name(self.author_str if self.author_str.startswith('+') else self.author)
         author_txt = Text(self.author_str, style=author_style)
         timestamp_txt = Text(f"[{self.timestamp_str}]", style=TIMESTAMP_STYLE).append(' ')

epstein_files/documents/messenger_log.py CHANGED Viewed

@@ -44,17 +44,8 @@ class MessengerLog(Communication):
     def messages(self) -> list[TextMessage]:
         """Lazily evaluated accessor for self._messages."""
-        if len(self._messages) == 0:
-            self._messages = [
-                TextMessage(
-                    # If the Sender: is redacted that means it's from self.author
-                    author=REDACTED_AUTHOR_REGEX.sub('', match.group(1).strip()) or self.author,
-                    id_confirmed=not self.is_attribution_uncertain(),
-                    text=match.group(4).strip(),
-                    timestamp_str=match.group(2).strip(),
-                )
-                for match in MSG_REGEX.finditer(self.text)
-            ]
+        if not self._messages:
+            self._messages = [self._build_message(match) for match in MSG_REGEX.finditer(self.text)]
         return self._messages
@@ -70,6 +61,19 @@ class MessengerLog(Communication):
     def _border_style(self) -> str:
         return self.author_style
+    def _build_message(self, match: re.Match) -> TextMessage:
+        """Turn a regex match into a TextMessage."""
+        author_str = REDACTED_AUTHOR_REGEX.sub('', match.group(1).strip())
+        # If the Sender: is redacted that means it's from self.author
+        return TextMessage(
+            author=self.author if (author_str.startswith('+') or not author_str) else author_str,
+            author_str=author_str if author_str.startswith('+') else None,  # Preserve phone numbers
+            id_confirmed=not self.is_attribution_uncertain(),
+            text=match.group(4).strip(),
+            timestamp_str=match.group(2).strip(),
+        )
     def _extract_timestamp(self) -> datetime:
         for match in MSG_REGEX.finditer(self.text):
             timestamp_str = match.group(2).strip()

epstein_files/documents/other_file.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import re
 import logging
 import warnings
-from dataclasses import dataclass
+from dataclasses import asdict, dataclass
 from datetime import datetime
 import datefinder
@@ -15,7 +15,7 @@ from rich.text import Text
 from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, WHITESPACE_REGEX, Document
 from epstein_files.util.constant.strings import *
 from epstein_files.util.constants import *
-from epstein_files.util.doc_cfg import FINANCIAL_REPORTS_AUTHORS, DocCfg
+from epstein_files.util.doc_cfg import FINANCIAL_REPORTS_AUTHORS, DocCfg, Metadata
 from epstein_files.util.data import escape_single_quotes, remove_timezone, uniquify
 from epstein_files.util.file_helper import FILENAME_LENGTH
 from epstein_files.util.env import args
@@ -83,11 +83,10 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
     NOBEL_CHARITABLE_TRUST,
     'Nautilus',
     'New Yorker',
-    NYT_ARTICLE,
-    NYT_COLUMN,
+    NYT,
     PALM_BEACH_CODE_ENFORCEMENT,
-    PALM_BEACH_DAILY_ARTICLE,
-    PALM_BEACH_POST_ARTICLE,
+    PALM_BEACH_DAILY_NEWS,
+    PALM_BEACH_POST,
     PALM_BEACH_TSV,
     PALM_BEACH_WATER_COMMITTEE,
     PAUL_KRASSNER,
@@ -102,6 +101,7 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
     SHIMON_POST_ARTICLE,
     SINGLE_PAGE,
     STACEY_PLASKETT,
+    'Tatler',
     TERJE_ROD_LARSEN,
     TEXT_OF_US_LAW,
     TRANSLATION,
@@ -113,7 +113,7 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
     'U.S. News',
     'US Office',
     'Vanity Fair',
-    VI_DAILY_NEWS_ARTICLE,
+    VI_DAILY_NEWS,
     WAPO,
 ]
@@ -127,7 +127,7 @@ class OtherFile(Document):
         if self.config is None and VI_DAILY_NEWS_REGEX.search(self.text):
             self.log(f"Creating synthetic config for VI Daily News article...", logging.INFO)
-            self.config = DocCfg(id=self.file_id, description=VI_DAILY_NEWS_ARTICLE, category=ARTICLE)
+            self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
     def category(self) -> str | None:
         return self.config and self.config.category
@@ -175,6 +175,11 @@ class OtherFile(Document):
         return True
+    def metadata(self) -> Metadata:
+        metadata = super().metadata()
+        metadata['is_interesting'] = self.is_interesting()
+        return metadata
     def preview_text(self) -> str:
         return WHITESPACE_REGEX.sub(' ', self.text)[0:PREVIEW_CHARS]

epstein_files/epstein_files.py CHANGED Viewed

@@ -21,11 +21,11 @@ from epstein_files.documents.messenger_log import MSG_REGEX, MessengerLog
 from epstein_files.documents.other_file import OtherFile
 from epstein_files.util.constant.output_files import PICKLED_PATH
 from epstein_files.util.constant.strings import *
-from epstein_files.util.constant.urls import (EPSTEIN_WEB, JMAIL, epsteinify_name_url, epstein_web_person_url,
-     search_jmail_url, search_twitter_url)
+from epstein_files.util.constant.urls import (EPSTEIN_MEDIA, EPSTEIN_WEB, JMAIL, epstein_media_person_url,
+     epsteinify_name_url, epstein_web_person_url, search_jmail_url, search_twitter_url)
 from epstein_files.util.constants import *
 from epstein_files.util.data import dict_sets_to_lists, json_safe, sort_dict
-from epstein_files.util.doc_cfg import EmailCfg
+from epstein_files.util.doc_cfg import EmailCfg, Metadata
 from epstein_files.util.env import args, logger
 from epstein_files.util.file_helper import DOCS_DIR, file_size_str
 from epstein_files.util.highlighted_group import get_info_for_name, get_style_for_name
@@ -68,6 +68,7 @@ class EpsteinFiles:
         """Iterate through files and build appropriate objects."""
         self.all_files = [f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')]
         documents = []
+        file_type_count = defaultdict(int)
         # Read through and classify all the files
         for file_arg in self.all_files:
@@ -75,12 +76,13 @@ class EpsteinFiles:
             document = Document(file_arg)
             if document.length == 0:
-                logger.warning(f"Skipping empty file: {document}")
+                logger.warning(f"Skipping empty file: {document}]")
                 continue
             cls = document_cls(document)
             documents.append(cls(file_arg, text=document.text))
             logger.info(str(documents[-1]))
+            file_type_count[cls.__name__] += 1
             if doc_timer.seconds_since_start() > SLOW_FILE_SECONDS:
                 doc_timer.print_at_checkpoint(f"Slow file: {documents[-1]} processed")
@@ -195,15 +197,13 @@ class EpsteinFiles:
     def imessage_logs_for(self, author: str | None | list[str | None]) -> Sequence[MessengerLog]:
         return MessengerLog.logs_for(author, self.imessage_logs)
-    def identified_imessage_log_count(self) -> int:
-        return len([log for log in self.imessage_logs if log.author])
     def json_metadata(self) -> str:
+        """Create a JSON string containing metadata for all the files."""
         metadata = {
-            EMAIL_CLASS: [json_safe(d.metadata()) for d in self.emails],
-            JSON_FILE_CLASS: [json_safe(d.metadata()) for d in self.json_files],
-            MESSENGER_LOG_CLASS: [json_safe(d.metadata()) for d in self.imessage_logs],
-            OTHER_FILE_CLASS: [json_safe(d.metadata()) for d in self.other_files if not isinstance(d, JsonFile)],
+            EMAIL_CLASS: _sorted_metadata(self.emails),
+            JSON_FILE_CLASS: _sorted_metadata(self.json_files),
+            MESSENGER_LOG_CLASS: _sorted_metadata(self.imessage_logs),
+            OTHER_FILE_CLASS: _sorted_metadata(self.non_json_other_files()),
         }
         return json.dumps(metadata, indent=4, sort_keys=True)
@@ -216,7 +216,7 @@ class EpsteinFiles:
         add_cols_to_table(table, ['File Type', 'Files', 'Author Known', 'Author Unknown', 'Duplicates'])
         def add_row(label: str, docs: list):
-            known = None if isinstance(docs[0], JsonFile) else len([d for d in docs if d.author])
+            known = None if isinstance(docs[0], JsonFile) else Document.known_author_count(docs)
             table.add_row(
                 label,
@@ -274,7 +274,7 @@ class EpsteinFiles:
     def print_emailer_counts_table(self) -> None:
         footer = f"Identified authors of {self.attributed_email_count():,} emails out of {len(self.emails):,}."
         counts_table = Table(title=f"Email Counts", caption=footer, header_style="bold")
-        add_cols_to_table(counts_table, ['Name', 'Count', 'Sent', "Recv'd", JMAIL, EPSTEIN_WEB, 'Twitter'])
+        add_cols_to_table(counts_table, ['Name', 'Count', 'Sent', "Recv'd", JMAIL, EPSTEIN_MEDIA, EPSTEIN_WEB, 'Twitter'])
         emailer_counts = {
             emailer: self.email_author_counts[emailer] + self.email_recipient_counts[emailer]
@@ -290,7 +290,8 @@ class EpsteinFiles:
                 str(self.email_author_counts[p]),
                 str(self.email_recipient_counts[p]),
                 '' if p is None else link_text_obj(search_jmail_url(p), JMAIL),
-                '' if not is_ok_for_epstein_web(p) else link_text_obj(epstein_web_person_url(p), EPSTEIN_WEB.lower()),
+                '' if not is_ok_for_epstein_web(p) else link_text_obj(epstein_media_person_url(p), EPSTEIN_MEDIA),
+                '' if not is_ok_for_epstein_web(p) else link_text_obj(epstein_web_person_url(p), EPSTEIN_WEB),
                 '' if p is None else link_text_obj(search_twitter_url(p), 'search X'),
             )
@@ -299,7 +300,7 @@ class EpsteinFiles:
     def print_imessage_summary(self) -> None:
         """Print summary table and stats for text messages."""
         console.print(MessengerLog.summary_table(self.imessage_logs))
-        text_summary_msg = f"\nDeanonymized {self.identified_imessage_log_count()} of "
+        text_summary_msg = f"\nDeanonymized {Document.known_author_count(self.imessage_logs)} of "
         text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files):,} files."
         console.print(text_summary_msg)
         imessage_msg_count = sum([len(log.messages()) for log in self.imessage_logs])
@@ -394,3 +395,8 @@ def is_ok_for_epstein_web(name: str | None) -> bool:
         return False
     return True
+def _sorted_metadata(docs: Sequence[Document]) -> list[Metadata]:
+    docs_sorted_by_id = sorted(docs, key=lambda d: d.file_id)
+    return [json_safe(d.metadata()) for d in docs_sorted_by_id]

epstein_files/util/constant/names.py CHANGED Viewed

@@ -198,14 +198,10 @@ OSBORNE_LLP = f"{IAN_OSBORNE} & Partners LLP"  # Ian Osborne's PR firm
 TRUMP_ORG = 'Trump Organization'
 UBS = 'UBS'
-# Locations
-PALM_BEACH = 'Palm Beach'
-VIRGIN_ISLANDS = 'Virgin Islands'
 # First and last names that should be made part of a highlighting regex for emailers
 NAMES_TO_NOT_HIGHLIGHT: list[str] = [name.lower() for name in [
-    'Al', 'Alfredo', 'Allen', 'Alex', 'Alexander', 'Amanda', 'Andres', 'Andrew',
-    'Bard', 'Barry', 'Bill', 'Black', 'Brad', 'Bruce',
+    'Al', 'Alan', 'Alfredo', 'Allen', 'Alex', 'Alexander', 'Amanda', 'Andres', 'Andrew',
+    'Bard', 'Barry', 'Bill', 'Black', 'Boris', 'Brad', 'Bruce',
     'Carolyn', 'Chris', 'Christina',
     'Dan', 'Daniel', 'Danny', 'Darren', 'Dave', 'David',
     'Ed', 'Edward', 'Edwards', 'Epstein', 'Eric', 'Erika', 'Etienne',
@@ -215,10 +211,10 @@ NAMES_TO_NOT_HIGHLIGHT: list[str] = [name.lower() for name in [
     'Ian',
     'Jack', 'James', 'Jay', 'Jean', 'Jeff', 'Jeffrey', 'Jennifer', 'Jeremy', 'jessica', 'Joel', 'John', 'Jon', 'Jonathan', 'Joseph', 'Jr',
     'Kahn', 'Katherine', 'Ken', 'Kevin',
-    'Leon', 'Lesley', 'Linda', 'Link', 'Lisa',
+    'Larry', 'Leon', 'Lesley', 'Linda', 'Link', 'Lisa',
     'Mann', 'Marc', 'Marie', 'Mark', 'Martin', 'Melanie', 'Michael', 'Mike', 'Miller', 'Mitchell', 'Miles', 'Morris', 'Moskowitz',
     'Nancy', 'Neal', 'New',
-    'Paul', 'Paula', 'Pen', 'Peter', 'Philip',
+    'Paul', 'Paula', 'Pen', 'Peter', 'Philip', 'Prince',
     'Randall', 'Reid', 'Richard', 'Robert', 'Rodriguez', 'Roger', 'Rosenberg', 'Ross', 'Roth', 'Rubin',
     'Scott', 'Sean', 'Stanley', 'Stern', 'Stephen', 'Steve', 'Steven', 'Stone', 'Susan',
     'The', 'Thomas', 'Tim', 'Tom', 'Tyler',
@@ -228,25 +224,25 @@ NAMES_TO_NOT_HIGHLIGHT: list[str] = [name.lower() for name in [
 ]]
 # Names to color white in the word counts
-OTHER_NAMES = """
-    aaron albert alberto alec alex alexandra alice allen anderson andre andres ann anna anne ariana arthur
-    baldwin barack barbro barry ben benjamin berger bert binant bob bonner boyden brad bradley brady branson bruce bruno burton
-    chapman charles charlie chris christopher clint cohen colin collins conway
-    dave davis dean debra deborah dennis diana diane diaz dickinson dixon dominique don dylan
-    ed edmond elizabeth emily entwistle erik erika etienne evelyn
+OTHER_NAMES = NAMES_TO_NOT_HIGHLIGHT + """
+    aaron albert alberto alec alexandra alice anderson andre ann anna anne ariana arthur
+    baldwin barack ben benjamin berger bert binant bob bonner boyden bradley brady branson bruno bryant burton
+    chapman charles charlie christopher clint cohen colin collins conway
+    davis dean debra deborah dennis diana diane diaz dickinson dixon dominique don dylan
+    edmond elizabeth emily entwistle erik evelyn
     ferguson flachsbart francis franco frank
-    gardner gary geoff geoffrey george gilbert goldberg gonzalez gould graham greene guarino gwyneth
+    gardner gary geoff geoffrey gilbert gloria goldberg gonzalez gould graham greene guarino gwyneth
     hancock harold harrison harry helen hirsch hofstadter horowitz hussein
     isaac isaacson
-    jamie james jane janet jason jen jennifer jim joe joel johnson jones julie justin
-    kate kathy kelly kevin kim kruger kyle
-    leonard lenny lieberman louis lynch lynn
-    marcus marianne matt matthew melissa michele michelle mike mitchell moore moscowitz
+    jamie jane janet jason jen jim joe johnson jones josh julie justin
+    kate kathy kelly kim kruger kyle
+    leo leonard lenny leslie lieberman louis lynch lynn
+    marcus marianne matt matthew melissa michele michelle moore moscowitz
     nicole nussbaum
-    paul paula paulson philip philippe
-    rafael ray richardson rob robin rodriguez ron rudolph ryan
-    sara sarah seligman serge sergey silverman sloman smith snowden sorkin stanley steele stevie stewart susan
-    ted theresa thompson tiffany tim timothy tom
+    paulson philippe
+    rafael ray richardson rob robin ron rudolph ryan
+    sara sarah seligman serge sergey silverman sloman smith snowden sorkin steele stevie stewart
+    ted theresa thompson tiffany timothy
     valeria
     walter warren weinstein weiss william
     zach zack

epstein_files/util/constant/strings.py CHANGED Viewed

@@ -30,6 +30,10 @@ REPUTATION = 'reputation'
 SOCIAL = 'social'
 SPEECH = 'speech'
+# Locations
+PALM_BEACH = 'Palm Beach'
+VIRGIN_ISLANDS = 'Virgin Islands'
 # Publications
 BBC = 'BBC'
 BLOOMBERG = 'Bloomberg'
@@ -38,10 +42,12 @@ DAILY_MAIL = 'Daily Mail'
 DAILY_TELEGRAPH = "Daily Telegraph"
 LA_TIMES = 'LA Times'
 MIAMI_HERALD = 'Miami Herald'
-NYT_ARTICLE = 'NYT article about'
-NYT_COLUMN = 'NYT column about'
+NYT = "New York Times"
+PALM_BEACH_DAILY_NEWS = f'{PALM_BEACH} Daily News'
+PALM_BEACH_POST = f'{PALM_BEACH} Post'
 THE_REAL_DEAL = 'The Real Deal'
 WAPO = 'WaPo'
+VI_DAILY_NEWS = f'{VIRGIN_ISLANDS} Daily News'
 # Site types
 EMAIL = 'email'

epstein_files/util/constant/urls.py CHANGED Viewed

@@ -71,6 +71,7 @@ epsteinify_name_url = lambda name: f"{EPSTEINIFY_URL}/?name={urllib.parse.quote(
 epstein_media_doc_url = lambda file_stem: build_doc_url(DOC_LINK_BASE_URLS[EPSTEIN_MEDIA], file_stem, True)
 epstein_media_doc_link_markup = lambda filename_or_id, style = TEXT_LINK: external_doc_link_markup(EPSTEIN_MEDIA, filename_or_id, style)
 epstein_media_doc_link_txt = lambda filename_or_id, style = TEXT_LINK: Text.from_markup(epstein_media_doc_link_markup(filename_or_id, style))
+epstein_media_person_url = lambda person: f"{EPSTEIN_MEDIA_URL}/people/{parameterize(person)}"
 epstein_web_doc_url = lambda file_stem: f"{DOC_LINK_BASE_URLS[EPSTEIN_WEB]}/{file_stem}.jpg"
 epstein_web_person_url = lambda person: f"{EPSTEIN_WEB_URL}/{parameterize(person)}"

epstein-files 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl

epstein-files 1.0.2py3-none-any.whl → 1.0.4py3-none-any.whl