PyPI - epstein-files - Versions diffs - 1.1.5__tar.gz → 1.2.1__tar.gz - Mend

epstein-files 1.1.5tar.gz → 1.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

{epstein_files-1.1.5 → epstein_files-1.2.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: epstein-files
-Version: 1.1.5
+Version: 1.2.1
 Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
 Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
 License: GPL-3.0-or-later
@@ -17,6 +17,7 @@ Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
+Requires-Dist: cairosvg (>=2.8.2,<3.0.0)
 Requires-Dist: datefinder (>=0.7.3,<0.8.0)
 Requires-Dist: inflection (>=0.5.1,<0.6.0)
 Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
@@ -31,7 +32,7 @@ Project-URL: TextMessages, https://michelcrypt4d4mus.github.io/epstein_text_mess
 Project-URL: WordCounts, https://michelcrypt4d4mus.github.io/epstein_text_messages/communication_word_count_epstein_files_nov_2025.html
 Description-Content-Type: text/markdown
-# I Made Epstein's Text Messages Great Again
+# Color Highlighted Epstein Emails and Text Messages
 ![joi](https://github.com/michelcrypt4d4mus/epstein_text_messages/raw/master/docs/joi_ito_gavin_is_clever_epstein_funds_bitcoin_dev_team.png)
@@ -119,3 +120,6 @@ for file in epstein_files.other_files:
     do_stuff(file)
 ```
+# Everyone Who Sent or Received an Email in the November Document Dump
+![emails](https://github.com/michelcrypt4d4mus/epstein_text_messages/raw/master/docs/emailers_info_table.png)

{epstein_files-1.1.5 → epstein_files-1.2.1}/README.md RENAMED Viewed

@@ -1,4 +1,4 @@
-# I Made Epstein's Text Messages Great Again
+# Color Highlighted Epstein Emails and Text Messages
 ![joi](https://github.com/michelcrypt4d4mus/epstein_text_messages/raw/master/docs/joi_ito_gavin_is_clever_epstein_funds_bitcoin_dev_team.png)
@@ -85,3 +85,6 @@ for json_file in epstein_files.json_files:
 for file in epstein_files.other_files:
     do_stuff(file)
 ```
+# Everyone Who Sent or Received an Email in the November Document Dump
+![emails](https://github.com/michelcrypt4d4mus/epstein_text_messages/raw/master/docs/emailers_info_table.png)

{epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/__init__.py RENAMED Viewed

@@ -21,7 +21,8 @@ from epstein_files.util.env import args
 from epstein_files.util.file_helper import coerce_file_path, extract_file_id
 from epstein_files.util.logging import exit_with_error, logger
 from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
-     print_other_files_section, print_text_messages_section, print_email_timeline, print_json_metadata, write_urls)
+     print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info,
+     print_json_metadata, write_urls)
 from epstein_files.util.rich import (build_highlighter, console, print_color_key, print_title_page_header,
      print_title_page_tables, print_subtitle_panel, write_html)
 from epstein_files.util.timer import Timer
@@ -37,7 +38,10 @@ def generate_html() -> None:
     timer = Timer()
     epstein_files = EpsteinFiles.get_files(timer)
-    if args.json_metadata:
+    if args.emailers_info:
+        print_emailers_info(epstein_files)
+        exit()
+    elif args.json_metadata:
         print_json_metadata(epstein_files)
         exit()
     elif args.json_files:
@@ -55,25 +59,19 @@ def generate_html() -> None:
         exit()
     if args.output_texts:
-        imessage_logs = [log for log in epstein_files.imessage_logs if not args.names or log.author in args.names]
-        print_text_messages_section(imessage_logs)
-        timer.print_at_checkpoint(f'Printed {len(imessage_logs)} text message log files')
+        printed_logs = print_text_messages_section(epstein_files)
+        timer.log_section_complete('MessengerLog', epstein_files.imessage_logs, printed_logs)
     if args.output_emails:
-        emails_that_were_printed = print_emails_section(epstein_files)
-        timer.print_at_checkpoint(f"Printed {len(emails_that_were_printed):,} emails")
+        printed_emails = print_emails_section(epstein_files)
+        timer.log_section_complete('Email', epstein_files.emails, printed_emails)
     elif args.email_timeline:
         print_email_timeline(epstein_files)
         timer.print_at_checkpoint(f"Printed chronological emails table")
     if args.output_other:
-        if args.uninteresting:
-            files = [f for f in epstein_files.other_files if not f.is_interesting()]
-        else:
-            files = [f for f in epstein_files.other_files if args.all_other_files or f.is_interesting()]
-        print_other_files_section(files, epstein_files)
-        timer.print_at_checkpoint(f"Printed {len(files)} other files (skipped {len(epstein_files.other_files) - len(files)})")
+        printed_files = print_other_files_section(epstein_files)
+        timer.log_section_complete('OtherFile', epstein_files.other_files, printed_files)
     write_html(args.build)
     logger.warning(f"Total time: {timer.seconds_since_start_str()}")
@@ -90,7 +88,6 @@ def epstein_diff():
 def epstein_search():
     """Search the cleaned up text of the files."""
-    _assert_positional_args()
     epstein_files = EpsteinFiles.get_files()
     for search_term in args.positional_args:
@@ -113,7 +110,6 @@ def epstein_search():
 def epstein_show():
     """Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
-    _assert_positional_args()
     raw_docs: list[Document] = []
     console.line()
@@ -138,8 +134,3 @@ def epstein_show():
 def epstein_word_count() -> None:
     write_word_counts_html()
-def _assert_positional_args():
-    if not args.positional_args:
-        exit_with_error(f"No positional args provided!\n")

{epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/communication.py RENAMED Viewed

@@ -34,9 +34,6 @@ class Communication(Document):
         """Overrides super() method to apply self.author_style."""
         return super().external_links_txt(self.author_style(), include_alt_links=include_alt_links)
-    def is_attribution_uncertain(self) -> bool:
-        return bool(self.config and self.config.is_attribution_uncertain)
     def summary(self) -> Text:
         return self._summary().append(CLOSE_PROPERTIES_CHAR)

{epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/document.py RENAMED Viewed

@@ -11,17 +11,19 @@ from rich.console import Console, ConsoleOptions, Group, RenderResult
 from rich.padding import Padding
 from rich.panel import Panel
 from rich.text import Text
+from rich.table import Table
 from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import *
 from epstein_files.util.constant.urls import *
 from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
-from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time_from_timestamp_str, without_falsey
+from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time, without_falsey
 from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
 from epstein_files.util.env import DOCS_DIR, args
-from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, is_local_extract_file
+from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, file_size_to_str, is_local_extract_file
 from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
-from epstein_files.util.rich import INFO_STYLE, SYMBOL_STYLE, console, highlighter, join_texts, key_value_txt, link_text_obj, parenthesize
+from epstein_files.util.rich import (INFO_STYLE, NA_TXT, SYMBOL_STYLE, add_cols_to_table, build_table, console,
+     highlighter, join_texts, key_value_txt, link_text_obj, parenthesize)
 from epstein_files.util.search_result import MatchedLine
 ALT_LINK_STYLE = 'white dim'
@@ -55,6 +57,14 @@ OCR_REPAIRS = {
     'Nil Priell': 'Nili Priell',
 }
+SUMMARY_TABLE_COLS: list[str | dict] = [
+    'Count',
+    {'name': 'Has Author', 'style': 'honeydew2'},
+    {'name': 'No Author', 'style': 'wheat4'},
+    {'name': 'Uncertain Author', 'style': 'royal_blue1 dim'},
+    {'name': 'Size', 'justify': 'right', 'style': 'dim'},
+]
 @dataclass
 class Document:
@@ -63,7 +73,7 @@ class Document:
     Attributes:
         file_path (Path): Local path to file
-        author (str | None): Who is responsible for the text in the file
+        author (Name): Who is responsible for the text in the file
         config (DocCfg): Information about this fil
         file_id (str): 6 digit (or 8 digits if it's a local extract file) string ID
         filename (str): File's basename
@@ -74,7 +84,7 @@ class Document:
     """
     file_path: Path
     # Optional fields
-    author: str | None = None
+    author: Name = None
     config: EmailCfg | DocCfg | TextCfg | None = None
     file_id: str = field(init=False)
     filename: str = field(init=False)
@@ -121,6 +131,10 @@ class Document:
         txt.append(f" because it's {DUPE_TYPE_STRS[self.config.dupe_type]} ")
         return txt.append(epstein_media_doc_link_txt(self.config.duplicate_of_id, style='royal_blue1'))
+    def duplicate_of_id(self) -> str | None:
+        if self.config and self.config.duplicate_of_id:
+            return self.config.duplicate_of_id
     def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
         return self.external_link(epsteinify_doc_url, style, link_txt)
@@ -177,8 +191,11 @@ class Document:
         """Secondary info about this file (description recipients, etc). Overload in subclasses."""
         return None
+    def is_attribution_uncertain(self) -> bool:
+        return bool(self.config and self.config.is_attribution_uncertain)
     def is_duplicate(self) -> bool:
-        return bool(self.config and self.config.duplicate_of_id)
+        return bool(self.duplicate_of_id())
     def is_local_extract_file(self) -> bool:
         """True if extracted from other file (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
@@ -236,17 +253,6 @@ class Document:
         return text
-    def sort_key(self) -> tuple[datetime, str, int]:
-        """Sort by timestamp, file_id, then whether or not it's a duplicate file."""
-        if self.is_duplicate():
-            sort_id = self.config.duplicate_of_id
-            dupe_idx = 1
-        else:
-            sort_id = self.file_id
-            dupe_idx = 0
-        return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
     def source_file_id(self) -> str:
         """Strip off the _1, _2, etc. suffixes for extracted documents."""
         return self.file_id[0:6]
@@ -257,7 +263,7 @@ class Document:
         txt.append(f" {self.file_path.stem}", style=FILENAME_STYLE)
         if self.timestamp:
-            timestamp_str = remove_zero_time_from_timestamp_str(self.timestamp).replace('T', ' ')
+            timestamp_str = remove_zero_time(self.timestamp).replace('T', ' ')
             txt.append(' (', style=SYMBOL_STYLE)
             txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
@@ -278,6 +284,17 @@ class Document:
         return Panel(Group(*sentences), border_style=self._class_style(), expand=False)
+    def timestamp_sort_key(self) -> tuple[datetime, str, int]:
+        """Sort by timestamp, file_id, then whether or not it's a duplicate file."""
+        if self.is_duplicate():
+            sort_id = self.config.duplicate_of_id
+            dupe_idx = 1
+        else:
+            sort_id = self.file_id
+            dupe_idx = 0
+        return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
     def top_lines(self, n: int = 10) -> str:
         """First n lines."""
         return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
@@ -357,6 +374,32 @@ class Document:
     def __str__(self) -> str:
         return self.summary().plain
+    @classmethod
+    def file_info_table(cls, title: str, first_col_name: str) -> Table:
+        """Empty table with appropriate cols for summarizing groups of files."""
+        table = build_table(title)
+        cols = [{'name': first_col_name, 'min_width': 14}] + SUMMARY_TABLE_COLS
+        add_cols_to_table(table, cols, 'right')
+        return table
+    @classmethod
+    def files_info(cls, files: Sequence['Document'], is_author_na: bool = False) -> dict[str, str | Text]:
+        """Summary info about a group of files."""
+        file_count = len(files)
+        author_count = cls.known_author_count(files)
+        return {
+            'count': str(file_count),
+            'author_count': NA_TXT if is_author_na else str(author_count),
+            'no_author_count': NA_TXT if is_author_na else str(file_count - author_count),
+            'uncertain_author_count': NA_TXT if is_author_na else str(len([f for f in files if f.is_attribution_uncertain()])),
+            'bytes': file_size_to_str(sum([f.file_size() for f in files])),
+        }
+    @classmethod
+    def files_info_row(cls, files: Sequence['Document'], author_na: bool = False) -> Sequence[str | Text]:
+        return [v for v in cls.files_info(files, author_na).values()]
     @staticmethod
     def diff_files(files: list[str]) -> None:
         """Diff the contents of two Documents after all cleanup, BOM removal, etc."""
@@ -394,14 +437,18 @@ class Document:
     @staticmethod
     def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
-        return sorted(docs, key=lambda doc: doc.sort_key())
+        return sorted(docs, key=lambda doc: doc.timestamp_sort_key())
-    @classmethod
-    def uniquify(cls, documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
+    @staticmethod
+    def uniquify(documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
         """Uniquify by file_id."""
         id_map = {doc.file_id: doc for doc in documents}
         return [doc for doc in id_map.values()]
+    @staticmethod
+    def without_dupes(docs: Sequence['DocumentType']) -> list['DocumentType']:
+        return [doc for doc in docs if not doc.is_duplicate()]
 DocumentType = TypeVar('DocumentType', bound=Document)

{epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/email.py RENAMED Viewed

@@ -20,7 +20,7 @@ from epstein_files.documents.emails.email_header import (BAD_EMAILER_REGEX, EMAI
 from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import REDACTED
 from epstein_files.util.constants import *
-from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes, extract_last_name,
+from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes,
      flatten, listify, remove_timezone, uniquify)
 from epstein_files.util.doc_cfg import EmailCfg, Metadata
 from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
@@ -32,7 +32,7 @@ BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE
 BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
 DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
 LINK_LINE_REGEX = re.compile(f"^(> )?htt")
-QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
+QUOTED_REPLY_LINE_REGEX = re.compile(r'(\nFrom:(.*)|wrote:)\n', re.IGNORECASE)
 REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
 BAD_TIMEZONE_REGEX = re.compile(fr'\((UTC|GMT\+\d\d:\d\d)\)|{REDACTED}')
@@ -55,6 +55,7 @@ REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
 OCR_REPAIRS: dict[str | re.Pattern, str] = {
     re.compile(r'grnail\.com'): 'gmail.com',
+    'Newsmax. corn': 'Newsmax.com',
     re.compile(r"^(From|To)(: )?[_1.]{5,}", re.MULTILINE): rf"\1: {REDACTED}",  # Redacted email addresses
     # These 3 must come in this order!
     re.compile(r'([/vkT]|Ai|li|(I|7)v)rote:'): 'wrote:',
@@ -79,6 +80,7 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
     'twitter glhsummers': 'twitter @lhsummers',
     re.compile(r"twitter\.com[i/][lI]krauss[1lt]"): "twitter.com/lkrauss1",
     re.compile(r'from my BlackBerry[0°] wireless device'): 'from my BlackBerry® wireless device',
+    re.compile(r'^INW$', re.MULTILINE): REDACTED,
     # links
     'Imps ://': 'https://',
     re.compile(r'timestopics/people/t/landon jr thomas/inde\n?x\n?\.\n?h\n?tml'): 'timestopics/people/t/landon_jr_thomas/index.html',
@@ -112,7 +114,7 @@ EMAIL_SIGNATURE_REGEXES = {
     DAVID_INGRAM: re.compile(r"Thank you in advance.*\nDavid Ingram.*\nCorrespondent\nReuters.*\nThomson.*(\n(Office|Mobile|Reuters.com).*)*"),
     DEEPAK_CHOPRA: re.compile(fr"({DEEPAK_CHOPRA}( MD)?\n)?2013 Costa Del Mar Road\nCarlsbad, CA 92009(\n(Chopra Foundation|Super Genes: Unlock.*))?(\nJiyo)?(\nChopra Center for Wellbeing)?(\nHome: Where Everyone is Welcome)?"),
     JEFFREY_EPSTEIN: re.compile(r"((\*+|please note)\n+)?(> )?(• )?(» )?The information contained in this communication is\n(> )*(» )?confidential.*?all attachments.( copyright -all rights reserved?)?", re.DOTALL),
-    JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*", re.IGNORECASE),
+    JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*(\nTel:.*)?(\nEmail:.*)?", re.IGNORECASE),
     KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
     LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
     LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
@@ -127,14 +129,6 @@ EMAIL_SIGNATURE_REGEXES = {
     UNKNOWN: re.compile(r"(This message is directed to and is for the use of the above-noted addressee only.*\nhereon\.)", re.DOTALL),
 }
-EMAIL_TABLE_COLS = [
-    {'name': 'Sent At', 'justify': 'left', 'style': TIMESTAMP_DIM},
-    {'name': 'From', 'justify': 'left', 'max_width': 20},
-    {'name': 'To', 'justify': 'left', 'max_width': 22},
-    {'name': 'Length', 'justify': 'right', 'style': 'wheat4'},
-    {'name': 'Subject', 'justify': 'left', 'min_width': 35, 'style': 'honeydew2'},
-]
 MAILING_LISTS = [
     CAROLYN_RANGEL,
     INTELLIGENCE_SQUARED,
@@ -142,10 +136,13 @@ MAILING_LISTS = [
     JP_MORGAN_USGIO,
 ]
-TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + MAILING_LISTS + [
+BBC_LISTS = JUNK_EMAILERS + MAILING_LISTS
+TRUNCATE_ALL_EMAILS_FROM = BBC_LISTS + [
     'Alan S Halperin',
     'Mitchell Bard',
     'Skip Rimer',
+    'Steven Victor MD',
 ]
 TRUNCATION_LENGTHS = {
@@ -253,58 +250,15 @@ TRUNCATE_TERMS = [
     'https://www.washingtonpost.com/politics/2018/09/04/transcript-phone-call',
 ]
-# Some Paul Krassner emails have a ton of CCed parties we don't care about
-KRASSNER_RECIPIENTS = uniquify(flatten([ALL_FILE_CONFIGS[id].recipients for id in ['025329', '024923', '033568']]))
-# No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
-USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIPIENTS + [
-    'Alan Dlugash',                            # CCed with Richard Kahn
-    'Alan Rogers',                           # Random CC
-    'Andrew Friendly',                       # Presumably some relation of Kelly Friendly
-    'BS Stern',                              # A random fwd of email we have
-    'Cheryl Kleen',                          # Single email from Anne Boyles, displayed under Anne Boyles
-    'Connie Zaguirre',                       # Random CC
-    'Dan Fleuette',                          # CC from sean bannon
-    'Danny Goldberg',                        # Random Paul Krassner emails
-    GERALD_LEFCOURT,                         # Single CC
-    GORDON_GETTY,                            # Random CC
-    JEFF_FULLER,                             # Random Jean Luc Brunel CC
-    'Jojo Fontanilla',                       # Random CC
-    'Joseph Vinciguerra',                    # Random CC
-    'Larry Cohen',                           # Random Bill Gates CC
-    'Lyn Fontanilla',                        # Random CC
-    'Mark Albert',                           # Random CC
-    'Matthew Schafer',                       # Random CC
-    MICHAEL_BUCHHOLTZ,                       # Terry Kafka CC
-    'Nancy Dahl',                            # covered by Lawrence Krauss (her husband)
-    'Michael Simmons',                       # Random CC
-    'Nancy Portland',                        # Lawrence Krauss CC
-    'Oliver Goodenough',                     # Robert Trivers CC
-    'Peter Aldhous',                         # Lawrence Krauss CC
-    'Players2',                              # Hoffenberg CC
-    'Sam Harris',                            # Lawrence Krauss CC
-    SAMUEL_LEFF,                             # Random CC
-    'Sean T Lehane',                         # Random CC
-    'Stephen Rubin',                         # Random CC
-    'Tim Kane',                              # Random CC
-    'Travis Pangburn',                       # Random CC
-    'Vahe Stepanian',                        # Random CC
-    # Ross Gow BCC
-    'david.brown@thetimes.co.uk',
-    'io-anne.pugh@bbc.co.uk',
-    'martin.robinson@mailonline.co.uk',
-    'nick.alwav@bbc.co.uk'
-    'nick.sommerlad@mirror.co.uk',
-    'p.peachev@independent.co.uk',
-]
 METADATA_FIELDS = [
     'is_junk_mail',
+    'is_mailing_list',
     'recipients',
     'sent_from_device',
     'subject',
 ]
+# Note the line repair happens *after* 'Importance: High' is removed
 LINE_REPAIR_MERGES = {
     '017523': 4,
     '019407': [2, 4],
@@ -312,10 +266,14 @@ LINE_REPAIR_MERGES = {
     '022673': 9,
     '022684': 9,
     '022695': 4,
+    '029773': [2, 5],
     '023067': 3,
     '025790': 2,
+    '029841': 3,
     '026345': 3,
     '026609': 4,
+    '033299': 3,
+    '026829': 3,
     '026924': [2, 4],
     '028931': [3, 6],
     '029154': [2, 5],
@@ -326,6 +284,7 @@ LINE_REPAIR_MERGES = {
     '029501': 2,
     '029835': [2, 4],
     '029889': 2,
+    '029545': [3, 5],
     '029976': 3,
     '030299': [7, 10],
     '030381': [2, 4],
@@ -359,14 +318,14 @@ class Email(Communication):
         actual_text (str) - best effort at the text actually sent in this email, excluding quoted replies and forwards
         config (EmailCfg | None) - manual config for this email (if it exists)
         header (EmailHeader) - header data extracted from the text (from/to/sent/subject etc)
-        recipients (list[str | None]) - who this email was sent to
+        recipients (list[Name]) - who this email was sent to
         sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
         signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
     """
     actual_text: str = field(init=False)
     config: EmailCfg | None = None
     header: EmailHeader = field(init=False)
-    recipients: list[str | None] = field(default_factory=list)
+    recipients: list[Name] = field(default_factory=list)
     sent_from_device: str | None = None
     signature_substitution_counts: dict[str, int] = field(default_factory=dict)  # defaultdict breaks asdict :(
@@ -394,7 +353,7 @@ class Email(Communication):
                 self.recipients.extend(self._extract_emailer_names(recipient))
             # Assume mailing list emails are to Epstein
-            if self.author in MAILING_LISTS and (self.is_note_to_self() or not self.recipients):
+            if self.author in BBC_LISTS and (self.is_note_to_self() or not self.recipients):
                 self.recipients = [JEFFREY_EPSTEIN]
         # Remove self CCs but preserve self emails
@@ -423,7 +382,10 @@ class Email(Communication):
         return bool(self.config and self.config.is_fwded_article)
     def is_junk_mail(self) -> bool:
-        return self.author in JUNK_EMAILERS or self.author in MAILING_LISTS
+        return self.author in JUNK_EMAILERS
+    def is_mailing_list(self) -> bool:
+        return self.author in MAILING_LISTS or self.is_junk_mail()
     def is_note_to_self(self) -> bool:
         return self.recipients == [self.author]
@@ -431,6 +393,7 @@ class Email(Communication):
     def metadata(self) -> Metadata:
         local_metadata = asdict(self)
         local_metadata['is_junk_mail'] = self.is_junk_mail()
+        local_metadata['is_mailing_list'] = self.is_junk_mail()
         local_metadata['subject'] = self.subject() or None
         metadata = super().metadata()
         metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
@@ -473,9 +436,9 @@ class Email(Communication):
         elif self.header.num_header_rows == 0:
             return self.text
-        reply_text_match = REPLY_TEXT_REGEX.search(text)
         self.log_top_lines(20, "Raw text:", logging.DEBUG)
         self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
+        reply_text_match = REPLY_TEXT_REGEX.search(text)
         if reply_text_match:
             actual_num_chars = len(reply_text_match.group(1))
@@ -550,6 +513,8 @@ class Email(Communication):
             self.log_top_lines(msg='No email header match found!', level=log_level)
             self.header = EmailHeader(field_names=[])
+        logger.debug(f"{self.file_id} extracted header\n\n{self.header}\n")
     def _extract_timestamp(self) -> datetime:
         if self.config and self.config.timestamp:
             return self.config.timestamp
@@ -585,9 +550,15 @@ class Email(Communication):
     def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
         """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
-        for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text or self.text)):
+        if text is None:
+            header_offset = len(self.header.header_chars)
+            text = self.text[header_offset:]
+        else:
+            header_offset = 0
+        for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text)):
             if i >= n:
-                return match.end() - 1
+                return match.end() + header_offset - 1
     def _merge_lines(self, idx: int, idx2: int | None = None) -> None:
         """Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""
@@ -674,6 +645,9 @@ class Email(Communication):
         elif self.file_id in ['025329']:
             for _i in range(9):
                 self._merge_lines(2)
+        elif self.file_id in ['025812']:
+            for _i in range(2):
+                self._merge_lines(3)
         elif self.file_id == '014860':
             self._merge_lines(3)
             self._merge_lines(4)
@@ -839,19 +813,29 @@ class Email(Communication):
             self.log_top_lines(self.header.num_header_rows + 4, f'Original header:')
     @staticmethod
-    def build_emails_table(emails: list['Email'], author: str | None = '', title: str = '', show_length: bool = False) -> Table:
+    def build_emails_table(emails: list['Email'], name: Name = '', title: str = '', show_length: bool = False) -> Table:
         """Turn a set of Emails into a Table."""
-        if title and author:
+        if title and name:
             raise ValueError(f"Can't provide both 'author' and 'title' args")
-        elif author == '' and title == '':
+        elif name == '' and title == '':
             raise ValueError(f"Must provide either 'author' or 'title' arg")
-        author_style = get_style_for_name(author, allow_bold=False)
-        link_style = author_style if author else ARCHIVE_LINK_COLOR
+        author_style = get_style_for_name(name, allow_bold=False)
+        link_style = author_style if name else ARCHIVE_LINK_COLOR
+        min_width = len(name or UNKNOWN)
+        max_width = max(20, min_width)
+        columns = [
+            {'name': 'Sent At', 'justify': 'left', 'style': TIMESTAMP_DIM},
+            {'name': 'From', 'justify': 'left', 'min_width': min_width, 'max_width': max_width},
+            {'name': 'To', 'justify': 'left', 'min_width': min_width, 'max_width': max_width + 2},
+            {'name': 'Length', 'justify': 'right', 'style': 'wheat4'},
+            {'name': 'Subject', 'justify': 'left', 'min_width': 35, 'style': 'honeydew2'},
+        ]
         table = build_table(
             title or None,
-            cols=[col for col in EMAIL_TABLE_COLS if show_length or col['name'] not in ['Length']],
+            cols=[col for col in columns if show_length or col['name'] not in ['Length']],
             border_style=DEFAULT_TABLE_KWARGS['border_style'] if title else author_style,
             header_style="bold",
             highlight=True,

{epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/emails/email_header.py RENAMED Viewed

@@ -8,13 +8,12 @@ from epstein_files.util.doc_cfg import EmailCfg
 from epstein_files.util.logging import logger
 from epstein_files.util.rich import UNKNOWN
-FIELD_NAMES = ['From', 'Date', 'Sent', 'Subject']
-NON_HEADER_FIELDS = ['field_names', 'num_header_rows', 'was_initially_empty']
+FIELD_NAMES = ['Date', 'From', 'Sent', 'Subject']
 ON_BEHALF_OF = 'on behalf of'
 TO_FIELDS = ['bcc', 'cc', 'to']
 EMAILER_FIELDS = [AUTHOR] + TO_FIELDS
-HEADER_REGEX_STR = r'(((?:(?:Date|From|Sent|To|C[cC]|Importance|Subject|Bee|B[cC]{2}|Attachments):|on behalf of ?)(?! +(by |from my|via )).*\n){3,})'
+HEADER_REGEX_STR = r'(((?:(?:Date|From|Sent|To|C[cC]|Importance|Subject|Bee|B[cC]{2}|Attachments|Classification|Flag):|on behalf of ?)(?! +(by |from my|via )).*\n){3,})'
 EMAIL_SIMPLE_HEADER_REGEX = re.compile(rf'^{HEADER_REGEX_STR}')
 EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX = re.compile(HEADER_REGEX_STR)
 EMAIL_PRE_FORWARD_REGEX = re.compile(r"(.{3,2000}?)" + HEADER_REGEX_STR, re.DOTALL)  # Match up to the next email header section
@@ -28,10 +27,18 @@ CONFIGURED_ACTUAL_TEXTS = [
     if isinstance(cfg, EmailCfg) and cfg.actual_text is not None
 ]
+NON_HEADER_FIELDS = [
+    'field_names',
+    'header_chars',
+    'num_header_rows',
+    'was_initially_empty',
+]
 @dataclass(kw_only=True)
 class EmailHeader:
     field_names: list[str]  # Order is same as the order header fields appear in the email file text
+    header_chars: str = ''
     num_header_rows: int = field(init=False)
     was_initially_empty: bool = False
@@ -41,6 +48,8 @@ class EmailHeader:
     subject: str | None = None
     bcc: list[str] | None = None
     cc: list[str] | None = None
+    classification: str | None = None
+    flag: str | None = None
     importance: str | None = None
     attachments: str | None = None
     to: list[str] | None = None
@@ -99,6 +108,7 @@ class EmailHeader:
             setattr(self, field_name, value)
         self.num_header_rows = len(self.field_names) + num_headers
+        self.header_chars = '\n'.join(email_lines[0:self.num_header_rows])
         log_msg = f"Corrected empty header using {self.num_header_rows} lines to:\n"
         logger.debug(f"{log_msg}{self}\n\nTop lines:\n\n%s", '\n'.join(email_lines[0:(num_headers + 1) * 2]))
@@ -161,7 +171,7 @@ class EmailHeader:
         if should_log_header:
             logger.debug(f"Header being parsed was this:\n\n{header}\n")
-        return EmailHeader(field_names=field_names, **kw_args)
+        return cls(field_names=field_names, header_chars=header, **kw_args)
     @staticmethod
     def cleanup_str(_str: str) -> str:

epstein-files 1.1.5__tar.gz → 1.2.1__tar.gz

epstein-files 1.1.5tar.gz → 1.2.1tar.gz