PyPI - epstein-files - Versions diffs - 1.1.5__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

epstein-files 1.1.5py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

epstein_files/__init__.py +5 -1
epstein_files/documents/document.py +7 -3
epstein_files/documents/email.py +43 -65
epstein_files/documents/emails/email_header.py +4 -2
epstein_files/documents/imessage/text_message.py +3 -3
epstein_files/documents/messenger_log.py +7 -7
epstein_files/epstein_files.py +117 -115
epstein_files/person.py +350 -0
epstein_files/util/constant/names.py +35 -11
epstein_files/util/constant/output_files.py +1 -0
epstein_files/util/constant/strings.py +3 -2
epstein_files/util/constant/urls.py +14 -2
epstein_files/util/constants.py +72 -20
epstein_files/util/data.py +0 -19
epstein_files/util/doc_cfg.py +24 -14
epstein_files/util/env.py +3 -1
epstein_files/util/highlighted_group.py +154 -127
epstein_files/util/output.py +84 -152
epstein_files/util/rich.py +6 -21
epstein_files/util/word_count.py +1 -1
{epstein_files-1.1.5.dist-info → epstein_files-1.2.0.dist-info}/METADATA +2 -1
epstein_files-1.2.0.dist-info/RECORD +34 -0
epstein_files-1.1.5.dist-info/RECORD +0 -33
{epstein_files-1.1.5.dist-info → epstein_files-1.2.0.dist-info}/LICENSE +0 -0
{epstein_files-1.1.5.dist-info → epstein_files-1.2.0.dist-info}/WHEEL +0 -0
{epstein_files-1.1.5.dist-info → epstein_files-1.2.0.dist-info}/entry_points.txt +0 -0

epstein_files/__init__.py CHANGED Viewed

@@ -21,7 +21,8 @@ from epstein_files.util.env import args
 from epstein_files.util.file_helper import coerce_file_path, extract_file_id
 from epstein_files.util.logging import exit_with_error, logger
 from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
-     print_other_files_section, print_text_messages_section, print_email_timeline, print_json_metadata, write_urls)
+     print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info_png,
+     print_json_metadata, write_urls)
 from epstein_files.util.rich import (build_highlighter, console, print_color_key, print_title_page_header,
      print_title_page_tables, print_subtitle_panel, write_html)
 from epstein_files.util.timer import Timer
@@ -43,6 +44,9 @@ def generate_html() -> None:
     elif args.json_files:
         print_json_files(epstein_files)
         exit()
+    elif args.emailers_info_png:
+        print_emailers_info_png(epstein_files)
+        exit()
     print_title_page_header()

epstein_files/documents/document.py CHANGED Viewed

@@ -63,7 +63,7 @@ class Document:
     Attributes:
         file_path (Path): Local path to file
-        author (str | None): Who is responsible for the text in the file
+        author (Name): Who is responsible for the text in the file
         config (DocCfg): Information about this fil
         file_id (str): 6 digit (or 8 digits if it's a local extract file) string ID
         filename (str): File's basename
@@ -74,7 +74,7 @@ class Document:
     """
     file_path: Path
     # Optional fields
-    author: str | None = None
+    author: Name = None
     config: EmailCfg | DocCfg | TextCfg | None = None
     file_id: str = field(init=False)
     filename: str = field(init=False)
@@ -121,6 +121,10 @@ class Document:
         txt.append(f" because it's {DUPE_TYPE_STRS[self.config.dupe_type]} ")
         return txt.append(epstein_media_doc_link_txt(self.config.duplicate_of_id, style='royal_blue1'))
+    def duplicate_of_id(self) -> str | None:
+        if self.config and self.config.duplicate_of_id:
+            return self.config.duplicate_of_id
     def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
         return self.external_link(epsteinify_doc_url, style, link_txt)
@@ -178,7 +182,7 @@ class Document:
         return None
     def is_duplicate(self) -> bool:
-        return bool(self.config and self.config.duplicate_of_id)
+        return bool(self.duplicate_of_id())
     def is_local_extract_file(self) -> bool:
         """True if extracted from other file (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""

epstein_files/documents/email.py CHANGED Viewed

@@ -20,7 +20,7 @@ from epstein_files.documents.emails.email_header import (BAD_EMAILER_REGEX, EMAI
 from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import REDACTED
 from epstein_files.util.constants import *
-from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes, extract_last_name,
+from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes,
      flatten, listify, remove_timezone, uniquify)
 from epstein_files.util.doc_cfg import EmailCfg, Metadata
 from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
@@ -55,6 +55,7 @@ REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
 OCR_REPAIRS: dict[str | re.Pattern, str] = {
     re.compile(r'grnail\.com'): 'gmail.com',
+    'Newsmax. corn': 'Newsmax.com',
     re.compile(r"^(From|To)(: )?[_1.]{5,}", re.MULTILINE): rf"\1: {REDACTED}",  # Redacted email addresses
     # These 3 must come in this order!
     re.compile(r'([/vkT]|Ai|li|(I|7)v)rote:'): 'wrote:',
@@ -79,6 +80,7 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
     'twitter glhsummers': 'twitter @lhsummers',
     re.compile(r"twitter\.com[i/][lI]krauss[1lt]"): "twitter.com/lkrauss1",
     re.compile(r'from my BlackBerry[0°] wireless device'): 'from my BlackBerry® wireless device',
+    re.compile(r'^INW$', re.MULTILINE): REDACTED,
     # links
     'Imps ://': 'https://',
     re.compile(r'timestopics/people/t/landon jr thomas/inde\n?x\n?\.\n?h\n?tml'): 'timestopics/people/t/landon_jr_thomas/index.html',
@@ -127,14 +129,6 @@ EMAIL_SIGNATURE_REGEXES = {
     UNKNOWN: re.compile(r"(This message is directed to and is for the use of the above-noted addressee only.*\nhereon\.)", re.DOTALL),
 }
-EMAIL_TABLE_COLS = [
-    {'name': 'Sent At', 'justify': 'left', 'style': TIMESTAMP_DIM},
-    {'name': 'From', 'justify': 'left', 'max_width': 20},
-    {'name': 'To', 'justify': 'left', 'max_width': 22},
-    {'name': 'Length', 'justify': 'right', 'style': 'wheat4'},
-    {'name': 'Subject', 'justify': 'left', 'min_width': 35, 'style': 'honeydew2'},
-]
 MAILING_LISTS = [
     CAROLYN_RANGEL,
     INTELLIGENCE_SQUARED,
@@ -142,10 +136,13 @@ MAILING_LISTS = [
     JP_MORGAN_USGIO,
 ]
-TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + MAILING_LISTS + [
+BBC_LISTS = JUNK_EMAILERS + MAILING_LISTS
+TRUNCATE_ALL_EMAILS_FROM = BBC_LISTS + [
     'Alan S Halperin',
     'Mitchell Bard',
     'Skip Rimer',
+    'Steven Victor MD',
 ]
 TRUNCATION_LENGTHS = {
@@ -253,58 +250,15 @@ TRUNCATE_TERMS = [
     'https://www.washingtonpost.com/politics/2018/09/04/transcript-phone-call',
 ]
-# Some Paul Krassner emails have a ton of CCed parties we don't care about
-KRASSNER_RECIPIENTS = uniquify(flatten([ALL_FILE_CONFIGS[id].recipients for id in ['025329', '024923', '033568']]))
-# No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
-USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIPIENTS + [
-    'Alan Dlugash',                            # CCed with Richard Kahn
-    'Alan Rogers',                           # Random CC
-    'Andrew Friendly',                       # Presumably some relation of Kelly Friendly
-    'BS Stern',                              # A random fwd of email we have
-    'Cheryl Kleen',                          # Single email from Anne Boyles, displayed under Anne Boyles
-    'Connie Zaguirre',                       # Random CC
-    'Dan Fleuette',                          # CC from sean bannon
-    'Danny Goldberg',                        # Random Paul Krassner emails
-    GERALD_LEFCOURT,                         # Single CC
-    GORDON_GETTY,                            # Random CC
-    JEFF_FULLER,                             # Random Jean Luc Brunel CC
-    'Jojo Fontanilla',                       # Random CC
-    'Joseph Vinciguerra',                    # Random CC
-    'Larry Cohen',                           # Random Bill Gates CC
-    'Lyn Fontanilla',                        # Random CC
-    'Mark Albert',                           # Random CC
-    'Matthew Schafer',                       # Random CC
-    MICHAEL_BUCHHOLTZ,                       # Terry Kafka CC
-    'Nancy Dahl',                            # covered by Lawrence Krauss (her husband)
-    'Michael Simmons',                       # Random CC
-    'Nancy Portland',                        # Lawrence Krauss CC
-    'Oliver Goodenough',                     # Robert Trivers CC
-    'Peter Aldhous',                         # Lawrence Krauss CC
-    'Players2',                              # Hoffenberg CC
-    'Sam Harris',                            # Lawrence Krauss CC
-    SAMUEL_LEFF,                             # Random CC
-    'Sean T Lehane',                         # Random CC
-    'Stephen Rubin',                         # Random CC
-    'Tim Kane',                              # Random CC
-    'Travis Pangburn',                       # Random CC
-    'Vahe Stepanian',                        # Random CC
-    # Ross Gow BCC
-    'david.brown@thetimes.co.uk',
-    'io-anne.pugh@bbc.co.uk',
-    'martin.robinson@mailonline.co.uk',
-    'nick.alwav@bbc.co.uk'
-    'nick.sommerlad@mirror.co.uk',
-    'p.peachev@independent.co.uk',
-]
 METADATA_FIELDS = [
     'is_junk_mail',
+    'is_mailing_list',
     'recipients',
     'sent_from_device',
     'subject',
 ]
+# Note the line repair happens *after* 'Importance: High' is removed
 LINE_REPAIR_MERGES = {
     '017523': 4,
     '019407': [2, 4],
@@ -312,10 +266,14 @@ LINE_REPAIR_MERGES = {
     '022673': 9,
     '022684': 9,
     '022695': 4,
+    '029773': [2, 5],
     '023067': 3,
     '025790': 2,
+    '029841': 3,
     '026345': 3,
     '026609': 4,
+    '033299': 3,
+    '026829': 3,
     '026924': [2, 4],
     '028931': [3, 6],
     '029154': [2, 5],
@@ -326,6 +284,7 @@ LINE_REPAIR_MERGES = {
     '029501': 2,
     '029835': [2, 4],
     '029889': 2,
+    '029545': [3, 5],
     '029976': 3,
     '030299': [7, 10],
     '030381': [2, 4],
@@ -359,14 +318,14 @@ class Email(Communication):
         actual_text (str) - best effort at the text actually sent in this email, excluding quoted replies and forwards
         config (EmailCfg | None) - manual config for this email (if it exists)
         header (EmailHeader) - header data extracted from the text (from/to/sent/subject etc)
-        recipients (list[str | None]) - who this email was sent to
+        recipients (list[Name]) - who this email was sent to
         sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
         signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
     """
     actual_text: str = field(init=False)
     config: EmailCfg | None = None
     header: EmailHeader = field(init=False)
-    recipients: list[str | None] = field(default_factory=list)
+    recipients: list[Name] = field(default_factory=list)
     sent_from_device: str | None = None
     signature_substitution_counts: dict[str, int] = field(default_factory=dict)  # defaultdict breaks asdict :(
@@ -394,7 +353,7 @@ class Email(Communication):
                 self.recipients.extend(self._extract_emailer_names(recipient))
             # Assume mailing list emails are to Epstein
-            if self.author in MAILING_LISTS and (self.is_note_to_self() or not self.recipients):
+            if self.author in BBC_LISTS and (self.is_note_to_self() or not self.recipients):
                 self.recipients = [JEFFREY_EPSTEIN]
         # Remove self CCs but preserve self emails
@@ -423,7 +382,10 @@ class Email(Communication):
         return bool(self.config and self.config.is_fwded_article)
     def is_junk_mail(self) -> bool:
-        return self.author in JUNK_EMAILERS or self.author in MAILING_LISTS
+        return self.author in JUNK_EMAILERS
+    def is_mailing_list(self) -> bool:
+        return self.author in MAILING_LISTS or self.is_junk_mail()
     def is_note_to_self(self) -> bool:
         return self.recipients == [self.author]
@@ -431,6 +393,7 @@ class Email(Communication):
     def metadata(self) -> Metadata:
         local_metadata = asdict(self)
         local_metadata['is_junk_mail'] = self.is_junk_mail()
+        local_metadata['is_mailing_list'] = self.is_junk_mail()
         local_metadata['subject'] = self.subject() or None
         metadata = super().metadata()
         metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
@@ -550,6 +513,8 @@ class Email(Communication):
             self.log_top_lines(msg='No email header match found!', level=log_level)
             self.header = EmailHeader(field_names=[])
+        logger.debug(f"{self.file_id} extracted header\n\n{self.header}\n")
     def _extract_timestamp(self) -> datetime:
         if self.config and self.config.timestamp:
             return self.config.timestamp
@@ -674,6 +639,9 @@ class Email(Communication):
         elif self.file_id in ['025329']:
             for _i in range(9):
                 self._merge_lines(2)
+        elif self.file_id in ['025812']:
+            for _i in range(2):
+                self._merge_lines(3)
         elif self.file_id == '014860':
             self._merge_lines(3)
             self._merge_lines(4)
@@ -839,19 +807,29 @@ class Email(Communication):
             self.log_top_lines(self.header.num_header_rows + 4, f'Original header:')
     @staticmethod
-    def build_emails_table(emails: list['Email'], author: str | None = '', title: str = '', show_length: bool = False) -> Table:
+    def build_emails_table(emails: list['Email'], name: Name = '', title: str = '', show_length: bool = False) -> Table:
         """Turn a set of Emails into a Table."""
-        if title and author:
+        if title and name:
             raise ValueError(f"Can't provide both 'author' and 'title' args")
-        elif author == '' and title == '':
+        elif name == '' and title == '':
             raise ValueError(f"Must provide either 'author' or 'title' arg")
-        author_style = get_style_for_name(author, allow_bold=False)
-        link_style = author_style if author else ARCHIVE_LINK_COLOR
+        author_style = get_style_for_name(name, allow_bold=False)
+        link_style = author_style if name else ARCHIVE_LINK_COLOR
+        min_width = len(name or UNKNOWN)
+        max_width = max(20, min_width)
+        columns = [
+            {'name': 'Sent At', 'justify': 'left', 'style': TIMESTAMP_DIM},
+            {'name': 'From', 'justify': 'left', 'min_width': min_width, 'max_width': max_width},
+            {'name': 'To', 'justify': 'left', 'min_width': min_width, 'max_width': max_width + 2},
+            {'name': 'Length', 'justify': 'right', 'style': 'wheat4'},
+            {'name': 'Subject', 'justify': 'left', 'min_width': 35, 'style': 'honeydew2'},
+        ]
         table = build_table(
             title or None,
-            cols=[col for col in EMAIL_TABLE_COLS if show_length or col['name'] not in ['Length']],
+            cols=[col for col in columns if show_length or col['name'] not in ['Length']],
             border_style=DEFAULT_TABLE_KWARGS['border_style'] if title else author_style,
             header_style="bold",
             highlight=True,

epstein_files/documents/emails/email_header.py CHANGED Viewed

@@ -8,13 +8,13 @@ from epstein_files.util.doc_cfg import EmailCfg
 from epstein_files.util.logging import logger
 from epstein_files.util.rich import UNKNOWN
-FIELD_NAMES = ['From', 'Date', 'Sent', 'Subject']
+FIELD_NAMES = ['Date', 'From', 'Sent', 'Subject']
 NON_HEADER_FIELDS = ['field_names', 'num_header_rows', 'was_initially_empty']
 ON_BEHALF_OF = 'on behalf of'
 TO_FIELDS = ['bcc', 'cc', 'to']
 EMAILER_FIELDS = [AUTHOR] + TO_FIELDS
-HEADER_REGEX_STR = r'(((?:(?:Date|From|Sent|To|C[cC]|Importance|Subject|Bee|B[cC]{2}|Attachments):|on behalf of ?)(?! +(by |from my|via )).*\n){3,})'
+HEADER_REGEX_STR = r'(((?:(?:Date|From|Sent|To|C[cC]|Importance|Subject|Bee|B[cC]{2}|Attachments|Classification|Flag):|on behalf of ?)(?! +(by |from my|via )).*\n){3,})'
 EMAIL_SIMPLE_HEADER_REGEX = re.compile(rf'^{HEADER_REGEX_STR}')
 EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX = re.compile(HEADER_REGEX_STR)
 EMAIL_PRE_FORWARD_REGEX = re.compile(r"(.{3,2000}?)" + HEADER_REGEX_STR, re.DOTALL)  # Match up to the next email header section
@@ -41,6 +41,8 @@ class EmailHeader:
     subject: str | None = None
     bcc: list[str] | None = None
     cc: list[str] | None = None
+    classification: str | None = None
+    flag: str | None = None
     importance: str | None = None
     attachments: str | None = None
     to: list[str] | None = None

epstein_files/documents/imessage/text_message.py CHANGED Viewed

@@ -4,9 +4,9 @@ from datetime import datetime
 from rich.text import Text
-from epstein_files.util.constant.names import JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN
+from epstein_files.util.constant.names import JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN, Name, extract_last_name
 from epstein_files.util.constant.strings import TIMESTAMP_DIM
-from epstein_files.util.data import extract_last_name, iso_timestamp
+from epstein_files.util.data import iso_timestamp
 from epstein_files.util.highlighted_group import get_style_for_name
 from epstein_files.util.logging import logger
 from epstein_files.util.rich import TEXT_LINK, highlighter
@@ -25,7 +25,7 @@ DISPLAY_LAST_NAME_ONLY = [
 @dataclass(kw_only=True)
 class TextMessage:
     """Class representing a single iMessage text message."""
-    author: str | None
+    author: Name
     author_str: str = ''
     is_id_confirmed: bool = False
     text: str

epstein_files/documents/messenger_log.py CHANGED Viewed

@@ -10,11 +10,11 @@ from rich.text import Text
 from epstein_files.documents.communication import Communication
 from epstein_files.documents.imessage.text_message import TextMessage
-from epstein_files.util.constant.names import JEFFREY_EPSTEIN, UNKNOWN
+from epstein_files.util.constant.names import JEFFREY_EPSTEIN, Name
 from epstein_files.util.constant.strings import AUTHOR, TIMESTAMP_STYLE
 from epstein_files.util.data import days_between, days_between_str, iso_timestamp, sort_dict
 from epstein_files.util.doc_cfg import Metadata, TextCfg
-from epstein_files.util.highlighted_group import get_style_for_name, styled_name
+from epstein_files.util.highlighted_group import styled_name
 from epstein_files.util.logging import logger
 from epstein_files.util.rich import LAST_TIMESTAMP_STYLE, build_table, highlighter
@@ -35,7 +35,7 @@ class MessengerLog(Communication):
         super().__post_init__()
         self.messages = [self._build_message(match) for match in MSG_REGEX.finditer(self.text)]
-    def first_message_at(self, name: str | None) -> datetime:
+    def first_message_at(self, name: Name) -> datetime:
         return self.messages_by(name)[0].parse_timestamp()
     def info_txt(self) -> Text | None:
@@ -54,10 +54,10 @@ class MessengerLog(Communication):
         return txt.append(')')
-    def last_message_at(self, name: str | None) -> datetime:
+    def last_message_at(self, name: Name) -> datetime:
         return self.messages_by(name)[-1].parse_timestamp()
-    def messages_by(self, name: str | None) -> list[TextMessage]:
+    def messages_by(self, name: Name) -> list[TextMessage]:
         """Return all messages by 'name'."""
         return [m for m in self.messages if m.author == name]
@@ -129,9 +129,9 @@ class MessengerLog(Communication):
             yield message
     @classmethod
-    def count_authors(cls, imessage_logs: list['MessengerLog']) -> dict[str | None, int]:
+    def count_authors(cls, imessage_logs: list['MessengerLog']) -> dict[Name, int]:
         """Count up how many texts were sent by each author."""
-        sender_counts: dict[str | None, int] = defaultdict(int)
+        sender_counts: dict[Name, int] = defaultdict(int)
         for message_log in imessage_logs:
             for message in message_log.messages:

epstein-files 1.1.5__py3-none-any.whl → 1.2.0__py3-none-any.whl

epstein-files 1.1.5py3-none-any.whl → 1.2.0py3-none-any.whl