PyPI - epstein-files - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

epstein-files 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

epstein_files/__init__.py +75 -135
epstein_files/documents/communication.py +9 -9
epstein_files/documents/document.py +115 -87
epstein_files/documents/email.py +154 -85
epstein_files/documents/emails/email_header.py +7 -6
epstein_files/documents/imessage/text_message.py +3 -2
epstein_files/documents/json_file.py +17 -0
epstein_files/documents/messenger_log.py +62 -3
epstein_files/documents/other_file.py +165 -17
epstein_files/epstein_files.py +128 -169
epstein_files/util/constant/names.py +8 -1
epstein_files/util/constant/output_files.py +29 -0
epstein_files/util/constant/strings.py +27 -0
epstein_files/util/constant/urls.py +25 -9
epstein_files/util/constants.py +1018 -1045
epstein_files/util/data.py +20 -55
epstein_files/util/{file_cfg.py → doc_cfg.py} +121 -43
epstein_files/util/env.py +19 -20
epstein_files/util/file_helper.py +38 -21
epstein_files/util/highlighted_group.py +229 -177
epstein_files/util/logging.py +63 -0
epstein_files/util/output.py +180 -0
epstein_files/util/rich.py +29 -17
epstein_files/util/search_result.py +14 -6
epstein_files/util/timer.py +24 -0
epstein_files/util/word_count.py +2 -1
{epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/METADATA +20 -4
epstein_files-1.0.2.dist-info/RECORD +33 -0
epstein_files-1.0.2.dist-info/entry_points.txt +7 -0
epstein_files-1.0.0.dist-info/RECORD +0 -28
{epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/LICENSE +0 -0
{epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/WHEEL +0 -0

epstein_files/documents/email.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import logging
 import re
-from collections import defaultdict
-from dataclasses import dataclass, field
+from dataclasses import asdict, dataclass, field
 from datetime import datetime
 from typing import ClassVar, cast
@@ -9,6 +8,7 @@ from dateutil.parser import parse
 from rich.console import Console, ConsoleOptions, RenderResult
 from rich.padding import Padding
 from rich.panel import Panel
+from rich.table import Table
 from rich.text import Text
 from epstein_files.documents.communication import Communication
@@ -19,13 +19,14 @@ from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import REDACTED, URL_SIGNIFIERS
 from epstein_files.util.constants import *
 from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes, extract_last_name,
-     remove_timezone, uniquify)
-from epstein_files.util.env import logger
+     flatten, remove_timezone, uniquify)
+from epstein_files.util.doc_cfg import EmailCfg, Metadata
 from epstein_files.util.highlighted_group import get_style_for_name
+from epstein_files.util.logging import logger
 from epstein_files.util.rich import *
-BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|L\._|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
-BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,])$')
+BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
+BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
 DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
 LINK_LINE_REGEX = re.compile(f"^(> )?htt")
 QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
@@ -39,8 +40,8 @@ TIMESTAMP_LINE_REGEX = re.compile(r"\d+:\d+")
 SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
 REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
 MAX_CHARS_TO_PRINT = 4000
+MAX_NUM_HEADER_LINES = 14
 MAX_QUOTED_REPLIES = 2
-VALID_HEADER_LINES = 14
 OCR_REPAIRS: dict[str | re.Pattern, str] = {
     re.compile(r'grnail\.com'): 'gmail.com',
@@ -71,12 +72,15 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
     'Imps ://': 'https://',
     re.compile(r'timestopics/people/t/landon jr thomas/inde\n?x\n?\.\n?h\n?tml'): 'timestopics/people/t/landon_jr_thomas/index.html',
     # Subject lines
-    r"as Putin Mayhem Tests President's Grip\non GOP": "as Putin Mayhem Tests President's Grip on GOP",
-    r"avoids testimony from alleged\nvictims": "avoids testimony from alleged victims",
-    r"but\nwatchdogs say probe is tainted": "watchdogs say probe is tainted",
-    r"COVER UP SEX ABUSE CRIMES\nBY THE WHITE HOUSE": "COVER UP SEX ABUSE CRIMES BY THE WHITE HOUSE",
-    r'Priebus, used\nprivate email accounts for': 'Priebus, used private email accounts for',
-    r"War on the Investigations\nEncircling Him": "War on the Investigations Encircling Him",
+    "Arrested in\nInauguration Day Riot": "Arrested in Inauguration Day Riot",
+    "as Putin Mayhem Tests President's Grip\non GOP": "as Putin Mayhem Tests President's Grip on GOP",
+    "avoids testimony from alleged\nvictims": "avoids testimony from alleged victims",
+    "but\nwatchdogs say probe is tainted": "watchdogs say probe is tainted",
+    "Christmas comes\nearly for most of macro": "Christmas comes early for most of macro",            # 023717
+    "but majority still made good\nmoney because": "but majority still made good money because",      # 023717
+    "COVER UP SEX ABUSE CRIMES\nBY THE WHITE HOUSE": "COVER UP SEX ABUSE CRIMES BY THE WHITE HOUSE",
+    'Priebus, used\nprivate email accounts for': 'Priebus, used private email accounts for',
+    "War on the Investigations\nEncircling Him": "War on the Investigations Encircling Him",
     re.compile(r"deadline re Mr Bradley Edwards vs Mr\s*Jeffrey Epstein", re.I): "deadline re Mr Bradley Edwards vs Mr Jeffrey Epstein",
     re.compile(r"Following Plea That Implicated Trump -\s*https://www.npr.org/676040070", re.I): "Following Plea That Implicated Trump - https://www.npr.org/676040070",
     re.compile(r"for Attorney General -\s+Wikisource, the"): r"for Attorney General - Wikisource, the",
@@ -89,9 +93,7 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
     'AVG°': 'AVGO',
 }
-MARTIN_WEINBERG_SIGNATURE_PATTERN = r"Martin G. Weinberg, Esq.\n20 Park Plaza((, )|\n)Suite 1000\nBoston, MA 02116(\n61.*)?(\n.*([cC]ell|Office))*"
-EMAIL_SIGNATURES = {
+EMAIL_SIGNATURE_REGEXES = {
     ARIANE_DE_ROTHSCHILD: re.compile(r"Ensemble.*\nCe.*\ndestinataires.*\nremercions.*\nautorisee.*\nd.*\nLe.*\ncontenues.*\nEdmond.*\nRoth.*\nlo.*\nRoth.*\ninfo.*\nFranc.*\n.2.*", re.I),
     BARBRO_C_EHNBOM: re.compile(r"Barbro C.? Ehn.*\nChairman, Swedish-American.*\n((Office|Cell|Sweden):.*\n)*(360.*\nNew York.*)?"),
     DANNY_FROST: re.compile(r"Danny Frost\nDirector.*\nManhattan District.*\n212.*", re.IGNORECASE),
@@ -103,7 +105,7 @@ EMAIL_SIGNATURES = {
     KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
     LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
     LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
-    MARTIN_WEINBERG: re.compile(fr"({MARTIN_WEINBERG_SIGNATURE_PATTERN}\n)?This Electronic Message contains.*?contents of this message is.*?prohibited.", re.DOTALL),
+    MARTIN_WEINBERG: re.compile(r"(Martin G. Weinberg, Esq.\n20 Park Plaza((, )|\n)Suite 1000\nBoston, MA 02116(\n61.*)?(\n.*([cC]ell|Office))*\n)?This Electronic Message contains.*?contents of this message is.*?prohibited.", re.DOTALL),
     STEVEN_PFEIFFER: re.compile(r"Steven\nSteven .*\nAssociate.*\nIndependent Filmmaker Project\nMade in NY.*\n30 .*\nBrooklyn.*\n(p:.*\n)?www\.ifp.*", re.IGNORECASE),
     PETER_MANDELSON: re.compile(r'Disclaimer This email and any attachments to it may be.*?with[ \n]+number(.*?EC4V[ \n]+6BJ)?', re.DOTALL | re.IGNORECASE),
     PAUL_BARRETT: re.compile(r"Paul Barrett[\n\s]+Alpha Group Capital LLC[\n\s]+(142 W 57th Street, 11th Floor, New York, NY 10019?[\n\s]+)?(al?[\n\s]*)?ALPHA GROUP[\n\s]+CAPITAL"),
@@ -147,7 +149,6 @@ TRUNCATE_TERMS = [
     'quote from The Colbert Report distinguishes',
     'co-inventor of the GTX Smart Shoe',
     'my latest Washington Post column',
-    'Whether you donated to Poetry in America through',
     'supported my humanities work at Harvard',
     'Calendar of Major Events, Openings, and Fundraisers',
     'Nuclear Operator Raises Alarm on Crisis',
@@ -181,7 +182,6 @@ TRUNCATE_TERMS = [
     'We can also discuss single stock and Topix banks',
     'We are recording unprecedented divergences in falling equity vol',
     'As previously discussed between you and Ariane',
-    'The US trade war against China: The view from Beijing',
     'no evidence you got the latest so i have sent you just the key message',
     # Joscha Bach
     'Cells seem to be mostly indistinguishable (except',
@@ -204,6 +204,8 @@ TRUNCATE_TERMS = [
     'General Election: Trump vs. Clinton LA Times/USC Tracking',
     'Location: Quicken Loans Arena in Cleveland, OH',
     'A friendly discussion about Syria with a former US State Department',
+    # Robert Kuhn
+    'The US trade war against China: The view from Beijing',
     # Tom / Paul Krassner
     'I forgot to post my cartoon from week before last, about Howard Schultz',
     # Bannon
@@ -221,23 +223,26 @@ TRUNCATE_TERMS = [
     'lecture in Heidelberg Oct 14 but they had to cancel',
     # Nikolic
     'people from LifeBall',
-    # Random
-    'Little Hodiaki',
-    "It began with deep worries regarding China's growth path",
-    'https://www.washingtonpost.com/politics/2018/09/04/transcript-phone-call',
     # Epstein
     'David Ben Gurion was asked why he, after 2000',
     # Lisa New
     'The raw materials for that period include interviews',
+    'Whether you donated to Poetry in America through',
+    # Random
+    'Little Hodiaki',
+    "It began with deep worries regarding China's growth path",
+    'https://www.washingtonpost.com/politics/2018/09/04/transcript-phone-call',
 ]
-KRASSNER_RECIPIENTS = uniquify(KRASSNER_MANSON_RECIPIENTS + KRASSNER_024923_RECIPIENTS + KRASSNER_033568_RECIPIENTS)
+# Some Paul Krassner emails have a ton of CCed parties we don't care about
+KRASSNER_RECIPIENTS = uniquify(flatten(ALL_FILE_CONFIGS[id].recipients for id in ['025329', '024923', '033568']))
 # No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
 USELESS_EMAILERS = IRAN_NUCLEAR_DEAL_SPAM_EMAIL_RECIPIENTS + \
                    KRASSNER_RECIPIENTS + \
                    FLIGHT_IN_2012_PEOPLE + [
     'Alan Rogers',                           # Random CC
+    'Andrew Friendly',                       # Presumably some relation of Kelly Friendly
     'BS Stern',                              # A random fwd of email we have
     'Cheryl Kleen',                          # Single email from Anne Boyles, displayed under Anne Boyles
     'Connie Zaguirre',                       # Random CC
@@ -268,24 +273,41 @@ USELESS_EMAILERS = IRAN_NUCLEAR_DEAL_SPAM_EMAIL_RECIPIENTS + \
 ]
 # Emails sent by epstein to himself that are just notes
-NOTES_TO_SELF = [
+SELF_EMAILS_FILE_IDS = [
     '026677',
-    '029752',
+    '029752',   # TODO: jokeland...
     '030238',
     # '033274',  # TODO: Epstein's note to self doesn't get printed if we don't set the recipients to [None]
 ]
+METADATA_FIELDS = [
+    'is_junk_mail',
+    'recipients',
+    'sent_from_device',
+]
 @dataclass
 class Email(Communication):
+    """
+    Attributes:
+        actual_text (str) - best effort at the text actually sent in this email, excluding quoted replies and forwards
+        config (EmailCfg | None) - manual config for this email (if it exists)
+        header (EmailHeader) - header data extracted from the text (from/to/sent/subject etc)
+        is_junk_mail (bool) - True if this is junk mail
+        recipients (list[str | None]) - who this email was sent to
+        sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
+        signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
+    """
     actual_text: str = field(init=False)
+    config: EmailCfg | None = None
     header: EmailHeader = field(init=False)
     is_junk_mail: bool = False
     recipients: list[str | None] = field(default_factory=list)
     sent_from_device: str | None = None
-    signature_substitution_counts: dict[str, int] = field(default_factory=lambda: defaultdict(int))
+    signature_substitution_counts: dict[str, int] = field(default_factory=dict)  # defaultdict breaks asdict :(
-    # Just for logging how many headers we rewrote
+    # For logging how many headers we prettified while printing, kind of janky
     rewritten_header_ids: ClassVar[set[str]] = set([])
     def __post_init__(self):
@@ -298,35 +320,34 @@ class Email(Communication):
             for recipient in self.header.recipients():
                 self.recipients.extend(self._get_names(recipient))
-        recipients = [r for r in self.recipients if r != self.author or self.file_id in NOTES_TO_SELF]  # Remove self CCs
+        # Remove self CCs
+        recipients = [r for r in self.recipients if r != self.author or self.file_id in SELF_EMAILS_FILE_IDS]
         self.recipients = list(set(recipients))
-        self.text = self._cleaned_up_text()
+        self.text = self._prettify_text()
         self.actual_text = self._actual_text()
         self.sent_from_device = self._sent_from_device()
-        logger.debug(f"Constructed {self.description()}")
-    def description(self) -> Text:
-        """One line summary mostly for logging."""
-        txt = self._description()
-        if len(self.recipients) > 0:
-            txt.append(', ').append(key_value_txt('recipients', self._recipients_txt()))
-        return txt.append(CLOSE_PROPERTIES_CHAR)
-    def idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
-        """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
-        for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text or self.text)):
-            if i >= n:
-                return match.end() - 1
     def info_txt(self) -> Text:
         txt = Text("OCR text of email from ", style='grey46').append(self.author_txt).append(' to ')
         return txt.append(self._recipients_txt()).append(highlighter(f" probably sent at {self.timestamp}"))
+    def metadata(self) -> Metadata:
+        metadata = super().metadata()
+        metadata.update({k: v for k, v in asdict(self).items() if v and k in METADATA_FIELDS})
+        return metadata
     def subject(self) -> str:
         return self.header.subject or ''
+    def summary(self) -> Text:
+        """One line summary mostly for logging."""
+        txt = self._summary()
+        if len(self.recipients) > 0:
+            txt.append(', ').append(key_value_txt('recipients', self._recipients_txt()))
+        return txt.append(CLOSE_PROPERTIES_CHAR)
     def _actual_text(self) -> str:
         """The text that comes before likely quoted replies and forwards etc."""
         if self.config and self.config.actual_text is not None:
@@ -339,8 +360,8 @@ class Email(Communication):
         # logger.info(f"Raw text:\n" + self.top_lines(20) + '\n\n')
         # logger.info(f"With header removed:\n" + text[0:500] + '\n\n')
-        if self.file_id in ['024624']:
-            return text
+        if self.file_id in ['024624']:  # This email starts with "On September 14th"
+            return text.split('On Tue, May 14')[0].strip()
         if reply_text_match:
             actual_num_chars = len(reply_text_match.group(1))
@@ -355,7 +376,6 @@ class Email(Communication):
             if field_string not in text:
                 continue
-            logger.debug(f"'{self.url_slug}': Splitting based on '{field_string.strip()}'")
             pre_from_text = text.split(field_string)[0]
             actual_num_chars = len(pre_from_text)
             actual_text_pct = f"{(100 * float(actual_num_chars) / len(text)):.1f}%"
@@ -377,29 +397,6 @@ class Email(Communication):
         return style.replace('bold', '').strip()
-    def _cleaned_up_text(self) -> str:
-        """Add newline after headers in text if actual header wasn't empty, remove bad lines, etc."""
-        # Insert line breaks now unless header is broken, in which case we'll do it later after fixing header
-        text = self.text if self.header.was_initially_empty else _add_line_breaks(self.text)
-        text = REPLY_REGEX.sub(r'\n\1', text)  # Newlines between quoted replies
-        for name, signature_regex in EMAIL_SIGNATURES.items():
-            signature_replacement = f'<...snipped {name.lower()} legal signature...>'
-            text, num_replaced = signature_regex.subn(signature_replacement, text)
-            self.signature_substitution_counts[name] += num_replaced
-        return collapse_newlines(text).strip()
-    def _debug_info(self) -> str:
-        info = [
-            f"id={self.file_id}",
-            f"url_slug={self.url_slug}",
-            f"file_path='{self.file_path}'",
-            f"is_local_extract_file={self.is_local_extract_file()}",
-        ]
-        return f"     " + "\n     ".join(info)
     def _extract_author(self) -> None:
         self._extract_header()
         super()._extract_author()
@@ -418,9 +415,8 @@ class Email(Communication):
             if self.header.is_empty():
                 self.header.repair_empty_header(self.lines)
         else:
-            msg = f"No header match found in '{self.filename}'! Top lines:\n\n{self.top_lines()}"
-            log_fxn = logger.info if self.config else logger.warning
-            log_fxn(msg)
+            log_level = logging.INFO if self.config else logging.WARNING
+            self.log_top_lines(msg='No email header match found!', level=log_level)
             self.header = EmailHeader(field_names=[])
     def _extract_timestamp(self) -> datetime:
@@ -432,7 +428,7 @@ class Email(Communication):
             if timestamp:
                 return timestamp
-        searchable_lines = self.lines[0:VALID_HEADER_LINES]
+        searchable_lines = self.lines[0:MAX_NUM_HEADER_LINES]
         searchable_text = '\n'.join(searchable_lines)
         date_match = DATE_HEADER_REGEX.search(searchable_text)
@@ -442,7 +438,7 @@ class Email(Communication):
             if timestamp:
                 return timestamp
-        logger.debug(f"Failed to find timestamp, falling back to parsing {VALID_HEADER_LINES} lines...")
+        logger.debug(f"Failed to find timestamp, falling back to parsing {MAX_NUM_HEADER_LINES} lines...")
         for line in searchable_lines:
             if not TIMESTAMP_LINE_REGEX.search(line):
@@ -476,6 +472,12 @@ class Email(Communication):
         names_found = names_found or [emailer_str]
         return [_reverse_first_and_last_names(name) for name in names_found]
+    def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
+        """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
+        for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text or self.text)):
+            if i >= n:
+                return match.end() - 1
     def _merge_lines(self, idx: int, idx2: int | None = None) -> None:
         """Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""
         idx2 = idx2 if idx2 is not None else (idx + 1)
@@ -490,6 +492,20 @@ class Email(Communication):
         self._set_computed_fields(lines=lines)
+    def _prettify_text(self) -> str:
+        """Add newlines before quoted replies and snip signatures."""
+        # Insert line breaks now unless header is broken, in which case we'll do it later after fixing header
+        text = self.text if self.header.was_initially_empty else _add_line_breaks(self.text)
+        text = REPLY_REGEX.sub(r'\n\1', text)  # Newlines between quoted replies
+        for name, signature_regex in EMAIL_SIGNATURE_REGEXES.items():
+            signature_replacement = f'<...snipped {name.lower()} legal signature...>'
+            text, num_replaced = signature_regex.subn(signature_replacement, text)
+            self.signature_substitution_counts[name] = self.signature_substitution_counts.get(name, 0)
+            self.signature_substitution_counts[name] += num_replaced
+        return collapse_newlines(text).strip()
     def _recipients_txt(self) -> Text:
         """Text object with comma separated colored versions of all recipients."""
         recipients = [r or UNKNOWN for r in self.recipients] if len(self.recipients) > 0 else [UNKNOWN]
@@ -500,6 +516,14 @@ class Email(Communication):
             for r in recipients
         ], join=', ')
+    def _remove_line(self, idx: int) -> None:
+        """Remove a line from self.lines."""
+        num_lines = idx * 2
+        self.log_top_lines(num_lines, msg=f'before removal of line {idx}')
+        del self.lines[idx]
+        self._set_computed_fields(lines=self.lines)
+        self.log_top_lines(num_lines, msg=f'after removal of line {idx}')
     def _repair(self) -> None:
         """Repair particularly janky files."""
         if BAD_FIRST_LINE_REGEX.match(self.lines[0]):
@@ -510,21 +534,37 @@ class Email(Communication):
         if self.file_id in ['031442']:
             self._merge_lines(0)  # Merge 1st and 2nd rows
-        elif self.file_id in '021729 029501 029282 030626 031384 033512'.split():
+        elif self.file_id in '021729 025790 029282 029501 029889 030626 031384 031428 033097 033512 033583 029498 033583'.split():
             self._merge_lines(2)  # Merge 3rd and 4th rows
             if self.file_id in ['030626']:  # Merge 6th and 7th (now 5th and 6th) rows
                 self._merge_lines(4)
-        elif self.file_id in ['029976']:
+            elif self.file_id == '029889':
+                self._merge_lines(2, 5)
+            elif self.file_id in ['029498', '031428']:
+                self._merge_lines(2, 4)
+        elif self.file_id in ['029976', '023067']:
             self._merge_lines(3)  # Merge 4th and 5th rows
-        elif self.file_id in '026609 029402 032405'.split():
+        elif self.file_id in '026609 029402 032405 022695'.split():
             self._merge_lines(4)  # Merge 5th and 6th rows
+        elif self.file_id in ['019407', '031980', '030384', '033144', '030999', '033575', '029835', '030381']:
+            self._merge_lines(2, 4)
+        elif self.file_id in ['029154', '029163']:
+            self._merge_lines(2, 5)
+        elif self.file_id in ['033228', '032063']:
+            self._merge_lines(3, 5)
+        elif self.file_id == '028931':
+            self._merge_lines(3, 6)
         elif self.file_id in ['033568']:
             for _i in range(5):
                 self._merge_lines(5)
         elif self.file_id in ['025329']:
             for _i in range(9):
                 self._merge_lines(2)
+        elif self.file_id == '033486':
+            self._merge_lines(7, 9)
+        elif self.file_id == '030299':
+            self._merge_lines(7, 10)
         elif self.file_id == '029977':
             self._set_computed_fields(text=self.text.replace('Sent 9/28/2012 2:41:02 PM', 'Sent: 9/28/2012 2:41:02 PM'))
@@ -533,6 +573,11 @@ class Email(Communication):
             self._merge_lines(4)
             self._merge_lines(2, 4)
+        elif self.file_id == '025041':
+            self._remove_line(4)
+            self._remove_line(4)
+        elif self.file_id == '029692':
+            self._remove_line(3)
         if old_text != self.text:
             self.log(f"Modified text, old:\n\n" + '\n'.join(old_text.split('\n')[0:12]) + '\n', logging.INFO)
@@ -568,21 +613,21 @@ class Email(Communication):
         self._set_computed_fields(lines=new_lines)
     def _sent_from_device(self) -> str | None:
-        """Find any 'Sent from my iPhone' style lines if they exist."""
+        """Find any 'Sent from my iPhone' style signature line if it exist in the 'actual_text'."""
         sent_from_match = SENT_FROM_REGEX.search(self.actual_text)
         if sent_from_match:
             sent_from = sent_from_match.group(0)
             return 'S' + sent_from[1:] if sent_from.startswith('sent') else sent_from
-    def __rich_console__(self, _console: Console, _options: ConsoleOptions) -> RenderResult:
+    def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
         logger.debug(f"Printing '{self.filename}'...")
         yield self.file_info_panel()
-        text = self.text
         should_rewrite_header = self.header.was_initially_empty and self.header.num_header_rows > 0
-        quote_cutoff = self.idx_of_nth_quoted_reply(text=text)  # Trim if there's many quoted replies
+        quote_cutoff = self._idx_of_nth_quoted_reply(text=self.text)  # Trim if there's many quoted replies
         num_chars = MAX_CHARS_TO_PRINT
         trim_footer_txt = None
+        text = self.text
         if self.file_id in TRUNCATION_LENGTHS:
             num_chars = TRUNCATION_LENGTHS[self.file_id]
@@ -611,7 +656,7 @@ class Email(Communication):
             lines += text.split('\n')[num_lines_to_skip:]
             text = self.header.rewrite_header() + '\n' + '\n'.join(lines)
-            text = _add_line_breaks(text)  # This was skipped when _cleaned_up_text() w/a broken header so we do it now
+            text = _add_line_breaks(text)  # This was skipped when _prettify_text() w/a broken header so we do it now
             self.rewritten_header_ids.add(self.file_id)
         panel_txt = highlighter(text)
@@ -628,6 +673,30 @@ class Email(Communication):
         if should_rewrite_header:
             self.log_top_lines(self.header.num_header_rows + 4, f'Original header:', logging.INFO)
+    @staticmethod
+    def build_table(emails: list['Email'], _author: str | None) -> Table:
+        """Turn a set of Email objects into a Table."""
+        author = _author or UNKNOWN
+        table = Table(
+            title=f"Emails to/from {author} starting {emails[0].timestamp.date()}",
+            border_style=get_style_for_name(author, allow_bold=False),
+            header_style="bold"
+        )
+        table.add_column('From', justify='left')
+        table.add_column('Timestamp', justify='center')
+        table.add_column('Subject', justify='left', style='honeydew2', min_width=60)
+        for email in emails:
+            table.add_row(
+                email.author_txt,
+                email.epstein_media_link(link_txt=email.timestamp_without_seconds()),
+                highlighter(email.subject())
+            )
+        return table
 def _add_line_breaks(email_text: str) -> str:
     return EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX.sub(r'\n\1\n', email_text).strip()

epstein_files/documents/emails/email_header.py CHANGED Viewed

@@ -4,8 +4,8 @@ from dataclasses import asdict, dataclass, field
 from epstein_files.util.constant.strings import AUTHOR, REDACTED
 from epstein_files.util.constants import ALL_CONFIGS
-from epstein_files.util.env import logger
-from epstein_files.util.file_cfg import MessageCfg
+from epstein_files.util.doc_cfg import EmailCfg
+from epstein_files.util.logging import logger
 from epstein_files.util.rich import UNKNOWN
 FIELD_NAMES = ['From', 'Date', 'Sent', 'Subject']
@@ -21,11 +21,11 @@ EMAIL_PRE_FORWARD_REGEX = re.compile(r"(.{3,2000}?)" + HEADER_REGEX_STR, re.DOTA
 TIME_REGEX = re.compile(r'^(\d{1,2}/\d{1,2}/\d{2,4}|Thursday|Monday|Tuesday|Wednesday|Friday|Saturday|Sunday).*')
 BAD_NAME_CHARS_REGEX = re.compile(r"[\"'\[\]*><•]")
-BAD_EMAILER_REGEX = re.compile(r'^(>|11111111)|agreed|ok|sexy|rt|re:|fwd:|Multiple Senders|((sent|attachments|subject|importance).*|.*(january|201\d|hysterical|i have|image0|so that people|article 1.?|momminnemummin|These conspiracy theories|your state|undisclosed|www\.theguardian|talk in|it was a|what do|cc:|call (back|me)).*)$', re.IGNORECASE)
+BAD_EMAILER_REGEX = re.compile(r'^(>|11111111)|agreed|ok|sexy|re:|fwd:|Multiple Senders|((sent|attachments|subject|importance).*|.*(january|201\d|hysterical|i have|image0|so that people|article 1.?|momminnemummin|These conspiracy theories|your state|undisclosed|www\.theguardian|talk in|it was a|what do|cc:|call (back|me)).*)$', re.IGNORECASE)
 CONFIGURED_ACTUAL_TEXTS = [
     cfg.actual_text for cfg in ALL_CONFIGS
-    if isinstance(cfg, MessageCfg) and cfg.actual_text is not None
+    if isinstance(cfg, EmailCfg) and cfg.actual_text is not None
 ]
@@ -70,7 +70,7 @@ class EmailHeader:
                 raise RuntimeError(f"Ran out of header rows to check for '{field_name}'")
             value = email_lines[row_number_to_check]
-            log_prefix = f"Looks like '{value}' is a mismatch for '{field_name}', "
+            log_prefix = f"Looks like '{value}' is a mismatch for '{field_name}'"
             if field_name == AUTHOR:
                 if value in CONFIGURED_ACTUAL_TEXTS:
@@ -99,7 +99,8 @@ class EmailHeader:
             setattr(self, field_name, value)
         self.num_header_rows = len(self.field_names) + num_headers
-        logger.debug(f"Corrected empty header using {self.num_header_rows} lines to:\n%s\n\nTop lines:\n\n%s", self, '\n'.join(email_lines[0:(num_headers + 1) * 2]))
+        log_msg = f"Corrected empty header using {self.num_header_rows} lines to:\n"
+        logger.debug(f"{log_msg}{self}\n\nTop lines:\n\n%s", '\n'.join(email_lines[0:(num_headers + 1) * 2]))
     def rewrite_header(self) -> str:
         header_fields = {}

epstein_files/documents/imessage/text_message.py CHANGED Viewed

@@ -7,7 +7,8 @@ from rich.text import Text
 from epstein_files.util.constant.names import JEFFREY_EPSTEIN, ANTHONY_SCARAMUCCI, STEVE_BANNON, UNKNOWN
 from epstein_files.util.data import extract_last_name
 from epstein_files.util.highlighted_group import get_style_for_name
-from epstein_files.util.rich import TEXT_LINK, highlighter, logger
+from epstein_files.util.logging import logger
+from epstein_files.util.rich import TEXT_LINK, highlighter
 MSG_DATE_FORMAT = r"%m/%d/%y %I:%M:%S %p"
 PHONE_NUMBER_REGEX = re.compile(r'^[\d+]+.*')
@@ -55,7 +56,7 @@ class TextMessage:
         else:
             self.author_str = self.author
-        if not self.id_confirmed and self.author is not None:
+        if not self.id_confirmed and self.author is not None and self.author != JEFFREY_EPSTEIN:
             self.author_str = self.author + ' (?)'
     def timestamp(self) -> datetime:

epstein_files/documents/json_file.py CHANGED Viewed

@@ -1,14 +1,19 @@
+import json
+import logging
 from dataclasses import dataclass
 from pathlib import Path
+from typing import ClassVar
 from rich.text import Text
 from epstein_files.documents.other_file import OtherFile
+from epstein_files.util.constant.strings import JSON
 @dataclass
 class JsonFile(OtherFile):
     """File containing JSON data."""
+    strip_whitespace: ClassVar[bool] = False
     def __post_init__(self):
         super().__post_init__()
@@ -16,8 +21,20 @@ class JsonFile(OtherFile):
         if self.url_slug.endswith('.txt') or self.url_slug.endswith('.json'):
             self.url_slug = Path(self.url_slug).stem
+        self._set_computed_fields(text=self.formatted_json())
+    def category(self) -> str:
+        return JSON
+    def formatted_json(self) -> str:
+        return json.dumps(self.json_data(), indent=4)
     def info_txt(self) -> Text | None:
         return Text(f"JSON file, possibly iMessage or similar app metadata", style='white dim italic')
     def is_interesting(self):
         return False
+    def json_data(self) -> object:
+        with open(self.file_path, encoding='utf-8-sig') as f:
+            return json.load(f)

epstein-files 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

epstein-files 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl