PyPI - epstein-files - Versions diffs - 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

epstein-files 1.4.1py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

epstein_files/__init__.py +31 -18
epstein_files/documents/communication.py +9 -5
epstein_files/documents/document.py +225 -136
epstein_files/documents/doj_file.py +242 -0
epstein_files/documents/doj_files/full_text.py +166 -0
epstein_files/documents/email.py +138 -163
epstein_files/documents/emails/email_header.py +21 -11
epstein_files/documents/emails/emailers.py +223 -0
epstein_files/documents/imessage/text_message.py +2 -3
epstein_files/documents/json_file.py +18 -14
epstein_files/documents/messenger_log.py +23 -39
epstein_files/documents/other_file.py +48 -44
epstein_files/epstein_files.py +54 -33
epstein_files/person.py +142 -110
epstein_files/util/constant/names.py +29 -6
epstein_files/util/constant/output_files.py +2 -0
epstein_files/util/constant/strings.py +12 -6
epstein_files/util/constant/urls.py +17 -0
epstein_files/util/constants.py +101 -174
epstein_files/util/data.py +2 -0
epstein_files/util/doc_cfg.py +20 -15
epstein_files/util/env.py +24 -16
epstein_files/util/file_helper.py +28 -6
epstein_files/util/helpers/debugging_helper.py +13 -0
epstein_files/util/helpers/env_helpers.py +21 -0
epstein_files/util/highlighted_group.py +57 -16
epstein_files/util/layout/left_bar_panel.py +26 -0
epstein_files/util/logging.py +28 -13
epstein_files/util/output.py +33 -10
epstein_files/util/rich.py +28 -2
epstein_files/util/word_count.py +7 -7
{epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/METADATA +14 -1
epstein_files-1.5.0.dist-info/RECORD +40 -0
epstein_files-1.4.1.dist-info/RECORD +0 -34
{epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/LICENSE +0 -0
{epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/WHEEL +0 -0
{epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/entry_points.txt +0 -0

epstein_files/documents/email.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import json
 import logging
 import re
-from collections import defaultdict
 from copy import deepcopy
 from dataclasses import asdict, dataclass, field
 from datetime import datetime
@@ -16,13 +15,14 @@ from rich.text import Text
 from epstein_files.documents.communication import Communication
 from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, INFO_INDENT
-from epstein_files.documents.emails.email_header import (BAD_EMAILER_REGEX, EMAIL_SIMPLE_HEADER_REGEX,
-     EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, FIELDS_COLON_PATTERN, TIME_REGEX, EmailHeader)
+from epstein_files.documents.emails.email_header import (EMAIL_SIMPLE_HEADER_REGEX,
+     EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, FIELDS_COLON_PATTERN, EmailHeader)
+from epstein_files.documents.emails.emailers import extract_emailer_names
 from epstein_files.documents.other_file import OtherFile
 from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import REDACTED
 from epstein_files.util.constants import *
-from epstein_files.util.data import TIMEZONE_INFO, collapse_newlines, escape_single_quotes, remove_timezone
+from epstein_files.util.data import AMERICAN_TIME_REGEX, TIMEZONE_INFO, collapse_newlines, remove_timezone
 from epstein_files.util.doc_cfg import EmailCfg, Metadata
 from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
 from epstein_files.util.highlighted_group import JUNK_EMAILERS, get_style_for_name
@@ -32,7 +32,6 @@ from epstein_files.util.rich import *
 BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
 BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Hide caption|Importance:?\s*High|[iI,•]|[1i] (_ )?[il]|, [-,]|L\._|_filtered|.*(yiv0232|font-family:|margin-bottom:).*)$')
 BAD_SUBJECT_CONTINUATIONS = ['orwarded', 'Hi ', 'Sent ', 'AmLaw', 'Original Message', 'Privileged', 'Sorry', '---']
-DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
 FIELDS_COLON_REGEX = re.compile(FIELDS_COLON_PATTERN)
 LINK_LINE_REGEX = re.compile(f"^[>• ]*htt")
 LINK_LINE2_REGEX = re.compile(r"^[-\w.%&=/]{5,}$")
@@ -44,7 +43,6 @@ DATE_HEADER_REGEX = re.compile(r'(?:Date|Sent):? +(?!by|from|to|via)([^\n]{6,})\
 TIMESTAMP_LINE_REGEX = re.compile(r"\d+:\d+")
 LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
-SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
 REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
 URL_SIGNIFIERS = ['?amp', 'amp?', 'cd=', 'click', 'CMP=', 'contentId', 'ft=', 'gclid', 'htm', 'mp=', 'keywords=', 'Id=', 'module=', 'mpweb', 'nlid=', 'ref=', 'smid=', 'sp=', 'usg=', 'utm']
 APPEARS_IN = 'appears in'
@@ -107,6 +105,7 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
     'Priebus, used\nprivate email accounts for': 'Priebus, used private email accounts for',
     "War on the Investigations\nEncircling Him": "War on the Investigations Encircling Him",
     "Subject; RE": "Subject: RE",
+    "straining relations between UK and\nAmerica": "straining relations between UK and America",
     re.compile(r"deadline re Mr Bradley Edwards vs Mr\s*Jeffrey Epstein", re.I): "deadline re Mr Bradley Edwards vs Mr Jeffrey Epstein",
     re.compile(r"Following Plea That Implicated Trump -\s*https://www.npr.org/676040070", re.I): "Following Plea That Implicated Trump - https://www.npr.org/676040070",
     re.compile(r"for Attorney General -\s+Wikisource, the"): r"for Attorney General - Wikisource, the",
@@ -344,6 +343,10 @@ LINE_REPAIR_MERGES = {
     '033575': [[2, 4]],
     '033576': [[3]],
     '033583': [[2]],
+    # Note DOJ file line adjustments happen *after* DojFile._repair() is called
+    'EFTA00039689': [[4]],
+    'EFTA00040118': [[2], [2], [2], [2], [2], [2], [6], [6]],
 }
@@ -351,12 +354,13 @@ LINE_REPAIR_MERGES = {
 class Email(Communication):
     """
     Attributes:
-        actual_text (str) - best effort at the text actually sent in this email, excluding quoted replies and forwards
-        config (EmailCfg | None) - manual config for this email (if it exists)
-        header (EmailHeader) - header data extracted from the text (from/to/sent/subject etc)
-        recipients (list[Name]) - who this email was sent to
-        sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
-        signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
+        actual_text (str) - Best effort at the text actually sent in this email, excluding quoted replies and forwards.
+        config (EmailCfg, optional) - Manual config for this email (if it exists).
+        header (EmailHeader) - Header data extracted from the text (from/to/sent/subject etc).
+        recipients (list[Name]) - People to whom this email was sent.
+        sent_from_device (str, optional) - "Sent from my iPhone" style signature (if it exists).
+        signature_substitution_counts (dict[str, int]) - Number of times a signature was replaced with
+            <...snipped...> for each participant
     """
     attached_docs: list[OtherFile] = field(default_factory=list)
     actual_text: str = field(init=False)
@@ -371,53 +375,33 @@ class Email(Communication):
     # For logging how many headers we prettified while printing, kind of janky
     rewritten_header_ids: ClassVar[set[str]] = set([])
-    def __post_init__(self):
-        self.filename = self.file_path.name
-        self.file_id = extract_file_id(self.filename)
-        # Special handling for copying properties out of the config for the document this one was extracted from
-        if self.is_local_extract_file():
-            self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
-            extracted_from_doc_id = self.url_slug.split('_')[-1]
-            if extracted_from_doc_id in ALL_FILE_CONFIGS:
-                self._set_config_for_extracted_file(ALL_FILE_CONFIGS[extracted_from_doc_id])
-        super().__post_init__()
-        if self.config and self.config.recipients:
-            self.recipients = self.config.recipients
-        else:
-            for recipient in self.header.recipients():
-                self.recipients.extend(self._extract_emailer_names(recipient))
-            # Assume mailing list emails are to Epstein
-            if self.author in BCC_LISTS and (self.is_note_to_self() or not self.recipients):
-                self.recipients = [JEFFREY_EPSTEIN]
-        # Remove self CCs but preserve self emails
-        if not self.is_note_to_self():
-            self.recipients = [r for r in self.recipients if r != self.author]
-        self.recipients = sorted(list(set(self.recipients)), key=lambda r: r or UNKNOWN)
-        self.text = self._prettify_text()
-        self.actual_text = self._actual_text()
-        self.sent_from_device = self._sent_from_device()
+    @property
     def attachments(self) -> list[str]:
         """Returns the string in the header."""
         return (self.header.attachments or '').split(';')
+    @property
+    def border_style(self) -> str:
+        """Color emails from epstein to others with the color for the first recipient."""
+        if self.author == JEFFREY_EPSTEIN and len(self.recipients) > 0:
+            style = get_style_for_name(self.recipients[0])
+        else:
+            style = self.author_style
+        return style.replace('bold', '').strip()
+    @property
     def info_txt(self) -> Text:
-        email_type = 'fwded article' if self.is_fwded_article() else 'email'
-        txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt())
+        email_type = 'fwded article' if self.is_fwded_article else 'email'
+        txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt)
         if self.config and self.config.is_attribution_uncertain:
-            txt.append(f" {QUESTION_MARKS}", style=self.author_style())
+            txt.append(f" {QUESTION_MARKS}", style=self.author_style)
         txt.append(' to ').append(self.recipients_txt())
         return txt.append(highlighter(f" probably sent at {self.timestamp}"))
+    @property
     def is_fwded_article(self) -> bool:
         if self.config is None:
             return False
@@ -426,33 +410,78 @@ class Email(Communication):
         else:
             return bool(self.config.is_fwded_article)
+    @property
     def is_junk_mail(self) -> bool:
         return self.author in JUNK_EMAILERS
+    @property
     def is_mailing_list(self) -> bool:
-        return self.author in MAILING_LISTS or self.is_junk_mail()
+        return self.author in MAILING_LISTS or self.is_junk_mail
+    @property
     def is_note_to_self(self) -> bool:
         return self.recipients == [self.author]
-    def is_from_or_to(self, name: str) -> bool:
-        return name in [self.author] + self.recipients
+    @property
     def is_word_count_worthy(self) -> bool:
-        if self.is_fwded_article():
+        if self.is_fwded_article:
             return bool(self.config.fwded_text_after) or len(self.actual_text) < 150
         else:
-            return not self.is_mailing_list()
+            return not self.is_mailing_list
+    @property
     def metadata(self) -> Metadata:
         local_metadata = asdict(self)
-        local_metadata['is_junk_mail'] = self.is_junk_mail()
-        local_metadata['is_mailing_list'] = self.is_junk_mail()
-        local_metadata['subject'] = self.subject() or None
-        metadata = super().metadata()
+        local_metadata['is_junk_mail'] = self.is_junk_mail
+        local_metadata['is_mailing_list'] = self.is_junk_mail
+        local_metadata['subject'] = self.subject or None
+        metadata = super().metadata
         metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
         return metadata
+    @property
+    def subject(self) -> str:
+        if self.config and self.config.subject:
+            return self.config.subject
+        else:
+            return self.header.subject or ''
+    def __post_init__(self):
+        self.filename = self.file_path.name
+        self.file_id = extract_file_id(self.filename)
+        # Special handling for copying properties out of the config for the document this one was extracted from
+        if self.is_local_extract_file:
+            self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
+            extracted_from_doc_id = self.url_slug.split('_')[-1]
+            if extracted_from_doc_id in ALL_FILE_CONFIGS:
+                self._set_config_for_extracted_file(ALL_FILE_CONFIGS[extracted_from_doc_id])
+        super().__post_init__()
+        if self.config and self.config.recipients:
+            self.recipients = self.config.recipients
+        else:
+            for recipient in self.header.recipients():
+                self.recipients.extend(extract_emailer_names(recipient))
+            # Assume mailing list emails are to Epstein
+            if self.author in BCC_LISTS and (self.is_note_to_self or not self.recipients):
+                self.recipients = [JEFFREY_EPSTEIN]
+        # Remove self CCs but preserve self emails
+        if not self.is_note_to_self:
+            self.recipients = [r for r in self.recipients if r != self.author]
+        self.recipients = sorted(list(set(self.recipients)), key=lambda r: r or UNKNOWN)
+        self.text = self._prettify_text()
+        self.actual_text = self._extract_actual_text()
+        self.sent_from_device = self._sent_from_device()
+    def is_from_or_to(self, name: str) -> bool:
+        return name in [self.author] + self.recipients
     def recipients_txt(self, max_full_names: int = 2) -> Text:
         """Text object with comma separated colored versions of all recipients."""
         recipients = [r or UNKNOWN for r in self.recipients] if len(self.recipients) > 0 else [UNKNOWN]
@@ -463,12 +492,6 @@ class Email(Communication):
             for r in recipients
         ], join=', ')
-    def subject(self) -> str:
-        if self.config and self.config.subject:
-            return self.config.subject
-        else:
-            return self.header.subject or ''
     def summary(self) -> Text:
         """One line summary mostly for logging."""
         txt = self._summary()
@@ -478,7 +501,7 @@ class Email(Communication):
         return txt.append(CLOSE_PROPERTIES_CHAR)
-    def _actual_text(self) -> str:
+    def _extract_actual_text(self) -> str:
         """The text that comes before likely quoted replies and forwards etc."""
         if self.config and self.config.actual_text is not None:
             return self.config.actual_text
@@ -490,7 +513,6 @@ class Email(Communication):
         elif self.header.num_header_rows == 0:
             return self.text
-        # import pdb;pdb.set_trace()
         self.log_top_lines(20, "Raw text:", logging.DEBUG)
         self.log(f"With {self.header.num_header_rows} header lines removed:\n{text[0:500]}\n\n", logging.DEBUG)
         reply_text_match = REPLY_TEXT_REGEX.search(text)
@@ -517,51 +539,24 @@ class Email(Communication):
         return text.strip()
-    def _border_style(self) -> str:
-        """Color emails from epstein to others with the color for the first recipient."""
-        if self.author == JEFFREY_EPSTEIN and len(self.recipients) > 0:
-            style = get_style_for_name(self.recipients[0])
-        else:
-            style = self.author_style()
-        return style.replace('bold', '').strip()
     def _extract_author(self) -> None:
+        """Overloads superclass method, called at instantiation time."""
         self._extract_header()
         super()._extract_author()
         if not self.author and self.header.author:
-            authors = self._extract_emailer_names(self.header.author)
+            authors = extract_emailer_names(self.header.author)
             self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
-    def _extract_emailer_names(self, emailer_str: str) -> list[str]:
-        """Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
-        emailer_str = EmailHeader.cleanup_str(emailer_str)
-        if len(emailer_str) == 0:
-            return []
-        names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
-        if BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
-            if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
-                logger.warning(f"'{self.filename}': No emailer found in '{escape_single_quotes(emailer_str)}'")
-            else:
-                logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
-            return names_found
-        names_found = names_found or [emailer_str]
-        return [_reverse_first_and_last_names(name) for name in names_found]
     def _extract_header(self) -> None:
-        """Extract an EmailHeader object from the OCR text."""
+        """Extract an `EmailHeader` from the OCR text."""
         header_match = EMAIL_SIMPLE_HEADER_REGEX.search(self.text)
         if header_match:
             self.header = EmailHeader.from_header_lines(header_match.group(0))
-            if self.header.is_empty():
+            # DOJ file OCR text is broken in a less consistent way than the HOUSE_OVERSIGHT files
+            if self.header.is_empty() and not self.is_doj_file:
                 self.header.repair_empty_header(self.lines)
         else:
             log_level = logging.INFO if self.config else logging.WARNING
@@ -571,22 +566,15 @@ class Email(Communication):
         logger.debug(f"{self.file_id} extracted header\n\n{self.header}\n")
     def _extract_timestamp(self) -> datetime:
-        if self.config and self.config.timestamp():
-            return self.config.timestamp()
-        elif self.header.sent_at:
-            timestamp = _parse_timestamp(self.header.sent_at)
-            if timestamp:
-                return timestamp
+        """Find the time this email was sent."""
+        if self.header.sent_at and (timestamp := _parse_timestamp(self.header.sent_at)):
+            return timestamp
         searchable_lines = self.lines[0:MAX_NUM_HEADER_LINES]
         searchable_text = '\n'.join(searchable_lines)
-        date_match = DATE_HEADER_REGEX.search(searchable_text)
-        if date_match:
-            timestamp = _parse_timestamp(date_match.group(1))
-            if timestamp:
+        if (date_match := DATE_HEADER_REGEX.search(searchable_text)):
+            if (timestamp := _parse_timestamp(date_match.group(1))):
                 return timestamp
         logger.debug(f"Failed to find timestamp, falling back to parsing {MAX_NUM_HEADER_LINES} lines...")
@@ -595,18 +583,16 @@ class Email(Communication):
             if not TIMESTAMP_LINE_REGEX.search(line):
                 continue
-            timestamp = _parse_timestamp(line)
-            if timestamp:
+            if (timestamp := _parse_timestamp(line)):
                 logger.debug(f"Fell back to timestamp {timestamp} in line '{line}'...")
                 return timestamp
         no_timestamp_msg = f"No timestamp found in '{self.file_path.name}'"
-        if self.is_duplicate():
-            logger.warning(f"{no_timestamp_msg} but timestamp should be copied from {self.duplicate_of_id()}")
+        if self.is_duplicate:
+            logger.warning(f"{no_timestamp_msg} but timestamp should be copied from {self.duplicate_of_id}")
         else:
-            raise RuntimeError(f"{no_timestamp_msg}, top lines:\n{searchable_text}")
+            raise RuntimeError(f"{no_timestamp_msg}, top lines:\n" + '\n'.join(self.lines[0:MAX_NUM_HEADER_LINES + 10]))
     def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES) -> int | None:
         """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
@@ -653,7 +639,7 @@ class Email(Communication):
         # Share / Tweet lines
         if self.author == KATHRYN_RUEMMLER:
-            text = '\n'.join([l for l in text.split('\n') if l not in ['Share', 'Tweet', 'Bookmark it']])
+            text = '\n'.join([line for line in text.split('\n') if line not in ['Share', 'Tweet', 'Bookmark it']])
         return collapse_newlines(text).strip()
@@ -666,7 +652,7 @@ class Email(Communication):
         self.log_top_lines(num_lines, msg=f'after removal of line {idx}')
     def _repair(self) -> None:
-        """Repair particularly janky files."""
+        """Repair particularly janky files. Note that OCR_REPAIRS are applied *after* other line adjustments."""
         if BAD_FIRST_LINE_REGEX.match(self.lines[0]):
             self._set_computed_fields(lines=self.lines[1:])
@@ -694,13 +680,17 @@ class Email(Communication):
             self.log(f"Modified text, old:\n\n" + '\n'.join(old_text.split('\n')[0:12]) + '\n')
             self.log_top_lines(12, 'Result of modifications')
-        lines = self.repair_ocr_text(OCR_REPAIRS, self.text).split('\n')
+        repaired_text = self._repair_links_and_quoted_subjects(self.repair_ocr_text(OCR_REPAIRS, self.text))
+        self._set_computed_fields(text=repaired_text)
+    def _repair_links_and_quoted_subjects(self, text: str) -> str:
+        """Repair links that the OCR has broken into multiple lines as well as 'Subject:' lines."""
+        lines = text.split('\n')
         subject_line = next((line for line in lines if line.startswith('Subject:')), None) or ''
         subject = subject_line.split(':')[1].strip() if subject_line else ''
         new_lines = []
         i = 0
-        # Fix links and quoted subjects (remove spaces, merge multiline links to a single line)
         while i < len(lines):
             line = lines[i]
@@ -708,8 +698,8 @@ class Email(Communication):
                 while i < (len(lines) - 1) \
                         and not lines[i + 1].startswith('htt') \
                         and (lines[i + 1].endswith('/') \
-                             or any(s in lines[i + 1] for s in URL_SIGNIFIERS) \
-                             or LINK_LINE2_REGEX.match(lines[i + 1])):
+                            or any(s in lines[i + 1] for s in URL_SIGNIFIERS) \
+                            or LINK_LINE2_REGEX.match(lines[i + 1])):
                     logger.debug(f"{self.filename}: Joining link lines\n   1. {line}\n   2. {lines[i + 1]}\n")
                     line += lines[i + 1]
                     i += 1
@@ -726,25 +716,19 @@ class Email(Communication):
                     pass
                 elif (subject.endswith(next_line) and next_line != subject) \
                         or (FIELDS_COLON_REGEX.search(next_next) and not FIELDS_COLON_REGEX.search(next_line)):
-                    self.warn(f"Fixing broken subject line\n  line: '{line}'\n    next: '{next_line}'\n    next: '{next_next}'\nsubject='{subject}'\n")
+                    self.log(f"Fixing broken subject line\n  line: '{line}'\n    next: '{next_line}'\n    next: '{next_next}'\nsubject='{subject}'\n")
                     line += f" {next_line}"
                     i += 1
             new_lines.append(line)
-            # TODO: hacky workaround to get a working link for HOUSE_OVERSIGHT_032564
-            if self.file_id == '032564' and line == 'http://m.huffpost.com/us/entry/us_599f532ae4b0dOef9f1c129d':
-                new_lines.append('(ed. note: an archived version of the above link is here: https://archive.is/hJxT3 )')
             i += 1
-        self._set_computed_fields(lines=new_lines)
+        logger.debug(f"----after line repair---\n" + '\n'.join(new_lines[0:20]) + "\n---")
+        return '\n'.join(lines)
     def _sent_from_device(self) -> str | None:
         """Find any 'Sent from my iPhone' style signature line if it exist in the 'actual_text'."""
-        sent_from_match = SENT_FROM_REGEX.search(self.actual_text)
-        if sent_from_match:
+        if (sent_from_match := SENT_FROM_REGEX.search(self.actual_text)):
             sent_from = sent_from_match.group(0)
             return 'S' + sent_from[1:] if sent_from.startswith('sent') else sent_from
@@ -756,9 +740,7 @@ class Email(Communication):
         else:
             self.config = EmailCfg(id=self.file_id)
-        extracted_from_description = extracted_from_doc_cfg.complete_description()
-        if extracted_from_description:
+        if (extracted_from_description := extracted_from_doc_cfg.complete_description):
             extracted_description = f"{APPEARS_IN} {extracted_from_description}"
             if isinstance(extracted_from_doc_cfg, EmailCfg):
@@ -783,11 +765,11 @@ class Email(Communication):
             num_chars = args.truncate
         elif self.config and self.config.truncate_to is not None:
             num_chars = len(self.text) if self.config.truncate_to == NO_TRUNCATE else self.config.truncate_to
-        elif self.is_interesting():
+        elif self.is_interesting:
             num_chars = len(self.text)
         elif self.author in TRUNCATE_EMAILS_FROM \
                 or any([self.is_from_or_to(n) for n in TRUNCATE_EMAILS_FROM_OR_TO]) \
-                or self.is_fwded_article() \
+                or self.is_fwded_article \
                 or includes_truncate_term:
             num_chars = min(quote_cutoff or MAX_CHARS_TO_PRINT, TRUNCATED_CHARS)
         else:
@@ -807,18 +789,18 @@ class Email(Communication):
                 else:
                     num_chars = quote_cutoff
             else:
-                num_chars = min(self.file_size(), MAX_CHARS_TO_PRINT)
+                num_chars = min(self.file_size, MAX_CHARS_TO_PRINT)
             # Always print whole email for 1st email for user
-            if self._is_first_for_user and num_chars < self.file_size() and not self.is_duplicate():
+            if self._is_first_for_user and num_chars < self.file_size and not self.is_duplicate:
                 logger.info(f"{self} Overriding cutoff {num_chars} for first email")
-                num_chars = self.file_size()
+                num_chars = self.file_size
         log_args = {
             'num_chars': num_chars,
             '_is_first_for_user': self._is_first_for_user,
             'author_truncate': self.author in TRUNCATE_EMAILS_FROM,
-            'is_fwded_article': self.is_fwded_article(),
+            'is_fwded_article': self.is_fwded_article,
             'is_quote_cutoff': quote_cutoff == num_chars,
             'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
             'quote_cutoff': quote_cutoff,
@@ -838,8 +820,8 @@ class Email(Communication):
         # Truncate long emails but leave a note explaining what happened w/link to source document
         if len(text) > num_chars:
             text = text[0:num_chars]
-            doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style())
-            trim_note = f"<...trimmed to {num_chars:,} characters of {self.length():,}, read the rest at {doc_link_markup}...>"
+            doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
+            trim_note = f"<...trimmed to {num_chars:,} characters of {self.length:,}, read the rest at {doc_link_markup}...>"
             trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
         # Rewrite broken headers where the values are on separate lines from the field names
@@ -855,7 +837,7 @@ class Email(Communication):
             lines += text.split('\n')[num_lines_to_skip:]
             text = self.header.rewrite_header() + '\n' + '\n'.join(lines)
-            text = _add_line_breaks(text)  # This was skipped when _prettify_text() w/a broken header so we do it now
+            text = _add_line_breaks(text)
             self.rewritten_header_ids.add(self.file_id)
         lines = [
@@ -867,7 +849,7 @@ class Email(Communication):
         email_txt_panel = Panel(
             highlighter(text).append('...\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
-            border_style=self._border_style(),
+            border_style=self.border_style,
             expand=False,
             subtitle=REWRITTEN_HEADER_MSG if should_rewrite_header else None,
         )
@@ -914,11 +896,11 @@ class Email(Communication):
         for email in emails:
             fields = [
-                email.epstein_media_link(link_txt=email.timestamp_without_seconds(), style=link_style),
-                email.author_txt(),
+                email.epstein_media_link(link_txt=email.timestamp_without_seconds, style=link_style),
+                email.author_txt,
                 email.recipients_txt(max_full_names=1),
-                f"{email.length()}",
-                email.subject(),
+                f"{email.length}",
+                email.subject,
             ]
             if not show_length:
@@ -935,21 +917,14 @@ def _add_line_breaks(email_text: str) -> str:
 def _parse_timestamp(timestamp_str: str) -> None | datetime:
     try:
-        timestamp_str = timestamp_str.replace('(GMT-05:00)', 'EST')
-        timestamp_str = BAD_TIMEZONE_REGEX.sub(' ', timestamp_str).strip()
-        timestamp = parse(timestamp_str, tzinfos=TIMEZONE_INFO)
+        if (american_date_match := AMERICAN_TIME_REGEX.search(timestamp_str)):
+            timestamp_str = american_date_match.group(1)
+        else:
+            timestamp_str = timestamp_str.replace('(GMT-05:00)', 'EST')
+            timestamp_str = BAD_TIMEZONE_REGEX.sub(' ', timestamp_str).strip()
+        timestamp = parse(timestamp_str, fuzzy=True, tzinfos=TIMEZONE_INFO)
         logger.debug(f'Parsed timestamp "%s" from string "%s"', timestamp, timestamp_str)
         return remove_timezone(timestamp)
     except Exception as e:
         logger.debug(f'Failed to parse "{timestamp_str}" to timestamp!')
-def _reverse_first_and_last_names(name: str) -> str:
-    if '@' in name:
-        return name.lower()
-    if ', ' in name:
-        names = name.split(', ')
-        return f"{names[1]} {names[0]}"
-    else:
-        return name

epstein_files/documents/emails/email_header.py CHANGED Viewed

@@ -2,7 +2,8 @@ import json
 import re
 from dataclasses import asdict, dataclass, field
-from epstein_files.util.constant.strings import AUTHOR, REDACTED, indented
+from epstein_files.documents.emails.emailers import BAD_EMAILER_REGEX, TIME_REGEX
+from epstein_files.util.constant.strings import AUTHOR, indented
 from epstein_files.util.constants import ALL_CONFIGS
 from epstein_files.util.doc_cfg import EmailCfg
 from epstein_files.util.logging import logger
@@ -13,17 +14,29 @@ ON_BEHALF_OF = 'on behalf of'
 TO_FIELDS = ['bcc', 'cc', 'to']
 EMAILER_FIELDS = [AUTHOR] + TO_FIELDS
-FIELD_PATTERNS = ['Date', 'From', 'Sent', 'To', r"C[cC]", r"B[cC][cC]", 'Importance', 'Subject', 'Attachments', 'Classification', 'Flag', 'Reply-To']
+FIELD_PATTERNS = [
+    'Date',
+    'From',
+    'Sent',
+    'To',
+    r"C[cC]",
+    r"B[cC][cC]",
+    'Importance',
+    'Subject',
+    'Attachments',
+    'Classification',
+    'Flag',
+    'Reply-To',
+    'Inline-Images'
+]
+DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}(From|Subject):')  # IDed 140 emails out of 3777 DOJ files with just 'From:' match
 FIELDS_PATTERN = '|'.join(FIELD_PATTERNS)
 FIELDS_COLON_PATTERN = fr"^({FIELDS_PATTERN}):"
 HEADER_REGEX_STR = fr"(((?:(?:{FIELDS_PATTERN}|Bee):|on behalf of ?)(?! +(by |from my|via )).*\n){{3,}})"
 EMAIL_SIMPLE_HEADER_REGEX = re.compile(rf'^{HEADER_REGEX_STR}')
 EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX = re.compile(HEADER_REGEX_STR)
 EMAIL_PRE_FORWARD_REGEX = re.compile(r"(.{3,2000}?)" + HEADER_REGEX_STR, re.DOTALL)  # Match up to the next email header section
-TIME_REGEX = re.compile(r'^(\d{1,2}/\d{1,2}/\d{2,4}|Thursday|Monday|Tuesday|Wednesday|Friday|Saturday|Sunday).*')
-BAD_NAME_CHARS_REGEX = re.compile(r"[\"'\[\]*><•]")
-BAD_EMAILER_REGEX = re.compile(r'^(>|11111111)|agreed|ok|sexy|re:|fwd:|Multiple Senders|((sent|attachments|subject|importance).*|.*(january|201\d|hysterical|i have|image0|so that people|article 1.?|momminnemummin|These conspiracy theories|your state|undisclosed|www\.theguardian|talk in|it was a|what do|cc:|call (back|me)).*)$', re.IGNORECASE)
 CONFIGURED_ACTUAL_TEXTS = [
     cfg.actual_text for cfg in ALL_CONFIGS
@@ -54,6 +67,7 @@ class EmailHeader:
     classification: str | None = None
     flag: str | None = None
     importance: str | None = None
+    inline_images: str | None = None
     attachments: str | None = None
     to: list[str] | None = None
     reply_to: str | None = None
@@ -112,7 +126,7 @@ class EmailHeader:
         self.header_chars = '\n'.join(email_lines[0:self.num_header_rows])
         log_msg = f"Corrected empty header using {self.num_header_rows} lines to:\n"
-        logger.warning(
+        logger.info(
             f"{log_msg}{self}\n\n[top lines]:\n\n%s\n\n[body_lines]:\n\n%s\n\n",
             indented('\n'.join(email_lines[0:(num_headers + 1) * 2]), prefix='> '),
             indented('\n'.join(email_lines[self.num_header_rows:self.num_header_rows + 5]), prefix='> '),
@@ -181,7 +195,3 @@ class EmailHeader:
             logger.debug(f"Header being parsed was this:\n\n{header}\n")
         return cls(field_names=field_names, header_chars=header, **kw_args)
-    @staticmethod
-    def cleanup_str(_str: str) -> str:
-        return BAD_NAME_CHARS_REGEX.sub('', _str.replace(REDACTED, '')).strip().strip('_').strip()

epstein-files 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl

epstein-files 1.4.1py3-none-any.whl → 1.5.0py3-none-any.whl