PyPI - epstein-files - Versions diffs - 1.0.0__py3-none-any.whl - Mend

epstein-files 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

epstein_files/__init__.py +194 -0
epstein_files/documents/communication.py +53 -0
epstein_files/documents/document.py +357 -0
epstein_files/documents/email.py +655 -0
epstein_files/documents/emails/email_header.py +167 -0
epstein_files/documents/imessage/text_message.py +93 -0
epstein_files/documents/json_file.py +23 -0
epstein_files/documents/messenger_log.py +73 -0
epstein_files/documents/other_file.py +117 -0
epstein_files/epstein_files.py +437 -0
epstein_files/util/constant/common_words.py +94 -0
epstein_files/util/constant/html.py +57 -0
epstein_files/util/constant/names.py +261 -0
epstein_files/util/constant/strings.py +47 -0
epstein_files/util/constant/urls.py +103 -0
epstein_files/util/constants.py +1552 -0
epstein_files/util/data.py +131 -0
epstein_files/util/env.py +80 -0
epstein_files/util/file_cfg.py +172 -0
epstein_files/util/file_helper.py +81 -0
epstein_files/util/highlighted_group.py +620 -0
epstein_files/util/rich.py +324 -0
epstein_files/util/search_result.py +15 -0
epstein_files/util/word_count.py +191 -0
epstein_files-1.0.0.dist-info/LICENSE +674 -0
epstein_files-1.0.0.dist-info/METADATA +60 -0
epstein_files-1.0.0.dist-info/RECORD +28 -0
epstein_files-1.0.0.dist-info/WHEEL +4 -0

epstein_files/documents/emails/email_header.py ADDED Viewed

@@ -0,0 +1,167 @@
+import json
+import re
+from dataclasses import asdict, dataclass, field
+from epstein_files.util.constant.strings import AUTHOR, REDACTED
+from epstein_files.util.constants import ALL_CONFIGS
+from epstein_files.util.env import logger
+from epstein_files.util.file_cfg import MessageCfg
+from epstein_files.util.rich import UNKNOWN
+FIELD_NAMES = ['From', 'Date', 'Sent', 'Subject']
+NON_HEADER_FIELDS = ['field_names', 'num_header_rows', 'was_initially_empty']
+ON_BEHALF_OF = 'on behalf of'
+TO_FIELDS = ['bcc', 'cc', 'to']
+EMAILER_FIELDS = [AUTHOR] + TO_FIELDS
+HEADER_REGEX_STR = r'(((?:(?:Date|From|Sent|To|C[cC]|Importance|Subject|Bee|B[cC]{2}|Attachments):|on behalf of ?)(?! +(by |from my|via )).*\n){3,})'
+EMAIL_SIMPLE_HEADER_REGEX = re.compile(rf'^{HEADER_REGEX_STR}')
+EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX = re.compile(HEADER_REGEX_STR)
+EMAIL_PRE_FORWARD_REGEX = re.compile(r"(.{3,2000}?)" + HEADER_REGEX_STR, re.DOTALL)  # Match up to the next email header section
+TIME_REGEX = re.compile(r'^(\d{1,2}/\d{1,2}/\d{2,4}|Thursday|Monday|Tuesday|Wednesday|Friday|Saturday|Sunday).*')
+BAD_NAME_CHARS_REGEX = re.compile(r"[\"'\[\]*><•]")
+BAD_EMAILER_REGEX = re.compile(r'^(>|11111111)|agreed|ok|sexy|rt|re:|fwd:|Multiple Senders|((sent|attachments|subject|importance).*|.*(january|201\d|hysterical|i have|image0|so that people|article 1.?|momminnemummin|These conspiracy theories|your state|undisclosed|www\.theguardian|talk in|it was a|what do|cc:|call (back|me)).*)$', re.IGNORECASE)
+CONFIGURED_ACTUAL_TEXTS = [
+    cfg.actual_text for cfg in ALL_CONFIGS
+    if isinstance(cfg, MessageCfg) and cfg.actual_text is not None
+]
+@dataclass(kw_only=True)
+class EmailHeader:
+    field_names: list[str]  # Order is same as the order header fields appear in the email file text
+    num_header_rows: int = field(init=False)
+    was_initially_empty: bool = False
+    # Fields from the email text
+    author: str | None = None
+    sent_at: str | None = None
+    subject: str | None = None
+    bcc: list[str] | None = None
+    cc: list[str] | None = None
+    importance: str | None = None
+    attachments: str | None = None
+    to: list[str] | None = None
+    def __post_init__(self):
+        self.num_header_rows = len(self.field_names)
+        self.was_initially_empty = self.is_empty()
+    def as_dict(self) -> dict[str, str | None]:
+        """Remove housekeeping fields that don't actually come from the email."""
+        return {k: v for k, v in asdict(self).items() if k not in NON_HEADER_FIELDS}
+    def is_empty(self) -> bool:
+        return not any([v for _k, v in self.as_dict().items()])
+    def recipients(self) -> list[str]:
+        return (self.to or []) + (self.cc or []) + (self.bcc or [])
+    def repair_empty_header(self, email_lines: list[str]) -> None:
+        num_headers = len(self.field_names)
+        # Sometimes the headers and values are on separate lines and we need to do some shenanigans
+        for i, field_name in enumerate(self.field_names):
+            row_number_to_check = i + num_headers  # Look ahead 3 lines if there's 3 header fields, 4 if 4, etc.
+            if row_number_to_check > (len(email_lines) - 1):
+                raise RuntimeError(f"Ran out of header rows to check for '{field_name}'")
+            value = email_lines[row_number_to_check]
+            log_prefix = f"Looks like '{value}' is a mismatch for '{field_name}', "
+            if field_name == AUTHOR:
+                if value in CONFIGURED_ACTUAL_TEXTS:
+                    logger.info(f"{log_prefix}, trying the next line...")
+                    num_headers += 1
+                    value = email_lines[i + num_headers]
+                elif TIME_REGEX.match(value) or value == 'Darren,' or BAD_EMAILER_REGEX.match(value):
+                    logger.info(f"{log_prefix}, decrementing num_headers and skipping...")
+                    num_headers -= 1
+                    continue
+            elif field_name in TO_FIELDS:
+                if TIME_REGEX.match(value):
+                    logger.info(f"{log_prefix}, trying next line...")
+                    num_headers += 1
+                    value = email_lines[i + num_headers]
+                elif BAD_EMAILER_REGEX.match(value):
+                    logger.info(f"{log_prefix}, decrementing num_headers and skipping...")
+                    num_headers -= 1
+                    continue
+                elif value.startswith('http'):
+                    logger.info(f"{log_prefix}, using empty string instead...")
+                    value = ''
+                value = [v.strip() for v in value.split(';') if len(v.strip()) > 0]
+            setattr(self, field_name, value)
+        self.num_header_rows = len(self.field_names) + num_headers
+        logger.debug(f"Corrected empty header using {self.num_header_rows} lines to:\n%s\n\nTop lines:\n\n%s", self, '\n'.join(email_lines[0:(num_headers + 1) * 2]))
+    def rewrite_header(self) -> str:
+        header_fields = {}
+        for field_name in self.field_names:
+            if field_name == AUTHOR:
+                header_fields['From'] = self.author or ''
+            elif field_name == 'sent_at':
+                if self.sent_at in CONFIGURED_ACTUAL_TEXTS:
+                    header_fields['Date'] = ''
+                else:
+                    header_fields['Date'] = self.sent_at or ''
+            elif field_name in TO_FIELDS:
+                header_fields[field_name.title()] = '; '.join(getattr(self, field_name) or [])
+            else:
+                header_fields[field_name.title()] = getattr(self, field_name) or ''
+        return '\n'.join([f"{k}: {v}" for k, v in header_fields.items()])
+    def __str__(self) -> str:
+        return json.dumps(self.as_dict(), sort_keys=True, indent=4)
+    @classmethod
+    def from_header_lines(cls, header: str) -> 'EmailHeader':
+        kw_args = {}
+        field_names = []
+        should_log_header = False
+        for line in [l.strip() for l in header.strip().split('\n')]:
+            if line.lower().startswith(ON_BEHALF_OF):
+                author = line.removeprefix(ON_BEHALF_OF).strip()
+                if len(author) > 0:
+                    kw_args[AUTHOR] = author
+                continue
+            #logger.debug(f"extracting header line: '{line}'")
+            key, value = [element.strip() for element in line.split(':', 1)]
+            value = value.rstrip('_')
+            key = AUTHOR if key == 'From' else ('sent_at' if key in ['Date', 'Sent'] else key.lower())
+            key = 'bcc' if key == 'bee' else key
+            if kw_args.get(key):
+                logger.debug(f'Already have value "{kw_args[key]}" at key "{key}", not overwriting with "{value}"')
+                should_log_header = True
+                continue
+            field_names.append(key)
+            if key in TO_FIELDS:
+                recipients = [element.strip() for element in value.split(';')]
+                recipients = [r for r in recipients if len(r) > 0]
+                kw_args[key] = None if len(value) == 0 else [r if len(r) > 0 else UNKNOWN for r in recipients]
+            else:
+                kw_args[key.lower()] = None if len(value) == 0 else value
+        if should_log_header:
+            logger.debug(f"Header being parsed was this:\n\n{header}\n")
+        return EmailHeader(field_names=field_names, **kw_args)
+    @staticmethod
+    def cleanup_str(_str: str) -> str:
+        return BAD_NAME_CHARS_REGEX.sub('', _str.replace(REDACTED, '')).strip().strip('_').strip()

epstein_files/documents/imessage/text_message.py ADDED Viewed

@@ -0,0 +1,93 @@
+import re
+from dataclasses import dataclass, field
+from datetime import datetime
+from rich.text import Text
+from epstein_files.util.constant.names import JEFFREY_EPSTEIN, ANTHONY_SCARAMUCCI, STEVE_BANNON, UNKNOWN
+from epstein_files.util.data import extract_last_name
+from epstein_files.util.highlighted_group import get_style_for_name
+from epstein_files.util.rich import TEXT_LINK, highlighter, logger
+MSG_DATE_FORMAT = r"%m/%d/%y %I:%M:%S %p"
+PHONE_NUMBER_REGEX = re.compile(r'^[\d+]+.*')
+TIMESTAMP_STYLE = 'turquoise4 dim'
+DISPLAY_LAST_NAME_ONLY = [
+    JEFFREY_EPSTEIN,
+    STEVE_BANNON,
+]
+UNKNOWN_TEXTERS = [
+    '+16463880059',
+    '+13108737937',
+    '+13108802851',
+]
+TEXTER_MAPPING = {
+    'e:': JEFFREY_EPSTEIN,
+    'e:jeeitunes@gmail.com': JEFFREY_EPSTEIN,
+    '+19174393646': ANTHONY_SCARAMUCCI,
+    '+13109906526': STEVE_BANNON,
+}
+@dataclass(kw_only=True)
+class TextMessage:
+    """Class representing a single iMessage text message."""
+    author: str | None
+    author_str: str = field(init=False)
+    id_confirmed: bool = False
+    text: str
+    timestamp_str: str
+    def __post_init__(self):
+        self.author = TEXTER_MAPPING.get(self.author or UNKNOWN, self.author)
+        if self.author is None:
+            self.author_str = UNKNOWN
+        elif self.author in UNKNOWN_TEXTERS:
+            logger.warning(f"Bad text from '{self.author}': \"{self.text}\"")
+            self.author_str = self.author
+            self.author = None  # TODO: this shouldn't be happening; we still know the author...
+        elif self.author in DISPLAY_LAST_NAME_ONLY:
+            self.author_str = extract_last_name(self.author)
+        else:
+            self.author_str = self.author
+        if not self.id_confirmed and self.author is not None:
+            self.author_str = self.author + ' (?)'
+    def timestamp(self) -> datetime:
+        return datetime.strptime(self.timestamp_str, MSG_DATE_FORMAT)
+    def _message(self) -> Text:
+        lines = self.text.split('\n')
+        # Fix multiline links
+        if self.text.startswith('http'):
+            text = self.text
+            if len(lines) > 1 and not lines[0].endswith('html'):
+                if len(lines) > 2 and lines[1].endswith('-'):
+                    text = text.replace('\n', '', 2)
+                else:
+                    text = text.replace('\n', '', 1)
+            lines = text.split('\n')
+            link_text = lines.pop()
+            msg_txt = Text('').append(Text.from_markup(f"[link={link_text}]{link_text}[/link]", style=TEXT_LINK))
+            if len(lines) > 0:
+                msg_txt.append('\n' + ' '.join(lines))
+        else:
+            msg_txt = highlighter(' '.join(lines))  # remove newlines
+        return msg_txt
+    def __rich__(self) -> Text:
+        # TODO: Workaround for phone numbers that sucks
+        author_style = get_style_for_name(self.author_str if self.author_str.startswith('+') else self.author)
+        author_txt = Text(self.author_str, style=author_style)
+        timestamp_txt = Text(f"[{self.timestamp_str}]", style=TIMESTAMP_STYLE).append(' ')
+        return Text('').append(timestamp_txt).append(author_txt).append(': ', style='dim').append(self._message())

epstein_files/documents/json_file.py ADDED Viewed

@@ -0,0 +1,23 @@
+from dataclasses import dataclass
+from pathlib import Path
+from rich.text import Text
+from epstein_files.documents.other_file import OtherFile
+@dataclass
+class JsonFile(OtherFile):
+    """File containing JSON data."""
+    def __post_init__(self):
+        super().__post_init__()
+        if self.url_slug.endswith('.txt') or self.url_slug.endswith('.json'):
+            self.url_slug = Path(self.url_slug).stem
+    def info_txt(self) -> Text | None:
+        return Text(f"JSON file, possibly iMessage or similar app metadata", style='white dim italic')
+    def is_interesting(self):
+        return False

epstein_files/documents/messenger_log.py ADDED Viewed

@@ -0,0 +1,73 @@
+import re
+from dataclasses import dataclass, field
+from datetime import datetime
+from rich.console import Console, ConsoleOptions, RenderResult
+from rich.text import Text
+from epstein_files.documents.communication import Communication
+from epstein_files.documents.imessage.text_message import MSG_DATE_FORMAT, TextMessage
+from epstein_files.util.rich import logger
+CONFIRMED_MSG = 'Found confirmed counterparty'
+GUESSED_MSG = 'This is probably a conversation with'
+MSG_REGEX = re.compile(r'Sender:(.*?)\nTime:(.*? (AM|PM)).*?Message:(.*?)\s*?((?=(\nSender)|\Z))', re.DOTALL)
+REDACTED_AUTHOR_REGEX = re.compile(r"^([-+•_1MENO.=F]+|[4Ide])$")
+@dataclass
+class MessengerLog(Communication):
+    """Class representing one iMessage log file (one conversation between Epstein and some counterparty)."""
+    _messages: list[TextMessage] = field(default_factory=list)
+    def first_message_at(self, name: str | None) -> datetime:
+        return self.messages_by(name)[0].timestamp()
+    def info_txt(self) -> Text | None:
+        hint_msg = GUESSED_MSG if self.is_attribution_uncertain() else CONFIRMED_MSG
+        author_txt = Text(self.author_or_unknown(), style=self.author_style + ' bold')
+        return Text(f"({hint_msg} ", style='dim').append(author_txt).append(')')
+    def last_message_at(self, name: str | None) -> datetime:
+        return self.messages_by(name)[-1].timestamp()
+    def messages(self) -> list[TextMessage]:
+        """Lazily evaluated accessor for self._messages."""
+        if len(self._messages) == 0:
+            self._messages = [
+                TextMessage(
+                    # If the Sender: is redacted that means it's from self.author
+                    author=REDACTED_AUTHOR_REGEX.sub('', match.group(1).strip()) or self.author,
+                    id_confirmed=not self.is_attribution_uncertain(),
+                    text=match.group(4).strip(),
+                    timestamp_str=match.group(2).strip(),
+                )
+                for match in MSG_REGEX.finditer(self.text)
+            ]
+        return self._messages
+    def messages_by(self, name: str | None) -> list[TextMessage]:
+        """Return all messages by 'name'."""
+        return [m for m in self.messages() if m.author == name]
+    def _border_style(self) -> str:
+        return self.author_style
+    def _extract_timestamp(self) -> datetime:
+        for match in MSG_REGEX.finditer(self.text):
+            timestamp_str = match.group(2).strip()
+            try:
+                return datetime.strptime(timestamp_str, MSG_DATE_FORMAT)
+            except ValueError as e:
+                logger.info(f"[WARNING] Failed to parse '{timestamp_str}' to datetime! Using next match. Error: {e}'")
+        raise RuntimeError(f"{self}: No timestamp found!")
+    def __rich_console__(self, _console: Console, _options: ConsoleOptions) -> RenderResult:
+        yield self.file_info_panel()
+        yield Text('')
+        for message in self.messages():
+            yield message

epstein_files/documents/other_file.py ADDED Viewed

@@ -0,0 +1,117 @@
+import logging
+import warnings
+from dataclasses import dataclass
+from datetime import datetime
+import datefinder
+import dateutil
+from rich.markup import escape
+from rich.panel import Panel
+from rich.text import Text
+from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, WHITESPACE_REGEX, Document
+from epstein_files.util.constants import UNINTERESTING_PREFIXES
+from epstein_files.util.data import escape_single_quotes, remove_timezone, uniquify
+from epstein_files.util.env import args, logger
+from epstein_files.util.rich import highlighter, logger
+MAX_EXTRACTED_TIMESTAMPS = 100
+MAX_DAYS_SPANNED_TO_BE_VALID = 10
+MIN_TIMESTAMP = datetime(2000, 1, 1)
+MID_TIMESTAMP = datetime(2007, 1, 1)
+MAX_TIMESTAMP = datetime(2022, 12, 31)
+PREVIEW_CHARS = int(580 * (1 if args.all_other_files else 1.5))
+LOG_INDENT = '\n         '
+TIMESTAMP_LOG_INDENT = f'{LOG_INDENT}    '
+VAST_HOUSE = 'vast house'  # Michael Wolff article draft about Epstein indicator
+@dataclass
+class OtherFile(Document):
+    """File that is not an email, an iMessage log, or JSON data."""
+    def configured_description(self) -> str | None:
+        """Overloads superclass method."""
+        if self.config is None:
+            return None
+        pieces = [p for p in [self.config.author, self.config.description] if p]
+        return ' '.join(pieces) if pieces else None
+    def description(self) -> Text:
+        """One line summary mostly for logging."""
+        return super().description().append(CLOSE_PROPERTIES_CHAR)
+    def description_panel(self, include_hints=True) -> Panel:
+        """Panelized description() with info_txt(), used in search results."""
+        return super().description_panel(include_hints=include_hints)
+    def highlighted_preview_text(self) -> Text:
+        try:
+            return highlighter(escape(self.preview_text()))
+        except Exception as e:
+            logger.error(f"Failed to apply markup in string '{escape_single_quotes(self.preview_text())}'\n"
+                         f"Original string: '{escape_single_quotes(self.preview_text())}'\n"
+                         f"File: '{self.filename}'\n")
+            return Text(escape(self.preview_text()))
+    def is_interesting(self):
+        """False for lame prefixes and duplicates."""
+        hints = self.hints()
+        if self.is_duplicate:
+            return False
+        elif len(hints) == 0:
+            return True
+        for prefix in UNINTERESTING_PREFIXES:
+            if hints[0].plain.startswith(prefix):
+                return False
+        return True
+    def preview_text(self) -> str:
+        return WHITESPACE_REGEX.sub(' ', self.text)[0:PREVIEW_CHARS]
+    def _extract_timestamp(self) -> datetime | None:
+        """Return configured timestamp or value extracted by scanning text with datefinder."""
+        if self.config and self.config.timestamp:
+            return self.config.timestamp
+        timestamps: list[datetime] = []
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", module="datefinder")
+            warnings.filterwarnings("ignore", module="dateutil")
+            try:
+                for timestamp in datefinder.find_dates(self.text, strict=True):
+                    timestamp = remove_timezone(timestamp)
+                    if MIN_TIMESTAMP < timestamp < MAX_TIMESTAMP:
+                        timestamps.append(timestamp)
+                    if len(timestamps) >= MAX_EXTRACTED_TIMESTAMPS:
+                        break
+            except ValueError as e:
+                logger.warning(f"Error while iterating through datefinder.find_dates(): {e}")
+        if len(timestamps) == 0:
+            self.log_top_lines(15, msg=f"{self.file_id}: No timestamps found", level=logging.INFO)
+            return None
+        elif len(timestamps) == 1:
+            return timestamps[0]
+        else:
+            timestamps = sorted(uniquify(timestamps), reverse=True)
+            self._log_extracted_timestamps_info(timestamps)
+            return timestamps[0]  # Most recent timestamp appearing in text is usually the closest
+    def _log_extracted_timestamps_info(self, timestamps: list[datetime]) -> None:
+        num_days_spanned = (timestamps[0] - timestamps[-1]).days
+        timestamps_log_msg = f"Extracted {len(timestamps)} timestamps spanning {num_days_spanned} days{TIMESTAMP_LOG_INDENT}"
+        timestamps_log_msg += TIMESTAMP_LOG_INDENT.join([str(dt) for dt in timestamps])
+        if num_days_spanned > MAX_DAYS_SPANNED_TO_BE_VALID and VAST_HOUSE not in self.text:
+            log_level = logging.DEBUG if VAST_HOUSE in self.text else logging.INFO
+            self.log_top_lines(15, msg=timestamps_log_msg, level=log_level)