PyPI - epstein-files - Versions diffs - 1.0.0__py3-none-any.whl - Mend

epstein-files 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

epstein_files/__init__.py +194 -0
epstein_files/documents/communication.py +53 -0
epstein_files/documents/document.py +357 -0
epstein_files/documents/email.py +655 -0
epstein_files/documents/emails/email_header.py +167 -0
epstein_files/documents/imessage/text_message.py +93 -0
epstein_files/documents/json_file.py +23 -0
epstein_files/documents/messenger_log.py +73 -0
epstein_files/documents/other_file.py +117 -0
epstein_files/epstein_files.py +437 -0
epstein_files/util/constant/common_words.py +94 -0
epstein_files/util/constant/html.py +57 -0
epstein_files/util/constant/names.py +261 -0
epstein_files/util/constant/strings.py +47 -0
epstein_files/util/constant/urls.py +103 -0
epstein_files/util/constants.py +1552 -0
epstein_files/util/data.py +131 -0
epstein_files/util/env.py +80 -0
epstein_files/util/file_cfg.py +172 -0
epstein_files/util/file_helper.py +81 -0
epstein_files/util/highlighted_group.py +620 -0
epstein_files/util/rich.py +324 -0
epstein_files/util/search_result.py +15 -0
epstein_files/util/word_count.py +191 -0
epstein_files-1.0.0.dist-info/LICENSE +674 -0
epstein_files-1.0.0.dist-info/METADATA +60 -0
epstein_files-1.0.0.dist-info/RECORD +28 -0
epstein_files-1.0.0.dist-info/WHEEL +4 -0

epstein_files/util/data.py ADDED Viewed

@@ -0,0 +1,131 @@
+"""
+Helpers for dealing with various kinds of data.
+"""
+import itertools
+import re
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from dateutil import tz
+from typing import TypeVar
+from dateutil.parser import parse
+from rich.text import Text
+from epstein_files.util.constant import names
+from epstein_files.util.env import args, logger
+T = TypeVar('T')
+ISO_DATE_REGEX = re.compile(r'\d{4}-\d{2}(-\d{2})?')
+MULTINEWLINE_REGEX = re.compile(r"\n{2,}")
+CONSTANT_VAR_REGEX = re.compile(r"^[A-Z_]+$")
+ALL_NAMES = [v for k, v in vars(names).items() if isinstance(v, str) and CONSTANT_VAR_REGEX.match(k)]
+PACIFIC_TZ = tz.gettz("America/Los_Angeles")
+TIMEZONE_INFO = {"PST": PACIFIC_TZ, "PDT": PACIFIC_TZ}  # Suppresses annoying warnings from parse() calls
+def collapse_newlines(text: str) -> str:
+    return MULTINEWLINE_REGEX.sub('\n\n', text)
+def date_str(timestamp: datetime | None) -> str | None:
+    return timestamp.isoformat()[0:10] if timestamp else None
+def dict_sets_to_lists(d: dict[str, set]) -> dict[str, list]:
+    return {k: sorted(list(v)) for k, v in d.items()}
+def extract_datetime(s: str) -> datetime | None:
+    match = ISO_DATE_REGEX.search(s)
+    if not match:
+        return None
+    date_str = match.group(0)
+    if len(date_str) == 4:
+        date_str += '-01-01'
+    elif len(date_str) == 7:
+        date_str += '-01'
+    return parse(date_str, tzinfos=TIMEZONE_INFO)
+def extract_last_name(name: str) -> str:
+    if ' ' not in name:
+        return name
+    names = name.split()
+    if names[-1].startswith('Jr') and len(names[-1]) <= 3:
+        return ' '.join(names[-2:])
+    else:
+        return names[-1]
+def flatten(_list: list[list[T]]) -> list[T]:
+    return list(itertools.chain.from_iterable(_list))
+def iso_timestamp(dt: datetime) -> str:
+    return dt.isoformat().replace('T', ' ')
+def listify(listlike: list | str | Text | None) -> list:
+    """Create a list of 'listlike'. Returns empty list if 'listlike' is None or empty string."""
+    if isinstance(listlike, list):
+        return listlike
+    elif listlike:
+        return [listlike]
+    else:
+        return []
+def ordinal_str(n: int) -> str:
+    if 11 <= (n % 100) <= 13:
+        suffix = 'th'
+    else:
+        suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
+    return str(n) + suffix
+def patternize(_pattern: str | re.Pattern):
+    return _pattern if isinstance(_pattern, re.Pattern) else re.compile(rf"({_pattern})", re.IGNORECASE)
+def remove_timezone(timestamp: datetime) -> datetime:
+    if timestamp.tzinfo:
+        timestamp = timestamp.astimezone(timezone.utc).replace(tzinfo=None)
+        logger.debug(f"    -> Converted to UTC: {timestamp}")
+    return timestamp
+def sort_dict(d: dict[str | None, int] | dict[str, int]) -> list[tuple[str | None, int]]:
+    sort_key = lambda e: (e[0] or '').lower() if args.sort_alphabetical else [-e[1], (e[0] or '').lower()]
+    return sorted(d.items(), key=sort_key)
+@dataclass
+class Timer:
+    started_at: float = field(default_factory=lambda: time.perf_counter())
+    checkpoint_at: float = field(default_factory=lambda: time.perf_counter())
+    def print_at_checkpoint(self, msg: str) -> None:
+        logger.warning(f"{msg} in {self.seconds_since_checkpoint()}")
+        self.checkpoint_at = time.perf_counter()
+    def seconds_since_checkpoint(self) -> str:
+        return f"{(time.perf_counter() - self.checkpoint_at):.2f} seconds"
+    def seconds_since_start(self) -> str:
+        return f"{(time.perf_counter() - self.started_at):.2f} seconds"
+escape_double_quotes = lambda text: text.replace('"', r'\"')
+escape_single_quotes = lambda text: text.replace("'", r"\'")
+uniquify = lambda _list: list(set(_list))

epstein_files/util/env.py ADDED Viewed

@@ -0,0 +1,80 @@
+import logging
+from argparse import ArgumentParser
+from os import environ
+from pathlib import Path
+from sys import argv
+from rich.logging import RichHandler
+DEFAULT_WIDTH = 154
+HTML_SCRIPTS = ['generate_html.py', 'count_words.py']
+parser = ArgumentParser(description="Parse epstein OCR docs and generate HTML page.")
+parser.add_argument('--build', '-b', action='store_true', help='write HTML to docs/index.html')
+parser.add_argument('--all-emails', '-ae', action='store_true', help='all the emails')
+parser.add_argument('--all-email-tables', '-aet', action='store_true', help='all email tables (except Epstein)')
+parser.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of a limited selection')
+parser.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
+parser.add_argument('--name', '-n', action='append', dest='names', help='specify the name(s) whose communications should be output')
+parser.add_argument('--output-emails', '-oe', action='store_true', help='generate other files section')
+parser.add_argument('--output-other-files', '-oo', action='store_true', help='generate other files section')
+parser.add_argument('--output-texts', '-ot', action='store_true', help='generate other files section')
+parser.add_argument('--pickled', '-p', action='store_true', help='use pickled EpsteinFiles object')
+parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='generate new pickled EpsteinFiles object')
+parser.add_argument('--sort-alphabetical', '-alpha', action='store_true', help='sort emailers alphabetically in counts table')
+parser.add_argument('--suppress-output', '-s', action='store_true', help='no output to terminal (use with --build)')
+parser.add_argument('--use-epstein-web-links', '-use', action='store_true', help='use epsteinweb.org links instead of epsteinify.com')
+parser.add_argument('--search-other', '-so', action='store_true', help='search for string in non email/text files (only used by search script)')
+parser.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use')
+parser.add_argument('--whole-file', '-wf', action='store_true', help='print whole file (only used by search script)')
+parser.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
+parser.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
+parser.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
+parser.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats at the end')
+parser.add_argument('positional_args', nargs='*', help='Optional args (only used by helper scripts)')
+args = parser.parse_args()
+is_env_var_set = lambda s: len(environ.get(s) or '') > 0
+current_script = Path(argv[0]).name
+is_html_script = current_script in HTML_SCRIPTS
+args.deep_debug = args.deep_debug or is_env_var_set('DEEP_DEBUG')
+args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
+args.output_emails = args.output_emails or args.all_emails
+args.output_other_files = args.output_other_files or args.all_other_files
+args.pickled = args.pickled or is_env_var_set('PICKLED') or args.colors_only or len(args.names or []) > 0
+args.width = args.width if is_html_script else None
+specified_names: list[str | None] = [None if n == 'None' else n for n in (args.names or [])]
+# Setup logging
+logging.basicConfig(level="NOTSET", format="%(message)s", datefmt="[%X]", handlers=[RichHandler()])
+# logging.basicConfig(level="DEBUG", handlers=[RichHandler()])
+logger = logging.getLogger("rich")
+if args.deep_debug:
+    logger.setLevel(logging.DEBUG)
+elif args.debug:
+    logger.setLevel(logging.INFO)
+elif args.suppress_logs:
+    logger.setLevel(logging.FATAL)
+else:
+    logger.setLevel(logging.WARNING)
+datefinder_logger = logging.getLogger('datefinder')  # Suppress annoying output
+datefinder_logger.setLevel(logger.level)
+# Massage args that depend on other args to the appropriate state
+if not (args.output_texts or args.output_emails or args.output_other_files):
+    logger.warning(f"No output section chosen; outputting default of texts, selected emails, and other files...")
+    args.output_texts = True
+    args.output_emails = True
+    args.output_other_files = True
+if args.use_epstein_web_links:
+    logger.warning(f"Using links to epsteinweb.org links instead of epsteinify.com...")
+if args.debug:
+    logger.warning(f"is_html_script={is_html_script}, specified_names={specified_names}, args={args}")

epstein_files/util/file_cfg.py ADDED Viewed

@@ -0,0 +1,172 @@
+import re
+from copy import deepcopy
+from dataclasses import Field, asdict, dataclass, field, fields
+from datetime import datetime
+from typing import Generator, Literal
+from dateutil.parser import parse
+from epstein_files.util.constant.names import constantize_name
+from epstein_files.util.constant.strings import AUTHOR
+DuplicateType = Literal['same', 'earlier', 'quoted', 'redacted']
+INDENT = '    '
+INDENT_NEWLINE = f'\n{INDENT}'
+INDENTED_JOIN = f',{INDENT_NEWLINE}'
+CONSTANTIZE_NAMES = False  # A flag set to True that causes repr() of these classes to return strings of usable code
+MAX_LINE_LENGTH = 250
+REASON_MAPPING: dict[DuplicateType, str] = {
+    'earlier': 'earlier draft of',
+    'quoted': 'quoted in full in',
+    'redacted': 'redacted version of',
+    'same': 'the same as',
+}
+FIELD_SORT_KEY = {
+    'id': 'a',
+    'author': 'aa',
+    'attribution_reason': 'zz',
+}
+@dataclass(kw_only=True)
+class FileCfg:
+    """Convenience class that encapsulates configuring info about files that need to be manually configured.
+    Attributes:
+        id (str): ID of file
+        author (str | None): Author of the document (if any)
+        date (str | None): If passed will be immediated parsed into the 'timestamp' field
+        dupe_of_id (str | None): If this is a dupe the ID of the duplicated file. This file will be suppressed
+        dupe_type (DuplicateType | None): The type of duplicate this file is (redacted, quoted, etc.)
+        duplicate_ids (list[str]): Inverse of 'dupe_of_id' - this file will NOT be suppressed but 'duplicate_ids' will be.
+        timestamp (datetime | None): Time this email was sent, file was created, article published, etc.
+        was_generated (bool): True if this object was generated by the duplicate_cfgs() method
+    """
+    id: str
+    author: str | None = None
+    date: str | None = None
+    description: str | None = None
+    dupe_of_id: str | None = None
+    dupe_type: DuplicateType | None = None
+    duplicate_ids: list[str] = field(default_factory=list)
+    timestamp: datetime | None = None
+    was_generated: bool = False  # True if this object was generated by duplicate_cfgs()
+    def __post_init__(self):
+        if self.dupe_of_id:
+            self.dupe_type = self.dupe_type or 'same'
+        if self.date:
+            self.timestamp = parse(self.date)
+    def duplicate_reason(self) -> str | None:
+        if self.dupe_type is not None:
+            return REASON_MAPPING[self.dupe_type]
+    def duplicate_cfgs(self) -> Generator['FileCfg', None, None]:
+        for id in self.duplicate_ids:
+            dupe_cfg = deepcopy(self)
+            dupe_cfg.id = id
+            dupe_cfg.dupe_of_id = self.id
+            dupe_cfg.dupe_type = self.dupe_type or 'same'
+            dupe_cfg.was_generated = True
+            yield dupe_cfg
+    def non_null_field_names(self) -> list[str]:
+        return [f.name for f in self.sorted_fields() if getattr(self, f.name)]
+    def sorted_fields(self) -> list[Field]:
+        return sorted(fields(self), key=lambda f: FIELD_SORT_KEY.get(f.name, f.name))
+    def _props_strs(self) -> list[str]:
+        props = []
+        add_prop = lambda f, value: props.append(f"{f.name}={value}")
+        for _field in self.sorted_fields():
+            value = getattr(self, _field.name)
+            if value is None or value is False or (isinstance(value, list) and len(value) == 0):
+                continue
+            elif _field.name == AUTHOR:
+                add_prop(_field, constantize_name(str(value)) if CONSTANTIZE_NAMES else f"'{value}'")
+            elif _field.name == 'recipients' and isinstance(value, list):
+                recipients_str = str([constantize_name(r) if (CONSTANTIZE_NAMES and r) else r for r in value])
+                add_prop(_field, recipients_str.replace("'", '') if CONSTANTIZE_NAMES else recipients_str)
+            elif isinstance(value, datetime):
+                value_str = re.sub(' 00:00:00', '', str(value))
+                add_prop(_field, f"parse('{value_str}')" if CONSTANTIZE_NAMES else f"'{value}'")
+            elif _field.name == 'description':
+                add_prop(_field, value.strip())
+            elif isinstance(value, str):
+                if "'" in value:
+                    value = '"' + value.replace('"', r'\"') + '"'
+                else:
+                    value = "'" + value.replace("'", r'\'') + "'"
+                add_prop(_field, value)
+            else:
+                add_prop(_field, str(value))
+        return props
+    def __eq__(self, other: 'FileCfg') -> bool:
+        """Return True if everything matches other than the two 'dupe_' fields ('duplicate_ids' is compared)."""
+        for _field in self.sorted_fields():
+            if _field.name == 'id' or _field.name.startswith('dupe'):
+                continue
+            elif getattr(self, _field.name) != getattr(other, _field.name):
+                return False
+        return True
+    def __repr__(self) -> str:
+        props = self._props_strs()
+        type_str = f"{type(self).__name__}("
+        single_line_repr = type_str + ', '.join(props) + f')'
+        if (len(single_line_repr) < MAX_LINE_LENGTH or self.non_null_field_names() == ['id', 'description']) and '#' not in (self.description or ''):
+            repr_str = single_line_repr
+        else:
+            repr_str = f"{type_str}{INDENT_NEWLINE}" + INDENTED_JOIN.join(props)
+            repr_str += ',' if props else ''
+            repr_str += '\n)'
+        if CONSTANTIZE_NAMES:
+            repr_str = INDENT + INDENT_NEWLINE.join(repr_str.split('\n'))
+            return repr_str.replace(',,', ',').replace(',),', '),').replace(',),', '),')
+        else:
+            return repr_str
+@dataclass(kw_only=True)
+class MessageCfg(FileCfg):
+    """
+    Convenience class to unite various configured properties for a given Communication file.
+    Manual config is always required for MessengerLog author attribution. It's also often needed for Email
+    files to handle the terrible OCR text that Congress provided which messes up a lot of the email headers.
+    Attributes:
+        actual_text (str | None): In dire cases of broken OCR we just configure the body of the email as a string.
+        attribution_reason (str | None): Optional explanation of why this email was attributed to this author.
+        is_attribution_uncertain (bool): True if we have a good idea of who the author is but are not 100% certain
+        is_fwded_article (bool): True if this is a newspaper article someone fwded. Used to exclude articles from word counting.
+        recipients (list[str | None]): Who received the email
+    """
+    actual_text: str | None = None  # Override for the Email._actual_text() method for particularly broken emails
+    attribution_reason: str | None = None
+    is_attribution_uncertain: bool = False
+    is_fwded_article: bool = False
+    recipients: list[str | None] = field(default_factory=list)
+    def __eq__(self, other: 'FileCfg') -> bool:
+        return super().__eq__(other)
+    def __repr__(self) -> str:
+        return super().__repr__()
+    @classmethod
+    def from_file_cfg(cls, cfg: FileCfg) -> 'MessageCfg':
+        return cls(**asdict(cfg))

epstein_files/util/file_helper.py ADDED Viewed

@@ -0,0 +1,81 @@
+import re
+from os import environ
+from pathlib import Path
+from sys import exit
+from epstein_files.util.constant.strings import HOUSE_OVERSIGHT_PREFIX
+EPSTEIN_DOCS_DIR_ENV_VAR_NAME = 'EPSTEIN_DOCS_DIR'
+DOCS_DIR_ENV = environ[EPSTEIN_DOCS_DIR_ENV_VAR_NAME]
+DOCS_DIR = Path(DOCS_DIR_ENV or '').resolve()
+if not DOCS_DIR_ENV:
+    print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME} env var not set!")
+    exit(1)
+elif not DOCS_DIR.exists():
+    print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!")
+    exit(1)
+JSON_DIR = DOCS_DIR.joinpath('json_files')
+HTML_DIR = Path('docs')
+EXTRACTED_EMAILS_DIR = Path('emails_extracted_from_legal_filings')
+GH_PAGES_HTML_PATH = HTML_DIR.joinpath('index.html')
+WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_emails_word_count.html')
+EPSTEIN_WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_texts_and_emails_word_count.html')
+PICKLED_PATH = Path("the_epstein_files.pkl.gz")
+FILE_STEM_REGEX = re.compile(fr"{HOUSE_OVERSIGHT_PREFIX}(\d{{6}})")
+FILE_ID_REGEX = re.compile(fr".*{FILE_STEM_REGEX.pattern}(_\d{{1,2}})?(\.txt(\.json)?)?")
+FILENAME_LENGTH = len(HOUSE_OVERSIGHT_PREFIX) + 6
+KB = 1024
+MB = KB * KB
+# Handles both string and int 'id' args.
+file_stem_for_id = lambda id: f"{HOUSE_OVERSIGHT_PREFIX}{int(id):06d}"
+filename_for_id = lambda id: file_stem_for_id(id) + '.txt'
+def coerce_file_stem(filename_or_id: int | str) -> str:
+    """Generate a valid file_stem no matter what form the argument comes in."""
+    if isinstance(filename_or_id, str) and filename_or_id.startswith(HOUSE_OVERSIGHT_PREFIX):
+        file_stem = file_stem_for_id(extract_file_id(filename_or_id))
+    else:
+        file_stem = file_stem_for_id(filename_or_id)
+    if not FILE_STEM_REGEX.match(file_stem):
+        raise RuntimeError(f"Invalid stem '{file_stem}' from '{filename_or_id}'")
+    return file_stem
+def extract_file_id(filename: str | Path) -> str:
+    file_match = FILE_ID_REGEX.match(str(filename))
+    if not file_match:
+        raise RuntimeError(f"Failed to extract file ID from {filename}")
+    return file_match.group(1)
+def file_size_str(file_path: str | Path) -> str:
+    file_size = float(Path(file_path).stat().st_size)
+    digits = 2
+    if file_size > MB:
+        size_num = file_size / MB
+        size_str = 'MB'
+    elif file_size > KB:
+        size_num = file_size / KB
+        size_str = 'kb'
+        digits = 1
+    else:
+        return f"{int(file_size)} b"
+    return f"{size_num:,.{digits}f} {size_str}"
+def is_local_extract_file(filename) -> bool:
+    """Return true if filename is of form 'HOUSE_OVERSIGHT_029835_1.txt'."""
+    file_match = FILE_ID_REGEX.match(str(filename))
+    return True if file_match and file_match.group(2) else False