PyPI - epstein-files - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

epstein-files 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

epstein_files/__init__.py +75 -135
epstein_files/documents/communication.py +9 -9
epstein_files/documents/document.py +115 -87
epstein_files/documents/email.py +154 -85
epstein_files/documents/emails/email_header.py +7 -6
epstein_files/documents/imessage/text_message.py +3 -2
epstein_files/documents/json_file.py +17 -0
epstein_files/documents/messenger_log.py +62 -3
epstein_files/documents/other_file.py +165 -17
epstein_files/epstein_files.py +128 -169
epstein_files/util/constant/names.py +8 -1
epstein_files/util/constant/output_files.py +29 -0
epstein_files/util/constant/strings.py +27 -0
epstein_files/util/constant/urls.py +25 -9
epstein_files/util/constants.py +1018 -1045
epstein_files/util/data.py +20 -55
epstein_files/util/{file_cfg.py → doc_cfg.py} +121 -43
epstein_files/util/env.py +19 -20
epstein_files/util/file_helper.py +38 -21
epstein_files/util/highlighted_group.py +229 -177
epstein_files/util/logging.py +63 -0
epstein_files/util/output.py +180 -0
epstein_files/util/rich.py +29 -17
epstein_files/util/search_result.py +14 -6
epstein_files/util/timer.py +24 -0
epstein_files/util/word_count.py +2 -1
{epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/METADATA +20 -4
epstein_files-1.0.2.dist-info/RECORD +33 -0
epstein_files-1.0.2.dist-info/entry_points.txt +7 -0
epstein_files-1.0.0.dist-info/RECORD +0 -28
{epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/LICENSE +0 -0
{epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/WHEEL +0 -0

epstein_files/util/data.py CHANGED Viewed

@@ -3,17 +3,13 @@ Helpers for dealing with various kinds of data.
 """
 import itertools
 import re
-import time
-from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from dateutil import tz
 from typing import TypeVar
-from dateutil.parser import parse
-from rich.text import Text
 from epstein_files.util.constant import names
-from epstein_files.util.env import args, logger
+from epstein_files.util.env import args
+from epstein_files.util.logging import logger
 T = TypeVar('T')
@@ -23,37 +19,22 @@ CONSTANT_VAR_REGEX = re.compile(r"^[A-Z_]+$")
 ALL_NAMES = [v for k, v in vars(names).items() if isinstance(v, str) and CONSTANT_VAR_REGEX.match(k)]
 PACIFIC_TZ = tz.gettz("America/Los_Angeles")
-TIMEZONE_INFO = {"PST": PACIFIC_TZ, "PDT": PACIFIC_TZ}  # Suppresses annoying warnings from parse() calls
-def collapse_newlines(text: str) -> str:
-    return MULTINEWLINE_REGEX.sub('\n\n', text)
+TIMEZONE_INFO = {"PDT": PACIFIC_TZ, "PST": PACIFIC_TZ}  # Suppresses annoying warnings from parse() calls
-def date_str(timestamp: datetime | None) -> str | None:
-    return timestamp.isoformat()[0:10] if timestamp else None
+collapse_newlines = lambda text: MULTINEWLINE_REGEX.sub('\n\n', text)
+date_str = lambda dt: dt.isoformat()[0:10] if dt else None
+escape_double_quotes = lambda text: text.replace('"', r'\"')
+escape_single_quotes = lambda text: text.replace("'", r"\'")
+iso_timestamp = lambda dt: dt.isoformat().replace('T', ' ')
+uniquify = lambda _list: list(set(_list))
+without_nones = lambda _list: [e for e in _list if e]
 def dict_sets_to_lists(d: dict[str, set]) -> dict[str, list]:
     return {k: sorted(list(v)) for k, v in d.items()}
-def extract_datetime(s: str) -> datetime | None:
-    match = ISO_DATE_REGEX.search(s)
-    if not match:
-        return None
-    date_str = match.group(0)
-    if len(date_str) == 4:
-        date_str += '-01-01'
-    elif len(date_str) == 7:
-        date_str += '-01'
-    return parse(date_str, tzinfos=TIMEZONE_INFO)
 def extract_last_name(name: str) -> str:
     if ' ' not in name:
         return name
@@ -70,14 +51,19 @@ def flatten(_list: list[list[T]]) -> list[T]:
     return list(itertools.chain.from_iterable(_list))
-def iso_timestamp(dt: datetime) -> str:
-    return dt.isoformat().replace('T', ' ')
+def json_safe(d: dict) -> dict:
+    return {
+        'None' if k is None else k: v.isoformat() if isinstance(v, datetime) else v
+        for k,v in d.items()
+    }
-def listify(listlike: list | str | Text | None) -> list:
+def listify(listlike) -> list:
     """Create a list of 'listlike'. Returns empty list if 'listlike' is None or empty string."""
     if isinstance(listlike, list):
         return listlike
+    elif listlike is None:
+        return [None]
     elif listlike:
         return [listlike]
     else:
@@ -93,8 +79,8 @@ def ordinal_str(n: int) -> str:
     return str(n) + suffix
-def patternize(_pattern: str | re.Pattern):
-    return _pattern if isinstance(_pattern, re.Pattern) else re.compile(rf"({_pattern})", re.IGNORECASE)
+def patternize(_pattern: str | re.Pattern) -> re.Pattern:
+    return _pattern if isinstance(_pattern, re.Pattern) else re.compile(fr"({_pattern})", re.IGNORECASE)
 def remove_timezone(timestamp: datetime) -> datetime:
@@ -108,24 +94,3 @@ def remove_timezone(timestamp: datetime) -> datetime:
 def sort_dict(d: dict[str | None, int] | dict[str, int]) -> list[tuple[str | None, int]]:
     sort_key = lambda e: (e[0] or '').lower() if args.sort_alphabetical else [-e[1], (e[0] or '').lower()]
     return sorted(d.items(), key=sort_key)
-@dataclass
-class Timer:
-    started_at: float = field(default_factory=lambda: time.perf_counter())
-    checkpoint_at: float = field(default_factory=lambda: time.perf_counter())
-    def print_at_checkpoint(self, msg: str) -> None:
-        logger.warning(f"{msg} in {self.seconds_since_checkpoint()}")
-        self.checkpoint_at = time.perf_counter()
-    def seconds_since_checkpoint(self) -> str:
-        return f"{(time.perf_counter() - self.checkpoint_at):.2f} seconds"
-    def seconds_since_start(self) -> str:
-        return f"{(time.perf_counter() - self.started_at):.2f} seconds"
-escape_double_quotes = lambda text: text.replace('"', r'\"')
-escape_single_quotes = lambda text: text.replace("'", r"\'")
-uniquify = lambda _list: list(set(_list))

epstein_files/util/{file_cfg.py → doc_cfg.py} RENAMED Viewed

@@ -6,22 +6,27 @@ from typing import Generator, Literal
 from dateutil.parser import parse
-from epstein_files.util.constant.names import constantize_name
-from epstein_files.util.constant.strings import AUTHOR
+from epstein_files.util.constant.names import *
+from epstein_files.util.constant.strings import *
+from epstein_files.util.data import without_nones
-DuplicateType = Literal['same', 'earlier', 'quoted', 'redacted']
+DuplicateType = Literal['earlier', 'quoted', 'redacted', 'same']
+Metadata = dict[str, bool | datetime | int | str | list[str | None] |dict[str, bool | str]]
+# Misc
+CONSTANTIZE_NAMES = False  # A flag set to True that causes repr() of these classes to return strings of usable code
 INDENT = '    '
 INDENT_NEWLINE = f'\n{INDENT}'
 INDENTED_JOIN = f',{INDENT_NEWLINE}'
-CONSTANTIZE_NAMES = False  # A flag set to True that causes repr() of these classes to return strings of usable code
-MAX_LINE_LENGTH = 250
+MAX_LINE_LENGTH = 150
+REPUTATION_MGMT = f'{REPUTATION} management'
+SAME = 'same'
-REASON_MAPPING: dict[DuplicateType, str] = {
-    'earlier': 'earlier draft of',
+DUPE_TYPE_STRS: dict[DuplicateType, str] = {
+    'earlier': 'an earlier draft of',
     'quoted': 'quoted in full in',
-    'redacted': 'redacted version of',
-    'same': 'the same as',
+    'redacted': 'a redacted version of',
+    SAME: 'the same as',
 }
 FIELD_SORT_KEY = {
@@ -30,57 +35,116 @@ FIELD_SORT_KEY = {
     'attribution_reason': 'zz',
 }
+FINANCIAL_REPORTS_AUTHORS = [
+    BOFA,
+    DEUTSCHE_BANK,
+    ELECTRON_CAPITAL_PARTNERS,
+    GOLDMAN_INVESTMENT_MGMT,
+    'Invesco',
+    JP_MORGAN,
+    'Morgan Stanley',
+    'S&P',
+]
+# Fields like timestamp and author are better added from the Document object
+INVALID_FOR_METADATA = [
+    'actual_text',
+    'date',
+    'id',
+    'timestamp',
+    'was_generated',
+]
 @dataclass(kw_only=True)
-class FileCfg:
-    """Convenience class that encapsulates configuring info about files that need to be manually configured.
+class DocCfg:
+    """
+    Encapsulates info about files that needs to be manually configured because it cannot be programmatically inferred.
     Attributes:
         id (str): ID of file
         author (str | None): Author of the document (if any)
+        category (str | None): Type of file
         date (str | None): If passed will be immediated parsed into the 'timestamp' field
         dupe_of_id (str | None): If this is a dupe the ID of the duplicated file. This file will be suppressed
-        dupe_type (DuplicateType | None): The type of duplicate this file is (redacted, quoted, etc.)
-        duplicate_ids (list[str]): Inverse of 'dupe_of_id' - this file will NOT be suppressed but 'duplicate_ids' will be.
+        dupe_type (DuplicateType | None): The type of duplicate this file is or its 'duplicate_ids' are
+        duplicate_ids (list[str]): Inverse of 'dupe_of_id' - this file will NOT be suppressed but 'duplicate_ids' will be
+        is_interesting (bool): Override other considerations and always consider this file interesting
         timestamp (datetime | None): Time this email was sent, file was created, article published, etc.
         was_generated (bool): True if this object was generated by the duplicate_cfgs() method
     """
     id: str
     author: str | None = None
+    category: str | None = None
     date: str | None = None
     description: str | None = None
     dupe_of_id: str | None = None
     dupe_type: DuplicateType | None = None
     duplicate_ids: list[str] = field(default_factory=list)
+    is_interesting: bool = False
     timestamp: datetime | None = None
-    was_generated: bool = False  # True if this object was generated by duplicate_cfgs()
+    was_generated: bool = False
     def __post_init__(self):
-        if self.dupe_of_id:
-            self.dupe_type = self.dupe_type or 'same'
         if self.date:
             self.timestamp = parse(self.date)
+        if self.dupe_of_id or self.duplicate_ids:
+            self.dupe_type = self.dupe_type or SAME
     def duplicate_reason(self) -> str | None:
         if self.dupe_type is not None:
-            return REASON_MAPPING[self.dupe_type]
+            return DUPE_TYPE_STRS[self.dupe_type]
-    def duplicate_cfgs(self) -> Generator['FileCfg', None, None]:
+    def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
+        """Create synthetic DocCfg objects that set the 'dupe_of_id' field to point back to this object."""
         for id in self.duplicate_ids:
             dupe_cfg = deepcopy(self)
             dupe_cfg.id = id
             dupe_cfg.dupe_of_id = self.id
-            dupe_cfg.dupe_type = self.dupe_type or 'same'
+            dupe_cfg.duplicate_ids = []
+            dupe_cfg.dupe_type = self.dupe_type
             dupe_cfg.was_generated = True
             yield dupe_cfg
+    def info_str(self) -> str | None:
+        """String that summarizes what is known about this document."""
+        if self.category == REPUTATION:
+            return f"{REPUTATION_MGMT}: {self.description}"
+        elif self.author and self.description:
+            if self.category in [ACADEMIA, BOOK]:
+                return self.title_by_author()
+            elif self.category == FINANCE and self.author in FINANCIAL_REPORTS_AUTHORS:
+                return f"{self.author} report: '{self.description}'"
+            elif self.category == LEGAL and 'v.' in self.author:
+                return f"{self.author}: '{self.description}'"
+        elif self.category and self.author is None and self.description is None:
+            return self.category
+        pieces = without_nones([self.author, self.description])
+        return ' '.join(pieces) if pieces else None
+    def metadata(self) -> Metadata:
+        non_null_fields = {k: v for k, v in asdict(self).items() if v and k not in INVALID_FOR_METADATA}
+        if self.category in [EMAIL, TEXT_MESSAGE]:
+            del non_null_fields['category']
+        return non_null_fields
     def non_null_field_names(self) -> list[str]:
         return [f.name for f in self.sorted_fields() if getattr(self, f.name)]
     def sorted_fields(self) -> list[Field]:
         return sorted(fields(self), key=lambda f: FIELD_SORT_KEY.get(f.name, f.name))
+    def title_by_author(self) -> str:
+        if not (self.author and self.description):
+            raise RuntimeError(f"Can't call title_by_author() without author and description!")
+        title = self.description if '"' in self.description else f"'{self.description}'"
+        return f"{title} by {self.author}"
     def _props_strs(self) -> list[str]:
         props = []
         add_prop = lambda f, value: props.append(f"{f.name}={value}")
@@ -92,14 +156,16 @@ class FileCfg:
                 continue
             elif _field.name == AUTHOR:
                 add_prop(_field, constantize_name(str(value)) if CONSTANTIZE_NAMES else f"'{value}'")
+            elif _field.name == 'category' and value in [EMAIL, TEXT_MESSAGE]:
+                continue
             elif _field.name == 'recipients' and isinstance(value, list):
                 recipients_str = str([constantize_name(r) if (CONSTANTIZE_NAMES and r) else r for r in value])
                 add_prop(_field, recipients_str.replace("'", '') if CONSTANTIZE_NAMES else recipients_str)
+            elif _field.name == 'timestamp' and self.date is not None:
+                continue  # Don't print both timestamp and date
             elif isinstance(value, datetime):
                 value_str = re.sub(' 00:00:00', '', str(value))
                 add_prop(_field, f"parse('{value_str}')" if CONSTANTIZE_NAMES else f"'{value}'")
-            elif _field.name == 'description':
-                add_prop(_field, value.strip())
             elif isinstance(value, str):
                 if "'" in value:
                     value = '"' + value.replace('"', r'\"') + '"'
@@ -112,22 +178,12 @@ class FileCfg:
         return props
-    def __eq__(self, other: 'FileCfg') -> bool:
-        """Return True if everything matches other than the two 'dupe_' fields ('duplicate_ids' is compared)."""
-        for _field in self.sorted_fields():
-            if _field.name == 'id' or _field.name.startswith('dupe'):
-                continue
-            elif getattr(self, _field.name) != getattr(other, _field.name):
-                return False
-        return True
     def __repr__(self) -> str:
         props = self._props_strs()
         type_str = f"{type(self).__name__}("
         single_line_repr = type_str + ', '.join(props) + f')'
-        if (len(single_line_repr) < MAX_LINE_LENGTH or self.non_null_field_names() == ['id', 'description']) and '#' not in (self.description or ''):
+        if len(single_line_repr) < MAX_LINE_LENGTH:
             repr_str = single_line_repr
         else:
             repr_str = f"{type_str}{INDENT_NEWLINE}" + INDENTED_JOIN.join(props)
@@ -142,31 +198,53 @@ class FileCfg:
 @dataclass(kw_only=True)
-class MessageCfg(FileCfg):
+class CommunicationCfg(DocCfg):
     """
-    Convenience class to unite various configured properties for a given Communication file.
     Manual config is always required for MessengerLog author attribution. It's also often needed for Email
     files to handle the terrible OCR text that Congress provided which messes up a lot of the email headers.
     Attributes:
-        actual_text (str | None): In dire cases of broken OCR we just configure the body of the email as a string.
         attribution_reason (str | None): Optional explanation of why this email was attributed to this author.
         is_attribution_uncertain (bool): True if we have a good idea of who the author is but are not 100% certain
+    """
+    attribution_reason: str | None = None
+    is_attribution_uncertain: bool = False
+    def __repr__(self) -> str:
+        return super().__repr__()
+@dataclass(kw_only=True)
+class EmailCfg(CommunicationCfg):
+    """
+    Attributes:
+        actual_text (str | None): In dire cases of broken OCR we just configure the body of the email as a string.
         is_fwded_article (bool): True if this is a newspaper article someone fwded. Used to exclude articles from word counting.
         recipients (list[str | None]): Who received the email
     """
     actual_text: str | None = None  # Override for the Email._actual_text() method for particularly broken emails
-    attribution_reason: str | None = None
-    is_attribution_uncertain: bool = False
     is_fwded_article: bool = False
     recipients: list[str | None] = field(default_factory=list)
-    def __eq__(self, other: 'FileCfg') -> bool:
-        return super().__eq__(other)
+    def __post_init__(self):
+        super().__post_init__()
+        self.category = EMAIL
+    @classmethod
+    def from_doc_cfg(cls, cfg: DocCfg) -> 'EmailCfg':
+        return cls(**asdict(cfg))
+    # This is necessary for some dumb reason. @dataclass(repr=False) doesn't cut it
     def __repr__(self) -> str:
         return super().__repr__()
-    @classmethod
-    def from_file_cfg(cls, cfg: FileCfg) -> 'MessageCfg':
-        return cls(**asdict(cfg))
+@dataclass(kw_only=True)
+class TextCfg(CommunicationCfg):
+    def __post_init__(self):
+        super().__post_init__()
+        self.category = TEXT_MESSAGE
+    # This is necessary for some dumb reason. @dataclass(repr=False) doesn't cut it
+    def __repr__(self) -> str:
+        return super().__repr__()

epstein_files/util/env.py CHANGED Viewed

@@ -4,42 +4,43 @@ from os import environ
 from pathlib import Path
 from sys import argv
-from rich.logging import RichHandler
+from epstein_files.util.logging import datefinder_logger, env_log_level, logger
 DEFAULT_WIDTH = 154
-HTML_SCRIPTS = ['generate_html.py', 'count_words.py']
+HTML_SCRIPTS = ['epstein_generate', 'generate_html.py', 'count_words.py']
 parser = ArgumentParser(description="Parse epstein OCR docs and generate HTML page.")
-parser.add_argument('--build', '-b', action='store_true', help='write HTML to docs/index.html')
-parser.add_argument('--all-emails', '-ae', action='store_true', help='all the emails')
-parser.add_argument('--all-email-tables', '-aet', action='store_true', help='all email tables (except Epstein)')
-parser.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of a limited selection')
+parser.add_argument('--build', '-b', action='store_true', help='write output to file')
+parser.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
+parser.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just interesting ones')
 parser.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
 parser.add_argument('--name', '-n', action='append', dest='names', help='specify the name(s) whose communications should be output')
+parser.add_argument('--output-file', '-out', metavar='FILE', default='index.html', help='write output to FILE in docs/ (default=index.html)')
 parser.add_argument('--output-emails', '-oe', action='store_true', help='generate other files section')
 parser.add_argument('--output-other-files', '-oo', action='store_true', help='generate other files section')
 parser.add_argument('--output-texts', '-ot', action='store_true', help='generate other files section')
 parser.add_argument('--pickled', '-p', action='store_true', help='use pickled EpsteinFiles object')
 parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='generate new pickled EpsteinFiles object')
+parser.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (only used by scripts)')
 parser.add_argument('--sort-alphabetical', '-alpha', action='store_true', help='sort emailers alphabetically in counts table')
 parser.add_argument('--suppress-output', '-s', action='store_true', help='no output to terminal (use with --build)')
-parser.add_argument('--use-epstein-web-links', '-use', action='store_true', help='use epsteinweb.org links instead of epsteinify.com')
-parser.add_argument('--search-other', '-so', action='store_true', help='search for string in non email/text files (only used by search script)')
+parser.add_argument('--use-epstein-web-links', '-use', action='store_true', help='use epsteinweb.org links instead of epstein.media')
 parser.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use')
 parser.add_argument('--whole-file', '-wf', action='store_true', help='print whole file (only used by search script)')
 parser.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
 parser.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
+parser.add_argument('--make-clean', '-mc', action='store_true', help='delete all build artifact HTML and JSON files')
 parser.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
+parser.add_argument('--json-metadata', '-jm', action='store_true', help='dump JSON metadata for all files')
 parser.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats at the end')
 parser.add_argument('positional_args', nargs='*', help='Optional args (only used by helper scripts)')
 args = parser.parse_args()
-is_env_var_set = lambda s: len(environ.get(s) or '') > 0
 current_script = Path(argv[0]).name
+is_env_var_set = lambda s: len(environ.get(s) or '') > 0
 is_html_script = current_script in HTML_SCRIPTS
-args.deep_debug = args.deep_debug or is_env_var_set('DEEP_DEBUG')
 args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
 args.output_emails = args.output_emails or args.all_emails
 args.output_other_files = args.output_other_files or args.all_other_files
@@ -48,27 +49,25 @@ args.width = args.width if is_html_script else None
 specified_names: list[str | None] = [None if n == 'None' else n for n in (args.names or [])]
-# Setup logging
-logging.basicConfig(level="NOTSET", format="%(message)s", datefmt="[%X]", handlers=[RichHandler()])
-# logging.basicConfig(level="DEBUG", handlers=[RichHandler()])
-logger = logging.getLogger("rich")
+# Log level args
 if args.deep_debug:
     logger.setLevel(logging.DEBUG)
 elif args.debug:
     logger.setLevel(logging.INFO)
 elif args.suppress_logs:
     logger.setLevel(logging.FATAL)
-else:
+elif not env_log_level:
     logger.setLevel(logging.WARNING)
-datefinder_logger = logging.getLogger('datefinder')  # Suppress annoying output
+logger.info(f'Log level set to {logger.level}...')
 datefinder_logger.setLevel(logger.level)
 # Massage args that depend on other args to the appropriate state
-if not (args.output_texts or args.output_emails or args.output_other_files):
-    logger.warning(f"No output section chosen; outputting default of texts, selected emails, and other files...")
+if not (args.json_metadata or args.output_texts or args.output_emails or args.output_other_files):
+    if is_html_script:
+        logger.warning(f"No output section chosen; outputting default of texts, selected emails, and other files...")
     args.output_texts = True
     args.output_emails = True
     args.output_other_files = True
@@ -77,4 +76,4 @@ if args.use_epstein_web_links:
     logger.warning(f"Using links to epsteinweb.org links instead of epsteinify.com...")
 if args.debug:
-    logger.warning(f"is_html_script={is_html_script}, specified_names={specified_names}, args={args}")
+    logger.warning(f"Invocation args:\nis_html_script={is_html_script},\nspecified_names={specified_names},\nargs={args}")

epstein_files/util/file_helper.py CHANGED Viewed

@@ -3,7 +3,7 @@ from os import environ
 from pathlib import Path
 from sys import exit
-from epstein_files.util.constant.strings import HOUSE_OVERSIGHT_PREFIX
+from epstein_files.util.constant.strings import FILE_NAME_REGEX, FILE_STEM_REGEX, HOUSE_OVERSIGHT_PREFIX
 EPSTEIN_DOCS_DIR_ENV_VAR_NAME = 'EPSTEIN_DOCS_DIR'
 DOCS_DIR_ENV = environ[EPSTEIN_DOCS_DIR_ENV_VAR_NAME]
@@ -16,30 +16,23 @@ elif not DOCS_DIR.exists():
     print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!")
     exit(1)
-JSON_DIR = DOCS_DIR.joinpath('json_files')
-HTML_DIR = Path('docs')
 EXTRACTED_EMAILS_DIR = Path('emails_extracted_from_legal_filings')
-GH_PAGES_HTML_PATH = HTML_DIR.joinpath('index.html')
-WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_emails_word_count.html')
-EPSTEIN_WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_texts_and_emails_word_count.html')
-PICKLED_PATH = Path("the_epstein_files.pkl.gz")
-FILE_STEM_REGEX = re.compile(fr"{HOUSE_OVERSIGHT_PREFIX}(\d{{6}})")
-FILE_ID_REGEX = re.compile(fr".*{FILE_STEM_REGEX.pattern}(_\d{{1,2}})?(\.txt(\.json)?)?")
+FILE_ID_REGEX = re.compile(fr".*{FILE_NAME_REGEX.pattern}")
 FILENAME_LENGTH = len(HOUSE_OVERSIGHT_PREFIX) + 6
 KB = 1024
 MB = KB * KB
 # Handles both string and int 'id' args.
-file_stem_for_id = lambda id: f"{HOUSE_OVERSIGHT_PREFIX}{int(id):06d}"
+id_str = lambda id: f"{int(id):06d}"
 filename_for_id = lambda id: file_stem_for_id(id) + '.txt'
 def coerce_file_stem(filename_or_id: int | str) -> str:
     """Generate a valid file_stem no matter what form the argument comes in."""
     if isinstance(filename_or_id, str) and filename_or_id.startswith(HOUSE_OVERSIGHT_PREFIX):
-        file_stem = file_stem_for_id(extract_file_id(filename_or_id))
+        file_id = extract_file_id(filename_or_id)
+        file_stem = file_stem_for_id(file_id)
     else:
         file_stem = file_stem_for_id(filename_or_id)
@@ -49,32 +42,56 @@ def coerce_file_stem(filename_or_id: int | str) -> str:
     return file_stem
-def extract_file_id(filename: str | Path) -> str:
-    file_match = FILE_ID_REGEX.match(str(filename))
+def coerce_file_name(filename_or_id: int | str) -> str:
+    return coerce_file_stem(filename_or_id) + '.txt'
+def coerce_file_path(filename_or_id: int | str) -> Path:
+    return DOCS_DIR.joinpath(coerce_file_name(filename_or_id))
+def extract_file_id(filename_or_id: int | str | Path) -> str:
+    if isinstance(filename_or_id, int) or (isinstance(filename_or_id, str) and len(filename_or_id) <= 6):
+        return id_str(filename_or_id)
+    file_match = FILE_ID_REGEX.match(str(filename_or_id))
     if not file_match:
-        raise RuntimeError(f"Failed to extract file ID from {filename}")
+        raise RuntimeError(f"Failed to extract file ID from {filename_or_id}")
     return file_match.group(1)
+def file_size(file_path: str | Path) -> int:
+    return Path(file_path).stat().st_size
 def file_size_str(file_path: str | Path) -> str:
-    file_size = float(Path(file_path).stat().st_size)
+    size = file_size(file_path)
     digits = 2
-    if file_size > MB:
-        size_num = file_size / MB
+    if size > MB:
+        size_num = float(size) / MB
         size_str = 'MB'
-    elif file_size > KB:
-        size_num = file_size / KB
+    elif size > KB:
+        size_num = float(size) / KB
         size_str = 'kb'
         digits = 1
     else:
-        return f"{int(file_size)} b"
+        return f"{size} b"
     return f"{size_num:,.{digits}f} {size_str}"
+def file_stem_for_id(id: int | str) -> str:
+    if isinstance(id, int) or (isinstance(id, str) and len(id) <= 6):
+        return f"{HOUSE_OVERSIGHT_PREFIX}{id_str(id)}"
+    elif len(id) == 8:
+        return f"{HOUSE_OVERSIGHT_PREFIX}{id}"
+    else:
+        raise RuntimeError(f"Unknown kind of file id {id}")
 def is_local_extract_file(filename) -> bool:
     """Return true if filename is of form 'HOUSE_OVERSIGHT_029835_1.txt'."""
     file_match = FILE_ID_REGEX.match(str(filename))

epstein-files 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

epstein-files 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl