PyPI - epstein-files - Versions diffs - 1.0.10__py3-none-any.whl → 1.0.12__py3-none-any.whl - Mend

epstein-files 1.0.10py3-none-any.whl → 1.0.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

epstein_files/__init__.py +7 -9
epstein_files/documents/communication.py +2 -2
epstein_files/documents/document.py +94 -81
epstein_files/documents/email.py +47 -5
epstein_files/documents/imessage/text_message.py +4 -13
epstein_files/documents/json_file.py +13 -1
epstein_files/documents/messenger_log.py +32 -19
epstein_files/documents/other_file.py +67 -44
epstein_files/epstein_files.py +22 -15
epstein_files/util/constant/names.py +11 -10
epstein_files/util/constant/strings.py +2 -1
epstein_files/util/constants.py +98 -88
epstein_files/util/data.py +1 -1
epstein_files/util/doc_cfg.py +32 -62
epstein_files/util/env.py +29 -17
epstein_files/util/file_helper.py +12 -29
epstein_files/util/highlighted_group.py +34 -17
epstein_files/util/logging.py +1 -7
epstein_files/util/output.py +13 -8
epstein_files/util/rich.py +15 -10
epstein_files/util/word_count.py +65 -5
{epstein_files-1.0.10.dist-info → epstein_files-1.0.12.dist-info}/METADATA +1 -1
epstein_files-1.0.12.dist-info/RECORD +33 -0
epstein_files/count_words.py +0 -72
epstein_files-1.0.10.dist-info/RECORD +0 -34
{epstein_files-1.0.10.dist-info → epstein_files-1.0.12.dist-info}/LICENSE +0 -0
{epstein_files-1.0.10.dist-info → epstein_files-1.0.12.dist-info}/WHEEL +0 -0
{epstein_files-1.0.10.dist-info → epstein_files-1.0.12.dist-info}/entry_points.txt +0 -0

epstein_files/util/doc_cfg.py CHANGED Viewed

@@ -8,7 +8,7 @@ from dateutil.parser import parse
 from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import *
-from epstein_files.util.data import without_falsey
+from epstein_files.util.data import remove_time_from_timestamp_str, without_falsey
 DuplicateType = Literal['earlier', 'quoted', 'redacted', 'same']
 Metadata = dict[str, bool | datetime | int | str | list[str | None] |dict[str, bool | str]]
@@ -47,12 +47,11 @@ FINANCIAL_REPORTS_AUTHORS = [
 ]
 # Fields like timestamp and author are better added from the Document object
-INVALID_FOR_METADATA = [
+NON_METADATA_FIELDS = [
     'actual_text',
     'date',
     'id',
-    'timestamp',
-    'was_generated',
+    'is_synthetic',
 ]
@@ -68,10 +67,10 @@ class DocCfg:
         date (str | None): If passed will be immediated parsed into the 'timestamp' field
         dupe_of_id (str | None): If this is a dupe the ID of the duplicated file. This file will be suppressed
         dupe_type (DuplicateType | None): The type of duplicate this file is or its 'duplicate_ids' are
-        duplicate_ids (list[str]): Inverse of 'dupe_of_id' - this file will NOT be suppressed but 'duplicate_ids' will be
+        duplicate_ids (list[str]): IDs of *other* documents that are dupes of this document
         is_interesting (bool): Override other considerations and always consider this file interesting
         timestamp (datetime | None): Time this email was sent, file was created, article published, etc.
-        was_generated (bool): True if this object was generated by the duplicate_cfgs() method
+        is_synthetic (bool): True if this config was generated by the duplicate_cfgs() method
     """
     id: str
     author: str | None = None
@@ -82,8 +81,8 @@ class DocCfg:
     dupe_type: DuplicateType | None = None
     duplicate_ids: list[str] = field(default_factory=list)
     is_interesting: bool = False
+    is_synthetic: bool = False
     timestamp: datetime | None = None
-    was_generated: bool = False
     def __post_init__(self):
         if self.date:
@@ -92,66 +91,48 @@ class DocCfg:
         if self.dupe_of_id or self.duplicate_ids:
             self.dupe_type = self.dupe_type or SAME
-    def duplicate_reason(self) -> str | None:
-        if self.dupe_type is not None:
-            return DUPE_TYPE_STRS[self.dupe_type]
-    def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
-        """Create synthetic DocCfg objects that set the 'dupe_of_id' field to point back to this object."""
-        for id in self.duplicate_ids:
-            dupe_cfg = deepcopy(self)
-            dupe_cfg.id = id
-            dupe_cfg.dupe_of_id = self.id
-            dupe_cfg.duplicate_ids = []
-            dupe_cfg.dupe_type = self.dupe_type
-            dupe_cfg.was_generated = True
-            yield dupe_cfg
-    def info_str(self) -> str | None:
+    def complete_description(self) -> str | None:
         """String that summarizes what is known about this document."""
-        if self.category and not self.description:
+        if self.category and not self.description and not self.author:
             return self.category
         elif self.category == REPUTATION:
             return f"{REPUTATION_MGMT}: {self.description}"
+        elif self.category == SKYPE_LOG:
+            msg = f"{self.category} of conversation with {self.author}" if self.author else self.category
+            return f"{msg} {self.description}" if self.description else msg
         elif self.author and self.description:
             if self.category in [ACADEMIA, BOOK]:
-                return self.title_by_author()
+                title = self.description if '"' in self.description else f"'{self.description}'"
+                return f"{title} by {self.author}"
             elif self.category == FINANCE and self.author in FINANCIAL_REPORTS_AUTHORS:
                 return f"{self.author} report: '{self.description}'"
             elif self.category == LEGAL and 'v.' in self.author:
-                return f"{self.author}: '{self.description}'"
+                return f"{self.author}: {self.description}"
         elif self.category and self.author is None and self.description is None:
             return self.category
         pieces = without_falsey([self.author, self.description])
         return ' '.join(pieces) if pieces else None
-    def metadata(self) -> Metadata:
-        non_null_fields = {k: v for k, v in asdict(self).items() if v and k not in INVALID_FOR_METADATA}
-        if self.category in [EMAIL, TEXT_MESSAGE]:
-            del non_null_fields['category']
-        return non_null_fields
-    def non_null_field_names(self) -> list[str]:
-        return [f.name for f in self.sorted_fields() if getattr(self, f.name)]
-    def sorted_fields(self) -> list[Field]:
-        return sorted(fields(self), key=lambda f: FIELD_SORT_KEY.get(f.name, f.name))
-    def title_by_author(self) -> str:
-        if not (self.author and self.description):
-            raise RuntimeError(f"Can't call title_by_author() without author and description!")
+    def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
+        """Create synthetic DocCfg objects that set the 'dupe_of_id' field to point back to this object."""
+        for id in self.duplicate_ids:
+            dupe_cfg = deepcopy(self)
+            dupe_cfg.id = id
+            dupe_cfg.dupe_of_id = self.id
+            dupe_cfg.duplicate_ids = []
+            dupe_cfg.dupe_type = self.dupe_type
+            dupe_cfg.is_synthetic = True
+            yield dupe_cfg
-        title = self.description if '"' in self.description else f"'{self.description}'"
-        return f"{title} by {self.author}"
+    def metadata(self) -> Metadata:
+        return {k: v for k, v in asdict(self).items() if k not in NON_METADATA_FIELDS and v}
     def _props_strs(self) -> list[str]:
         props = []
         add_prop = lambda f, value: props.append(f"{f.name}={value}")
-        for _field in self.sorted_fields():
+        for _field in sorted(fields(self), key=lambda f: FIELD_SORT_KEY.get(f.name, f.name)):
             value = getattr(self, _field.name)
             if value is None or value is False or (isinstance(value, list) and len(value) == 0):
@@ -160,13 +141,13 @@ class DocCfg:
                 add_prop(_field, constantize_name(str(value)) if CONSTANTIZE_NAMES else f"'{value}'")
             elif _field.name == 'category' and value in [EMAIL, TEXT_MESSAGE]:
                 continue
-            elif _field.name == 'recipients' and isinstance(value, list):
+            elif _field.name == 'recipients' and value:
                 recipients_str = str([constantize_name(r) if (CONSTANTIZE_NAMES and r) else r for r in value])
                 add_prop(_field, recipients_str.replace("'", '') if CONSTANTIZE_NAMES else recipients_str)
             elif _field.name == 'timestamp' and self.date is not None:
                 continue  # Don't print both timestamp and date
             elif isinstance(value, datetime):
-                value_str = re.sub(' 00:00:00', '', str(value))
+                value_str = remove_time_from_timestamp_str(value)
                 add_prop(_field, f"parse('{value_str}')" if CONSTANTIZE_NAMES else f"'{value}'")
             elif isinstance(value, str):
                 if "'" in value:
@@ -221,22 +202,15 @@ class EmailCfg(CommunicationCfg):
     """
     Attributes:
         actual_text (str | None): In dire cases of broken OCR we just configure the body of the email as a string.
+        fwded_text_after (str | None): If set, any text after this is a fwd of an article or similar
         is_fwded_article (bool): True if this is a newspaper article someone fwded. Used to exclude articles from word counting.
         recipients (list[str | None]): Who received the email
     """
-    actual_text: str | None = None  # Override for the Email._actual_text() method for particularly broken emails
-    fwded_text_after: str | None = None  # If set, any text after this is a fwd of an article or similar
+    actual_text: str | None = None
+    fwded_text_after: str | None = None
     is_fwded_article: bool = False
     recipients: list[str | None] = field(default_factory=list)
-    def __post_init__(self):
-        super().__post_init__()
-        self.category = EMAIL
-    @classmethod
-    def from_doc_cfg(cls, cfg: DocCfg) -> 'EmailCfg':
-        return cls(**asdict(cfg))
     # This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
     def __repr__(self) -> str:
         return super().__repr__()
@@ -244,10 +218,6 @@ class EmailCfg(CommunicationCfg):
 @dataclass(kw_only=True)
 class TextCfg(CommunicationCfg):
-    def __post_init__(self):
-        super().__post_init__()
-        self.category = TEXT_MESSAGE
     # This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
     def __repr__(self) -> str:
         return super().__repr__()

epstein_files/util/env.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 from argparse import ArgumentParser
 from os import environ
 from pathlib import Path
-from sys import argv
+from sys import argv, exit
 from rich_argparse_plus import RichHelpFormatterPlus
@@ -11,28 +11,30 @@ from epstein_files.util.logging import env_log_level, logger
 COUNT_WORDS_SCRIPT = 'epstein_word_count'
 DEFAULT_WIDTH = 145
 HTML_SCRIPTS = ['epstein_generate', COUNT_WORDS_SCRIPT]
+EPSTEIN_DOCS_DIR_ENV_VAR_NAME = 'EPSTEIN_DOCS_DIR'
 RichHelpFormatterPlus.choose_theme('morning_glory')
 parser = ArgumentParser(description="Parse epstein OCR docs and generate HTML pages.", formatter_class=RichHelpFormatterPlus)
+parser.add_argument('--make-clean', action='store_true', help='delete all HTML build artifact and write latest URLs to .urls.env')
 parser.add_argument('--name', '-n', action='append', dest='names', help='specify the name(s) whose communications should be output')
-parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='ovewrite cached EpsteinFiles')
+parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='re-parse the files and ovewrite cached data')
-output = parser.add_argument_group('OUTPUT')
+output = parser.add_argument_group('OUTPUT', 'Options used by epstein_generate.')
 output.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
 output.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just the interesting ones')
-output.add_argument('--build', '-b', action='store_true', help='write output to an HTML file')
-output.add_argument('--json-metadata', '-jm', action='store_true', help='dump JSON metadata for all files and exit')
-output.add_argument('--make-clean', action='store_true', help='delete all HTML build artifact and write latest URLs to .urls.env')
-output.add_argument('--output-emails', '-oe', action='store_true', help='generate other files section')
-output.add_argument('--output-json-files', action='store_true', help='pretty print all the raw JSON data files in the collection and exit')
-output.add_argument('--output-other-files', '-oo', action='store_true', help='generate other files section')
+output.add_argument('--build', '-b', action='store_true', help='write HTML output to a file')
+output.add_argument('--json-files', action='store_true', help='pretty print all the raw JSON data files in the collection and exit')
+output.add_argument('--json-metadata', action='store_true', help='dump JSON metadata for all files and exit')
+output.add_argument('--output-emails', '-oe', action='store_true', help='generate emails section')
+output.add_argument('--output-other', '-oo', action='store_true', help='generate other files section')
 output.add_argument('--output-texts', '-ot', action='store_true', help='generate text messages section')
 output.add_argument('--sort-alphabetical', action='store_true', help='sort emailers alphabetically intead of by email count')
 output.add_argument('--suppress-output', action='store_true', help='no output to terminal (use with --build)')
 output.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use (in characters)')
-output.add_argument('--use-epstein-web-links', action='store_true', help='use epsteinweb.org links instead of epstein.media')
+output.add_argument('--use-epstein-web', action='store_true', help='use epsteinweb.org links instead of epstein.media')
-scripts = parser.add_argument_group('SCRIPTS', 'Arguments used only by epstein_search, epstein_show, epstein_diff')
+scripts = parser.add_argument_group('SCRIPTS', 'Options used by epstein_search, epstein_show, and epstein_diff.')
 scripts.add_argument('positional_args', nargs='*', help='strings to searchs for, file IDs to show or diff, etc.')
 scripts.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (used by epstein_show)')
 scripts.add_argument('--whole-file', '-wf', action='store_true', help='print whole file (used by epstein_search)')
@@ -42,23 +44,35 @@ debug.add_argument('--colors-only', '-c', action='store_true', help='print heade
 debug.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
 debug.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
 debug.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats about the files')
+debug.add_argument('--skip-other-files', '-sof', action='store_true', help='skip parsing non email/text files')
 debug.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
 args = parser.parse_args()
+# Verify Epstein docs can be found
+DOCS_DIR_ENV = environ.get(EPSTEIN_DOCS_DIR_ENV_VAR_NAME)
+DOCS_DIR = Path(DOCS_DIR_ENV or '').resolve()
+if not DOCS_DIR_ENV:
+    print(f"\n   ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME} env var not set!\n")
+    exit(1)
+elif not DOCS_DIR.exists():
+    print(f"\n   ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!\n")
+    exit(1)
 current_script = Path(argv[0]).name
 is_env_var_set = lambda s: len(environ.get(s) or '') > 0
 is_html_script = current_script in HTML_SCRIPTS
 args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
 args.output_emails = args.output_emails or args.all_emails
-args.output_other_files = args.output_other_files or args.all_other_files
+args.output_other = args.output_other or args.all_other_files
 args.overwrite_pickle = args.overwrite_pickle or (is_env_var_set('OVERWRITE_PICKLE') and not is_env_var_set('PICKLED'))
 args.width = args.width if is_html_script else None
 is_output_selected = any([arg.startswith('output_') and value for arg, value in vars(args).items()])
 is_output_selected = is_output_selected or args.json_metadata or args.colors_only
 specified_names: list[str | None] = [None if n == 'None' else n for n in (args.names or [])]
 # Log level args
 if args.deep_debug:
     logger.setLevel(logging.DEBUG)
@@ -74,11 +88,9 @@ logger.info(f'Log level set to {logger.level}...')
 # Massage args that depend on other args to the appropriate state
 if current_script == 'epstein_generate' and not (is_output_selected or args.make_clean):
     logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
-    args.output_texts = True
-    args.output_emails = True
-    args.output_other_files = True
+    args.output_texts = args.output_emails = args.output_other = True
-if args.use_epstein_web_links:
+if args.use_epstein_web:
     logger.warning(f"Using links to epsteinweb.org links instead of epsteinify.com...")
 if args.debug:

epstein_files/util/file_helper.py CHANGED Viewed

@@ -1,20 +1,9 @@
 import re
-from os import environ
 from pathlib import Path
-from sys import exit
 from epstein_files.util.constant.strings import FILE_NAME_REGEX, FILE_STEM_REGEX, HOUSE_OVERSIGHT_PREFIX
-EPSTEIN_DOCS_DIR_ENV_VAR_NAME = 'EPSTEIN_DOCS_DIR'
-DOCS_DIR_ENV = environ[EPSTEIN_DOCS_DIR_ENV_VAR_NAME]
-DOCS_DIR = Path(DOCS_DIR_ENV or '').resolve()
-if not DOCS_DIR_ENV:
-    print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME} env var not set!")
-    exit(1)
-elif not DOCS_DIR.exists():
-    print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!")
-    exit(1)
+from epstein_files.util.env import DOCS_DIR
+from epstein_files.util.logging import logger
 EXTRACTED_EMAILS_DIR = Path('emails_extracted_from_legal_filings')
 FILE_ID_REGEX = re.compile(fr".*{FILE_NAME_REGEX.pattern}")
@@ -22,10 +11,13 @@ FILENAME_LENGTH = len(HOUSE_OVERSIGHT_PREFIX) + 6
 KB = 1024
 MB = KB * KB
+file_size = lambda file_path: Path(file_path).stat().st_size
+file_size_str = lambda file_path: file_size_to_str(file_size(file_path))
-# Handles both string and int 'id' args.
+# Coerce methods handle both string and int arguments.
+coerce_file_name = lambda filename_or_id: coerce_file_stem(filename_or_id) + '.txt'
+coerce_file_path = lambda filename_or_id: DOCS_DIR.joinpath(coerce_file_name(filename_or_id))
 id_str = lambda id: f"{int(id):06d}"
-filename_for_id = lambda id: file_stem_for_id(id) + '.txt'
 def coerce_file_stem(filename_or_id: int | str) -> str:
@@ -42,14 +34,6 @@ def coerce_file_stem(filename_or_id: int | str) -> str:
     return file_stem
-def coerce_file_name(filename_or_id: int | str) -> str:
-    return coerce_file_stem(filename_or_id) + '.txt'
-def coerce_file_path(filename_or_id: int | str) -> Path:
-    return DOCS_DIR.joinpath(coerce_file_name(filename_or_id))
 def extract_file_id(filename_or_id: int | str | Path) -> str:
     if isinstance(filename_or_id, int) or (isinstance(filename_or_id, str) and len(filename_or_id) <= 6):
         return id_str(filename_or_id)
@@ -62,12 +46,7 @@ def extract_file_id(filename_or_id: int | str | Path) -> str:
     return file_match.group(1)
-def file_size(file_path: str | Path) -> int:
-    return Path(file_path).stat().st_size
-def file_size_str(file_path: str | Path) -> str:
-    size = file_size(file_path)
+def file_size_to_str(size: int) -> str:
     digits = 2
     if size > MB:
@@ -96,3 +75,7 @@ def is_local_extract_file(filename) -> bool:
     """Return true if filename is of form 'HOUSE_OVERSIGHT_029835_1.txt'."""
     file_match = FILE_ID_REGEX.match(str(filename))
     return True if file_match and file_match.group(2) else False
+def log_file_write(file_path: str | Path) -> None:
+    logger.warning(f"Wrote {file_size_str(file_path)} to '{file_path}'")

epstein_files/util/highlighted_group.py CHANGED Viewed

@@ -2,6 +2,7 @@ import re
 from dataclasses import dataclass, field
 from rich.highlighter import RegexHighlighter
+from rich.text import Text
 from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import *
@@ -21,7 +22,7 @@ EPSTEIN_ESTATE_EXECUTOR = f"Epstein {ESTATE_EXECUTOR}"
 REGEX_STYLE_PREFIX = 'regex'
 SIMPLE_NAME_REGEX = re.compile(r"^[-\w ]+$", re.IGNORECASE)
-CATEGORY_LABEL_MAPPING = {
+CATEGORY_STYLE_MAPPING = {
     ARTICLE: JOURNALIST,
     ARTS: ENTERTAINER,
     BOOK: JOURNALIST,
@@ -31,6 +32,12 @@ CATEGORY_LABEL_MAPPING = {
     REPUTATION: PUBLICIST,
 }
+CATEGORY_STYLES = {
+    JSON: 'dark_red',
+    JUNK: 'grey19',
+    'letter': 'medium_orchid1'
+}
 @dataclass(kw_only=True)
 class HighlightedText:
@@ -156,7 +163,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label=BUSINESS,
         style='spring_green4',
-        pattern=r'Gruterite|(John\s*)?Kluge|Marc Rich|(Mi(chael|ke)\s*)?Ovitz|(Steve\s+)?Wynn|(Les(lie)?\s+)?Wexner|SALSS|Swedish[-\s]*American\s*Life\s*Science\s*Summit|Valhi|(Yves\s*)?Bouvier',
+        pattern=r'Gruterite|(John\s*)?Kluge|Marc Rich|(Mi(chael|ke)\s*)?Ovitz|(Steve\s+)?Wynn|(Les(lie)?\s+)?Wexner|New Leaf Ventures|Park Partners|SALSS|Swedish[-\s]*American\s*Life\s*Science\s*Summit|Valhi|(Yves\s*)?Bouvier',
         emailers = {
             ALIREZA_ITTIHADIEH: 'CEO Freestream Aircraft Limited',
             BARBRO_C_EHNBOM: 'Swedish pharmaceuticals, SALSS',
@@ -216,6 +223,7 @@ HIGHLIGHTED_NAMES = [
             'Linda Pinto': 'interior design at Alberto Pinto Cabinet',
             MERWIN_DELA_CRUZ: None,  # HOUSE_OVERSIGHT_032652 Groff says "Jojo and Merwin both requested off Nov. 25 and 26"
             NADIA_MARCINKO: 'pilot',
+            'Sean J. Lancaster': 'airplane reseller',
         }
     ),
     HighlightedNames(
@@ -253,6 +261,8 @@ HIGHLIGHTED_NAMES = [
             MARTIN_WEINBERG: CRIMINAL_DEFENSE_ATTORNEY,
             MICHAEL_MILLER: 'Steptoe LLP partner',
             REID_WEINGARTEN: 'Steptoe LLP partner',
+            ROBERT_D_CRITTON_JR: 'criminal defense attorney',
+            'Robert Gold': None,
             'Roy Black': CRIMINAL_DEFENSE_2008,
             SCOTT_J_LINK: None,
             TONJA_HADDAD_COLEMAN: f'{EPSTEIN_V_ROTHSTEIN_EDWARDS_ATTORNEY}, maybe daughter of Fred Haddad?',
@@ -303,15 +313,17 @@ HIGHLIGHTED_NAMES = [
         }
     ),
     HighlightedNames(
-        label='finance',
+        label=FINANCE,
         style='green',
-        pattern=r'Apollo|Ari\s*Glass|Bank|(Bernie\s*)?Madoff|Black(rock|stone)|B\s*of\s*A|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|(money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
+        pattern=r'Apollo|Ari\s*Glass|Bank|(Bernie\s*)?Madoff|Black(rock|stone)|B\s*of\s*A|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|((anti.?)?money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
         emailers={
             AMANDA_ENS: 'Citigroup',
+            BRAD_WECHSLER: f"head of {LEON_BLACK}'s personal investment vehicle according to FT",
             DANIEL_SABBA: 'UBS Investment Bank',
             DAVID_FISZEL: 'CIO Honeycomb Asset Management',
             JES_STALEY: 'former CEO of Barclays',
             JIDE_ZEITLIN: 'former partner at Goldman Sachs, allegations of sexual misconduct',
+            'Laurie Cameron': 'currency trading',
             LEON_BLACK: 'Apollo CEO',
             MARC_LEON: 'Luxury Properties Sari Morrocco',
             MELANIE_SPINELLA: f'representative of {LEON_BLACK}',
@@ -325,6 +337,7 @@ HIGHLIGHTED_NAMES = [
         style='deep_pink2',
         pattern=r'Cambridge|(Derek\s*)?Bok|Elisa(\s*New)?|Harvard(\s*(Business|Law|University)(\s*School)?)?|(Jonathan\s*)?Zittrain|(Stephen\s*)?Kosslyn',
         emailers = {
+            "Donald Rubin": f"Professor of Statistics",
             "Kelly Friendly": f"longtime aide and spokesperson of {LARRY_SUMMERS}",
             LARRY_SUMMERS: 'board of Digital Currency Group (DCG), Harvard president, Obama economic advisor',
             'Leah Reis-Dennis': 'producer for Lisa New\'s Poetry in America',
@@ -370,7 +383,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label=JOURNALIST,
         style='bright_yellow',
-        pattern=r'Palm\s*Beach\s*(Daily\s*News|Post)|ABC(\s*News)?|Alex\s*Yablon|(Andrew\s*)?Marra|Arianna(\s*Huffington)?|(Arthur\s*)?Kretchmer|BBC|Bloomberg|Breitbart|Charlie\s*Rose|China\s*Daily|CNBC|CNN(politics?)?|Con[cs]hita|Sarnoff|(?<!Virgin[-\s]Islands[-\s])Daily\s*(Beast|Mail|News|Telegraph)|(David\s*)?Pecker|David\s*Brooks|Ed\s*Krassenstein|(Emily\s*)?Michot|Ezra\s*Klein|(George\s*)?Stephanopoulus|Globe\s*and\s*Mail|Good\s*Morning\s*America|Graydon(\s*Carter)?|Huffington(\s*Post)?|Ingram, David|(James\s*)?Patterson|Jonathan\s*Karl|Julie\s*(K.?\s*)?Brown|(Katie\s*)?Couric|Keith\s*Larsen|L\.?A\.?\s*Times|Miami\s*Herald|(Michele\s*)?Dargan|(National\s*)?Enquirer|(The\s*)?N(ew\s*)?Y(ork\s*)?(P(ost)?|T(imes)?)|(The\s*)?New\s*Yorker|NYer|PERVERSION\s*OF\s*JUSTICE|Politico|Pro\s*Publica|(Sean\s*)?Hannity|Sulzberger|SunSentinel|Susan Edelman|(Uma\s*)?Sanghvi|(The\s*)?Wa(shington\s*)?Po(st)?|Viceland|Vick[iy]\s*Ward|Vox|WGBH|(The\s*)?Wall\s*Street\s*Journal|WSJ|[-\w.]+@(bbc|independent|mailonline|mirror|thetimes)\.co\.uk',
+        pattern=r'Palm\s*Beach\s*(Daily\s*News|Post)|ABC(\s*News)?|Alex\s*Yablon|(Andrew\s*)?Marra|Arianna(\s*Huffington)?|(Arthur\s*)?Kretchmer|BBC|Bloomberg|Breitbart|Charlie\s*Rose|China\s*Daily|CNBC|CNN(politics?)?|Con[cs]hita|Sarnoff|(?<!Virgin[-\s]Islands[-\s])Daily\s*(Beast|Mail|News|Telegraph)|(David\s*)?Pecker|David\s*Brooks|Ed\s*Krassenstein|(Emily\s*)?Michot|Ezra\s*Klein|(George\s*)?Stephanopoulus|Globe\s*and\s*Mail|Good\s*Morning\s*America|Graydon(\s*Carter)?|Huffington(\s*Post)?|Ingram, David|(James\s*)?(Hill|Patterson)|Jonathan\s*Karl|Julie\s*(K.?\s*)?Brown|(Katie\s*)?Couric|Keith\s*Larsen|L\.?A\.?\s*Times|Miami\s*Herald|(Michele\s*)?Dargan|(National\s*)?Enquirer|(The\s*)?N(ew\s*)?Y(ork\s*)?(P(ost)?|T(imes)?)|(The\s*)?New\s*Yorker|NYer|PERVERSION\s*OF\s*JUSTICE|Politico|Pro\s*Publica|(Sean\s*)?Hannity|Sulzberger|SunSentinel|Susan Edelman|(Uma\s*)?Sanghvi|(The\s*)?Wa(shington\s*)?Po(st)?|Viceland|Vick[iy]\s*Ward|Vox|WGBH|(The\s*)?Wall\s*Street\s*Journal|WSJ|[-\w.]+@(bbc|independent|mailonline|mirror|thetimes)\.co\.uk',
         emailers = {
             EDWARD_JAY_EPSTEIN: 'reporter who wrote about the kinds of crimes Epstein was involved in, no relation to Jeffrey',
             'James Hill': 'ABC News',
@@ -390,7 +403,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label='law enforcement',
         style='color(24) bold',
-        pattern=r'ag|(Alicia\s*)?Valle|attorney|((Bob|Robert)\s*)?Mueller|(Byung\s)?Pak|CFTC|CIA|CIS|CVRA|Dep(artmen)?t\.?\s*of\s*(the\s*)?(Justice|Treasury)|DHS|DOJ|FBI|FCPA|FDIC|Federal\s*Bureau\s*of\s*Investigation|FinCEN|FINRA|FOIA|FTC|IRS|(James\s*)?Comey|(Jennifer\s*Shasky\s*)?Calvery|((Judge|Mark)\s*)?(Carney|Filip)|(Kirk )?Blouin|KYC|NIH|NS(A|C)|OCC|OFAC|(Lann?a\s*)?Belohlavek|lawyer|(Michael\s*)?Reiter|OGE|Office\s*of\s*Government\s*Ethics|Police Code Enforcement|(Preet\s*)?Bharara|SCOTUS|SD(FL|NY)|Southern\s*District\s*of\s*(Florida|New\s*York)|SEC|Secret\s*Service|Securities\s*and\s*Exchange\s*Commission|State\s*Dep(artmen)?t|Strzok|Supreme\s*Court|Treasury\s*(Dep(artmen)?t|Secretary)|TSA|USAID|(William\s*J\.?\s*)?Zloch',
+        pattern=r'ag|(Alicia\s*)?Valle|AML|attorney|((Bob|Robert)\s*)?Mueller|(Byung\s)?Pak|CFTC?|CIA|CIS|CVRA|Dep(artmen)?t\.?\s*of\s*(the\s*)?(Justice|Treasury)|DHS|DOJ|FBI|FCPA|FDIC|Federal\s*Bureau\s*of\s*Investigation|FinCEN|FINRA|FOIA|FTC|IRS|(James\s*)?Comey|(Jennifer\s*Shasky\s*)?Calvery|((Judge|Mark)\s*)?(Carney|Filip)|(Kirk )?Blouin|KYC|NIH|NS(A|C)|OCC|OFAC|(Lann?a\s*)?Belohlavek|lawyer|(Michael\s*)?Reiter|OGE|Office\s*of\s*Government\s*Ethics|Police Code Enforcement|(Preet\s*)?Bharara|SCOTUS|SD(FL|NY)|Southern\s*District\s*of\s*(Florida|New\s*York)|SEC|Secret\s*Service|Securities\s*and\s*Exchange\s*Commission|State\s*Dep(artmen)?t|Strzok|Supreme\s*Court|Treasury\s*(Dep(artmen)?t|Secretary)|TSA|USAID|(William\s*J\.?\s*)?Zloch',
         emailers = {
             ANN_MARIE_VILLAFANA: 'southern district of Florida U.S. Attorney',
             DANNY_FROST: 'Director of Communications at Manhattan DA',
@@ -450,6 +463,7 @@ HIGHLIGHTED_NAMES = [
             IAN_OSBORNE: f"{OSBORNE_LLP} reputation repairer possibly hired by Epstein ca. 2011-06",
             MICHAEL_SITRICK: 'crisis PR',
             PEGGY_SIEGAL: 'socialite',
+            'R. Couri Hay': None,
             ROSS_GOW: 'Acuity Reputation Management',
             TYLER_SHEARS: f"{REPUTATION_MGMT}, worked on Epstein's Google search results with {CHRISTINA_GALBRAITH}",
         }
@@ -477,6 +491,7 @@ HIGHLIGHTED_NAMES = [
         style='red bold',
         pattern=r'Alfa\s*Bank|Anya\s*Rasulova|Chernobyl|Day\s+One\s+Ventures|(Dmitry\s)?(Kiselyov|(Lana\s*)?Pozhidaeva|Medvedev|Rybolo(o?l?ev|vlev))|Dmitry|FSB|GRU|KGB|Kislyak|Kremlin|Kuznetsova|Lavrov|Lukoil|Moscow|(Oleg\s*)?Deripaska|Oleksandr Vilkul|Rosneft|RT|St.?\s*?Petersburg|Russian?|Sberbank|Soviet(\s*Union)?|USSR|Vladimir|(Vladimir\s*)?(Putin|Yudashkin)|Women\s*Empowerment|Xitrans',
         emailers = {
+            'Dasha Zhukova': 'art collector, daughter of Alexander Zhukov',
             MASHA_DROKOVA: 'silicon valley VC, former Putin Youth',
             RENATA_BOLOTOVA: 'former aspiring model, now fund manager at New York State Insurance Fund',
             SVETLANA_POZHIDAEVA: f'Epstein\'s Russian assistant who was recommended for a visa by Sergei Belyakov (FSB) and {DAVID_BLAINE}',
@@ -485,14 +500,16 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label=ACADEMIA,
         style='light_goldenrod2',
-        pattern=r'Alain Forget|Brotherton|Carl\s*Sagan|Columbia|David Grosof|J(ames|im)\s*Watson|(Lord\s*)?Martin\s*Rees|Massachusetts\s*Institute\s*of\s*Technology|MIT(\s*Media\s*Lab)?|Media\s*Lab|Minsky|((Noam|Valeria)\s*)?Chomsky|Praluent|Regeneron|(Richard\s*)?Dawkins|Sanofi|Stanford|(Stephen\s*)?Hawking|(Steven?\s*)?Pinker|UCLA',
+        pattern=r'Alain Forget|Brotherton|Carl\s*Sagan|Columbia|David Grosof|J(ames|im)\s*Watson|(Lord\s*)?Martin\s*Rees|Massachusetts\s*Institute\s*of\s*Technology|MIT(\s*Media\s*Lab)?|Media\s*Lab|Minsky|((Noam|Valeria)\s*)?Chomsky|Norman\s*Finkelstein|Praluent|Regeneron|(Richard\s*)?Dawkins|Sanofi|Stanford|(Stephen\s*)?Hawking|(Steven?\s*)?Pinker|UCLA',
         emailers = {
             DAVID_HAIG: None,
             JOSCHA_BACH: 'cognitive science / AI research',
             'Daniel Kahneman': 'Nobel economic sciences laureate and cognitivie psychologist (?)',
+            'Ed Boyden': 'Associate Professor, MIT Media Lab neurobiology',
             LAWRENCE_KRAUSS: 'theoretical physicist',
             LINDA_STONE: 'ex-Microsoft, MIT Media Lab',
             MARK_TRAMO: 'professor of neurology at UCLA',
+            'Nancy Dahl': f'wife of {LAWRENCE_KRAUSS}',
             NEAL_KASSELL: 'professor of neurosurgery at University of Virginia',
             PETER_ATTIA: 'longevity medicine',
             ROBERT_TRIVERS: 'evolutionary biology',
@@ -588,7 +605,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedText(
         label='phone_number',
         style='bright_green',
-        pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|\b[\d+]{10,12}\b",
+        pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|(\b|\+)[\d+]{10,12}\b",
     ),
 ]
@@ -648,18 +665,14 @@ def get_info_for_name(name: str) -> str | None:
 def get_style_for_category(category: str) -> str | None:
-    if category in [CONFERENCE, SPEECH]:
+    if category in CATEGORY_STYLES:
+        return CATEGORY_STYLES[category]
+    elif category in [CONFERENCE, SPEECH]:
         return f"{get_style_for_category(ACADEMIA)} dim"
-    elif category == JSON:
-        return 'dark_red'
-    elif category == JUNK:
-        return 'grey19'
-    elif category == 'letter':
-        return 'medium_orchid1'
     elif category == SOCIAL:
-        return f"{get_style_for_category(PUBLICIST)} dim"
+        return get_style_for_category(PUBLICIST)
-    category = CATEGORY_LABEL_MAPPING.get(category, category)
+    category = CATEGORY_STYLE_MAPPING.get(category, category)
     for highlight_group in HIGHLIGHTED_NAMES:
         if highlight_group.label == category:
@@ -672,6 +685,10 @@ def get_style_for_name(name: str | None, default_style: str = DEFAULT, allow_bol
     return style if allow_bold else style.replace('bold', '').strip()
+def styled_category(category: str) -> Text:
+    return Text(category, get_style_for_category(category) or 'wheat4')
 def _get_highlight_group_for_name(name: str) -> HighlightedNames | None:
     for highlight_group in HIGHLIGHTED_NAMES:
         if highlight_group.regex.search(name):

epstein_files/util/logging.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import logging
 from os import environ
-from pathlib import Path
 import datefinder
 import rich_argparse_plus
@@ -10,7 +9,6 @@ from rich.logging import RichHandler
 from rich.theme import Theme
 from epstein_files.util.constant.strings import *
-from epstein_files.util.file_helper import file_size_str
 FILENAME_STYLE = 'gray27'
@@ -34,7 +32,7 @@ LOG_LEVEL_ENV_VAR = 'LOG_LEVEL'
 # Augment the standard log highlighter with 'epstein_filename' matcher
 class LogHighlighter(ReprHighlighter):
     highlights = ReprHighlighter.highlights + [
-        *[fr"(?P<{doc_type}>{doc_type})" for doc_type in DOC_TYPE_STYLES.keys()],
+        *[fr"(?P<{doc_type}>{doc_type}(Cfg)?)" for doc_type in DOC_TYPE_STYLES.keys()],
         "(?P<epstein_filename>" + FILE_NAME_REGEX.pattern + ')',
     ]
@@ -60,7 +58,3 @@ if env_log_level_str:
     logger.warning(f"Setting log level to {env_log_level} based on {LOG_LEVEL_ENV_VAR} env var...")
     logger.setLevel(env_log_level)
-def log_file_write(file_path: str | Path) -> None:
-    logger.warning(f"Wrote {file_size_str(file_path)} to '{file_path}'")

epstein_files/util/output.py CHANGED Viewed

@@ -11,7 +11,8 @@ from epstein_files.util.constant.names import *
 from epstein_files.util.constant.output_files import JSON_FILES_JSON_PATH, JSON_METADATA_PATH
 from epstein_files.util.data import dict_sets_to_lists
 from epstein_files.util.env import args, specified_names
-from epstein_files.util.logging import log_file_write, logger
+from epstein_files.util.file_helper import log_file_write
+from epstein_files.util.logging import logger
 from epstein_files.util.rich import *
 PRINT_COLOR_KEY_EVERY_N_EMAILS = 150
@@ -60,7 +61,6 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
     """Returns number of emails printed."""
     print_section_header(('Selections from ' if not args.all_emails else '') + 'His Emails')
     print_other_site_link(is_header=False)
     emailers_to_print: list[str | None]
     emailer_tables: list[str | None] = []
     already_printed_emails: list[Email] = []
@@ -106,8 +106,8 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
         _verify_all_emails_were_printed(epstein_files, already_printed_emails)
     fwded_articles = [e for e in already_printed_emails if e.config and e.config.is_fwded_article]
-    logger.warning(f"{len(fwded_articles)} of {len(already_printed_emails)} emails were forwarded articles.")
-    logger.warning(f"Rewrote {len(Email.rewritten_header_ids)} headers of {len(epstein_files.emails)} emails.")
+    log_msg = f"Rewrote {len(Email.rewritten_header_ids)} email headers (out of {len(already_printed_emails)})"
+    logger.warning(f"{log_msg}, {len(fwded_articles)} of the emails were forwarded articles.")
     return len(already_printed_emails)
@@ -121,11 +121,11 @@ def print_json_files(epstein_files: EpsteinFiles):
     else:
         for json_file in epstein_files.json_files:
             console.line(2)
-            console.print(json_file.description_panel())
+            console.print(json_file.summary_panel())
             console.print_json(json_file.json_str(), indent=4, sort_keys=False)
-def print_json_metadata(epstein_files: EpsteinFiles) -> None:
+def write_json_metadata(epstein_files: EpsteinFiles) -> None:
     json_str = epstein_files.json_metadata()
     if args.build:
@@ -187,8 +187,13 @@ def write_urls() -> None:
 def _verify_all_emails_were_printed(epstein_files: EpsteinFiles, already_printed_emails: list[Email]) -> None:
     """Log warnings if some emails were never printed."""
     email_ids_that_were_printed = set([email.file_id for email in already_printed_emails])
-    logger.warning(f"Printed {len(already_printed_emails)} emails of {len(email_ids_that_were_printed)} unique file IDs.")
+    logger.warning(f"Printed {len(already_printed_emails):,} emails of {len(email_ids_that_were_printed):,} unique file IDs.")
+    missed_an_email = False
     for email in epstein_files.emails:
-        if email.file_id not in email_ids_that_were_printed and not email.is_duplicate:
+        if email.file_id not in email_ids_that_were_printed and not email.is_duplicate():
             logger.warning(f"Failed to print {email.summary()}")
+            missed_an_email = True
+    if not missed_an_email:
+        logger.warning(f"All {len(epstein_files.emails):,} emails printed at least once.")

epstein-files 1.0.10__py3-none-any.whl → 1.0.12__py3-none-any.whl

epstein-files 1.0.10py3-none-any.whl → 1.0.12py3-none-any.whl