PyPI - epstein-files - Versions diffs - 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl - Mend

epstein-files 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

epstein_files/__init__.py +66 -131
epstein_files/documents/document.py +12 -3
epstein_files/documents/email.py +33 -13
epstein_files/documents/imessage/text_message.py +11 -15
epstein_files/documents/messenger_log.py +15 -11
epstein_files/documents/other_file.py +13 -8
epstein_files/epstein_files.py +51 -43
epstein_files/util/constant/names.py +21 -24
epstein_files/util/constant/output_files.py +29 -0
epstein_files/util/constant/strings.py +8 -2
epstein_files/util/constant/urls.py +11 -7
epstein_files/util/constants.py +325 -227
epstein_files/util/data.py +12 -33
epstein_files/util/doc_cfg.py +7 -14
epstein_files/util/env.py +5 -3
epstein_files/util/file_helper.py +0 -22
epstein_files/util/highlighted_group.py +31 -26
epstein_files/util/logging.py +7 -0
epstein_files/util/output.py +179 -0
epstein_files/util/rich.py +22 -10
{epstein_files-1.0.1.dist-info → epstein_files-1.0.3.dist-info}/METADATA +32 -7
epstein_files-1.0.3.dist-info/RECORD +33 -0
epstein_files-1.0.3.dist-info/entry_points.txt +7 -0
epstein_files-1.0.1.dist-info/RECORD +0 -30
{epstein_files-1.0.1.dist-info → epstein_files-1.0.3.dist-info}/LICENSE +0 -0
{epstein_files-1.0.1.dist-info → epstein_files-1.0.3.dist-info}/WHEEL +0 -0

epstein_files/util/data.py CHANGED Viewed

@@ -3,15 +3,10 @@ Helpers for dealing with various kinds of data.
 """
 import itertools
 import re
-import time
-from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from dateutil import tz
 from typing import TypeVar
-from dateutil.parser import parse
-from rich.text import Text
 from epstein_files.util.constant import names
 from epstein_files.util.env import args
 from epstein_files.util.logging import logger
@@ -24,27 +19,20 @@ CONSTANT_VAR_REGEX = re.compile(r"^[A-Z_]+$")
 ALL_NAMES = [v for k, v in vars(names).items() if isinstance(v, str) and CONSTANT_VAR_REGEX.match(k)]
 PACIFIC_TZ = tz.gettz("America/Los_Angeles")
-TIMEZONE_INFO = {"PST": PACIFIC_TZ, "PDT": PACIFIC_TZ}  # Suppresses annoying warnings from parse() calls
-def dict_sets_to_lists(d: dict[str, set]) -> dict[str, list]:
-    return {k: sorted(list(v)) for k, v in d.items()}
+TIMEZONE_INFO = {"PDT": PACIFIC_TZ, "PST": PACIFIC_TZ}  # Suppresses annoying warnings from parse() calls
-def extract_datetime(s: str) -> datetime | None:
-    match = ISO_DATE_REGEX.search(s)
-    if not match:
-        return None
-    date_str = match.group(0)
+collapse_newlines = lambda text: MULTINEWLINE_REGEX.sub('\n\n', text)
+date_str = lambda dt: dt.isoformat()[0:10] if dt else None
+escape_double_quotes = lambda text: text.replace('"', r'\"')
+escape_single_quotes = lambda text: text.replace("'", r"\'")
+iso_timestamp = lambda dt: dt.isoformat().replace('T', ' ')
+uniquify = lambda _list: list(set(_list))
+without_falsey = lambda _list: [e for e in _list if e]
-    if len(date_str) == 4:
-        date_str += '-01-01'
-    elif len(date_str) == 7:
-        date_str += '-01'
-    return parse(date_str, tzinfos=TIMEZONE_INFO)
+def dict_sets_to_lists(d: dict[str, set]) -> dict[str, list]:
+    return {k: sorted(list(v)) for k, v in d.items()}
 def extract_last_name(name: str) -> str:
@@ -91,8 +79,8 @@ def ordinal_str(n: int) -> str:
     return str(n) + suffix
-def patternize(_pattern: str | re.Pattern):
-    return _pattern if isinstance(_pattern, re.Pattern) else re.compile(rf"({_pattern})", re.IGNORECASE)
+def patternize(_pattern: str | re.Pattern) -> re.Pattern:
+    return _pattern if isinstance(_pattern, re.Pattern) else re.compile(fr"({_pattern})", re.IGNORECASE)
 def remove_timezone(timestamp: datetime) -> datetime:
@@ -106,12 +94,3 @@ def remove_timezone(timestamp: datetime) -> datetime:
 def sort_dict(d: dict[str | None, int] | dict[str, int]) -> list[tuple[str | None, int]]:
     sort_key = lambda e: (e[0] or '').lower() if args.sort_alphabetical else [-e[1], (e[0] or '').lower()]
     return sorted(d.items(), key=sort_key)
-collapse_newlines = lambda text: MULTINEWLINE_REGEX.sub('\n\n', text)
-date_str = lambda dt: dt.isoformat()[0:10] if dt else None
-escape_double_quotes = lambda text: text.replace('"', r'\"')
-escape_single_quotes = lambda text: text.replace("'", r"\'")
-iso_timestamp = lambda dt: dt.isoformat().replace('T', ' ')
-uniquify = lambda _list: list(set(_list))
-without_nones = lambda _list: [e for e in _list if e]

epstein_files/util/doc_cfg.py CHANGED Viewed

@@ -8,7 +8,7 @@ from dateutil.parser import parse
 from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import *
-from epstein_files.util.data import without_nones
+from epstein_files.util.data import without_falsey
 DuplicateType = Literal['earlier', 'quoted', 'redacted', 'same']
 Metadata = dict[str, bool | datetime | int | str | list[str | None] |dict[str, bool | str]]
@@ -116,10 +116,12 @@ class DocCfg:
                 return self.title_by_author()
             elif self.category == FINANCE and self.author in FINANCIAL_REPORTS_AUTHORS:
                 return f"{self.author} report: '{self.description}'"
+            elif self.category == LEGAL and 'v.' in self.author:
+                return f"{self.author}: '{self.description}'"
         elif self.category and self.author is None and self.description is None:
             return self.category
-        pieces = without_nones([self.author, self.description])
+        pieces = without_falsey([self.author, self.description])
         return ' '.join(pieces) if pieces else None
     def metadata(self) -> Metadata:
@@ -176,16 +178,6 @@ class DocCfg:
         return props
-    def __eq__(self, other: 'DocCfg') -> bool:
-        """Return True if everything matches other than the two 'dupe_' fields ('duplicate_ids' is compared)."""
-        for _field in self.sorted_fields():
-            if _field.name == 'id' or _field.name.startswith('dupe'):
-                continue
-            elif getattr(self, _field.name) != getattr(other, _field.name):
-                return False
-        return True
     def __repr__(self) -> str:
         props = self._props_strs()
         type_str = f"{type(self).__name__}("
@@ -231,6 +223,7 @@ class EmailCfg(CommunicationCfg):
         recipients (list[str | None]): Who received the email
     """
     actual_text: str | None = None  # Override for the Email._actual_text() method for particularly broken emails
+    fwded_text_after: str | None = None  # If set, any text after this is a fwd of an article or similar
     is_fwded_article: bool = False
     recipients: list[str | None] = field(default_factory=list)
@@ -242,7 +235,7 @@ class EmailCfg(CommunicationCfg):
     def from_doc_cfg(cls, cfg: DocCfg) -> 'EmailCfg':
         return cls(**asdict(cfg))
-    # This is necessary for some dumb reason. @dataclass(repr=False) doesn't cut it
+    # This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
     def __repr__(self) -> str:
         return super().__repr__()
@@ -253,6 +246,6 @@ class TextCfg(CommunicationCfg):
         super().__post_init__()
         self.category = TEXT_MESSAGE
-    # This is necessary for some dumb reason. @dataclass(repr=False) doesn't cut it
+    # This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
     def __repr__(self) -> str:
         return super().__repr__()

epstein_files/util/env.py CHANGED Viewed

@@ -6,16 +6,18 @@ from sys import argv
 from epstein_files.util.logging import datefinder_logger, env_log_level, logger
+COUNT_WORDS_SCRIPT = 'count_words.py'
 DEFAULT_WIDTH = 154
-HTML_SCRIPTS = ['generate_html.py', 'count_words.py']
+HTML_SCRIPTS = ['epstein_generate', 'generate_html.py', COUNT_WORDS_SCRIPT]
 parser = ArgumentParser(description="Parse epstein OCR docs and generate HTML page.")
-parser.add_argument('--build', '-b', action='store_true', help='write HTML to docs/index.html')
+parser.add_argument('--build', '-b', action='store_true', help='write output to file')
 parser.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
 parser.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just interesting ones')
 parser.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
 parser.add_argument('--name', '-n', action='append', dest='names', help='specify the name(s) whose communications should be output')
+parser.add_argument('--output-file', '-out', metavar='FILE', default='index.html', help='write output to FILE in docs/ (default=index.html)')
 parser.add_argument('--output-emails', '-oe', action='store_true', help='generate other files section')
 parser.add_argument('--output-other-files', '-oo', action='store_true', help='generate other files section')
 parser.add_argument('--output-texts', '-ot', action='store_true', help='generate other files section')
@@ -64,7 +66,7 @@ datefinder_logger.setLevel(logger.level)
 # Massage args that depend on other args to the appropriate state
 if not (args.json_metadata or args.output_texts or args.output_emails or args.output_other_files):
-    if is_html_script:
+    if is_html_script and current_script != COUNT_WORDS_SCRIPT and not args.make_clean:
         logger.warning(f"No output section chosen; outputting default of texts, selected emails, and other files...")
     args.output_texts = True

epstein_files/util/file_helper.py CHANGED Viewed

@@ -8,7 +8,6 @@ from epstein_files.util.constant.strings import FILE_NAME_REGEX, FILE_STEM_REGEX
 EPSTEIN_DOCS_DIR_ENV_VAR_NAME = 'EPSTEIN_DOCS_DIR'
 DOCS_DIR_ENV = environ[EPSTEIN_DOCS_DIR_ENV_VAR_NAME]
 DOCS_DIR = Path(DOCS_DIR_ENV or '').resolve()
-PICKLED_PATH = Path("the_epstein_files.pkl.gz")
 if not DOCS_DIR_ENV:
     print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME} env var not set!")
@@ -17,20 +16,7 @@ elif not DOCS_DIR.exists():
     print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!")
     exit(1)
-HTML_DIR = Path('docs')
 EXTRACTED_EMAILS_DIR = Path('emails_extracted_from_legal_filings')
-EPSTEIN_WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_texts_and_emails_word_count.html')
-GH_PAGES_HTML_PATH = HTML_DIR.joinpath('index.html')
-JSON_METADATA_PATH = HTML_DIR.joinpath('epstein_files_nov_2025_cryptadamus_metadata.json')
-WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_emails_word_count.html')
-BUILD_ARTIFACTS = [
-    EPSTEIN_WORD_COUNT_HTML_PATH,
-    GH_PAGES_HTML_PATH,
-    JSON_METADATA_PATH,
-    WORD_COUNT_HTML_PATH,
-]
 FILE_ID_REGEX = re.compile(fr".*{FILE_NAME_REGEX.pattern}")
 FILENAME_LENGTH = len(HOUSE_OVERSIGHT_PREFIX) + 6
 KB = 1024
@@ -110,11 +96,3 @@ def is_local_extract_file(filename) -> bool:
     """Return true if filename is of form 'HOUSE_OVERSIGHT_029835_1.txt'."""
     file_match = FILE_ID_REGEX.match(str(filename))
     return True if file_match and file_match.group(2) else False
-def make_clean() -> None:
-    """Delete all build artifacts."""
-    for build_file in BUILD_ARTIFACTS:
-        if build_file.exists():
-            print(f"Removing build file '{build_file}'...")
-            build_file.unlink()

epstein_files/util/highlighted_group.py CHANGED Viewed

@@ -2,7 +2,6 @@ import re
 from dataclasses import dataclass, field
 from rich.highlighter import RegexHighlighter
-from rich.text import Text
 from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import *
@@ -10,7 +9,7 @@ from epstein_files.util.constant.urls import ARCHIVE_LINK_COLOR
 from epstein_files.util.constants import (EMAILER_ID_REGEXES, EPSTEIN_V_ROTHSTEIN_EDWARDS, HEADER_ABBREVIATIONS,
      OSBORNE_LLP, REPLY_REGEX, SENT_FROM_REGEX, VIRGIN_ISLANDS)
 from epstein_files.util.doc_cfg import *
-from epstein_files.util.data import extract_last_name, listify
+from epstein_files.util.data import extract_last_name, listify, without_falsey
 CIVIL_ATTORNEY = 'civil attorney'
 CRIMINAL_DEFENSE_ATTORNEY = 'criminal defense attorney'
@@ -48,7 +47,6 @@ class HighlightedText:
     label: str = ''
     pattern: str = ''
     style: str
-    # Computed fields
     regex: re.Pattern = field(init=False)
     theme_style_name: str = field(init=False)
     _capture_group_label: str = field(init=False)
@@ -76,7 +74,7 @@ class HighlightedNames(HighlightedText):
     Attributes:
         category (str): optional string to use as an override for self.label in some contexts
         emailers (dict[str, str | None]): optional names to construct regexes for (values are descriptions)
-        _pattern (str): complete regex pattern that combines 'pattern' with 'emailers'
+        _pattern (str): regex pattern combining 'pattern' with first & last names of all 'emailers'
     """
     category: str = ''
     emailers: dict[str, str | None] = field(default_factory=dict)
@@ -102,7 +100,7 @@ class HighlightedNames(HighlightedText):
             self.emailers.get(name),
         ]
-        info_pieces = [p for p in info_pieces if p is not None]
+        info_pieces = without_falsey(info_pieces)
         return ', '.join(info_pieces) if info_pieces else None
     def _emailer_pattern(self, name: str) -> str:
@@ -114,10 +112,10 @@ class HighlightedNames(HighlightedText):
         if name in EMAILER_ID_REGEXES:
             pattern = EMAILER_ID_REGEXES[name].pattern
-            # Include regex for last name
-            # TODO: handle word boundary issue for names that end in symbols
-            if SIMPLE_NAME_REGEX.match(last_name) and last_name.lower() not in NAMES_TO_NOT_HIGHLIGHT:
-                pattern += fr"|{last_name}"
+            # Include regex for first and last names
+            for partial_name in [first_name, last_name]:
+                if SIMPLE_NAME_REGEX.match(partial_name) and partial_name.lower() not in NAMES_TO_NOT_HIGHLIGHT:
+                    pattern += fr"|{partial_name}"
             return pattern
         elif ' ' not in name:
@@ -163,7 +161,7 @@ HIGHLIGHTED_NAMES = [
             ALIREZA_ITTIHADIEH: 'CEO Freestream Aircraft Limited',
             BARBRO_C_EHNBOM: 'Swedish pharmaceuticals',
             FRED_HADDAD: "co-founder of Heck's in West Virginia",
-            GERALD_BARTON: "Maryland property developer, fan of Trump's Irish golf course",
+            GERALD_BARTON: "Maryland property developer Landmark Land Company, fan of Trump's Irish golf course",
             GORDON_GETTY: 'heir of oil tycoon J. Paul Getty',
             NICHOLAS_RIBIS: 'Hilton CEO, former president of Trump Organization',
             'Philip Kafka': 'president of Prince Concepts (and son of Terry Kafka?)',
@@ -272,7 +270,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label='europe',
         style='light_sky_blue3',
-        pattern=r'(Angela )?Merk(el|le)|Austria|(Benjamin\s*)?Harnwell|Berlin|Brexit(eers?)?|Brit(ain|ish)|Brussels|Cannes|(Caroline|Jack)?\s*Lang(, Caroline)?|Cypr(iot|us)|Davos|ECB|EU|Europe(an)?(\s*Union)?|France|Geneva|Germany?|Gillard|Gree(ce|k)|Ital(ian|y)|Jacques|(Kevin\s*)?Rudd|Le\s*Pen|London|Macron|Melusine|Munich|(Natalia\s*)?Veselnitskaya|(Nicholas\s*)?Sarkozy|Nigel(\s*Farage)?|Oslo|Paris|Polish|(Sebastian )?Kurz|(Vi(c|k)tor\s+)?Orbah?n|Edward Rod Larsen|Strasbourg|Strauss[- ]?Kahn|Swed(en|ish)(?![-\s]+America)|Switzerland|(Tony\s)?Blair|Ukrain(e|ian)|Vienna|(Vitaly\s*)?Churkin|Zug',
+        pattern=r'(Angela )?Merk(el|le)|Austria|(Benjamin\s*)?Harnwell|Berlin|Borge|Boris\s*Johnson|Brexit(eers?)?|Brit(ain|ish)|Brussels|Cannes|(Caroline|Jack)?\s*Lang(, Caroline)?|Cypr(iot|us)|Davos|ECB|England|EU|Europe(an)?(\s*Union)?|Fr(ance|ench)|Geneva|Germany?|Gillard|Gree(ce|k)|Ital(ian|y)|Jacques|(Kevin\s*)?Rudd|Le\s*Pen|London|Macron|Melusine|Munich|(Natalia\s*)?Veselnitskaya|(Nicholas\s*)?Sarkozy|Nigel(\s*Farage)?|Norw(ay|egian)|Oslo|Paris|Polish|(Sebastian )?Kurz|(Vi(c|k)tor\s+)?Orbah?n|Edward Rod Larsen|Strasbourg|Strauss[- ]?Kahn|Swed(en|ish)(?![-\s]+America)|Switzerland|(Tony\s)?Blair|Ukrain(e|ian)|Vienna|(Vitaly\s*)?Churkin|Zug',
         emailers = {
             ANDRZEJ_DUDA: 'former president of Poland',
             MIROSLAV_LAJCAK: 'Russia-friendly Slovakian politician, friend of Steve Bannon',
@@ -306,7 +304,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label='finance',
         style='green',
-        pattern=r'Apollo|Ari\s*Glass|(Bernie\s*)?Madoff|Black(rock|stone)|BofA|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|MLPF&S|(money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
+        pattern=r'Apollo|Ari\s*Glass|(Bernie\s*)?Madoff|Black(rock|stone)|BofA|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|(money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
         emailers={
             AMANDA_ENS: 'Citigroup',
             DANIEL_SABBA: 'UBS Investment Bank',
@@ -342,7 +340,7 @@ HIGHLIGHTED_NAMES = [
         emailers = {
             ANIL_AMBANI: 'chairman of Reliance Group',
             VINIT_SAHNI: None,
-            ZUBAIR_KHAN: 'Tranchulas CEO, InsightsPod founder',
+            ZUBAIR_KHAN: 'cybersecurity firm Tranchulas CEO, InsightsPod founder, based in Islamabad and Dubai',
         }
     ),
     HighlightedNames(
@@ -391,7 +389,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label='law enforcement',
         style='color(24) bold',
-        pattern=r'ag|(Alicia\s*)?Valle|((Bob|Robert)\s*)?Mueller|(Byung\s)?Pak|CFTC|CIA|CIS|CVRA|Dep(artmen)?t\.?\s*of\s*(the\s*)?(Justice|Treasury)|DHS|DOJ|FBI|FCPA|FDIC|Federal\s*Bureau\s*of\s*Investigation|FinCEN|FINRA|FOIA|FTC|IRS|(James\s*)?Comey|(Jennifer\s*Shasky\s*)?Calvery|((Judge|Mark)\s*)?(Carney|Filip)|(Kirk )?Blouin|KYC|NIH|NS(A|C)|OCC|OFAC|(Lann?a\s*)?Belohlavek|(Michael\s*)?Reiter|OGE|Office\s*of\s*Government\s*Ethics|Police Code Enforcement|(Preet\s*)?Bharara|SCOTUS|SD(FL|NY)|Southern\s*District\s*of\s*(Florida|New\s*York)|SEC|Securities\s*and\s*Exchange\s*Commission|State\s*Dep(artmen)?t|Strzok|Supreme\s*Court|Treasury\s*(Dep(artmen)?t|Secretary)|TSA|USAID|(William\s*J\.?\s*)?Zloch',
+        pattern=r'ag|(Alicia\s*)?Valle|attorney|((Bob|Robert)\s*)?Mueller|(Byung\s)?Pak|CFTC|CIA|CIS|CVRA|Dep(artmen)?t\.?\s*of\s*(the\s*)?(Justice|Treasury)|DHS|DOJ|FBI|FCPA|FDIC|Federal\s*Bureau\s*of\s*Investigation|FinCEN|FINRA|FOIA|FTC|IRS|(James\s*)?Comey|(Jennifer\s*Shasky\s*)?Calvery|((Judge|Mark)\s*)?(Carney|Filip)|(Kirk )?Blouin|KYC|NIH|NS(A|C)|OCC|OFAC|(Lann?a\s*)?Belohlavek|lawyer|(Michael\s*)?Reiter|OGE|Office\s*of\s*Government\s*Ethics|Police Code Enforcement|(Preet\s*)?Bharara|SCOTUS|SD(FL|NY)|Southern\s*District\s*of\s*(Florida|New\s*York)|SEC|Securities\s*and\s*Exchange\s*Commission|State\s*Dep(artmen)?t|Strzok|Supreme\s*Court|Treasury\s*(Dep(artmen)?t|Secretary)|TSA|USAID|(William\s*J\.?\s*)?Zloch',
         emailers = {
             ANN_MARIE_VILLAFANA: 'southern district of Florida U.S. Attorney',
             DANNY_FROST: 'Director of Communications at Manhattan DA',
@@ -426,7 +424,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label='modeling',
         style='pale_violet_red1',
-        pattern=r'\w+@mc2mm.com|(Nicole\s*)?Junkerman',
+        pattern=r'\w+@mc2mm.com|model(ed|ing)|(Nicole\s*)?Junkerman',
         emailers = {
             'Abi Schwinck': 'MC2 Model Management (?)',
             DANIEL_SIAD: None,
@@ -458,7 +456,8 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label='republicans',
         style='bold dark_red',
-        pattern=r'Alberto\sGonzale[sz]|(Alex\s*)?Acosta|(Bill\s*)?Barr|Bill\s*Shine|(Bob\s*)?Corker|(John\s*(R.?\s*)?)Bolton|Broidy|(Chris\s)?Christie|Devin\s*Nunes|(Don\s*)?McGa[hn]n|McMaster|(George\s*)?Nader|GOP|(Brett\s*)?Kavanaugh|Kissinger|Kobach|Koch\s*Brothers|Kolfage|Kudlow|Lewandowski|(Marco\s)?Rubio|(Mark\s*)Meadows|Mattis|(?<!Merwin Dela )Cruz|(Michael\s)?Hayden|((General|Mike)\s*)?(Flynn|Pence)|(Mitt\s*)?Romney|Mnuchin|Nikki|Haley|(Paul\s+)?Manafort|(Peter\s)?Navarro|Pompeo|Reagan|Republican|(?<!Cynthia )(Richard\s*)?Nixon|Sasse|(Rex\s*)?Tillerson',
+        pattern=r'Alberto\sGonzale[sz]|(Alex\s*)?Acosta|(Bill\s*)?Barr|Bill\s*Shine|(Bob\s*)?Corker|(John\s*(R.?\s*)?)Bolton|Broidy|(Chris\s)?Christie|Devin\s*Nunes|(Don\s*)?McGa[hn]n|McMaster|(George\s*)?Nader|GOP|(Brett\s*)?Kavanaugh|Kissinger|Kobach|Koch\s*Brothers|Kolfage|Kudlow|Lewandowski|(Marco\s)?Rubio|(Mark\s*)Meadows|Mattis|(?<!Merwin Dela )Cruz|(Michael\s)?Hayden|((General|Mike)\s*)?(Flynn|Pence)|(Mitt\s*)?Romney|Mnuchin|Nikki|Haley|(Paul\s+)?Manafort|(Peter\s)?Navarro|Pompeo|Reagan|Reince|Priebus|Republican|(?<!Cynthia )(Richard\s*)?Nixon|Sasse|(Rex\s*)?Tillerson',
+        # There's no emails from these people, they're just here to automate the regex creation for both first + last names
         emailers = {
             RUDY_GIULIANI: 'disbarred formed mayor of New York City',
             TULSI_GABBARD: None,
@@ -475,7 +474,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label='russia',
         style='red bold',
-        pattern=r'Alfa\s*Bank|Anya\s*Rasulova|Chernobyl|Day\s+One\s+Ventures|(Dmitry\s)?(Kiselyov|(Lana\s*)?Pozhidaeva|Medvedev|Rybolo(o?l?ev|vlev))|Dmitry|FSB|GRU|KGB|Kislyak|Kremlin|Kuznetsova|Lavrov|Lukoil|Moscow|(Oleg\s*)?Deripaska|Oleksandr Vilkul|Rosneft|RT|St.?\s*?Petersburg|Russian?|Sberbank|Soviet(\s*Union)?|USSR|(Vladimir\s*)?(Putin|Yudashkin)|Women\s*Empowerment|Xitrans',
+        pattern=r'Alfa\s*Bank|Anya\s*Rasulova|Chernobyl|Day\s+One\s+Ventures|(Dmitry\s)?(Kiselyov|(Lana\s*)?Pozhidaeva|Medvedev|Rybolo(o?l?ev|vlev))|Dmitry|FSB|GRU|KGB|Kislyak|Kremlin|Kuznetsova|Lavrov|Lukoil|Moscow|(Oleg\s*)?Deripaska|Oleksandr Vilkul|Rosneft|RT|St.?\s*?Petersburg|Russian?|Sberbank|Soviet(\s*Union)?|USSR|Vladimir|(Vladimir\s*)?(Putin|Yudashkin)|Women\s*Empowerment|Xitrans',
         emailers = {
             MASHA_DROKOVA: 'silicon valley VC, former Putin Youth',
             RENATA_BOLOTOVA: 'former aspiring model, now fund manager at New York State Insurance Fund',
@@ -519,7 +518,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label='trump',
         style='red3 bold',
-        pattern=r"@?realDonaldTrump|(Alan\s*)?Weiss?elberg|\bDJ?T\b|Donald J. Tramp|(Donald\s+(J\.\s+)?)?Trump(ism|\s*Properties)?|Don(ald| *Jr)(?! Rubin)|Ivana|(Madeleine\s*)?Westerhout|Mar[-\s]*a[-\s]*Lago|(Marla\s*)?Maples|(Matt(hew)? )?Calamari|\bMatt C\b|Melania|(Michael (J.? )?)?Boccio|Roger\s+Stone|rona|(The\s*)?Art\s*of\s*the\s*Deal",
+        pattern=r"@?realDonaldTrump|(Alan\s*)?Weiss?elberg|\bDJ?T\b|Donald J. Tramp|(Donald\s+(J\.\s+)?)?Trump(ism|\s*Properties)?|Don(ald| *Jr)(?! Rubin)|Ivana|(Madeleine\s*)?Westerhout|Mar[-\s]*a[-\s]*Lago|(Marla\s*)?Maples|(Matt(hew)? )?Calamari|\bMatt C\b|Melania|(Michael (J.? )?)?Boccio|Rebekah\s*Mercer|Roger\s+Stone|rona|(The\s*)?Art\s*of\s*the\s*Deal",
         emailers = {
             'Bruce Moskowitz': "'Trump's health guy' according to Epstein",
         },
@@ -541,7 +540,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label=VIRGIN_ISLANDS,
         style='sea_green1',
-        pattern=r'Bahamas|Caribb?ean|Dominican\s*Republic|(Great|Little)\s*St.?\s*James|Haiti(an)?|(John\s*)deJongh(\s*Jr\.?)|(Kenneth E\. )?Mapp|Palm\s*Beach(?!\s*Post)|PBI|S(ain)?t.?\s*Thomas|USVI|VI|(The\s*)?Virgin\s*Islands(\s*Daily\s*News)?',  # TODO: VI Daily News should be yellow but it's hard bc Daily News xists
+        pattern=r'Antigua|Bahamas|Caribb?ean|Dominican\s*Republic|(Great|Little)\s*St.?\s*James|Haiti(an)?|(John\s*)deJongh(\s*Jr\.?)|(Kenneth E\. )?Mapp|Palm\s*Beach(?!\s*Post)|PBI|S(ain)?t.?\s*Thomas|USVI|VI|(The\s*)?Virgin\s*Islands(\s*Daily\s*News)?',  # TODO: VI Daily News should be yellow but it's hard bc Daily News xists
         emailers = {
             CECILE_DE_JONGH: f'First lady 2007-2015',
             STACEY_PLASKETT: 'non-voting member of Congress',
@@ -561,7 +560,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label=STEVE_BANNON,
         style='color(58)',
-        pattern=r'((Steve|Sean)\s*)?Bannon?',
+        pattern=r'((Steve|Sean)\s*)?Bannon?|(American\s*)?Dharma',
     ),
     HighlightedNames(
         emailers={STEVEN_HOFFENBERG: HEADER_ABBREVIATIONS['Hoffenberg']},
@@ -578,7 +577,18 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(emailers={PRINCE_ANDREW: 'British royal family'}, style='dodger_blue1'),
     HighlightedNames(emailers={SOON_YI_PREVIN: "wife of Woody Allen"}, style='hot_pink'),
     HighlightedNames(emailers={SULTAN_BIN_SULAYEM: 'CEO of DP World, chairman of ports in Dubai'}, style='green1'),
-    HighlightedText(label='unknown', style='cyan', pattern=r'\(unknown\)'),  # HighlightedText bc of word boundary issue
+    # HighlightedText not HighlightedNames bc of word boundary issue
+    HighlightedText(
+        label='unknown',
+        style='cyan',
+        pattern=r'\(unknown\)'
+    ),
+    HighlightedText(
+        label='phone_number',
+        style='bright_green',
+        pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|[\d+]{10,12}",
+    ),
 ]
 # Highlight regexes for things other than names, only used by RegexHighlighter pattern matching
@@ -593,11 +603,6 @@ HIGHLIGHTED_TEXTS = [
         style=f'{ARCHIVE_LINK_COLOR} underline',
         pattern=r"https?:[^\s]+",
     ),
-    HighlightedText(
-        label='phone_number',
-        style='bright_green',
-        pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|[\d+]{10,12}",
-    ),
     HighlightedText(
         label='quoted_reply_line',
         style='dim',

epstein_files/util/logging.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import logging
 from os import environ
+from pathlib import Path
 from rich.console import Console
 from rich.highlighter import ReprHighlighter
@@ -7,6 +8,7 @@ from rich.logging import RichHandler
 from rich.theme import Theme
 from epstein_files.util.constant.strings import *
+from epstein_files.util.file_helper import file_size_str
 FILENAME_STYLE = 'gray27'
@@ -27,6 +29,7 @@ LOG_THEME[f"{ReprHighlighter.base_style}epstein_filename"] = FILENAME_STYLE
 LOG_LEVEL_ENV_VAR = 'LOG_LEVEL'
+# Augment the standard log highlighter with 'epstein_filename' matcher
 class LogHighlighter(ReprHighlighter):
     highlights = ReprHighlighter.highlights + [
         *[fr"(?P<{doc_type}>{doc_type})" for doc_type in DOC_TYPE_STYLES.keys()],
@@ -55,3 +58,7 @@ if env_log_level_str:
     logger.warning(f"Setting log level to {env_log_level} based on {LOG_LEVEL_ENV_VAR} env var...")
     logger.setLevel(env_log_level)
     datefinder_logger.setLevel(env_log_level)
+def log_file_write(file_path: str | Path) -> None:
+    logger.warning(f"Wrote {file_size_str(file_path)} to '{file_path}'")

epstein_files/util/output.py ADDED Viewed

@@ -0,0 +1,179 @@
+from rich.padding import Padding
+from epstein_files.documents.email import Email
+from epstein_files.documents.messenger_log import MessengerLog
+from epstein_files.epstein_files import EpsteinFiles, count_by_month
+from epstein_files.util.constant.output_files import JSON_METADATA_PATH
+from epstein_files.util.constant import urls
+from epstein_files.util.constant.html import *
+from epstein_files.util.constant.names import *
+from epstein_files.util.constant.strings import EMAIL_CLASS, MESSENGER_LOG_CLASS
+from epstein_files.util.data import dict_sets_to_lists
+from epstein_files.util.env import args, specified_names
+from epstein_files.util.logging import log_file_write, logger
+from epstein_files.util.rich import *
+PRINT_COLOR_KEY_EVERY_N_EMAILS = 150
+# Order matters. Default names to print emails for.
+DEFAULT_EMAILERS = [
+    JEREMY_RUBIN,
+    AL_SECKEL,
+    JOI_ITO,
+    JABOR_Y,
+    STEVEN_SINOFSKY,
+    DANIEL_SIAD,
+    JEAN_LUC_BRUNEL,
+    STEVEN_HOFFENBERG,
+    EHUD_BARAK,
+    MARTIN_NOWAK,
+    MASHA_DROKOVA,
+    RENATA_BOLOTOVA,
+    STEVE_BANNON,
+    OLIVIER_COLOM,
+    BORIS_NIKOLIC,
+    PRINCE_ANDREW,
+    JIDE_ZEITLIN,
+    DAVID_STERN,
+    MOHAMED_WAHEED_HASSAN,
+    JENNIFER_JACQUET,
+    TYLER_SHEARS,
+    CHRISTINA_GALBRAITH,
+    None,
+]
+# Order matters. Default names to print tables w/email subject, timestamp, etc for. # TODO: get rid of this ?
+DEFAULT_EMAILER_TABLES: list[str | None] = [
+    GHISLAINE_MAXWELL,
+    LEON_BLACK,
+    SULTAN_BIN_SULAYEM,
+    DEEPAK_CHOPRA,
+    ARIANE_DE_ROTHSCHILD,
+]
+if len(set(DEFAULT_EMAILERS).intersection(set(DEFAULT_EMAILER_TABLES))) > 0:
+    raise RuntimeError(f"Some names appear in both DEFAULT_EMAILERS and DEFAULT_EMAILER_TABLES")
+def print_emails(epstein_files: EpsteinFiles) -> int:
+    """Returns number of emails printed."""
+    print_section_header(('Selections from ' if not args.all_emails else '') + 'His Emails')
+    print_other_site_link(is_header=False)
+    emailers_to_print: list[str | None]
+    emailer_tables: list[str | None] = []
+    already_printed_emails: list[Email] = []
+    num_emails_printed_since_last_color_key = 0
+    if specified_names:
+        emailers_to_print = specified_names
+    else:
+        epstein_files.print_emailer_counts_table()
+        if args.all_emails:
+            emailers_to_print = sorted(epstein_files.all_emailers(), key=lambda e: epstein_files.earliest_email_at(e))
+            console.print('Email conversations are sorted chronologically based on time of the first email.')
+            print_numbered_list_of_emailers(emailers_to_print, epstein_files)
+        else:
+            emailers_to_print = DEFAULT_EMAILERS
+            emailer_tables = DEFAULT_EMAILER_TABLES
+            console.print('Email conversations grouped by counterparty can be found in the order listed below.')
+            print_numbered_list_of_emailers(emailers_to_print)
+            console.print("\nAfter that there's tables linking to (but not displaying) all known emails for each of these people:")
+            print_numbered_list_of_emailers(emailer_tables)
+    for author in emailers_to_print:
+        author_emails = epstein_files.print_emails_for(author)
+        already_printed_emails.extend(author_emails)
+        num_emails_printed_since_last_color_key += len(author_emails)
+        # Print color key every once in a while
+        if num_emails_printed_since_last_color_key > PRINT_COLOR_KEY_EVERY_N_EMAILS:
+            print_color_key()
+            num_emails_printed_since_last_color_key = 0
+    if emailer_tables:
+        print_author_header(f"Email Tables for {len(emailer_tables)} Other People", 'white')
+        for name in DEFAULT_EMAILER_TABLES:
+            epstein_files.print_emails_table_for(name)
+    if not specified_names:
+        epstein_files.print_email_device_info()
+    if args.all_emails:
+        _verify_all_emails_were_printed(epstein_files, already_printed_emails)
+    logger.warning(f"Rewrote {len(Email.rewritten_header_ids)} headers of {len(epstein_files.emails)} emails")
+    return len(already_printed_emails)
+def print_json_metadata(epstein_files: EpsteinFiles) -> None:
+    json_str = epstein_files.json_metadata()
+    if args.build:
+        with open(JSON_METADATA_PATH, 'w') as f:
+            f.write(json_str)
+            log_file_write(JSON_METADATA_PATH)
+    else:
+        console.print_json(json_str, indent=4, sort_keys=True)
+def print_json_stats(epstein_files: EpsteinFiles) -> None:
+    console.line(5)
+    console.print(Panel('JSON Stats Dump', expand=True, style='reverse bold'), '\n')
+    print_json(f"{MESSENGER_LOG_CLASS} Sender Counts", MessengerLog.count_authors(epstein_files.imessage_logs), skip_falsey=True)
+    print_json(f"{EMAIL_CLASS} Author Counts", epstein_files.email_author_counts, skip_falsey=True)
+    print_json(f"{EMAIL_CLASS} Recipient Counts", epstein_files.email_recipient_counts, skip_falsey=True)
+    print_json("Email signature_substitution_countss", epstein_files.email_signature_substitution_counts(), skip_falsey=True)
+    print_json("email_author_device_signatures", dict_sets_to_lists(epstein_files.email_authors_to_device_signatures))
+    print_json("email_sent_from_devices", dict_sets_to_lists(epstein_files.email_device_signatures_to_authors))
+    print_json("email_unknown_recipient_file_ids", epstein_files.email_unknown_recipient_file_ids())
+    print_json("count_by_month", count_by_month(epstein_files.all_documents()))
+def print_text_messages(epstein_files: EpsteinFiles) -> None:
+    print_section_header('Text Messages')
+    print_centered("(conversations are sorted chronologically based on timestamp of first message)\n", style='gray30')
+    authors: list[str | None] = specified_names if specified_names else [JEFFREY_EPSTEIN]
+    log_files = epstein_files.imessage_logs_for(authors)
+    for log_file in log_files:
+        console.print(Padding(log_file))
+        console.line(2)
+    epstein_files.print_imessage_summary()
+def write_urls() -> None:
+    """Write _URL style constant variables to a file bash scripts can load as env vars."""
+    if args.output_file == 'index.html':
+        logger.warning(f"Can't write env vars to '{args.output_file}', writing to '{URLS_ENV}' instead.\n")
+        args.output_file = URLS_ENV
+    url_vars = {
+        k: v for k, v in vars(urls).items()
+        if isinstance(v, str) and k.split('_')[-1] in ['URL'] and 'github.io' in v and 'BASE' not in k
+    }
+    with open(args.output_file, 'w') as f:
+        for var_name, url in url_vars.items():
+            key_value = f"{var_name}='{url}'"
+            if not args.suppress_output:
+                console.print(key_value, style='dim')
+            f.write(f"{key_value}\n")
+    console.line()
+    logger.warning(f"Wrote {len(url_vars)} URL variables to '{args.output_file}'\n")
+def _verify_all_emails_were_printed(epstein_files: EpsteinFiles, already_printed_emails: list[Email]) -> None:
+    """Log warnings if some emails were never printed."""
+    email_ids_that_were_printed = set([email.file_id for email in already_printed_emails])
+    logger.warning(f"Printed {len(already_printed_emails)} emails of {len(email_ids_that_were_printed)} unique file IDs.")
+    for email in epstein_files.emails:
+        if email.file_id not in email_ids_that_were_printed and not email.is_duplicate:
+            logger.warning(f"Failed to print {email.summary()}")

epstein-files 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

epstein-files 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl