PyPI - epstein-files - Versions diffs - 1.0.9__py3-none-any.whl → 1.0.11__py3-none-any.whl - Mend

epstein-files 1.0.9py3-none-any.whl → 1.0.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

epstein_files/__init__.py +11 -13
epstein_files/documents/document.py +92 -49
epstein_files/documents/email.py +7 -4
epstein_files/documents/imessage/text_message.py +3 -12
epstein_files/documents/json_file.py +13 -1
epstein_files/documents/messenger_log.py +32 -19
epstein_files/documents/other_file.py +66 -43
epstein_files/epstein_files.py +22 -15
epstein_files/util/constant/names.py +4 -4
epstein_files/util/constants.py +86 -80
epstein_files/util/doc_cfg.py +17 -25
epstein_files/util/env.py +29 -17
epstein_files/util/file_helper.py +13 -24
epstein_files/util/highlighted_group.py +25 -17
epstein_files/util/logging.py +0 -6
epstein_files/util/output.py +12 -7
epstein_files/util/rich.py +15 -10
epstein_files/util/word_count.py +65 -5
{epstein_files-1.0.9.dist-info → epstein_files-1.0.11.dist-info}/METADATA +1 -1
epstein_files-1.0.11.dist-info/RECORD +33 -0
epstein_files/count_words.py +0 -72
epstein_files-1.0.9.dist-info/RECORD +0 -34
{epstein_files-1.0.9.dist-info → epstein_files-1.0.11.dist-info}/LICENSE +0 -0
{epstein_files-1.0.9.dist-info → epstein_files-1.0.11.dist-info}/WHEEL +0 -0
{epstein_files-1.0.9.dist-info → epstein_files-1.0.11.dist-info}/entry_points.txt +0 -0

epstein_files/util/file_helper.py CHANGED Viewed

@@ -1,20 +1,9 @@
 import re
-from os import environ
 from pathlib import Path
-from sys import exit
 from epstein_files.util.constant.strings import FILE_NAME_REGEX, FILE_STEM_REGEX, HOUSE_OVERSIGHT_PREFIX
-EPSTEIN_DOCS_DIR_ENV_VAR_NAME = 'EPSTEIN_DOCS_DIR'
-DOCS_DIR_ENV = environ[EPSTEIN_DOCS_DIR_ENV_VAR_NAME]
-DOCS_DIR = Path(DOCS_DIR_ENV or '').resolve()
-if not DOCS_DIR_ENV:
-    print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME} env var not set!")
-    exit(1)
-elif not DOCS_DIR.exists():
-    print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!")
-    exit(1)
+from epstein_files.util.env import DOCS_DIR
+from epstein_files.util.logging import logger
 EXTRACTED_EMAILS_DIR = Path('emails_extracted_from_legal_filings')
 FILE_ID_REGEX = re.compile(fr".*{FILE_NAME_REGEX.pattern}")
@@ -23,9 +12,10 @@ KB = 1024
 MB = KB * KB
-# Handles both string and int 'id' args.
+# Coerce methods hands both string and int arguments.
+coerce_file_name = lambda filename_or_id: coerce_file_stem(filename_or_id) + '.txt'
+coerce_file_path = lambda filename_or_id: DOCS_DIR.joinpath(coerce_file_name(filename_or_id))
 id_str = lambda id: f"{int(id):06d}"
-filename_for_id = lambda id: file_stem_for_id(id) + '.txt'
 def coerce_file_stem(filename_or_id: int | str) -> str:
@@ -42,14 +32,6 @@ def coerce_file_stem(filename_or_id: int | str) -> str:
     return file_stem
-def coerce_file_name(filename_or_id: int | str) -> str:
-    return coerce_file_stem(filename_or_id) + '.txt'
-def coerce_file_path(filename_or_id: int | str) -> Path:
-    return DOCS_DIR.joinpath(coerce_file_name(filename_or_id))
 def extract_file_id(filename_or_id: int | str | Path) -> str:
     if isinstance(filename_or_id, int) or (isinstance(filename_or_id, str) and len(filename_or_id) <= 6):
         return id_str(filename_or_id)
@@ -67,7 +49,10 @@ def file_size(file_path: str | Path) -> int:
 def file_size_str(file_path: str | Path) -> str:
-    size = file_size(file_path)
+    return file_size_to_str(file_size(file_path))
+def file_size_to_str(size: int) -> str:
     digits = 2
     if size > MB:
@@ -96,3 +81,7 @@ def is_local_extract_file(filename) -> bool:
     """Return true if filename is of form 'HOUSE_OVERSIGHT_029835_1.txt'."""
     file_match = FILE_ID_REGEX.match(str(filename))
     return True if file_match and file_match.group(2) else False
+def log_file_write(file_path: str | Path) -> None:
+    logger.warning(f"Wrote {file_size_str(file_path)} to '{file_path}'")

epstein_files/util/highlighted_group.py CHANGED Viewed

@@ -2,6 +2,7 @@ import re
 from dataclasses import dataclass, field
 from rich.highlighter import RegexHighlighter
+from rich.text import Text
 from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import *
@@ -21,7 +22,7 @@ EPSTEIN_ESTATE_EXECUTOR = f"Epstein {ESTATE_EXECUTOR}"
 REGEX_STYLE_PREFIX = 'regex'
 SIMPLE_NAME_REGEX = re.compile(r"^[-\w ]+$", re.IGNORECASE)
-CATEGORY_LABEL_MAPPING = {
+CATEGORY_STYLE_MAPPING = {
     ARTICLE: JOURNALIST,
     ARTS: ENTERTAINER,
     BOOK: JOURNALIST,
@@ -31,6 +32,12 @@ CATEGORY_LABEL_MAPPING = {
     REPUTATION: PUBLICIST,
 }
+CATEGORY_STYLES = {
+    JSON: 'dark_red',
+    JUNK: 'grey19',
+    'letter': 'medium_orchid1'
+}
 @dataclass(kw_only=True)
 class HighlightedText:
@@ -156,7 +163,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label=BUSINESS,
         style='spring_green4',
-        pattern=r'Gruterite|(John\s*)?Kluge|Marc Rich|(Mi(chael|ke)\s*)?Ovitz|(Steve\s+)?Wynn|(Les(lie)?\s+)?Wexner|SALSS|Swedish[-\s]*American\s*Life\s*Science\s*Summit|Valhi|(Yves\s*)?Bouvier',
+        pattern=r'Gruterite|(John\s*)?Kluge|Marc Rich|(Mi(chael|ke)\s*)?Ovitz|(Steve\s+)?Wynn|(Les(lie)?\s+)?Wexner|New Leaf Ventures|Park Partners|SALSS|Swedish[-\s]*American\s*Life\s*Science\s*Summit|Valhi|(Yves\s*)?Bouvier',
         emailers = {
             ALIREZA_ITTIHADIEH: 'CEO Freestream Aircraft Limited',
             BARBRO_C_EHNBOM: 'Swedish pharmaceuticals, SALSS',
@@ -270,7 +277,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label='europe',
         style='light_sky_blue3',
-        pattern=r'(Angela )?Merk(el|le)|Austria|(Benjamin\s*)?Harnwell|Berlin|Borge|Boris\s*Johnson|Brexit(eers?)?|Brit(ain|ish)|Brussels|Cannes|(Caroline|Jack)?\s*Lang(, Caroline)?|Cypr(iot|us)|Davos|ECB|England|EU|Europe(an)?(\s*Union)?|Fr(ance|ench)|Geneva|Germany?|Gillard|Gree(ce|k)|Ital(ian|y)|Jacques|(Kevin\s*)?Rudd|Le\s*Pen|London|Macron|Melusine|Munich|(Natalia\s*)?Veselnitskaya|(Nicholas\s*)?Sarkozy|Nigel(\s*Farage)?|Norw(ay|egian)|Oslo|Paris|Polish|(Sebastian )?Kurz|(Vi(c|k)tor\s+)?Orbah?n|Edward Rod Larsen|Strasbourg|Strauss[- ]?Kahn|Swed(en|ish)(?![-\s]+America)|Switzerland|(Tony\s)?Blair|Ukrain(e|ian)|Vienna|(Vitaly\s*)?Churkin|Zug',
+        pattern=r'(Angela )?Merk(el|le)|Austria|(Benjamin\s*)?Harnwell|Berlin|Borge|Boris\s*Johnson|Brexit(eers?)?|Brit(ain|ish)|Brussels|Cannes|(Caroline|Jack)?\s*Lang(, Caroline)?|Cypr(iot|us)|Davos|ECB|England|EU|Europe(an)?(\s*Union)?|Fr(ance|ench)|Geneva|Germany?|Gillard|Gree(ce|k)|Ital(ian|y)|Jacques|(Kevin\s*)?Rudd|Le\s*Pen|London|Macron|Melusine|Munich|(Natalia\s*)?Veselnitskaya|(Nicholas\s*)?Sarkozy|Nigel(\s*Farage)?|Norw(ay|egian)|Oslo|Paris|Polish|(Sebastian )?Kurz|(Vi(c|k)tor\s+)?Orbah?n|Edward Rod Larsen|Strasbourg|Strauss[- ]?Kahn|Swed(en|ish)(?![-\s]+America)|Switzerland|(Tony\s)?Blair|U\.?K\.?|Ukrain(e|ian)|Vienna|(Vitaly\s*)?Churkin|Zug',
         emailers = {
             ANDRZEJ_DUDA: 'former president of Poland',
             MIROSLAV_LAJCAK: 'Russia-friendly Slovakian politician, friend of Steve Bannon',
@@ -305,7 +312,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label='finance',
         style='green',
-        pattern=r'Apollo|Ari\s*Glass|Bank|(Bernie\s*)?Madoff|Black(rock|stone)|B\s*of\s*A|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|(money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
+        pattern=r'Apollo|Ari\s*Glass|Bank|(Bernie\s*)?Madoff|Black(rock|stone)|B\s*of\s*A|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|((anti.?)?money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
         emailers={
             AMANDA_ENS: 'Citigroup',
             DANIEL_SABBA: 'UBS Investment Bank',
@@ -325,6 +332,7 @@ HIGHLIGHTED_NAMES = [
         style='deep_pink2',
         pattern=r'Cambridge|(Derek\s*)?Bok|Elisa(\s*New)?|Harvard(\s*(Business|Law|University)(\s*School)?)?|(Jonathan\s*)?Zittrain|(Stephen\s*)?Kosslyn',
         emailers = {
+            "Donald Rubin": f"Professor of Statistics",
             "Kelly Friendly": f"longtime aide and spokesperson of {LARRY_SUMMERS}",
             LARRY_SUMMERS: 'board of Digital Currency Group (DCG), Harvard president, Obama economic advisor',
             'Leah Reis-Dennis': 'producer for Lisa New\'s Poetry in America',
@@ -390,7 +398,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label='law enforcement',
         style='color(24) bold',
-        pattern=r'ag|(Alicia\s*)?Valle|attorney|((Bob|Robert)\s*)?Mueller|(Byung\s)?Pak|CFTC|CIA|CIS|CVRA|Dep(artmen)?t\.?\s*of\s*(the\s*)?(Justice|Treasury)|DHS|DOJ|FBI|FCPA|FDIC|Federal\s*Bureau\s*of\s*Investigation|FinCEN|FINRA|FOIA|FTC|IRS|(James\s*)?Comey|(Jennifer\s*Shasky\s*)?Calvery|((Judge|Mark)\s*)?(Carney|Filip)|(Kirk )?Blouin|KYC|NIH|NS(A|C)|OCC|OFAC|(Lann?a\s*)?Belohlavek|lawyer|(Michael\s*)?Reiter|OGE|Office\s*of\s*Government\s*Ethics|Police Code Enforcement|(Preet\s*)?Bharara|SCOTUS|SD(FL|NY)|Southern\s*District\s*of\s*(Florida|New\s*York)|SEC|Securities\s*and\s*Exchange\s*Commission|State\s*Dep(artmen)?t|Strzok|Supreme\s*Court|Treasury\s*(Dep(artmen)?t|Secretary)|TSA|USAID|(William\s*J\.?\s*)?Zloch',
+        pattern=r'ag|(Alicia\s*)?Valle|AML|attorney|((Bob|Robert)\s*)?Mueller|(Byung\s)?Pak|CFTC?|CIA|CIS|CVRA|Dep(artmen)?t\.?\s*of\s*(the\s*)?(Justice|Treasury)|DHS|DOJ|FBI|FCPA|FDIC|Federal\s*Bureau\s*of\s*Investigation|FinCEN|FINRA|FOIA|FTC|IRS|(James\s*)?Comey|(Jennifer\s*Shasky\s*)?Calvery|((Judge|Mark)\s*)?(Carney|Filip)|(Kirk )?Blouin|KYC|NIH|NS(A|C)|OCC|OFAC|(Lann?a\s*)?Belohlavek|lawyer|(Michael\s*)?Reiter|OGE|Office\s*of\s*Government\s*Ethics|Police Code Enforcement|(Preet\s*)?Bharara|SCOTUS|SD(FL|NY)|Southern\s*District\s*of\s*(Florida|New\s*York)|SEC|Secret\s*Service|Securities\s*and\s*Exchange\s*Commission|State\s*Dep(artmen)?t|Strzok|Supreme\s*Court|Treasury\s*(Dep(artmen)?t|Secretary)|TSA|USAID|(William\s*J\.?\s*)?Zloch',
         emailers = {
             ANN_MARIE_VILLAFANA: 'southern district of Florida U.S. Attorney',
             DANNY_FROST: 'Director of Communications at Manhattan DA',
@@ -399,7 +407,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label=LOBBYIST,
         style='light_coral',
-        pattern=r'[BR]ob Crowe|Stanley Rosenberg',
+        pattern=r'[BR]ob Crowe|CSIS|Stanley Rosenberg',
         emailers = {
             'Joshua Cooper Ramo': 'co-CEO of Henry Kissinger Associates',
             KATHERINE_KEATING: 'Daughter of former Australian PM',
@@ -457,7 +465,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label='republicans',
         style='bold dark_red',
-        pattern=r'Alberto\sGonzale[sz]|(Alex\s*)?Acosta|(Bill\s*)?Barr|Bill\s*Shine|(Bob\s*)?Corker|(John\s*(R.?\s*)?)Bolton|Broidy|(Chris\s)?Christie|Devin\s*Nunes|(Don\s*)?McGa[hn]n|McMaster|(George\s*)?Nader|GOP|(Brett\s*)?Kavanaugh|Kissinger|Kobach|Koch\s*Brothers|Kolfage|Kudlow|Lewandowski|(Marco\s)?Rubio|(Mark\s*)Meadows|Mattis|(?<!Merwin Dela )Cruz|(Michael\s)?Hayden|((General|Mike)\s*)?(Flynn|Pence)|(Mitt\s*)?Romney|Mnuchin|Nikki|Haley|(Paul\s+)?Manafort|(Peter\s)?Navarro|Pompeo|Reagan|Reince|Priebus|Republican|(?<!Cynthia )(Richard\s*)?Nixon|Sasse|(Rex\s*)?Tillerson',
+        pattern=r'Alberto\sGonzale[sz]|(Alex\s*)?Acosta|(Bill\s*)?Barr|Bill\s*Shine|(Bob\s*)?Corker|(John\s*(R.?\s*)?)Bolton|Broidy|(Chris\s)?Christie|Devin\s*Nunes|(Don\s*)?McGa[hn]n|McMaster|(George\s*)?Nader|GOP|(Brett\s*)?Kavanaugh|Kissinger|Kobach|Koch\s*Brothers|Kolfage|Kudlow|Lewandowski|(Marco\s)?Rubio|(Mark\s*)Meadows|Mattis|McCain|(?<!Merwin Dela )Cruz|(Michael\s)?Hayden|((General|Mike)\s*)?(Flynn|Pence)|(Mitt\s*)?Romney|Mnuchin|Nikki|Haley|(Paul\s+)?(Manafort|Volcker)|(Peter\s)?Navarro|Pompeo|Reagan|Reince|Priebus|Republican|(Rex\s*)?Tillerson|(?<!Cynthia )(Richard\s*)?Nixon|Sasse',
         # There's no emails from these people, they're just here to automate the regex creation for both first + last names
         emailers = {
             RUDY_GIULIANI: 'disbarred formed mayor of New York City',
@@ -588,7 +596,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedText(
         label='phone_number',
         style='bright_green',
-        pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|\b[\d+]{10,12}\b",
+        pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|(\b|\+)[\d+]{10,12}\b",
     ),
 ]
@@ -648,18 +656,14 @@ def get_info_for_name(name: str) -> str | None:
 def get_style_for_category(category: str) -> str | None:
-    if category in [CONFERENCE, SPEECH]:
+    if category in CATEGORY_STYLES:
+        return CATEGORY_STYLES[category]
+    elif category in [CONFERENCE, SPEECH]:
         return f"{get_style_for_category(ACADEMIA)} dim"
-    elif category == JSON:
-        return 'dark_red'
-    elif category == JUNK:
-        return 'grey19'
-    elif category == 'letter':
-        return 'medium_orchid1'
     elif category == SOCIAL:
-        return f"{get_style_for_category(PUBLICIST)} dim"
+        return f"{get_style_for_category(PUBLICIST)}"
-    category = CATEGORY_LABEL_MAPPING.get(category, category)
+    category = CATEGORY_STYLE_MAPPING.get(category, category)
     for highlight_group in HIGHLIGHTED_NAMES:
         if highlight_group.label == category:
@@ -672,6 +676,10 @@ def get_style_for_name(name: str | None, default_style: str = DEFAULT, allow_bol
     return style if allow_bold else style.replace('bold', '').strip()
+def styled_category(category: str) -> Text:
+    return Text(category, get_style_for_category(category) or 'wheat4')
 def _get_highlight_group_for_name(name: str) -> HighlightedNames | None:
     for highlight_group in HIGHLIGHTED_NAMES:
         if highlight_group.regex.search(name):

epstein_files/util/logging.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import logging
 from os import environ
-from pathlib import Path
 import datefinder
 import rich_argparse_plus
@@ -10,7 +9,6 @@ from rich.logging import RichHandler
 from rich.theme import Theme
 from epstein_files.util.constant.strings import *
-from epstein_files.util.file_helper import file_size_str
 FILENAME_STYLE = 'gray27'
@@ -60,7 +58,3 @@ if env_log_level_str:
     logger.warning(f"Setting log level to {env_log_level} based on {LOG_LEVEL_ENV_VAR} env var...")
     logger.setLevel(env_log_level)
-def log_file_write(file_path: str | Path) -> None:
-    logger.warning(f"Wrote {file_size_str(file_path)} to '{file_path}'")

epstein_files/util/output.py CHANGED Viewed

@@ -11,7 +11,8 @@ from epstein_files.util.constant.names import *
 from epstein_files.util.constant.output_files import JSON_FILES_JSON_PATH, JSON_METADATA_PATH
 from epstein_files.util.data import dict_sets_to_lists
 from epstein_files.util.env import args, specified_names
-from epstein_files.util.logging import log_file_write, logger
+from epstein_files.util.file_helper import log_file_write
+from epstein_files.util.logging import logger
 from epstein_files.util.rich import *
 PRINT_COLOR_KEY_EVERY_N_EMAILS = 150
@@ -60,7 +61,6 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
     """Returns number of emails printed."""
     print_section_header(('Selections from ' if not args.all_emails else '') + 'His Emails')
     print_other_site_link(is_header=False)
     emailers_to_print: list[str | None]
     emailer_tables: list[str | None] = []
     already_printed_emails: list[Email] = []
@@ -106,8 +106,8 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
         _verify_all_emails_were_printed(epstein_files, already_printed_emails)
     fwded_articles = [e for e in already_printed_emails if e.config and e.config.is_fwded_article]
-    logger.warning(f"{len(fwded_articles)} of {len(already_printed_emails)} emails were forwarded articles.")
-    logger.warning(f"Rewrote {len(Email.rewritten_header_ids)} headers of {len(epstein_files.emails)} emails.")
+    log_msg = f"Rewrote {len(Email.rewritten_header_ids)} email headers (out of {len(already_printed_emails)})"
+    logger.warning(f"{log_msg}, {len(fwded_articles)} of the emails were forwarded articles.")
     return len(already_printed_emails)
@@ -121,7 +121,7 @@ def print_json_files(epstein_files: EpsteinFiles):
     else:
         for json_file in epstein_files.json_files:
             console.line(2)
-            console.print(json_file.description_panel())
+            console.print(json_file.summary_panel())
             console.print_json(json_file.json_str(), indent=4, sort_keys=False)
@@ -187,8 +187,13 @@ def write_urls() -> None:
 def _verify_all_emails_were_printed(epstein_files: EpsteinFiles, already_printed_emails: list[Email]) -> None:
     """Log warnings if some emails were never printed."""
     email_ids_that_were_printed = set([email.file_id for email in already_printed_emails])
-    logger.warning(f"Printed {len(already_printed_emails)} emails of {len(email_ids_that_were_printed)} unique file IDs.")
+    logger.warning(f"Printed {len(already_printed_emails):,} emails of {len(email_ids_that_were_printed):,} unique file IDs.")
+    missed_an_email = False
     for email in epstein_files.emails:
-        if email.file_id not in email_ids_that_were_printed and not email.is_duplicate:
+        if email.file_id not in email_ids_that_were_printed and not email.is_duplicate():
             logger.warning(f"Failed to print {email.summary()}")
+            missed_an_email = True
+    if not missed_an_email:
+        logger.warning(f"All {len(epstein_files.emails):,} emails printed at least once.")

epstein_files/util/rich.py CHANGED Viewed

@@ -20,8 +20,9 @@ from epstein_files.util.constant.urls import *
 from epstein_files.util.constants import FALLBACK_TIMESTAMP, HEADER_ABBREVIATIONS
 from epstein_files.util.data import json_safe
 from epstein_files.util.env import args
+from epstein_files.util.file_helper import log_file_write
 from epstein_files.util.highlighted_group import ALL_HIGHLIGHTS, HIGHLIGHTED_NAMES, EpsteinHighlighter
-from epstein_files.util.logging import log_file_write, logger
+from epstein_files.util.logging import logger
 TITLE_WIDTH = 50
 NUM_COLOR_KEY_COLS = 4
@@ -30,6 +31,7 @@ QUESTION_MARK_TXT = Text(QUESTION_MARKS, style='dim')
 GREY_NUMBERS = [58, 39, 39, 35, 30, 27, 23, 23, 19, 19, 15, 15, 15]
 DEFAULT_NAME_STYLE = 'gray46'
+INFO_STYLE = 'white dim italic'
 KEY_STYLE='honeydew2 bold'
 SECTION_HEADER_STYLE = 'bold white on blue3'
 SOCIAL_MEDIA_LINK_STYLE = 'pale_turquoise4'
@@ -239,23 +241,26 @@ def print_numbered_list_of_emailers(_list: list[str | None], epstein_files = Non
 def print_other_site_link(is_header: bool = True) -> None:
     """Print a link to the emails site if we're building text messages site and vice versa."""
     site_type: SiteType = EMAIL if args.all_emails else TEXT_MESSAGE
+    link_style = OTHER_SITE_LINK_STYLE if is_header else 'light_slate_grey bold'
     if is_header:
         print_starred_header(f"This is the Epstein {site_type.title()}s site", num_spaces=4, num_stars=14)
     other_site_type: SiteType = TEXT_MESSAGE if site_type == EMAIL else EMAIL
-    other_site_msg = "another site for" + (' all of' if other_site_type == EMAIL else '')
+    other_site_msg = "another site with" + (' all of' if other_site_type == EMAIL else '')
     other_site_msg += f" Epstein's {other_site_type}s also generated by this code"
-    markup_msg = link_markup(SITE_URLS[other_site_type], other_site_msg, OTHER_SITE_LINK_STYLE)
+    markup_msg = link_markup(SITE_URLS[other_site_type], other_site_msg, link_style)
     print_centered(parenthesize(Text.from_markup(markup_msg)), style='bold')
-    if is_header:
-        word_count_link = link_text_obj(WORD_COUNT_URL, 'most frequently used words in the emails and texts', AUX_SITE_LINK_STYLE)
-        print_centered(parenthesize(word_count_link))
-        metadata_link = link_text_obj(JSON_METADATA_URL, 'author attribution explanations', AUX_SITE_LINK_STYLE)
-        print_centered(parenthesize(metadata_link))
-        json_link = link_text_obj(WORD_COUNT_URL, "epstein's json files", AUX_SITE_LINK_STYLE)
-        print_centered(parenthesize(json_link))
+    if not is_header:
+        return
+    word_count_link = link_text_obj(WORD_COUNT_URL, 'most frequently used words in the emails and texts', AUX_SITE_LINK_STYLE)
+    print_centered(parenthesize(word_count_link))
+    metadata_link = link_text_obj(JSON_METADATA_URL, 'author attribution explanations', AUX_SITE_LINK_STYLE)
+    print_centered(parenthesize(metadata_link))
+    json_link = link_text_obj(WORD_COUNT_URL, "epstein's json files", AUX_SITE_LINK_STYLE)
+    print_centered(parenthesize(json_link))
 def print_page_title(expand: bool = True, width: int | None = None) -> None:

epstein_files/util/word_count.py CHANGED Viewed

@@ -9,18 +9,22 @@ from rich.padding import Padding
 from rich.text import Text
 from epstein_files.documents.emails.email_header import EmailHeader
-from epstein_files.util.constant.common_words import COMMON_WORDS, UNSINGULARIZABLE_WORDS
+from epstein_files.epstein_files import EpsteinFiles
+from epstein_files.util.constant.common_words import COMMON_WORDS_LIST, COMMON_WORDS, UNSINGULARIZABLE_WORDS
 from epstein_files.util.constant.names import OTHER_NAMES
+from epstein_files.util.constant.output_files import WORD_COUNT_HTML_PATH
 from epstein_files.util.data import ALL_NAMES, flatten, sort_dict
-from epstein_files.util.env import args
+from epstein_files.util.env import args, specified_names
 from epstein_files.util.logging import logger
-from epstein_files.util.rich import highlighter
-from epstein_files.util.search_result import SearchResult
+from epstein_files.util.rich import (console, highlighter, print_centered, print_color_key, print_page_title,
+     print_panel, print_starred_header, write_html)
+from epstein_files.util.search_result import MatchedLine, SearchResult
+from epstein_files.util.timer import Timer
 FIRST_AND_LAST_NAMES = flatten([n.split() for n in ALL_NAMES])
 FIRST_AND_LAST_NAMES = [n.lower() for n in FIRST_AND_LAST_NAMES] + OTHER_NAMES
-HTML_REGEX = re.compile(r"com/|cae-v2w=|content-(transfe|type)|font(/|-(family|size))|http|\.html?\??|margin-bottom|padding-left|quoted-printable|region=|text-decoration|ttps|www|\.(gif|jpe?g|png);?$")
+HTML_REGEX = re.compile(r"^http|#yiv|com/|cae-v2w=|content-(transfe|type)|font(/|-(family|size))|http|\.html?\??|margin-bottom|padding-left|quoted-printable|region=|text-decoration|ttps|www|\.(gif|jpe?g|png);?$")
 HYPHENATED_WORD_REGEX = re.compile(r"[a-z]+-[a-z]+", re.IGNORECASE)
 OK_SYMBOL_WORDS = ['mar-a-lago', 'p/e', 's&p', ':)', ':).', ';)', ':-)', ';-)']
 ONLY_SYMBOLS_REGEX = re.compile(r"^[^a-zA-Z0-9]+$")
@@ -187,6 +191,62 @@ class WordCount:
         yield f"Showing {len(word_txts):,} words appearing at least {MIN_COUNT_CUTOFF} times (out of {len(self.count):,} words)."
+def write_word_counts_html() -> None:
+    timer = Timer()
+    epstein_files = EpsteinFiles.get_files(timer)
+    email_subjects: set[str] = set()
+    word_count = WordCount()
+    # Remove dupes, junk mail, and fwded articles from emails
+    emails = [e for e in epstein_files.emails if not (e.is_duplicate() or e.is_junk_mail() or e.is_fwded_article())]
+    for email in emails:
+        if specified_names and email.author not in specified_names:
+            continue
+        logger.info(f"Counting words in {email}\n  [SUBJECT] {email.subject()}")
+        lines = email.actual_text.split('\n')
+        if email.subject() not in email_subjects and f'Re: {email.subject()}' not in email_subjects:
+            email_subjects.add(email.subject())
+            lines.append(email.subject())
+        for i, line in enumerate(lines):
+            if HTML_REGEX.search(line):
+                continue
+            for word in line.split():
+                word_count.tally_word(word, SearchResult(email, [MatchedLine(line, i)]))
+    # Add in iMessage conversation words
+    imessage_logs = epstein_files.imessage_logs_for(specified_names) if specified_names else epstein_files.imessage_logs
+    for imessage_log in imessage_logs:
+        logger.info(f"Counting words in {imessage_log}")
+        for i, msg in enumerate(imessage_log.messages):
+            if specified_names and msg.author not in specified_names:
+                continue
+            elif HTML_REGEX.search(line):
+                continue
+            for word in msg.text.split():
+                word_count.tally_word(word, SearchResult(imessage_log, [MatchedLine(msg.text, i)]))
+    print_page_title(expand=False)
+    print_starred_header(f"Most Common Words in {len(emails):,} Emails and {len(imessage_logs)} iMessage Logs")
+    print_centered(f"(excluding {len(COMMON_WORDS_LIST)} particularly common words at bottom)", style='dim')
+    console.line()
+    print_color_key()
+    console.line()
+    console.print(word_count)
+    console.line(2)
+    print_panel(f"{len(COMMON_WORDS_LIST):,} Excluded Words", centered=True)
+    console.print(', '.join(COMMON_WORDS_LIST), highlight=False)
+    write_html(WORD_COUNT_HTML_PATH)
+    timer.print_at_checkpoint(f"Finished counting words")
 def _word_style(word: str | None) -> str:
     word = word or ''
     return 'bright_white' if word in FIRST_AND_LAST_NAMES else 'grey53'

{epstein_files-1.0.9.dist-info → epstein_files-1.0.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: epstein-files
-Version: 1.0.9
+Version: 1.0.11
 Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
 Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
 License: GPL-3.0-or-later

epstein_files-1.0.11.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,33 @@
+epstein_files/__init__.py,sha256=4zxX1tw-0xMwpM-Sbq7PezV0YNS9zN-P6gc9BQ1BqKU,4710
+epstein_files/documents/communication.py,sha256=SunZdjMhR9v6y8LlQ6jhIu8vYjSndaBK0Su1mKnhfj0,2060
+epstein_files/documents/document.py,sha256=dECV0bSnOJzPfOIHyHeG5rNxKd6uwuiso35-sQZg9No,18353
+epstein_files/documents/email.py,sha256=yXiW7mB4myU8G9DY7PnnqazaCqeAR3dHr35NfBplfRU,38519
+epstein_files/documents/emails/email_header.py,sha256=wkPfSLbmzkAeQwvhf0bAeFDLPbQT-EeG0v8vNNLYktM,7502
+epstein_files/documents/imessage/text_message.py,sha256=3HlNp75JIoMlWj7PaUWIFry3qlGEmpGu5OmdmsBYS34,2807
+epstein_files/documents/json_file.py,sha256=HsnVWPZXVxTF_DadL2YtJtsiXKXOd18PUs05O33tjNc,1317
+epstein_files/documents/messenger_log.py,sha256=uSPlg85jGTwod1cV9f7MtxSNqmMZ61JBFzoiRNqg52M,6263
+epstein_files/documents/other_file.py,sha256=S_Y-SxYYYXtx42JHmhFWl5BbTduNI7cwQjeYHBJA7sc,9950
+epstein_files/epstein_files.py,sha256=SaD4DJJ5tRxY97Ei4BdOgLzHQ9wrBVGrP64CSqdmk-w,18691
+epstein_files/util/constant/common_words.py,sha256=aR0UjoWmxyR49XS-DtHECQ1CiA_bK8hNP6CQ1TS9yZA,3696
+epstein_files/util/constant/html.py,sha256=9U098TGzlghGg4WfxLYHyub5JGR17Dv7VP5i2MSu8Kk,1415
+epstein_files/util/constant/names.py,sha256=KKJEYFpdOp4xDwXe5dhrqYgF12oJODvVSFpAB28Q76A,10153
+epstein_files/util/constant/output_files.py,sha256=BkV4_gmdj46RfGy5SFYp6dgTty3FtlBth5YGmaGutls,1700
+epstein_files/util/constant/strings.py,sha256=FDtksfH50PSxtSBw9XhmqxtrgRgGxdIvGiAR2bbPpu4,1899
+epstein_files/util/constant/urls.py,sha256=0IdCVVvXib0i-4TZFkVHoS4zCbjOBZWcr6NkGxsmQWM,4981
+epstein_files/util/constants.py,sha256=LPSI6Z0n3ChFDnMGYVO80cGuSKZf0OoyUzLih_jlRKI,111434
+epstein_files/util/data.py,sha256=xwTqrbAi7ZDJM0iyFVOevnokP_oIQ2npkRjHzF1KGGY,2908
+epstein_files/util/doc_cfg.py,sha256=OZlocAWldfR8Nomiad4FxQeyhNMbd0PQ-rumKn2nWBg,9641
+epstein_files/util/env.py,sha256=HnYcfHSNkwVJ_T75Woy43_OpDyxD0KHPj3GxcVx86N4,5751
+epstein_files/util/file_helper.py,sha256=-higKqc9J5IfNpzMzg-9j1ps3beV4N2cw8kdAxfm7NA,2835
+epstein_files/util/highlighted_group.py,sha256=fU-8ns50uUolzPEAxadF5AnPLjn383KpEeyRXfFbv_U,35971
+epstein_files/util/logging.py,sha256=8e22WaBfDAKEmkcr3Gb4TdqtFSkU4FQDpk3Z6hfSzbw,1977
+epstein_files/util/output.py,sha256=UzTU0mNHEmeJr3w2TXAp19X497GB6_-HyW0mfztI1jk,8120
+epstein_files/util/rich.py,sha256=8-4IA5bwPBdDPqkPdymq3zVKB9hfy3nrT7fUrN_XevY,14744
+epstein_files/util/search_result.py,sha256=1fxe0KPBQXBk4dLfu6m0QXIzYfZCzvaSkWqvghJGzxY,567
+epstein_files/util/timer.py,sha256=8hxW4Y1JcTUfnBrHh7sL2pM9xu1sL4HFQM4CmmzTarU,837
+epstein_files/util/word_count.py,sha256=8qBTuq3d0Q-3fwiuECKWi2RfL-KUiZD8TciwvfL0D_o,9353
+epstein_files-1.0.11.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+epstein_files-1.0.11.dist-info/METADATA,sha256=HBW3t1F9lkoN6GIR7ySV2kBYnJhNEF9otDZWnf03jUo,5480
+epstein_files-1.0.11.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
+epstein_files-1.0.11.dist-info/entry_points.txt,sha256=5qYgwAXpxegeAicD_rzda_trDRnUC51F5UVDpcZ7j6Q,240
+epstein_files-1.0.11.dist-info/RECORD,,

epstein_files/count_words.py DELETED Viewed

@@ -1,72 +0,0 @@
-# Count word usage in emails and texts
-import re
-from epstein_files.epstein_files import EpsteinFiles
-from epstein_files.util.constant.common_words import COMMON_WORDS_LIST
-from epstein_files.util.constant.output_files import WORD_COUNT_HTML_PATH
-from epstein_files.util.env import args, specified_names
-from epstein_files.util.logging import logger
-from epstein_files.util.rich import (console, print_centered, print_color_key, print_page_title, print_panel,
-     print_starred_header, write_html)
-from epstein_files.util.search_result import MatchedLine, SearchResult
-from epstein_files.util.timer import Timer
-from epstein_files.util.word_count import WordCount
-HTML_REGEX = re.compile(r"^http|#yiv")
-def write_word_counts_html() -> None:
-    timer = Timer()
-    epstein_files = EpsteinFiles.get_files(timer)
-    email_subjects: set[str] = set()
-    word_count = WordCount()
-    # Remove dupes, junk mail, and fwded articles from emails
-    emails = [
-        e for e in epstein_files.emails
-        if not (e.is_duplicate or e.is_junk_mail() or (e.config and e.config.is_fwded_article)) \
-            and (len(specified_names) == 0 or e.author in specified_names)
-    ]
-    for email in emails:
-        logger.info(f"Counting words in {email}\n  [SUBJECT] {email.subject()}")
-        lines = email.actual_text.split('\n')
-        if email.subject() not in email_subjects and f'Re: {email.subject()}' not in email_subjects:
-            email_subjects.add(email.subject())
-            lines.append(email.subject())
-        for i, line in enumerate(lines):
-            if HTML_REGEX.search(line):
-                continue
-            for word in line.split():
-                word_count.tally_word(word, SearchResult(email, [MatchedLine(line, i)]))
-    # Add in iMessage conversation words
-    imessage_logs = epstein_files.imessage_logs_for(specified_names) if specified_names else epstein_files.imessage_logs
-    for imessage_log in imessage_logs:
-        logger.info(f"Counting words in {imessage_log}")
-        for msg in imessage_log.messages():
-            if len(specified_names) > 0 and msg.author not in specified_names:
-                continue
-            elif HTML_REGEX.search(line):
-                continue
-            for word in msg.text.split():
-                word_count.tally_word(word, SearchResult(imessage_log, [msg.text]))
-    print_page_title(expand=False)
-    print_starred_header(f"Most Common Words in {len(emails):,} Emails and {len(imessage_logs)} iMessage Logs")
-    print_centered(f"(excluding {len(COMMON_WORDS_LIST)} particularly common words at bottom)", style='dim')
-    console.line()
-    print_color_key()
-    console.line()
-    console.print(word_count)
-    console.line(2)
-    print_panel(f"{len(COMMON_WORDS_LIST):,} Excluded Words", centered=True)
-    console.print(', '.join(COMMON_WORDS_LIST), highlight=False)
-    write_html(WORD_COUNT_HTML_PATH)
-    timer.print_at_checkpoint(f"Finished counting words")

epstein_files-1.0.9.dist-info/RECORD DELETED Viewed

@@ -1,34 +0,0 @@
-epstein_files/__init__.py,sha256=w68EUhAzri8a_c9HggKMfoHTvPbVAO-u3NuHVizDc7U,4940
-epstein_files/count_words.py,sha256=i1pYaQzX7b9S3pyV3RM_8asbQJ1PEk8wJgLOG6Mf0D8,2966
-epstein_files/documents/communication.py,sha256=SunZdjMhR9v6y8LlQ6jhIu8vYjSndaBK0Su1mKnhfj0,2060
-epstein_files/documents/document.py,sha256=BUaioSvOmfsR-ULa6hJy3WYg-hBDC-kqafUheMJ-jFY,16665
-epstein_files/documents/email.py,sha256=H34b2zt_TrPUgXHwZXybjmLE9-QNAtezs9NVSCPOSGM,38462
-epstein_files/documents/emails/email_header.py,sha256=wkPfSLbmzkAeQwvhf0bAeFDLPbQT-EeG0v8vNNLYktM,7502
-epstein_files/documents/imessage/text_message.py,sha256=wfWPQhwGG5Yzyhbr1NAQAY0bzRjjqVZmh8SPl48XmAM,3025
-epstein_files/documents/json_file.py,sha256=1Cx_3uM38Dwgrbs8fU55TUZKSrCsmd4QpHKWtfWdudw,1089
-epstein_files/documents/messenger_log.py,sha256=DHlQpbLbMITMpMtCYk2vcRc7-CTvYvOXql-9nDUc3tQ,5887
-epstein_files/documents/other_file.py,sha256=NdVlCYcyzHvOInReqF-zvHJI1hwtzMWW9ekDojHIb4U,9091
-epstein_files/epstein_files.py,sha256=EEx8Auwv8z0FkRrCi7wE8iuuRQd6K1rQDMc2vdbrsh4,18298
-epstein_files/util/constant/common_words.py,sha256=aR0UjoWmxyR49XS-DtHECQ1CiA_bK8hNP6CQ1TS9yZA,3696
-epstein_files/util/constant/html.py,sha256=9U098TGzlghGg4WfxLYHyub5JGR17Dv7VP5i2MSu8Kk,1415
-epstein_files/util/constant/names.py,sha256=flIZCafFXHiaSy-G2QhYH0hNfkeJBH6Gz7p9AdvYgC0,10125
-epstein_files/util/constant/output_files.py,sha256=BkV4_gmdj46RfGy5SFYp6dgTty3FtlBth5YGmaGutls,1700
-epstein_files/util/constant/strings.py,sha256=FDtksfH50PSxtSBw9XhmqxtrgRgGxdIvGiAR2bbPpu4,1899
-epstein_files/util/constant/urls.py,sha256=0IdCVVvXib0i-4TZFkVHoS4zCbjOBZWcr6NkGxsmQWM,4981
-epstein_files/util/constants.py,sha256=MsWZQs3qd9N1HlgC7MoSKRF6ssbmWlUXX49REsp3qQs,110867
-epstein_files/util/data.py,sha256=xwTqrbAi7ZDJM0iyFVOevnokP_oIQ2npkRjHzF1KGGY,2908
-epstein_files/util/doc_cfg.py,sha256=5Pb__bP00mKi9ACv33omZQA-TBzumc7D2Td_Mk4M5DY,9822
-epstein_files/util/env.py,sha256=PaPBi27-npU9egt9LHxr5qR65B2DPHwt7Xc9sx5VN-M,5225
-epstein_files/util/file_helper.py,sha256=v_bE10MHEcXti9DVJo4WqyOsG83Xrv05S3Vc70cYJkk,3082
-epstein_files/util/highlighted_group.py,sha256=L7R63oyDG_lQ9Vv5gB_rRwJgHS2sdMfXHs9xcuDzqdc,35667
-epstein_files/util/logging.py,sha256=4hVl1Qw1qRMSVEYKXZxrvdQuSIMBgTPskzvNMNu8268,2185
-epstein_files/util/output.py,sha256=wLjFBGR5ffn4cLep12G3OmUR0H3WtEMXeVMOXtd-6ig,7909
-epstein_files/util/rich.py,sha256=rdHzn4XRB2erQSf2yYyPakRmd9ixqBUdS8-BVOUAXnE,14603
-epstein_files/util/search_result.py,sha256=1fxe0KPBQXBk4dLfu6m0QXIzYfZCzvaSkWqvghJGzxY,567
-epstein_files/util/timer.py,sha256=8hxW4Y1JcTUfnBrHh7sL2pM9xu1sL4HFQM4CmmzTarU,837
-epstein_files/util/word_count.py,sha256=eGzcsoAvMcutRUFOJnVuEp9_28H74to7T9jTdGUZnuI,6757
-epstein_files-1.0.9.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-epstein_files-1.0.9.dist-info/METADATA,sha256=QTK8iM7ZkD2742Gk9c8yPyG5LV1QLNOrjKguJALSX1c,5479
-epstein_files-1.0.9.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
-epstein_files-1.0.9.dist-info/entry_points.txt,sha256=5qYgwAXpxegeAicD_rzda_trDRnUC51F5UVDpcZ7j6Q,240
-epstein_files-1.0.9.dist-info/RECORD,,

{epstein_files-1.0.9.dist-info → epstein_files-1.0.11.dist-info}/LICENSE RENAMED Viewed

File without changes

{epstein_files-1.0.9.dist-info → epstein_files-1.0.11.dist-info}/WHEEL RENAMED Viewed

File without changes

{epstein_files-1.0.9.dist-info → epstein_files-1.0.11.dist-info}/entry_points.txt RENAMED Viewed

File without changes

epstein-files 1.0.9__py3-none-any.whl → 1.0.11__py3-none-any.whl

epstein-files 1.0.9py3-none-any.whl → 1.0.11py3-none-any.whl