PyPI - epstein-files - Versions diffs - 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl - Mend

epstein-files 1.0.4py3-none-any.whl → 1.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

epstein_files/__init__.py +24 -25
epstein_files/count_words.py +72 -0
epstein_files/documents/document.py +1 -2
epstein_files/documents/email.py +15 -10
epstein_files/documents/json_file.py +4 -4
epstein_files/documents/messenger_log.py +2 -1
epstein_files/documents/other_file.py +2 -2
epstein_files/epstein_files.py +40 -40
epstein_files/util/constant/output_files.py +20 -4
epstein_files/util/constant/strings.py +8 -8
epstein_files/util/constant/urls.py +6 -21
epstein_files/util/constants.py +19 -18
epstein_files/util/doc_cfg.py +3 -1
epstein_files/util/env.py +35 -30
epstein_files/util/highlighted_group.py +4 -3
epstein_files/util/output.py +29 -16
epstein_files/util/rich.py +56 -28
epstein_files/util/word_count.py +10 -10
{epstein_files-1.0.4.dist-info → epstein_files-1.0.6.dist-info}/METADATA +37 -18
epstein_files-1.0.6.dist-info/RECORD +34 -0
{epstein_files-1.0.4.dist-info → epstein_files-1.0.6.dist-info}/entry_points.txt +1 -1
epstein_files-1.0.4.dist-info/RECORD +0 -33
{epstein_files-1.0.4.dist-info → epstein_files-1.0.6.dist-info}/LICENSE +0 -0
{epstein_files-1.0.4.dist-info → epstein_files-1.0.6.dist-info}/WHEEL +0 -0

epstein_files/util/constants.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import re
 from copy import deepcopy
+from typing import cast
 from dateutil.parser import parse
@@ -84,7 +85,7 @@ EMAILER_ID_REGEXES: dict[str, re.Pattern] = {
     JAMES_HILL: re.compile(r"hill, james e.|james.e.hill@abc.com", re.IGNORECASE),
     JEAN_LUC_BRUNEL: re.compile(r'Jean[- ]Luc Brunel?', re.IGNORECASE),
     JEFF_FULLER: re.compile(r"jeff@mc2mm.com|Jeff Fuller", re.IGNORECASE),
-    JEFFREY_EPSTEIN: re.compile(r'[djl]ee[vy]acation[©@]?g?(mail.com)?|Epstine|\bJEE?\b|Jeffrey E((sp|ps)tein?)?|jeeproject@yahoo.com|J Jep|Jeffery Edwards|(?<!Mark L. )Epstein', re.IGNORECASE),
+    JEFFREY_EPSTEIN: re.compile(r'[djl]\s?ee[vy]acation[©@]?g?(mail.com)?|Epstine|\bJEE?\b|Jeffrey E((sp|ps)tein?)?|jeeproject@yahoo.com|J Jep|Jeffery Edwards|(?<!Mark L. )Epstein', re.IGNORECASE),
     JESSICA_CADWELL: re.compile(r'Jessica Cadwell?', re.IGNORECASE),
     JOHNNY_EL_HACHEM: re.compile(r'el hachem johnny|johnny el hachem', re.IGNORECASE),
     JOI_ITO: re.compile(r'ji@media.mit.?edu|(joichi|joi)( Ito)?', re.IGNORECASE),
@@ -94,7 +95,7 @@ EMAILER_ID_REGEXES: dict[str, re.Pattern] = {
     LANDON_THOMAS: re.compile(r'lando[nr] thomas( jr)?|thomas jr.?, lando[nr]', re.IGNORECASE),
     LARRY_SUMMERS: re.compile(r'(La(wrence|rry).{1,5})?Summers?|^LH$|LHS|Ihsofficel', re.IGNORECASE),
     LAWRANCE_VISOSKI: re.compile(r'La(rry|wrance) Visoski?|Lvjet', re.IGNORECASE),
-    LAWRENCE_KRAUSS: re.compile(r'Lawrence Kraus|lawkrauss', re.IGNORECASE),
+    LAWRENCE_KRAUSS: re.compile(r'Lawrence Kraus|[jl]awkrauss', re.IGNORECASE),
     LEON_BLACK: re.compile(r'Leon Black?', re.IGNORECASE),
     MANUELA_MARTINEZ: re.compile(fr'Manuela (- Mega Partners|Martinez)', re.IGNORECASE),
     MARIANA_IDZKOWSKA: re.compile(r'Mariana [Il]d[źi]kowska?', re.IGNORECASE),
@@ -268,7 +269,7 @@ SHIMON_POST = 'The Shimon Post'
 SHIMON_POST_ARTICLE = f'selection of articles about the mideast'
 SINGLE_PAGE = 'single page of'
 STRANGE_BEDFELLOWS = "'Strange Bedfellows' list of invitees f. Johnny Depp, Woody Allen, Obama, and more"
-SWEDISH_LIFE_SCIENCES_SUMMIT = f"{BARBRO_C_EHNBOM}'s Swedish American Life Science Summit"
+SWEDISH_LIFE_SCIENCES_SUMMIT = f"{BARBRO_C_EHNBOM}'s Swedish American Life Science Summit (SALSS)"
 THE_REAL_DEAL_ARTICLE = 'article by Keith Larsen'
 TRUMP_DISCLOSURES = f"Donald Trump financial disclosures from U.S. Office of Government Ethics"
 UBS_CIO_REPORT = 'CIO Monthly Extended report'
@@ -371,8 +372,8 @@ TEXTS_CONFIG = CONFIRMED_TEXTS_CONFIG + UNCONFIRMED_TEXTS_CONFIG
 ########################################################################################################
 # Some emails have a lot of uninteresting CCs
-IRAN_NUCLEAR_DEAL_SPAM_EMAIL_RECIPIENTS: list[str | None] = ['Allen West', 'Rafael Bardaji', 'Philip Kafka', 'Herb Goodman', 'Grant Seeger', 'Lisa Albert', 'Janet Kafka', 'James Ramsey', 'ACT for America', 'John Zouzelka', 'Joel Dunn', 'Nate McClain', 'Bennet Greenwald', 'Taal Safdie', 'Uri Fouzailov', 'Neil Anderson', 'Nate White', 'Rita Hortenstine', 'Henry Hortenstine', 'Gary Gross', 'Forrest Miller', 'Bennett Schmidt', 'Val Sherman', 'Marcie Brown', 'Michael Horowitz', 'Marshall Funk']
-FLIGHT_IN_2012_PEOPLE: list[str | None] = ['Francis Derby', 'Januiz Banasiak', 'Louella Rabuyo', 'Richard Barnnet']
+IRAN_DEAL_RECIPIENTS = ['Allen West', 'Rafael Bardaji', 'Philip Kafka', 'Herb Goodman', 'Grant Seeger', 'Lisa Albert', 'Janet Kafka', 'James Ramsey', 'ACT for America', 'John Zouzelka', 'Joel Dunn', 'Nate McClain', 'Bennet Greenwald', 'Taal Safdie', 'Uri Fouzailov', 'Neil Anderson', 'Nate White', 'Rita Hortenstine', 'Henry Hortenstine', 'Gary Gross', 'Forrest Miller', 'Bennett Schmidt', 'Val Sherman', 'Marcie Brown', 'Michael Horowitz', 'Marshall Funk']
+FLIGHT_IN_2012_PEOPLE = ['Francis Derby', 'Januiz Banasiak', 'Louella Rabuyo', 'Richard Barnnet']
 EMAILS_CONFIG = [
     EmailCfg(id='032436', author=ALIREZA_ITTIHADIEH, attribution_reason='Signature'),
@@ -491,9 +492,6 @@ EMAILS_CONFIG = [
     EmailCfg(id='032727', author=KATHRYN_RUEMMLER, attribution_reason=KATHY_REASON, is_attribution_uncertain=True),
     EmailCfg(id='030478', author=LANDON_THOMAS),
     EmailCfg(id='029013', author=LARRY_SUMMERS, recipients=[JEFFREY_EPSTEIN]),    # Bad OCR (nofix)
-    EmailCfg(id='032206', author=LAWRENCE_KRAUSS),                                # More of a text convo?
-    EmailCfg(id='032208', author=LAWRENCE_KRAUSS, recipients=[JEFFREY_EPSTEIN]),  # More of a text convo?
-    EmailCfg(id='032209', author=LAWRENCE_KRAUSS, recipients=[JEFFREY_EPSTEIN]),  # More of a text convo?
     EmailCfg(id='029196', author=LAWRENCE_KRAUSS, recipients=[JEFFREY_EPSTEIN], actual_text='Talk in 40?'),
     EmailCfg(id='033593', author=LAWRANCE_VISOSKI, attribution_reason='Signature'),
     EmailCfg(id='033370', author=LAWRANCE_VISOSKI, attribution_reason=LARRY_REASON),
@@ -575,7 +573,7 @@ EMAILS_CONFIG = [
         attribution_reason='ends with "Respectfully, terry"',
         author=TERRY_KAFKA,
         fwded_text_after='From: Mike Cohen',
-        recipients=[JEFFREY_EPSTEIN, MARK_EPSTEIN, MICHAEL_BUCHHOLTZ] + IRAN_NUCLEAR_DEAL_SPAM_EMAIL_RECIPIENTS,
+        recipients=[JEFFREY_EPSTEIN, MARK_EPSTEIN, MICHAEL_BUCHHOLTZ] + IRAN_DEAL_RECIPIENTS,
         duplicate_ids=['028482'],
     ),
     EmailCfg(id='029992', author=TERRY_KAFKA, attribution_reason='Quoted reply'),
@@ -600,7 +598,6 @@ EMAILS_CONFIG = [
     EmailCfg(id='022202', recipients=[JEAN_LUC_BRUNEL], attribution_reason='Follow up / reply', duplicate_ids=['029975']),
     EmailCfg(id='022187', recipients=[JEFFREY_EPSTEIN]),  # Bad OCR (nofix)
     EmailCfg(id='031489', recipients=[JEFFREY_EPSTEIN]),  # Bad OCR (unfixable)
-    EmailCfg(id='032210', recipients=[JEFFREY_EPSTEIN]),  # More of a text convo?
     EmailCfg(id='030347', recipients=[JEFFREY_EPSTEIN]),  # Bad OCR (nofix)
     EmailCfg(id='030367', recipients=[JEFFREY_EPSTEIN]),  # Bad OCR (nofix)
     EmailCfg(id='033274', recipients=[JEFFREY_EPSTEIN]),  # this is a note sent to self
@@ -751,7 +748,7 @@ EMAILS_CONFIG = [
     EmailCfg(id='031118', duplicate_ids=['019465']),
     EmailCfg(id='031912', duplicate_ids=['032158']),
     EmailCfg(id='030587', duplicate_ids=['030514']),
-    EmailCfg(id='029773', duplicate_ids=['012685']),
+    EmailCfg(id='029773', duplicate_ids=['012685'], fwded_text_after='Omar Quadhafi'),
     EmailCfg(id='033297', duplicate_ids=['033586']),
     EmailCfg(id='031089', duplicate_ids=['018084']),
     EmailCfg(id='031088', duplicate_ids=['030885']),
@@ -1195,7 +1192,7 @@ OTHER_FILES_CONFERENCES = [
     DocCfg(id='019300', author=SVETLANA_POZHIDAEVA, description=f'{WOMEN_EMPOWERMENT} f. {KATHRYN_RUEMMLER}', date='2019-04-05'),
     DocCfg(id='022267', author=SVETLANA_POZHIDAEVA, description=f'{WOMEN_EMPOWERMENT} founder essay about growing the seminar business'),
     DocCfg(id='022407', author=SVETLANA_POZHIDAEVA, description=f'{WOMEN_EMPOWERMENT} seminar pitch deck'),
-    DocCfg(id='017524', author=SWEDISH_LIFE_SCIENCES_SUMMIT, description=f"2012 program"),
+    DocCfg(id='017524', author=SWEDISH_LIFE_SCIENCES_SUMMIT, description=f"2012 program emailed to epstein BY {BARBRO_C_EHNBOM} in 031226", date='2012-08-18'),
     DocCfg(id='026747', author=SWEDISH_LIFE_SCIENCES_SUMMIT, description=f"2017 program", date='2017-08-23'),
     DocCfg(id='014951', author='TED Talks', description=f"2017 program", date='2017-04-20'),
     DocCfg(id='024179', author=UN_GENERAL_ASSEMBLY, description=f'president and first lady schedule', date='2012-09-21'),
@@ -1326,7 +1323,7 @@ OTHER_FILES_LETTERS = [
 ]
 OTHER_FILES_PROPERTY = [
-    DocCfg(id='026759', author='Great Bay Condominium Owners Association', description=f'{PRESS_RELEASE} by about Hurricane Irma damage', date='2017-09-13'),
+    DocCfg(id='026759', author='Great Bay Condominium Owners Association', description=f'{PRESS_RELEASE} about Hurricane Irma damage', date='2017-09-13'),
     DocCfg(id='016602', author=PALM_BEACH_CODE_ENFORCEMENT, description='board minutes', date='2008-04-17'),
     DocCfg(id='016554', author=PALM_BEACH_CODE_ENFORCEMENT, description='board minutes', date='2008-07-17', duplicate_ids=['016616', '016574']),
     DocCfg(id='027068', author=THE_REAL_DEAL, description=f"{THE_REAL_DEAL_ARTICLE} Palm House Hotel Bankruptcy and EB-5 Visa Fraud Allegations"),
@@ -1379,8 +1376,8 @@ OTHER_FILES_SOCIAL = [
 ]
 OTHER_FILES_POLITICS = [
-    DocCfg(id='029918', author=DIANA_DEGETTE_CAMPAIGN, description=f"bio", date='2012-01-01'),
-    DocCfg(id='031184', author=DIANA_DEGETTE_CAMPAIGN, description=f"fundraiser invitation"),
+    DocCfg(id='029918', author=DIANA_DEGETTE_CAMPAIGN, description=f"bio", date='2012-09-27'),
+    DocCfg(id='031184', author=DIANA_DEGETTE_CAMPAIGN, description=f"invitation to fundraiser hosted by {BARBRO_C_EHNBOM}", date='2012-09-27'),
     DocCfg(id='026827', author='Scowcroft Group', description=f'report on ISIS', date='2015-11-14'),
     DocCfg(id='024294', author=STACEY_PLASKETT, description=f"campaign flier", date='2016-10-01'),
     DocCfg(
@@ -1482,6 +1479,11 @@ OTHER_FILES_ARTS = [
 OTHER_FILES_MISC = [
     DocCfg(id='022780', category=FLIGHT_LOGS),
     DocCfg(id='022816', category=FLIGHT_LOGS),
+    DocCfg(id='032206', category=SKYPE_LOG, author=LAWRENCE_KRAUSS),
+    DocCfg(id='032208', category=SKYPE_LOG, author=LAWRENCE_KRAUSS),
+    DocCfg(id='032209', category=SKYPE_LOG, author=LAWRENCE_KRAUSS),
+    DocCfg(id='018224', category=SKYPE_LOG, author=LAWRENCE_KRAUSS, description=f'conversations with linkspirit (French?) and {LAWRENCE_KRAUSS}'),
+    DocCfg(id='032210', category=SKYPE_LOG, description=f'conversation with linkspirit'),
     DocCfg(
         id='025147',
         author=BROCKMAN_INC,
@@ -1496,7 +1498,6 @@ OTHER_FILES_MISC = [
     DocCfg(id='027074', author=FEMALE_HEALTH_COMPANY, description=f"pitch deck (USAID was a customer)"),
     DocCfg(id='032735', author=GORDON_GETTY, description=f"on Trump", date='2018-03-20'),  # Dated based on concurrent emails from Getty
     DocCfg(id='025540', author=JEFFREY_EPSTEIN, description=f"rough draft of Epstein's side of the story?"),
-    DocCfg(id='018224', author=LAWRENCE_KRAUSS, description=f"Skype conversation log"),
     DocCfg(id='026634', author='Michael Carrier', description=f"comments about an Apollo linked hedge fund 'DE Fund VIII'"),
     DocCfg(id='031425', author=SCOTT_J_LINK, description=f'completely redacted email from'),
     DocCfg(id='020447', author='Working Group on Chinese Influence Activities in the U.S.', description=f'Promoting Constructive Vigilance'),
@@ -1589,8 +1590,8 @@ SENT_FROM_REGEX = re.compile(r'^(?:(Please forgive|Sorry for all the) typos.{1,4
 # Error checking.
-if len(OTHER_FILES_CONFIG) != 438:
-    logger.warning(f"Only {len(OTHER_FILES_CONFIG)} configured other files!")
+if len(OTHER_FILES_CONFIG) != 442:
+    logger.warning(f"Found {len(OTHER_FILES_CONFIG)} configured other files!")
 encountered_file_ids = set()

epstein_files/util/doc_cfg.py CHANGED Viewed

@@ -109,7 +109,9 @@ class DocCfg:
     def info_str(self) -> str | None:
         """String that summarizes what is known about this document."""
-        if self.category == REPUTATION:
+        if self.category and not self.description:
+            return self.category
+        elif self.category == REPUTATION:
             return f"{REPUTATION_MGMT}: {self.description}"
         elif self.author and self.description:
             if self.category in [ACADEMIA, BOOK]:

epstein_files/util/env.py CHANGED Viewed

@@ -6,36 +6,41 @@ from sys import argv
 from epstein_files.util.logging import datefinder_logger, env_log_level, logger
-COUNT_WORDS_SCRIPT = 'count_words.py'
-DEFAULT_WIDTH = 154
-HTML_SCRIPTS = ['epstein_generate', 'generate_html.py', COUNT_WORDS_SCRIPT]
+COUNT_WORDS_SCRIPT = 'epstein_word_count'
+DEFAULT_WIDTH = 145
+HTML_SCRIPTS = ['epstein_generate', COUNT_WORDS_SCRIPT]
 parser = ArgumentParser(description="Parse epstein OCR docs and generate HTML page.")
-parser.add_argument('--build', '-b', action='store_true', help='write output to file')
-parser.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
-parser.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just interesting ones')
-parser.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
 parser.add_argument('--name', '-n', action='append', dest='names', help='specify the name(s) whose communications should be output')
-parser.add_argument('--output-file', '-out', metavar='FILE', default='index.html', help='write output to FILE in docs/ (default=index.html)')
-parser.add_argument('--output-emails', '-oe', action='store_true', help='generate other files section')
-parser.add_argument('--output-other-files', '-oo', action='store_true', help='generate other files section')
-parser.add_argument('--output-texts', '-ot', action='store_true', help='generate other files section')
-parser.add_argument('--pickled', '-p', action='store_true', help='use pickled EpsteinFiles object')
-parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='generate new pickled EpsteinFiles object')
-parser.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (only used by scripts)')
-parser.add_argument('--sort-alphabetical', '-alpha', action='store_true', help='sort emailers alphabetically in counts table')
-parser.add_argument('--suppress-output', '-s', action='store_true', help='no output to terminal (use with --build)')
-parser.add_argument('--use-epstein-web-links', '-use', action='store_true', help='use epsteinweb.org links instead of epstein.media')
-parser.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use')
-parser.add_argument('--whole-file', '-wf', action='store_true', help='print whole file (only used by search script)')
-parser.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
-parser.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
-parser.add_argument('--make-clean', '-mc', action='store_true', help='delete all build artifact HTML and JSON files')
-parser.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
-parser.add_argument('--json-metadata', '-jm', action='store_true', help='dump JSON metadata for all files')
-parser.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats at the end')
-parser.add_argument('positional_args', nargs='*', help='Optional args (only used by helper scripts)')
+parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='ovewrite cached EpsteinFiles')
+output = parser.add_argument_group('OUTPUT')
+output.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
+output.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just the interesting ones')
+output.add_argument('--build', '-b', action='store_true', help='write output to HTML file')
+output.add_argument('--make-clean', action='store_true', help='delete all HTML build artifact and write latest URLs to .urls.env')
+output.add_argument('--output-emails', '-oe', action='store_true', help='generate other files section')
+output.add_argument('--output-json-files', action='store_true', help='pretty print all the raw JSON data files in the collection')
+output.add_argument('--output-other-files', '-oo', action='store_true', help='generate other files section')
+output.add_argument('--output-texts', '-ot', action='store_true', help='generate other files section')
+output.add_argument('--suppress-output', action='store_true', help='no output to terminal (use with --build)')
+output.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use (in characters)')
+output.add_argument('--use-epstein-web-links', action='store_true', help='use epsteinweb.org links instead of epstein.media')
+scripts = parser.add_argument_group('SCRIPTS', 'Arguments used only by epstein_search, epstein_show, epstein_diff')
+scripts.add_argument('positional_args', nargs='*', help='strings to searchs for, file IDs to show or diff, etc.')
+scripts.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (only used by scripts)')
+scripts.add_argument('--whole-file', '-wf', action='store_true', help='print whole file (only used by epstein_search)')
+debug = parser.add_argument_group('DEBUG')
+debug.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
+debug.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
+debug.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
+debug.add_argument('--json-metadata', '-jm', action='store_true', help='dump JSON metadata for all files')
+debug.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats at the end')
+debug.add_argument('--sort-alphabetical', action='store_true', help='sort emailers alphabetically in counts table')
+debug.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
 args = parser.parse_args()
 current_script = Path(argv[0]).name
@@ -45,7 +50,7 @@ is_html_script = current_script in HTML_SCRIPTS
 args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
 args.output_emails = args.output_emails or args.all_emails
 args.output_other_files = args.output_other_files or args.all_other_files
-args.pickled = args.pickled or is_env_var_set('PICKLED') or args.colors_only or len(args.names or []) > 0
+args.overwrite_pickle = args.overwrite_pickle or (is_env_var_set('OVERWRITE_PICKLE') and not is_env_var_set('PICKLED'))
 args.width = args.width if is_html_script else None
 specified_names: list[str | None] = [None if n == 'None' else n for n in (args.names or [])]
@@ -66,8 +71,8 @@ datefinder_logger.setLevel(logger.level)
 # Massage args that depend on other args to the appropriate state
 if not (args.json_metadata or args.output_texts or args.output_emails or args.output_other_files):
-    if is_html_script and current_script != COUNT_WORDS_SCRIPT and not args.make_clean:
-        logger.warning(f"No output section chosen; outputting default of texts, selected emails, and other files...")
+    if is_html_script and current_script != COUNT_WORDS_SCRIPT and not args.make_clean and not args.colors_only:
+        logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
     args.output_texts = True
     args.output_emails = True
@@ -77,4 +82,4 @@ if args.use_epstein_web_links:
     logger.warning(f"Using links to epsteinweb.org links instead of epsteinify.com...")
 if args.debug:
-    logger.warning(f"Invocation args:\nis_html_script={is_html_script},\nspecified_names={specified_names},\nargs={args}")
+    logger.warning(f"Invocation args:\ncurrent_script={current_script}\nis_html_script={is_html_script},\nspecified_names={specified_names},\nargs={args}")

epstein_files/util/highlighted_group.py CHANGED Viewed

@@ -159,7 +159,7 @@ HIGHLIGHTED_NAMES = [
         pattern=r'Gruterite|(John\s*)?Kluge|Marc Rich|(Mi(chael|ke)\s*)?Ovitz|(Steve\s+)?Wynn|(Les(lie)?\s+)?Wexner|SALSS|Swedish[-\s]*American\s*Life\s*Science\s*Summit|Valhi|(Yves\s*)?Bouvier',
         emailers = {
             ALIREZA_ITTIHADIEH: 'CEO Freestream Aircraft Limited',
-            BARBRO_C_EHNBOM: 'Swedish pharmaceuticals',
+            BARBRO_C_EHNBOM: 'Swedish pharmaceuticals, SALSS',
             FRED_HADDAD: "co-founder of Heck's in West Virginia",
             GERALD_BARTON: "Maryland property developer Landmark Land Company, fan of Trump's Irish golf course",
             GORDON_GETTY: 'heir of oil tycoon J. Paul Getty',
@@ -296,6 +296,7 @@ HIGHLIGHTED_NAMES = [
         emailers = {
             DAVID_STERN: f'emailed Epstein from Moscow, appears to know chairman of {DEUTSCHE_BANK}',
             JONATHAN_FARKAS: "heir to the Alexander's department store fortune",
+            'linkspirit': "Skype username of someone Epstein communicated with",
             'Peter Thomas Roth': 'student of Epstein at Dalton, skincare company founder',
             STEPHEN_HANSON: None,
             TOM_BARRACK: 'long time friend of Trump',
@@ -304,7 +305,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedNames(
         label='finance',
         style='green',
-        pattern=r'Apollo|Ari\s*Glass|(Bernie\s*)?Madoff|Black(rock|stone)|BofA|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|(money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
+        pattern=r'Apollo|Ari\s*Glass|Bank|(Bernie\s*)?Madoff|Black(rock|stone)|B\s*of\s*A|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|(money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
         emailers={
             AMANDA_ENS: 'Citigroup',
             DANIEL_SABBA: 'UBS Investment Bank',
@@ -587,7 +588,7 @@ HIGHLIGHTED_NAMES = [
     HighlightedText(
         label='phone_number',
         style='bright_green',
-        pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|[\d+]{10,12}",
+        pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|\b[\d+]{10,12}\b",
     ),
 ]

epstein_files/util/output.py CHANGED Viewed

@@ -1,13 +1,14 @@
+import json
 from rich.padding import Padding
 from epstein_files.documents.email import Email
 from epstein_files.documents.messenger_log import MessengerLog
 from epstein_files.epstein_files import EpsteinFiles, count_by_month
-from epstein_files.util.constant.output_files import JSON_METADATA_PATH
-from epstein_files.util.constant import urls
+from epstein_files.util.constant import output_files
 from epstein_files.util.constant.html import *
 from epstein_files.util.constant.names import *
-from epstein_files.util.constant.strings import EMAIL_CLASS, MESSENGER_LOG_CLASS
+from epstein_files.util.constant.output_files import JSON_FILES_JSON_PATH, JSON_METADATA_PATH
 from epstein_files.util.data import dict_sets_to_lists
 from epstein_files.util.env import args, specified_names
 from epstein_files.util.logging import log_file_write, logger
@@ -108,6 +109,20 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
     return len(already_printed_emails)
+def print_json_files(epstein_files: EpsteinFiles):
+    if args.build:
+        json_data = {json_file.url_slug: json_file.json_data() for json_file in epstein_files.json_files}
+        with open(JSON_FILES_JSON_PATH, 'w') as f:
+            f.write(json.dumps(json_data, sort_keys=True))
+            log_file_write(JSON_FILES_JSON_PATH)
+    else:
+        for json_file in epstein_files.json_files:
+            console.line(2)
+            console.print(json_file.description_panel())
+            console.print_json(json_file.json_str(), indent=4, sort_keys=False)
 def print_json_metadata(epstein_files: EpsteinFiles) -> None:
     json_str = epstein_files.json_metadata()
@@ -122,9 +137,9 @@ def print_json_metadata(epstein_files: EpsteinFiles) -> None:
 def print_json_stats(epstein_files: EpsteinFiles) -> None:
     console.line(5)
     console.print(Panel('JSON Stats Dump', expand=True, style='reverse bold'), '\n')
-    print_json(f"{MESSENGER_LOG_CLASS} Sender Counts", MessengerLog.count_authors(epstein_files.imessage_logs), skip_falsey=True)
-    print_json(f"{EMAIL_CLASS} Author Counts", epstein_files.email_author_counts, skip_falsey=True)
-    print_json(f"{EMAIL_CLASS} Recipient Counts", epstein_files.email_recipient_counts, skip_falsey=True)
+    print_json(f"MessengerLog Sender Counts", MessengerLog.count_authors(epstein_files.imessage_logs), skip_falsey=True)
+    print_json(f"Email Author Counts", epstein_files.email_author_counts, skip_falsey=True)
+    print_json(f"Email Recipient Counts", epstein_files.email_recipient_counts, skip_falsey=True)
     print_json("Email signature_substitution_countss", epstein_files.email_signature_substitution_counts(), skip_falsey=True)
     print_json("email_author_device_signatures", dict_sets_to_lists(epstein_files.email_authors_to_device_signatures))
     print_json("email_sent_from_devices", dict_sets_to_lists(epstein_files.email_device_signatures_to_authors))
@@ -147,16 +162,12 @@ def print_text_messages(epstein_files: EpsteinFiles) -> None:
 def write_urls() -> None:
     """Write _URL style constant variables to a file bash scripts can load as env vars."""
-    if args.output_file == 'index.html':
-        logger.warning(f"Can't write env vars to '{args.output_file}', writing to '{URLS_ENV}' instead.\n")
-        args.output_file = URLS_ENV
+    url_vars = {k: v for k, v in vars(output_files).items() if k.endswith('URL') and not k.startswith('GH')}
-    url_vars = {
-        k: v for k, v in vars(urls).items()
-        if isinstance(v, str) and k.split('_')[-1] in ['URL'] and 'github.io' in v and 'BASE' not in k
-    }
+    if not args.suppress_output:
+        console.line()
-    with open(args.output_file, 'w') as f:
+    with open(URLS_ENV, 'w') as f:
         for var_name, url in url_vars.items():
             key_value = f"{var_name}='{url}'"
@@ -165,8 +176,10 @@ def write_urls() -> None:
             f.write(f"{key_value}\n")
-    console.line()
-    logger.warning(f"Wrote {len(url_vars)} URL variables to '{args.output_file}'\n")
+    if not args.suppress_output:
+        console.line()
+    logger.warning(f"Wrote {len(url_vars)} URL variables to '{URLS_ENV}'\n")
 def _verify_all_emails_were_printed(epstein_files: EpsteinFiles, already_printed_emails: list[Email]) -> None:

epstein_files/util/rich.py CHANGED Viewed

@@ -14,7 +14,8 @@ from rich.theme import Theme
 from epstein_files.util.constant.html import CONSOLE_HTML_FORMAT, HTML_TERMINAL_THEME, PAGE_TITLE
 from epstein_files.util.constant.names import UNKNOWN
-from epstein_files.util.constant.strings import DEFAULT, EMAIL, NA, OTHER_SITE_LINK_STYLE, QUESTION_MARKS, SiteType
+from epstein_files.util.constant.output_files import SITE_URLS
+from epstein_files.util.constant.strings import DEFAULT, EMAIL, NA, QUESTION_MARKS, TEXT_MESSAGE, SiteType
 from epstein_files.util.constant.urls import *
 from epstein_files.util.constants import FALLBACK_TIMESTAMP, HEADER_ABBREVIATIONS
 from epstein_files.util.data import json_safe
@@ -31,11 +32,22 @@ GREY_NUMBERS = [58, 39, 39, 35, 30, 27, 23, 23, 19, 19, 15, 15, 15]
 DEFAULT_NAME_STYLE = 'gray46'
 KEY_STYLE='honeydew2 bold'
 SECTION_HEADER_STYLE = 'bold white on blue3'
-SOCIAL_MEDIA_LINK_STYLE = 'cyan3 bold'
+SOCIAL_MEDIA_LINK_STYLE = 'pale_turquoise4'
 SUBSTACK_POST_LINK_STYLE = 'bright_cyan'
 SYMBOL_STYLE = 'grey70'
+TABLE_BORDER_STYLE = 'grey46'
+TABLE_TITLE_STYLE = f"gray85 italic"
 TITLE_STYLE = 'black on bright_white bold'
+AUX_SITE_LINK_STYLE = 'dark_orange3'
+OTHER_SITE_LINK_STYLE = 'dark_goldenrod'
+DEFAULT_TABLE_KWARGS = {
+    'border_style': TABLE_BORDER_STYLE,
+    'header_style': "bold",
+    'title_style': TABLE_TITLE_STYLE,
+}
 HIGHLIGHTED_GROUP_COLOR_KEYS = [
     Text(highlight_group.label.replace('_', ' '), style=highlight_group.style)
     for highlight_group in sorted(HIGHLIGHTED_NAMES, key=lambda hg: hg.label)
@@ -79,7 +91,11 @@ def build_highlighter(pattern: str) -> EpsteinHighlighter:
     return TempHighlighter()
-def join_texts(txts: list[Text], join: str = ' ', encloser: str = '') -> Text:
+def build_table(title: str | None, **kwargs) -> Table:
+    return Table(title=title, **{**DEFAULT_TABLE_KWARGS, **kwargs})
+def join_texts(txts: list[Text], join: str = ' ', encloser: str = '', encloser_style: str = 'wheat4') -> Text:
     """Join rich.Text objs into one."""
     if encloser:
         if len(encloser) != 2:
@@ -91,8 +107,9 @@ def join_texts(txts: list[Text], join: str = ' ', encloser: str = '') -> Text:
     txt = Text('')
-    for i, link in enumerate(txts):
-        txt.append(join if i >= 1 else '').append(enclose_start).append(link).append(enclose_end)
+    for i, _txt in enumerate(txts):
+        txt.append(join if i >= 1 else '').append(enclose_start, style=encloser_style)
+        txt.append(_txt).append(enclose_end, style=encloser_style)
     return txt
@@ -132,7 +149,7 @@ def print_centered_link(url: str, link_text: str, style: str | None = None) -> N
 def print_color_key() -> None:
-    color_table = Table(title=f'Rough Guide to Highlighted Colors', show_header=False)
+    color_table = build_table('Rough Guide to Highlighted Colors', show_header=False)
     num_colors = len(HIGHLIGHTED_GROUP_COLOR_KEYS)
     row_number = 0
@@ -164,7 +181,7 @@ def print_header(epstein_files: 'EpsteinFiles') -> None:
     print_centered(f"if you think there's an attribution error or can deanonymize an {UNKNOWN} contact {CRYPTADAMUS_TWITTER}", 'grey46')
     print_centered('note this site is based on the OCR text provided by Congress which is not always the greatest', 'grey23')
     print_centered(f"(thanks to {link_markup('https://x.com/ImDrinknWyn', '@ImDrinknWyn', 'dodger_blue3')} + others for help attributing redacted emails)")
-    print_centered_link(ATTRIBUTIONS_URL, "(some explanations of author attributions)", style='magenta')
+    print_centered_link(JSON_METADATA_URL, "(explanations of author attributions)", style='magenta')
 def print_json(label: str, obj: object, skip_falsey: bool = False) -> None:
@@ -231,24 +248,28 @@ def print_other_site_link(is_header: bool = True) -> None:
     other_site_msg += f" Epstein's {other_site_type}s also generated by this code"
     markup_msg = link_markup(SITE_URLS[other_site_type], other_site_msg, OTHER_SITE_LINK_STYLE)
     print_centered(parenthesize(Text.from_markup(markup_msg)), style='bold')
-    word_count_link = link_text_obj(WORD_COUNT_URL, 'site showing the most frequently used words in these communiques', OTHER_SITE_LINK_STYLE)
-    print_centered(parenthesize(word_count_link))
-    metadata_link = link_text_obj(JSON_METADATA_URL, 'metadata with author attribution explanations', OTHER_SITE_LINK_STYLE)
-    print_centered(parenthesize(metadata_link))
+    if is_header:
+        word_count_link = link_text_obj(WORD_COUNT_URL, 'most frequently used words in the emails and texts', AUX_SITE_LINK_STYLE)
+        print_centered(parenthesize(word_count_link))
+        metadata_link = link_text_obj(JSON_METADATA_URL, 'author attribution explanations', AUX_SITE_LINK_STYLE)
+        print_centered(parenthesize(metadata_link))
+        json_link = link_text_obj(WORD_COUNT_URL, "epstein's json files", AUX_SITE_LINK_STYLE)
+        print_centered(parenthesize(json_link))
 def print_page_title(expand: bool = True, width: int | None = None) -> None:
     title_panel = Panel(Text(PAGE_TITLE, justify='center'), expand=expand, style=TITLE_STYLE, width=width)
     console.print(Align.center(vertically_pad(title_panel)))
-    print_social_media_links()
+    _print_social_media_links()
     console.line(2)
 def print_panel(msg: str, style: str = 'black on white', padding: tuple | None = None, centered: bool = False) -> None:
     _padding: list[int] = list(padding or [0, 0, 0, 0])
     _padding[2] += 1  # Bottom pad
-    panel = Panel(Text.from_markup(msg, justify='center'), width=70, style=style)
     actual_padding: tuple[int, int, int, int] = tuple(_padding)
+    panel = Panel(Text.from_markup(msg, justify='center'), width=70, style=style)
     if centered:
         console.print(Align.center(Padding(panel, actual_padding)))
@@ -262,19 +283,6 @@ def print_section_header(msg: str, style: str = SECTION_HEADER_STYLE, is_centere
     console.print(Padding(panel, (3, 0, 1, 0)))
-def print_social_media_links() -> None:
-    print_centered_link(SUBSTACK_URL, "I Made Epstein's Text Messages Great Again (And You Should Read Them)", style=f'{SUBSTACK_POST_LINK_STYLE} bold')
-    print_centered_link(SUBSTACK_URL, SUBSTACK_URL.removeprefix('https://'), style=f'{SUBSTACK_POST_LINK_STYLE} dim')
-    social_links = [
-        link_text_obj('https://x.com/Cryptadamist/status/1990866804630036988', '@cryptadamist', style=SOCIAL_MEDIA_LINK_STYLE),
-        link_text_obj('https://cryptadamus.substack.com/', 'substack', style=SOCIAL_MEDIA_LINK_STYLE),
-        link_text_obj('https://universeodon.com/@cryptadamist/115572634993386057', 'mastodon', style=SOCIAL_MEDIA_LINK_STYLE),
-    ]
-    print_centered(join_texts(social_links, join='     ', encloser='[]'))
 def print_starred_header(msg: str, num_stars: int = 7, num_spaces: int = 2, style: str = TITLE_STYLE) -> None:
     stars = '*' * num_stars
     spaces = ' ' * num_spaces
@@ -314,7 +322,7 @@ def write_html(output_path: Path) -> None:
 def _print_abbreviations_table() -> None:
-    table = Table(title="Abbreviations Used Frequently In These Conversations", header_style="bold", show_header=False)
+    table = build_table(title="Abbreviations Used Frequently In These Conversations", show_header=False)
     table.add_column("Abbreviation", justify="center", style='bold')
     table.add_column("Translation", style="white", justify="center")
@@ -326,7 +334,7 @@ def _print_abbreviations_table() -> None:
 def _print_external_links() -> None:
     console.line()
-    print_starred_header('External Links', num_stars=0, num_spaces=20, style=f"italic")
+    print_centered(Text('External Links', style=TABLE_TITLE_STYLE))
     presser_link = link_text_obj(OVERSIGHT_REPUBLICANS_PRESSER_URL, 'Official Oversight Committee Press Release')
     raw_docs_link = join_texts([link_text_obj(RAW_OVERSIGHT_DOCS_GOOGLE_DRIVE_URL, 'raw files', style=f"{ARCHIVE_LINK_COLOR} dim")], encloser='()')
     print_centered(join_texts([presser_link, raw_docs_link]))
@@ -335,6 +343,26 @@ def _print_external_links() -> None:
     print_centered(link_markup(COURIER_NEWSROOM_ARCHIVE_URL, 'Searchable Archive') + " (Courier Newsroom)")
     print_centered(link_markup(EPSTEINIFY_URL) + " (raw document images)")
     print_centered(link_markup(EPSTEIN_WEB_URL) + " (character summaries)")
+    print_centered(link_markup(EPSTEIN_MEDIA_URL) + " (raw document images)")
+def _print_social_media_links() -> None:
+    print_centered_link(
+        SUBSTACK_URL,
+        "I Made Epstein's Text Messages Great Again (And You Should Read Them)",
+        style=f'{SUBSTACK_POST_LINK_STYLE} bold'
+    )
+    print_centered_link(SUBSTACK_URL, SUBSTACK_URL.removeprefix('https://'), style=f'{SUBSTACK_POST_LINK_STYLE} dim')
+    social_links = [
+        link_text_obj('https://universeodon.com/@cryptadamist/115572634993386057', '@mastodon', style=SOCIAL_MEDIA_LINK_STYLE),
+        link_text_obj(SUBSTACK_URL, '@substack', style=SOCIAL_MEDIA_LINK_STYLE),
+        link_text_obj('https://x.com/Cryptadamist/status/1990866804630036988', '@twitter', style=SOCIAL_MEDIA_LINK_STYLE),
+        link_text_obj('https://github.com/michelcrypt4d4mus/epstein_text_messages', '@github', style=SOCIAL_MEDIA_LINK_STYLE)
+    ]
+    print_centered(join_texts(social_links, join='  /  '))#, encloser='()'))#, encloser='‹›'))
 # if args.deep_debug:

epstein_files/util/word_count.py CHANGED Viewed

@@ -20,6 +20,14 @@ from epstein_files.util.search_result import SearchResult
 FIRST_AND_LAST_NAMES = flatten([n.split() for n in ALL_NAMES])
 FIRST_AND_LAST_NAMES = [n.lower() for n in FIRST_AND_LAST_NAMES] + OTHER_NAMES
+HTML_REGEX = re.compile(r"com/|cae-v2w=|content-(transfe|type)|font(/|-(family|size))|http|\.html?\??|margin-bottom|padding-left|quoted-printable|region=|text-decoration|ttps|www|\.(gif|jpe?g|png);?$")
+HYPHENATED_WORD_REGEX = re.compile(r"[a-z]+-[a-z]+", re.IGNORECASE)
+OK_SYMBOL_WORDS = ['mar-a-lago', 'p/e', 's&p', ':)', ':).', ';)', ':-)', ';-)']
+ONLY_SYMBOLS_REGEX = re.compile(r"^[^a-zA-Z0-9]+$")
+SYMBOL_WORD_REGEX = re.compile(r"^[-—–@%/?.,&=]+$")
+SPLIT_WORDS_BY = ['@', '/']
+FLAGGED_WORDS = []  # For debugging, log extra info when one of these is encountered
 NON_SINGULARIZABLE = UNSINGULARIZABLE_WORDS + [n for n in FIRST_AND_LAST_NAMES if n.endswith('s')]
 SKIP_WORDS_REGEX = re.compile(r"^(asmallworld@|enwiki|http|imagepng|nymagcomnymetro|addresswww|mailto|www|/font|colordu|classdms|targetdblank|nymagcom|palmbeachdailynews)|jee[vy]acation|fontfamily|(gif|html?|jpe?g|utm)$")
 BAD_CHARS_REGEX = re.compile(r"[-–=+()$€£©°«—^&%!#_`,.;:'‘’\"„“”?\d\\]")
@@ -100,21 +108,13 @@ SINGULARIZATIONS = {
     'twittercom': 'twitter',
 }
-HTML_REGEX = re.compile(r"com/|cae-v2w=|content-(transfe|type)|font(/|-(family|size))|http|\.html?\??|margin-bottom|padding-left|quoted-printable|region=|text-decoration|ttps|www|\.(gif|jpe?g|png);?$")
-HYPHENATED_WORD_REGEX = re.compile(r"[a-z]+-[a-z]+", re.IGNORECASE)
-OK_SYMBOL_WORDS = ['mar-a-lago', 'p/e', 's&p', ':)', ':).', ';)', ':-)', ';-)']
-SYMBOL_WORD_REGEX = re.compile(r"^[-—–@%/?.,&=]+$")
-ONLY_SYMBOLS_REGEX = re.compile(r"^[^a-zA-Z0-9]+$")
-SPLIT_WORDS_BY = ['@', '/']
-FLAGGED_WORDS = []  # For debugging, log extra info when one of these is encountered
 @dataclass
 class WordCount:
     count: dict[str, int] = field(default_factory=lambda: defaultdict(int))
     singularized: dict[str, int] = field(default_factory=lambda: defaultdict(int))
-    def count_word(self, word: str, document_line: SearchResult) -> None:
+    def tally_word(self, word: str, document_line: SearchResult) -> None:
         word = EmailHeader.cleanup_str(word).lower().strip()
         raw_word = word
@@ -148,7 +148,7 @@ class WordCount:
                 continue
             for w in word.split(symbol):
-                self.count_word(w, document_line)
+                self.tally_word(w, document_line)
             logger.info(f"  Split word with '{symbol}' in it '{word}'...")
             return

epstein-files 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl

epstein-files 1.0.4py3-none-any.whl → 1.0.6py3-none-any.whl