epstein-files 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +24 -25
- epstein_files/count_words.py +72 -0
- epstein_files/documents/document.py +1 -2
- epstein_files/documents/email.py +15 -10
- epstein_files/documents/json_file.py +4 -4
- epstein_files/documents/messenger_log.py +2 -1
- epstein_files/documents/other_file.py +2 -2
- epstein_files/epstein_files.py +40 -40
- epstein_files/util/constant/output_files.py +20 -4
- epstein_files/util/constant/strings.py +8 -8
- epstein_files/util/constant/urls.py +6 -21
- epstein_files/util/constants.py +19 -18
- epstein_files/util/doc_cfg.py +3 -1
- epstein_files/util/env.py +35 -30
- epstein_files/util/highlighted_group.py +4 -3
- epstein_files/util/output.py +29 -16
- epstein_files/util/rich.py +56 -28
- epstein_files/util/word_count.py +10 -10
- {epstein_files-1.0.4.dist-info → epstein_files-1.0.6.dist-info}/METADATA +37 -18
- epstein_files-1.0.6.dist-info/RECORD +34 -0
- {epstein_files-1.0.4.dist-info → epstein_files-1.0.6.dist-info}/entry_points.txt +1 -1
- epstein_files-1.0.4.dist-info/RECORD +0 -33
- {epstein_files-1.0.4.dist-info → epstein_files-1.0.6.dist-info}/LICENSE +0 -0
- {epstein_files-1.0.4.dist-info → epstein_files-1.0.6.dist-info}/WHEEL +0 -0
epstein_files/util/constants.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from copy import deepcopy
|
|
3
|
+
from typing import cast
|
|
3
4
|
|
|
4
5
|
from dateutil.parser import parse
|
|
5
6
|
|
|
@@ -84,7 +85,7 @@ EMAILER_ID_REGEXES: dict[str, re.Pattern] = {
|
|
|
84
85
|
JAMES_HILL: re.compile(r"hill, james e.|james.e.hill@abc.com", re.IGNORECASE),
|
|
85
86
|
JEAN_LUC_BRUNEL: re.compile(r'Jean[- ]Luc Brunel?', re.IGNORECASE),
|
|
86
87
|
JEFF_FULLER: re.compile(r"jeff@mc2mm.com|Jeff Fuller", re.IGNORECASE),
|
|
87
|
-
JEFFREY_EPSTEIN: re.compile(r'[djl]ee[vy]acation[©@]?g?(mail.com)?|Epstine|\bJEE?\b|Jeffrey E((sp|ps)tein?)?|jeeproject@yahoo.com|J Jep|Jeffery Edwards|(?<!Mark L. )Epstein', re.IGNORECASE),
|
|
88
|
+
JEFFREY_EPSTEIN: re.compile(r'[djl]\s?ee[vy]acation[©@]?g?(mail.com)?|Epstine|\bJEE?\b|Jeffrey E((sp|ps)tein?)?|jeeproject@yahoo.com|J Jep|Jeffery Edwards|(?<!Mark L. )Epstein', re.IGNORECASE),
|
|
88
89
|
JESSICA_CADWELL: re.compile(r'Jessica Cadwell?', re.IGNORECASE),
|
|
89
90
|
JOHNNY_EL_HACHEM: re.compile(r'el hachem johnny|johnny el hachem', re.IGNORECASE),
|
|
90
91
|
JOI_ITO: re.compile(r'ji@media.mit.?edu|(joichi|joi)( Ito)?', re.IGNORECASE),
|
|
@@ -94,7 +95,7 @@ EMAILER_ID_REGEXES: dict[str, re.Pattern] = {
|
|
|
94
95
|
LANDON_THOMAS: re.compile(r'lando[nr] thomas( jr)?|thomas jr.?, lando[nr]', re.IGNORECASE),
|
|
95
96
|
LARRY_SUMMERS: re.compile(r'(La(wrence|rry).{1,5})?Summers?|^LH$|LHS|Ihsofficel', re.IGNORECASE),
|
|
96
97
|
LAWRANCE_VISOSKI: re.compile(r'La(rry|wrance) Visoski?|Lvjet', re.IGNORECASE),
|
|
97
|
-
LAWRENCE_KRAUSS: re.compile(r'Lawrence Kraus|
|
|
98
|
+
LAWRENCE_KRAUSS: re.compile(r'Lawrence Kraus|[jl]awkrauss', re.IGNORECASE),
|
|
98
99
|
LEON_BLACK: re.compile(r'Leon Black?', re.IGNORECASE),
|
|
99
100
|
MANUELA_MARTINEZ: re.compile(fr'Manuela (- Mega Partners|Martinez)', re.IGNORECASE),
|
|
100
101
|
MARIANA_IDZKOWSKA: re.compile(r'Mariana [Il]d[źi]kowska?', re.IGNORECASE),
|
|
@@ -268,7 +269,7 @@ SHIMON_POST = 'The Shimon Post'
|
|
|
268
269
|
SHIMON_POST_ARTICLE = f'selection of articles about the mideast'
|
|
269
270
|
SINGLE_PAGE = 'single page of'
|
|
270
271
|
STRANGE_BEDFELLOWS = "'Strange Bedfellows' list of invitees f. Johnny Depp, Woody Allen, Obama, and more"
|
|
271
|
-
SWEDISH_LIFE_SCIENCES_SUMMIT = f"{BARBRO_C_EHNBOM}'s Swedish American Life Science Summit"
|
|
272
|
+
SWEDISH_LIFE_SCIENCES_SUMMIT = f"{BARBRO_C_EHNBOM}'s Swedish American Life Science Summit (SALSS)"
|
|
272
273
|
THE_REAL_DEAL_ARTICLE = 'article by Keith Larsen'
|
|
273
274
|
TRUMP_DISCLOSURES = f"Donald Trump financial disclosures from U.S. Office of Government Ethics"
|
|
274
275
|
UBS_CIO_REPORT = 'CIO Monthly Extended report'
|
|
@@ -371,8 +372,8 @@ TEXTS_CONFIG = CONFIRMED_TEXTS_CONFIG + UNCONFIRMED_TEXTS_CONFIG
|
|
|
371
372
|
########################################################################################################
|
|
372
373
|
|
|
373
374
|
# Some emails have a lot of uninteresting CCs
|
|
374
|
-
|
|
375
|
-
FLIGHT_IN_2012_PEOPLE
|
|
375
|
+
IRAN_DEAL_RECIPIENTS = ['Allen West', 'Rafael Bardaji', 'Philip Kafka', 'Herb Goodman', 'Grant Seeger', 'Lisa Albert', 'Janet Kafka', 'James Ramsey', 'ACT for America', 'John Zouzelka', 'Joel Dunn', 'Nate McClain', 'Bennet Greenwald', 'Taal Safdie', 'Uri Fouzailov', 'Neil Anderson', 'Nate White', 'Rita Hortenstine', 'Henry Hortenstine', 'Gary Gross', 'Forrest Miller', 'Bennett Schmidt', 'Val Sherman', 'Marcie Brown', 'Michael Horowitz', 'Marshall Funk']
|
|
376
|
+
FLIGHT_IN_2012_PEOPLE = ['Francis Derby', 'Januiz Banasiak', 'Louella Rabuyo', 'Richard Barnnet']
|
|
376
377
|
|
|
377
378
|
EMAILS_CONFIG = [
|
|
378
379
|
EmailCfg(id='032436', author=ALIREZA_ITTIHADIEH, attribution_reason='Signature'),
|
|
@@ -491,9 +492,6 @@ EMAILS_CONFIG = [
|
|
|
491
492
|
EmailCfg(id='032727', author=KATHRYN_RUEMMLER, attribution_reason=KATHY_REASON, is_attribution_uncertain=True),
|
|
492
493
|
EmailCfg(id='030478', author=LANDON_THOMAS),
|
|
493
494
|
EmailCfg(id='029013', author=LARRY_SUMMERS, recipients=[JEFFREY_EPSTEIN]), # Bad OCR (nofix)
|
|
494
|
-
EmailCfg(id='032206', author=LAWRENCE_KRAUSS), # More of a text convo?
|
|
495
|
-
EmailCfg(id='032208', author=LAWRENCE_KRAUSS, recipients=[JEFFREY_EPSTEIN]), # More of a text convo?
|
|
496
|
-
EmailCfg(id='032209', author=LAWRENCE_KRAUSS, recipients=[JEFFREY_EPSTEIN]), # More of a text convo?
|
|
497
495
|
EmailCfg(id='029196', author=LAWRENCE_KRAUSS, recipients=[JEFFREY_EPSTEIN], actual_text='Talk in 40?'),
|
|
498
496
|
EmailCfg(id='033593', author=LAWRANCE_VISOSKI, attribution_reason='Signature'),
|
|
499
497
|
EmailCfg(id='033370', author=LAWRANCE_VISOSKI, attribution_reason=LARRY_REASON),
|
|
@@ -575,7 +573,7 @@ EMAILS_CONFIG = [
|
|
|
575
573
|
attribution_reason='ends with "Respectfully, terry"',
|
|
576
574
|
author=TERRY_KAFKA,
|
|
577
575
|
fwded_text_after='From: Mike Cohen',
|
|
578
|
-
recipients=[JEFFREY_EPSTEIN, MARK_EPSTEIN, MICHAEL_BUCHHOLTZ] +
|
|
576
|
+
recipients=[JEFFREY_EPSTEIN, MARK_EPSTEIN, MICHAEL_BUCHHOLTZ] + IRAN_DEAL_RECIPIENTS,
|
|
579
577
|
duplicate_ids=['028482'],
|
|
580
578
|
),
|
|
581
579
|
EmailCfg(id='029992', author=TERRY_KAFKA, attribution_reason='Quoted reply'),
|
|
@@ -600,7 +598,6 @@ EMAILS_CONFIG = [
|
|
|
600
598
|
EmailCfg(id='022202', recipients=[JEAN_LUC_BRUNEL], attribution_reason='Follow up / reply', duplicate_ids=['029975']),
|
|
601
599
|
EmailCfg(id='022187', recipients=[JEFFREY_EPSTEIN]), # Bad OCR (nofix)
|
|
602
600
|
EmailCfg(id='031489', recipients=[JEFFREY_EPSTEIN]), # Bad OCR (unfixable)
|
|
603
|
-
EmailCfg(id='032210', recipients=[JEFFREY_EPSTEIN]), # More of a text convo?
|
|
604
601
|
EmailCfg(id='030347', recipients=[JEFFREY_EPSTEIN]), # Bad OCR (nofix)
|
|
605
602
|
EmailCfg(id='030367', recipients=[JEFFREY_EPSTEIN]), # Bad OCR (nofix)
|
|
606
603
|
EmailCfg(id='033274', recipients=[JEFFREY_EPSTEIN]), # this is a note sent to self
|
|
@@ -751,7 +748,7 @@ EMAILS_CONFIG = [
|
|
|
751
748
|
EmailCfg(id='031118', duplicate_ids=['019465']),
|
|
752
749
|
EmailCfg(id='031912', duplicate_ids=['032158']),
|
|
753
750
|
EmailCfg(id='030587', duplicate_ids=['030514']),
|
|
754
|
-
EmailCfg(id='029773', duplicate_ids=['012685']),
|
|
751
|
+
EmailCfg(id='029773', duplicate_ids=['012685'], fwded_text_after='Omar Quadhafi'),
|
|
755
752
|
EmailCfg(id='033297', duplicate_ids=['033586']),
|
|
756
753
|
EmailCfg(id='031089', duplicate_ids=['018084']),
|
|
757
754
|
EmailCfg(id='031088', duplicate_ids=['030885']),
|
|
@@ -1195,7 +1192,7 @@ OTHER_FILES_CONFERENCES = [
|
|
|
1195
1192
|
DocCfg(id='019300', author=SVETLANA_POZHIDAEVA, description=f'{WOMEN_EMPOWERMENT} f. {KATHRYN_RUEMMLER}', date='2019-04-05'),
|
|
1196
1193
|
DocCfg(id='022267', author=SVETLANA_POZHIDAEVA, description=f'{WOMEN_EMPOWERMENT} founder essay about growing the seminar business'),
|
|
1197
1194
|
DocCfg(id='022407', author=SVETLANA_POZHIDAEVA, description=f'{WOMEN_EMPOWERMENT} seminar pitch deck'),
|
|
1198
|
-
DocCfg(id='017524', author=SWEDISH_LIFE_SCIENCES_SUMMIT, description=f"2012 program"),
|
|
1195
|
+
DocCfg(id='017524', author=SWEDISH_LIFE_SCIENCES_SUMMIT, description=f"2012 program emailed to epstein BY {BARBRO_C_EHNBOM} in 031226", date='2012-08-18'),
|
|
1199
1196
|
DocCfg(id='026747', author=SWEDISH_LIFE_SCIENCES_SUMMIT, description=f"2017 program", date='2017-08-23'),
|
|
1200
1197
|
DocCfg(id='014951', author='TED Talks', description=f"2017 program", date='2017-04-20'),
|
|
1201
1198
|
DocCfg(id='024179', author=UN_GENERAL_ASSEMBLY, description=f'president and first lady schedule', date='2012-09-21'),
|
|
@@ -1326,7 +1323,7 @@ OTHER_FILES_LETTERS = [
|
|
|
1326
1323
|
]
|
|
1327
1324
|
|
|
1328
1325
|
OTHER_FILES_PROPERTY = [
|
|
1329
|
-
DocCfg(id='026759', author='Great Bay Condominium Owners Association', description=f'{PRESS_RELEASE}
|
|
1326
|
+
DocCfg(id='026759', author='Great Bay Condominium Owners Association', description=f'{PRESS_RELEASE} about Hurricane Irma damage', date='2017-09-13'),
|
|
1330
1327
|
DocCfg(id='016602', author=PALM_BEACH_CODE_ENFORCEMENT, description='board minutes', date='2008-04-17'),
|
|
1331
1328
|
DocCfg(id='016554', author=PALM_BEACH_CODE_ENFORCEMENT, description='board minutes', date='2008-07-17', duplicate_ids=['016616', '016574']),
|
|
1332
1329
|
DocCfg(id='027068', author=THE_REAL_DEAL, description=f"{THE_REAL_DEAL_ARTICLE} Palm House Hotel Bankruptcy and EB-5 Visa Fraud Allegations"),
|
|
@@ -1379,8 +1376,8 @@ OTHER_FILES_SOCIAL = [
|
|
|
1379
1376
|
]
|
|
1380
1377
|
|
|
1381
1378
|
OTHER_FILES_POLITICS = [
|
|
1382
|
-
DocCfg(id='029918', author=DIANA_DEGETTE_CAMPAIGN, description=f"bio", date='2012-
|
|
1383
|
-
DocCfg(id='031184', author=DIANA_DEGETTE_CAMPAIGN, description=f"fundraiser
|
|
1379
|
+
DocCfg(id='029918', author=DIANA_DEGETTE_CAMPAIGN, description=f"bio", date='2012-09-27'),
|
|
1380
|
+
DocCfg(id='031184', author=DIANA_DEGETTE_CAMPAIGN, description=f"invitation to fundraiser hosted by {BARBRO_C_EHNBOM}", date='2012-09-27'),
|
|
1384
1381
|
DocCfg(id='026827', author='Scowcroft Group', description=f'report on ISIS', date='2015-11-14'),
|
|
1385
1382
|
DocCfg(id='024294', author=STACEY_PLASKETT, description=f"campaign flier", date='2016-10-01'),
|
|
1386
1383
|
DocCfg(
|
|
@@ -1482,6 +1479,11 @@ OTHER_FILES_ARTS = [
|
|
|
1482
1479
|
OTHER_FILES_MISC = [
|
|
1483
1480
|
DocCfg(id='022780', category=FLIGHT_LOGS),
|
|
1484
1481
|
DocCfg(id='022816', category=FLIGHT_LOGS),
|
|
1482
|
+
DocCfg(id='032206', category=SKYPE_LOG, author=LAWRENCE_KRAUSS),
|
|
1483
|
+
DocCfg(id='032208', category=SKYPE_LOG, author=LAWRENCE_KRAUSS),
|
|
1484
|
+
DocCfg(id='032209', category=SKYPE_LOG, author=LAWRENCE_KRAUSS),
|
|
1485
|
+
DocCfg(id='018224', category=SKYPE_LOG, author=LAWRENCE_KRAUSS, description=f'conversations with linkspirit (French?) and {LAWRENCE_KRAUSS}'),
|
|
1486
|
+
DocCfg(id='032210', category=SKYPE_LOG, description=f'conversation with linkspirit'),
|
|
1485
1487
|
DocCfg(
|
|
1486
1488
|
id='025147',
|
|
1487
1489
|
author=BROCKMAN_INC,
|
|
@@ -1496,7 +1498,6 @@ OTHER_FILES_MISC = [
|
|
|
1496
1498
|
DocCfg(id='027074', author=FEMALE_HEALTH_COMPANY, description=f"pitch deck (USAID was a customer)"),
|
|
1497
1499
|
DocCfg(id='032735', author=GORDON_GETTY, description=f"on Trump", date='2018-03-20'), # Dated based on concurrent emails from Getty
|
|
1498
1500
|
DocCfg(id='025540', author=JEFFREY_EPSTEIN, description=f"rough draft of Epstein's side of the story?"),
|
|
1499
|
-
DocCfg(id='018224', author=LAWRENCE_KRAUSS, description=f"Skype conversation log"),
|
|
1500
1501
|
DocCfg(id='026634', author='Michael Carrier', description=f"comments about an Apollo linked hedge fund 'DE Fund VIII'"),
|
|
1501
1502
|
DocCfg(id='031425', author=SCOTT_J_LINK, description=f'completely redacted email from'),
|
|
1502
1503
|
DocCfg(id='020447', author='Working Group on Chinese Influence Activities in the U.S.', description=f'Promoting Constructive Vigilance'),
|
|
@@ -1589,8 +1590,8 @@ SENT_FROM_REGEX = re.compile(r'^(?:(Please forgive|Sorry for all the) typos.{1,4
|
|
|
1589
1590
|
|
|
1590
1591
|
|
|
1591
1592
|
# Error checking.
|
|
1592
|
-
if len(OTHER_FILES_CONFIG) !=
|
|
1593
|
-
logger.warning(f"
|
|
1593
|
+
if len(OTHER_FILES_CONFIG) != 442:
|
|
1594
|
+
logger.warning(f"Found {len(OTHER_FILES_CONFIG)} configured other files!")
|
|
1594
1595
|
|
|
1595
1596
|
encountered_file_ids = set()
|
|
1596
1597
|
|
epstein_files/util/doc_cfg.py
CHANGED
|
@@ -109,7 +109,9 @@ class DocCfg:
|
|
|
109
109
|
|
|
110
110
|
def info_str(self) -> str | None:
|
|
111
111
|
"""String that summarizes what is known about this document."""
|
|
112
|
-
if self.category
|
|
112
|
+
if self.category and not self.description:
|
|
113
|
+
return self.category
|
|
114
|
+
elif self.category == REPUTATION:
|
|
113
115
|
return f"{REPUTATION_MGMT}: {self.description}"
|
|
114
116
|
elif self.author and self.description:
|
|
115
117
|
if self.category in [ACADEMIA, BOOK]:
|
epstein_files/util/env.py
CHANGED
|
@@ -6,36 +6,41 @@ from sys import argv
|
|
|
6
6
|
|
|
7
7
|
from epstein_files.util.logging import datefinder_logger, env_log_level, logger
|
|
8
8
|
|
|
9
|
-
COUNT_WORDS_SCRIPT = '
|
|
10
|
-
DEFAULT_WIDTH =
|
|
11
|
-
HTML_SCRIPTS = ['epstein_generate',
|
|
9
|
+
COUNT_WORDS_SCRIPT = 'epstein_word_count'
|
|
10
|
+
DEFAULT_WIDTH = 145
|
|
11
|
+
HTML_SCRIPTS = ['epstein_generate', COUNT_WORDS_SCRIPT]
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
parser = ArgumentParser(description="Parse epstein OCR docs and generate HTML page.")
|
|
15
|
-
parser.add_argument('--build', '-b', action='store_true', help='write output to file')
|
|
16
|
-
parser.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
|
|
17
|
-
parser.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just interesting ones')
|
|
18
|
-
parser.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
|
|
19
15
|
parser.add_argument('--name', '-n', action='append', dest='names', help='specify the name(s) whose communications should be output')
|
|
20
|
-
parser.add_argument('--
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
parser.
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
16
|
+
parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='ovewrite cached EpsteinFiles')
|
|
17
|
+
|
|
18
|
+
output = parser.add_argument_group('OUTPUT')
|
|
19
|
+
output.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
|
|
20
|
+
output.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just the interesting ones')
|
|
21
|
+
output.add_argument('--build', '-b', action='store_true', help='write output to HTML file')
|
|
22
|
+
output.add_argument('--make-clean', action='store_true', help='delete all HTML build artifact and write latest URLs to .urls.env')
|
|
23
|
+
output.add_argument('--output-emails', '-oe', action='store_true', help='generate other files section')
|
|
24
|
+
output.add_argument('--output-json-files', action='store_true', help='pretty print all the raw JSON data files in the collection')
|
|
25
|
+
output.add_argument('--output-other-files', '-oo', action='store_true', help='generate other files section')
|
|
26
|
+
output.add_argument('--output-texts', '-ot', action='store_true', help='generate other files section')
|
|
27
|
+
output.add_argument('--suppress-output', action='store_true', help='no output to terminal (use with --build)')
|
|
28
|
+
output.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use (in characters)')
|
|
29
|
+
output.add_argument('--use-epstein-web-links', action='store_true', help='use epsteinweb.org links instead of epstein.media')
|
|
30
|
+
|
|
31
|
+
scripts = parser.add_argument_group('SCRIPTS', 'Arguments used only by epstein_search, epstein_show, epstein_diff')
|
|
32
|
+
scripts.add_argument('positional_args', nargs='*', help='strings to searchs for, file IDs to show or diff, etc.')
|
|
33
|
+
scripts.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (only used by scripts)')
|
|
34
|
+
scripts.add_argument('--whole-file', '-wf', action='store_true', help='print whole file (only used by epstein_search)')
|
|
35
|
+
|
|
36
|
+
debug = parser.add_argument_group('DEBUG')
|
|
37
|
+
debug.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
|
|
38
|
+
debug.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
|
|
39
|
+
debug.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
|
|
40
|
+
debug.add_argument('--json-metadata', '-jm', action='store_true', help='dump JSON metadata for all files')
|
|
41
|
+
debug.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats at the end')
|
|
42
|
+
debug.add_argument('--sort-alphabetical', action='store_true', help='sort emailers alphabetically in counts table')
|
|
43
|
+
debug.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
|
|
39
44
|
args = parser.parse_args()
|
|
40
45
|
|
|
41
46
|
current_script = Path(argv[0]).name
|
|
@@ -45,7 +50,7 @@ is_html_script = current_script in HTML_SCRIPTS
|
|
|
45
50
|
args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
|
|
46
51
|
args.output_emails = args.output_emails or args.all_emails
|
|
47
52
|
args.output_other_files = args.output_other_files or args.all_other_files
|
|
48
|
-
args.
|
|
53
|
+
args.overwrite_pickle = args.overwrite_pickle or (is_env_var_set('OVERWRITE_PICKLE') and not is_env_var_set('PICKLED'))
|
|
49
54
|
args.width = args.width if is_html_script else None
|
|
50
55
|
specified_names: list[str | None] = [None if n == 'None' else n for n in (args.names or [])]
|
|
51
56
|
|
|
@@ -66,8 +71,8 @@ datefinder_logger.setLevel(logger.level)
|
|
|
66
71
|
|
|
67
72
|
# Massage args that depend on other args to the appropriate state
|
|
68
73
|
if not (args.json_metadata or args.output_texts or args.output_emails or args.output_other_files):
|
|
69
|
-
if is_html_script and current_script != COUNT_WORDS_SCRIPT and not args.make_clean:
|
|
70
|
-
logger.warning(f"No output section chosen; outputting default of texts, selected emails, and other files...")
|
|
74
|
+
if is_html_script and current_script != COUNT_WORDS_SCRIPT and not args.make_clean and not args.colors_only:
|
|
75
|
+
logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
|
|
71
76
|
|
|
72
77
|
args.output_texts = True
|
|
73
78
|
args.output_emails = True
|
|
@@ -77,4 +82,4 @@ if args.use_epstein_web_links:
|
|
|
77
82
|
logger.warning(f"Using links to epsteinweb.org links instead of epsteinify.com...")
|
|
78
83
|
|
|
79
84
|
if args.debug:
|
|
80
|
-
logger.warning(f"Invocation args:\nis_html_script={is_html_script},\nspecified_names={specified_names},\nargs={args}")
|
|
85
|
+
logger.warning(f"Invocation args:\ncurrent_script={current_script}\nis_html_script={is_html_script},\nspecified_names={specified_names},\nargs={args}")
|
|
@@ -159,7 +159,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
159
159
|
pattern=r'Gruterite|(John\s*)?Kluge|Marc Rich|(Mi(chael|ke)\s*)?Ovitz|(Steve\s+)?Wynn|(Les(lie)?\s+)?Wexner|SALSS|Swedish[-\s]*American\s*Life\s*Science\s*Summit|Valhi|(Yves\s*)?Bouvier',
|
|
160
160
|
emailers = {
|
|
161
161
|
ALIREZA_ITTIHADIEH: 'CEO Freestream Aircraft Limited',
|
|
162
|
-
BARBRO_C_EHNBOM: 'Swedish pharmaceuticals',
|
|
162
|
+
BARBRO_C_EHNBOM: 'Swedish pharmaceuticals, SALSS',
|
|
163
163
|
FRED_HADDAD: "co-founder of Heck's in West Virginia",
|
|
164
164
|
GERALD_BARTON: "Maryland property developer Landmark Land Company, fan of Trump's Irish golf course",
|
|
165
165
|
GORDON_GETTY: 'heir of oil tycoon J. Paul Getty',
|
|
@@ -296,6 +296,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
296
296
|
emailers = {
|
|
297
297
|
DAVID_STERN: f'emailed Epstein from Moscow, appears to know chairman of {DEUTSCHE_BANK}',
|
|
298
298
|
JONATHAN_FARKAS: "heir to the Alexander's department store fortune",
|
|
299
|
+
'linkspirit': "Skype username of someone Epstein communicated with",
|
|
299
300
|
'Peter Thomas Roth': 'student of Epstein at Dalton, skincare company founder',
|
|
300
301
|
STEPHEN_HANSON: None,
|
|
301
302
|
TOM_BARRACK: 'long time friend of Trump',
|
|
@@ -304,7 +305,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
304
305
|
HighlightedNames(
|
|
305
306
|
label='finance',
|
|
306
307
|
style='green',
|
|
307
|
-
pattern=r'Apollo|Ari\s*Glass|(Bernie\s*)?Madoff|Black(rock|stone)|
|
|
308
|
+
pattern=r'Apollo|Ari\s*Glass|Bank|(Bernie\s*)?Madoff|Black(rock|stone)|B\s*of\s*A|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|(money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
|
|
308
309
|
emailers={
|
|
309
310
|
AMANDA_ENS: 'Citigroup',
|
|
310
311
|
DANIEL_SABBA: 'UBS Investment Bank',
|
|
@@ -587,7 +588,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
587
588
|
HighlightedText(
|
|
588
589
|
label='phone_number',
|
|
589
590
|
style='bright_green',
|
|
590
|
-
pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})
|
|
591
|
+
pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|\b[\d+]{10,12}\b",
|
|
591
592
|
),
|
|
592
593
|
]
|
|
593
594
|
|
epstein_files/util/output.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
1
3
|
from rich.padding import Padding
|
|
2
4
|
|
|
3
5
|
from epstein_files.documents.email import Email
|
|
4
6
|
from epstein_files.documents.messenger_log import MessengerLog
|
|
5
7
|
from epstein_files.epstein_files import EpsteinFiles, count_by_month
|
|
6
|
-
from epstein_files.util.constant
|
|
7
|
-
from epstein_files.util.constant import urls
|
|
8
|
+
from epstein_files.util.constant import output_files
|
|
8
9
|
from epstein_files.util.constant.html import *
|
|
9
10
|
from epstein_files.util.constant.names import *
|
|
10
|
-
from epstein_files.util.constant.
|
|
11
|
+
from epstein_files.util.constant.output_files import JSON_FILES_JSON_PATH, JSON_METADATA_PATH
|
|
11
12
|
from epstein_files.util.data import dict_sets_to_lists
|
|
12
13
|
from epstein_files.util.env import args, specified_names
|
|
13
14
|
from epstein_files.util.logging import log_file_write, logger
|
|
@@ -108,6 +109,20 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
|
|
|
108
109
|
return len(already_printed_emails)
|
|
109
110
|
|
|
110
111
|
|
|
112
|
+
def print_json_files(epstein_files: EpsteinFiles):
|
|
113
|
+
if args.build:
|
|
114
|
+
json_data = {json_file.url_slug: json_file.json_data() for json_file in epstein_files.json_files}
|
|
115
|
+
|
|
116
|
+
with open(JSON_FILES_JSON_PATH, 'w') as f:
|
|
117
|
+
f.write(json.dumps(json_data, sort_keys=True))
|
|
118
|
+
log_file_write(JSON_FILES_JSON_PATH)
|
|
119
|
+
else:
|
|
120
|
+
for json_file in epstein_files.json_files:
|
|
121
|
+
console.line(2)
|
|
122
|
+
console.print(json_file.description_panel())
|
|
123
|
+
console.print_json(json_file.json_str(), indent=4, sort_keys=False)
|
|
124
|
+
|
|
125
|
+
|
|
111
126
|
def print_json_metadata(epstein_files: EpsteinFiles) -> None:
|
|
112
127
|
json_str = epstein_files.json_metadata()
|
|
113
128
|
|
|
@@ -122,9 +137,9 @@ def print_json_metadata(epstein_files: EpsteinFiles) -> None:
|
|
|
122
137
|
def print_json_stats(epstein_files: EpsteinFiles) -> None:
|
|
123
138
|
console.line(5)
|
|
124
139
|
console.print(Panel('JSON Stats Dump', expand=True, style='reverse bold'), '\n')
|
|
125
|
-
print_json(f"
|
|
126
|
-
print_json(f"
|
|
127
|
-
print_json(f"
|
|
140
|
+
print_json(f"MessengerLog Sender Counts", MessengerLog.count_authors(epstein_files.imessage_logs), skip_falsey=True)
|
|
141
|
+
print_json(f"Email Author Counts", epstein_files.email_author_counts, skip_falsey=True)
|
|
142
|
+
print_json(f"Email Recipient Counts", epstein_files.email_recipient_counts, skip_falsey=True)
|
|
128
143
|
print_json("Email signature_substitution_countss", epstein_files.email_signature_substitution_counts(), skip_falsey=True)
|
|
129
144
|
print_json("email_author_device_signatures", dict_sets_to_lists(epstein_files.email_authors_to_device_signatures))
|
|
130
145
|
print_json("email_sent_from_devices", dict_sets_to_lists(epstein_files.email_device_signatures_to_authors))
|
|
@@ -147,16 +162,12 @@ def print_text_messages(epstein_files: EpsteinFiles) -> None:
|
|
|
147
162
|
|
|
148
163
|
def write_urls() -> None:
|
|
149
164
|
"""Write _URL style constant variables to a file bash scripts can load as env vars."""
|
|
150
|
-
if
|
|
151
|
-
logger.warning(f"Can't write env vars to '{args.output_file}', writing to '{URLS_ENV}' instead.\n")
|
|
152
|
-
args.output_file = URLS_ENV
|
|
165
|
+
url_vars = {k: v for k, v in vars(output_files).items() if k.endswith('URL') and not k.startswith('GH')}
|
|
153
166
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
if isinstance(v, str) and k.split('_')[-1] in ['URL'] and 'github.io' in v and 'BASE' not in k
|
|
157
|
-
}
|
|
167
|
+
if not args.suppress_output:
|
|
168
|
+
console.line()
|
|
158
169
|
|
|
159
|
-
with open(
|
|
170
|
+
with open(URLS_ENV, 'w') as f:
|
|
160
171
|
for var_name, url in url_vars.items():
|
|
161
172
|
key_value = f"{var_name}='{url}'"
|
|
162
173
|
|
|
@@ -165,8 +176,10 @@ def write_urls() -> None:
|
|
|
165
176
|
|
|
166
177
|
f.write(f"{key_value}\n")
|
|
167
178
|
|
|
168
|
-
|
|
169
|
-
|
|
179
|
+
if not args.suppress_output:
|
|
180
|
+
console.line()
|
|
181
|
+
|
|
182
|
+
logger.warning(f"Wrote {len(url_vars)} URL variables to '{URLS_ENV}'\n")
|
|
170
183
|
|
|
171
184
|
|
|
172
185
|
def _verify_all_emails_were_printed(epstein_files: EpsteinFiles, already_printed_emails: list[Email]) -> None:
|
epstein_files/util/rich.py
CHANGED
|
@@ -14,7 +14,8 @@ from rich.theme import Theme
|
|
|
14
14
|
|
|
15
15
|
from epstein_files.util.constant.html import CONSOLE_HTML_FORMAT, HTML_TERMINAL_THEME, PAGE_TITLE
|
|
16
16
|
from epstein_files.util.constant.names import UNKNOWN
|
|
17
|
-
from epstein_files.util.constant.
|
|
17
|
+
from epstein_files.util.constant.output_files import SITE_URLS
|
|
18
|
+
from epstein_files.util.constant.strings import DEFAULT, EMAIL, NA, QUESTION_MARKS, TEXT_MESSAGE, SiteType
|
|
18
19
|
from epstein_files.util.constant.urls import *
|
|
19
20
|
from epstein_files.util.constants import FALLBACK_TIMESTAMP, HEADER_ABBREVIATIONS
|
|
20
21
|
from epstein_files.util.data import json_safe
|
|
@@ -31,11 +32,22 @@ GREY_NUMBERS = [58, 39, 39, 35, 30, 27, 23, 23, 19, 19, 15, 15, 15]
|
|
|
31
32
|
DEFAULT_NAME_STYLE = 'gray46'
|
|
32
33
|
KEY_STYLE='honeydew2 bold'
|
|
33
34
|
SECTION_HEADER_STYLE = 'bold white on blue3'
|
|
34
|
-
SOCIAL_MEDIA_LINK_STYLE = '
|
|
35
|
+
SOCIAL_MEDIA_LINK_STYLE = 'pale_turquoise4'
|
|
35
36
|
SUBSTACK_POST_LINK_STYLE = 'bright_cyan'
|
|
36
37
|
SYMBOL_STYLE = 'grey70'
|
|
38
|
+
TABLE_BORDER_STYLE = 'grey46'
|
|
39
|
+
TABLE_TITLE_STYLE = f"gray85 italic"
|
|
37
40
|
TITLE_STYLE = 'black on bright_white bold'
|
|
38
41
|
|
|
42
|
+
AUX_SITE_LINK_STYLE = 'dark_orange3'
|
|
43
|
+
OTHER_SITE_LINK_STYLE = 'dark_goldenrod'
|
|
44
|
+
|
|
45
|
+
DEFAULT_TABLE_KWARGS = {
|
|
46
|
+
'border_style': TABLE_BORDER_STYLE,
|
|
47
|
+
'header_style': "bold",
|
|
48
|
+
'title_style': TABLE_TITLE_STYLE,
|
|
49
|
+
}
|
|
50
|
+
|
|
39
51
|
HIGHLIGHTED_GROUP_COLOR_KEYS = [
|
|
40
52
|
Text(highlight_group.label.replace('_', ' '), style=highlight_group.style)
|
|
41
53
|
for highlight_group in sorted(HIGHLIGHTED_NAMES, key=lambda hg: hg.label)
|
|
@@ -79,7 +91,11 @@ def build_highlighter(pattern: str) -> EpsteinHighlighter:
|
|
|
79
91
|
return TempHighlighter()
|
|
80
92
|
|
|
81
93
|
|
|
82
|
-
def
|
|
94
|
+
def build_table(title: str | None, **kwargs) -> Table:
|
|
95
|
+
return Table(title=title, **{**DEFAULT_TABLE_KWARGS, **kwargs})
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def join_texts(txts: list[Text], join: str = ' ', encloser: str = '', encloser_style: str = 'wheat4') -> Text:
|
|
83
99
|
"""Join rich.Text objs into one."""
|
|
84
100
|
if encloser:
|
|
85
101
|
if len(encloser) != 2:
|
|
@@ -91,8 +107,9 @@ def join_texts(txts: list[Text], join: str = ' ', encloser: str = '') -> Text:
|
|
|
91
107
|
|
|
92
108
|
txt = Text('')
|
|
93
109
|
|
|
94
|
-
for i,
|
|
95
|
-
txt.append(join if i >= 1 else '').append(enclose_start)
|
|
110
|
+
for i, _txt in enumerate(txts):
|
|
111
|
+
txt.append(join if i >= 1 else '').append(enclose_start, style=encloser_style)
|
|
112
|
+
txt.append(_txt).append(enclose_end, style=encloser_style)
|
|
96
113
|
|
|
97
114
|
return txt
|
|
98
115
|
|
|
@@ -132,7 +149,7 @@ def print_centered_link(url: str, link_text: str, style: str | None = None) -> N
|
|
|
132
149
|
|
|
133
150
|
|
|
134
151
|
def print_color_key() -> None:
|
|
135
|
-
color_table =
|
|
152
|
+
color_table = build_table('Rough Guide to Highlighted Colors', show_header=False)
|
|
136
153
|
num_colors = len(HIGHLIGHTED_GROUP_COLOR_KEYS)
|
|
137
154
|
row_number = 0
|
|
138
155
|
|
|
@@ -164,7 +181,7 @@ def print_header(epstein_files: 'EpsteinFiles') -> None:
|
|
|
164
181
|
print_centered(f"if you think there's an attribution error or can deanonymize an {UNKNOWN} contact {CRYPTADAMUS_TWITTER}", 'grey46')
|
|
165
182
|
print_centered('note this site is based on the OCR text provided by Congress which is not always the greatest', 'grey23')
|
|
166
183
|
print_centered(f"(thanks to {link_markup('https://x.com/ImDrinknWyn', '@ImDrinknWyn', 'dodger_blue3')} + others for help attributing redacted emails)")
|
|
167
|
-
print_centered_link(
|
|
184
|
+
print_centered_link(JSON_METADATA_URL, "(explanations of author attributions)", style='magenta')
|
|
168
185
|
|
|
169
186
|
|
|
170
187
|
def print_json(label: str, obj: object, skip_falsey: bool = False) -> None:
|
|
@@ -231,24 +248,28 @@ def print_other_site_link(is_header: bool = True) -> None:
|
|
|
231
248
|
other_site_msg += f" Epstein's {other_site_type}s also generated by this code"
|
|
232
249
|
markup_msg = link_markup(SITE_URLS[other_site_type], other_site_msg, OTHER_SITE_LINK_STYLE)
|
|
233
250
|
print_centered(parenthesize(Text.from_markup(markup_msg)), style='bold')
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
251
|
+
|
|
252
|
+
if is_header:
|
|
253
|
+
word_count_link = link_text_obj(WORD_COUNT_URL, 'most frequently used words in the emails and texts', AUX_SITE_LINK_STYLE)
|
|
254
|
+
print_centered(parenthesize(word_count_link))
|
|
255
|
+
metadata_link = link_text_obj(JSON_METADATA_URL, 'author attribution explanations', AUX_SITE_LINK_STYLE)
|
|
256
|
+
print_centered(parenthesize(metadata_link))
|
|
257
|
+
json_link = link_text_obj(WORD_COUNT_URL, "epstein's json files", AUX_SITE_LINK_STYLE)
|
|
258
|
+
print_centered(parenthesize(json_link))
|
|
238
259
|
|
|
239
260
|
|
|
240
261
|
def print_page_title(expand: bool = True, width: int | None = None) -> None:
|
|
241
262
|
title_panel = Panel(Text(PAGE_TITLE, justify='center'), expand=expand, style=TITLE_STYLE, width=width)
|
|
242
263
|
console.print(Align.center(vertically_pad(title_panel)))
|
|
243
|
-
|
|
264
|
+
_print_social_media_links()
|
|
244
265
|
console.line(2)
|
|
245
266
|
|
|
246
267
|
|
|
247
268
|
def print_panel(msg: str, style: str = 'black on white', padding: tuple | None = None, centered: bool = False) -> None:
|
|
248
269
|
_padding: list[int] = list(padding or [0, 0, 0, 0])
|
|
249
270
|
_padding[2] += 1 # Bottom pad
|
|
250
|
-
panel = Panel(Text.from_markup(msg, justify='center'), width=70, style=style)
|
|
251
271
|
actual_padding: tuple[int, int, int, int] = tuple(_padding)
|
|
272
|
+
panel = Panel(Text.from_markup(msg, justify='center'), width=70, style=style)
|
|
252
273
|
|
|
253
274
|
if centered:
|
|
254
275
|
console.print(Align.center(Padding(panel, actual_padding)))
|
|
@@ -262,19 +283,6 @@ def print_section_header(msg: str, style: str = SECTION_HEADER_STYLE, is_centere
|
|
|
262
283
|
console.print(Padding(panel, (3, 0, 1, 0)))
|
|
263
284
|
|
|
264
285
|
|
|
265
|
-
def print_social_media_links() -> None:
|
|
266
|
-
print_centered_link(SUBSTACK_URL, "I Made Epstein's Text Messages Great Again (And You Should Read Them)", style=f'{SUBSTACK_POST_LINK_STYLE} bold')
|
|
267
|
-
print_centered_link(SUBSTACK_URL, SUBSTACK_URL.removeprefix('https://'), style=f'{SUBSTACK_POST_LINK_STYLE} dim')
|
|
268
|
-
|
|
269
|
-
social_links = [
|
|
270
|
-
link_text_obj('https://x.com/Cryptadamist/status/1990866804630036988', '@cryptadamist', style=SOCIAL_MEDIA_LINK_STYLE),
|
|
271
|
-
link_text_obj('https://cryptadamus.substack.com/', 'substack', style=SOCIAL_MEDIA_LINK_STYLE),
|
|
272
|
-
link_text_obj('https://universeodon.com/@cryptadamist/115572634993386057', 'mastodon', style=SOCIAL_MEDIA_LINK_STYLE),
|
|
273
|
-
]
|
|
274
|
-
|
|
275
|
-
print_centered(join_texts(social_links, join=' ', encloser='[]'))
|
|
276
|
-
|
|
277
|
-
|
|
278
286
|
def print_starred_header(msg: str, num_stars: int = 7, num_spaces: int = 2, style: str = TITLE_STYLE) -> None:
|
|
279
287
|
stars = '*' * num_stars
|
|
280
288
|
spaces = ' ' * num_spaces
|
|
@@ -314,7 +322,7 @@ def write_html(output_path: Path) -> None:
|
|
|
314
322
|
|
|
315
323
|
|
|
316
324
|
def _print_abbreviations_table() -> None:
|
|
317
|
-
table =
|
|
325
|
+
table = build_table(title="Abbreviations Used Frequently In These Conversations", show_header=False)
|
|
318
326
|
table.add_column("Abbreviation", justify="center", style='bold')
|
|
319
327
|
table.add_column("Translation", style="white", justify="center")
|
|
320
328
|
|
|
@@ -326,7 +334,7 @@ def _print_abbreviations_table() -> None:
|
|
|
326
334
|
|
|
327
335
|
def _print_external_links() -> None:
|
|
328
336
|
console.line()
|
|
329
|
-
|
|
337
|
+
print_centered(Text('External Links', style=TABLE_TITLE_STYLE))
|
|
330
338
|
presser_link = link_text_obj(OVERSIGHT_REPUBLICANS_PRESSER_URL, 'Official Oversight Committee Press Release')
|
|
331
339
|
raw_docs_link = join_texts([link_text_obj(RAW_OVERSIGHT_DOCS_GOOGLE_DRIVE_URL, 'raw files', style=f"{ARCHIVE_LINK_COLOR} dim")], encloser='()')
|
|
332
340
|
print_centered(join_texts([presser_link, raw_docs_link]))
|
|
@@ -335,6 +343,26 @@ def _print_external_links() -> None:
|
|
|
335
343
|
print_centered(link_markup(COURIER_NEWSROOM_ARCHIVE_URL, 'Searchable Archive') + " (Courier Newsroom)")
|
|
336
344
|
print_centered(link_markup(EPSTEINIFY_URL) + " (raw document images)")
|
|
337
345
|
print_centered(link_markup(EPSTEIN_WEB_URL) + " (character summaries)")
|
|
346
|
+
print_centered(link_markup(EPSTEIN_MEDIA_URL) + " (raw document images)")
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def _print_social_media_links() -> None:
|
|
350
|
+
print_centered_link(
|
|
351
|
+
SUBSTACK_URL,
|
|
352
|
+
"I Made Epstein's Text Messages Great Again (And You Should Read Them)",
|
|
353
|
+
style=f'{SUBSTACK_POST_LINK_STYLE} bold'
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
print_centered_link(SUBSTACK_URL, SUBSTACK_URL.removeprefix('https://'), style=f'{SUBSTACK_POST_LINK_STYLE} dim')
|
|
357
|
+
|
|
358
|
+
social_links = [
|
|
359
|
+
link_text_obj('https://universeodon.com/@cryptadamist/115572634993386057', '@mastodon', style=SOCIAL_MEDIA_LINK_STYLE),
|
|
360
|
+
link_text_obj(SUBSTACK_URL, '@substack', style=SOCIAL_MEDIA_LINK_STYLE),
|
|
361
|
+
link_text_obj('https://x.com/Cryptadamist/status/1990866804630036988', '@twitter', style=SOCIAL_MEDIA_LINK_STYLE),
|
|
362
|
+
link_text_obj('https://github.com/michelcrypt4d4mus/epstein_text_messages', '@github', style=SOCIAL_MEDIA_LINK_STYLE)
|
|
363
|
+
]
|
|
364
|
+
|
|
365
|
+
print_centered(join_texts(social_links, join=' / '))#, encloser='()'))#, encloser='‹›'))
|
|
338
366
|
|
|
339
367
|
|
|
340
368
|
# if args.deep_debug:
|
epstein_files/util/word_count.py
CHANGED
|
@@ -20,6 +20,14 @@ from epstein_files.util.search_result import SearchResult
|
|
|
20
20
|
FIRST_AND_LAST_NAMES = flatten([n.split() for n in ALL_NAMES])
|
|
21
21
|
FIRST_AND_LAST_NAMES = [n.lower() for n in FIRST_AND_LAST_NAMES] + OTHER_NAMES
|
|
22
22
|
|
|
23
|
+
HTML_REGEX = re.compile(r"com/|cae-v2w=|content-(transfe|type)|font(/|-(family|size))|http|\.html?\??|margin-bottom|padding-left|quoted-printable|region=|text-decoration|ttps|www|\.(gif|jpe?g|png);?$")
|
|
24
|
+
HYPHENATED_WORD_REGEX = re.compile(r"[a-z]+-[a-z]+", re.IGNORECASE)
|
|
25
|
+
OK_SYMBOL_WORDS = ['mar-a-lago', 'p/e', 's&p', ':)', ':).', ';)', ':-)', ';-)']
|
|
26
|
+
ONLY_SYMBOLS_REGEX = re.compile(r"^[^a-zA-Z0-9]+$")
|
|
27
|
+
SYMBOL_WORD_REGEX = re.compile(r"^[-—–@%/?.,&=]+$")
|
|
28
|
+
SPLIT_WORDS_BY = ['@', '/']
|
|
29
|
+
FLAGGED_WORDS = [] # For debugging, log extra info when one of these is encountered
|
|
30
|
+
|
|
23
31
|
NON_SINGULARIZABLE = UNSINGULARIZABLE_WORDS + [n for n in FIRST_AND_LAST_NAMES if n.endswith('s')]
|
|
24
32
|
SKIP_WORDS_REGEX = re.compile(r"^(asmallworld@|enwiki|http|imagepng|nymagcomnymetro|addresswww|mailto|www|/font|colordu|classdms|targetdblank|nymagcom|palmbeachdailynews)|jee[vy]acation|fontfamily|(gif|html?|jpe?g|utm)$")
|
|
25
33
|
BAD_CHARS_REGEX = re.compile(r"[-–=+()$€£©°«—^&%!#_`,.;:'‘’\"„“”?\d\\]")
|
|
@@ -100,21 +108,13 @@ SINGULARIZATIONS = {
|
|
|
100
108
|
'twittercom': 'twitter',
|
|
101
109
|
}
|
|
102
110
|
|
|
103
|
-
HTML_REGEX = re.compile(r"com/|cae-v2w=|content-(transfe|type)|font(/|-(family|size))|http|\.html?\??|margin-bottom|padding-left|quoted-printable|region=|text-decoration|ttps|www|\.(gif|jpe?g|png);?$")
|
|
104
|
-
HYPHENATED_WORD_REGEX = re.compile(r"[a-z]+-[a-z]+", re.IGNORECASE)
|
|
105
|
-
OK_SYMBOL_WORDS = ['mar-a-lago', 'p/e', 's&p', ':)', ':).', ';)', ':-)', ';-)']
|
|
106
|
-
SYMBOL_WORD_REGEX = re.compile(r"^[-—–@%/?.,&=]+$")
|
|
107
|
-
ONLY_SYMBOLS_REGEX = re.compile(r"^[^a-zA-Z0-9]+$")
|
|
108
|
-
SPLIT_WORDS_BY = ['@', '/']
|
|
109
|
-
FLAGGED_WORDS = [] # For debugging, log extra info when one of these is encountered
|
|
110
|
-
|
|
111
111
|
|
|
112
112
|
@dataclass
|
|
113
113
|
class WordCount:
|
|
114
114
|
count: dict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
115
115
|
singularized: dict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
116
116
|
|
|
117
|
-
def
|
|
117
|
+
def tally_word(self, word: str, document_line: SearchResult) -> None:
|
|
118
118
|
word = EmailHeader.cleanup_str(word).lower().strip()
|
|
119
119
|
raw_word = word
|
|
120
120
|
|
|
@@ -148,7 +148,7 @@ class WordCount:
|
|
|
148
148
|
continue
|
|
149
149
|
|
|
150
150
|
for w in word.split(symbol):
|
|
151
|
-
self.
|
|
151
|
+
self.tally_word(w, document_line)
|
|
152
152
|
|
|
153
153
|
logger.info(f" Split word with '{symbol}' in it '{word}'...")
|
|
154
154
|
return
|