epstein-files 1.1.5__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +12 -21
- epstein_files/documents/communication.py +0 -3
- epstein_files/documents/document.py +68 -21
- epstein_files/documents/email.py +54 -70
- epstein_files/documents/emails/email_header.py +14 -4
- epstein_files/documents/imessage/text_message.py +5 -4
- epstein_files/documents/messenger_log.py +7 -7
- epstein_files/documents/other_file.py +16 -34
- epstein_files/epstein_files.py +133 -141
- epstein_files/person.py +324 -0
- epstein_files/util/constant/names.py +46 -15
- epstein_files/util/constant/output_files.py +1 -0
- epstein_files/util/constant/strings.py +3 -3
- epstein_files/util/constant/urls.py +15 -2
- epstein_files/util/constants.py +75 -21
- epstein_files/util/data.py +1 -20
- epstein_files/util/doc_cfg.py +27 -17
- epstein_files/util/env.py +5 -3
- epstein_files/util/highlighted_group.py +248 -203
- epstein_files/util/logging.py +1 -1
- epstein_files/util/output.py +113 -157
- epstein_files/util/rich.py +20 -35
- epstein_files/util/timer.py +14 -0
- epstein_files/util/word_count.py +1 -1
- {epstein_files-1.1.5.dist-info → epstein_files-1.2.1.dist-info}/METADATA +6 -2
- epstein_files-1.2.1.dist-info/RECORD +34 -0
- epstein_files-1.1.5.dist-info/RECORD +0 -33
- {epstein_files-1.1.5.dist-info → epstein_files-1.2.1.dist-info}/LICENSE +0 -0
- {epstein_files-1.1.5.dist-info → epstein_files-1.2.1.dist-info}/WHEEL +0 -0
- {epstein_files-1.1.5.dist-info → epstein_files-1.2.1.dist-info}/entry_points.txt +0 -0
epstein_files/util/constants.py
CHANGED
|
@@ -19,6 +19,7 @@ HEADER_ABBREVIATIONS = {
|
|
|
19
19
|
'bgC3': 'Bill Gates Ventures (renamed in 2018)',
|
|
20
20
|
"Brock": 'Brock Pierce (crypto bro with a very sordid past)',
|
|
21
21
|
"DB": "Deutsche Bank (maybe??)",
|
|
22
|
+
"GRAT": "Grantor Retained Annuity Trust (tax shelter)",
|
|
22
23
|
'HBJ': "Sheikh Hamad bin Jassim (former Qatari prime minister)",
|
|
23
24
|
'Jabor': '"an influential man in Qatar"',
|
|
24
25
|
'Jared': "Jared Kushner",
|
|
@@ -62,6 +63,7 @@ EMAILER_ID_REGEXES: dict[str, re.Pattern] = {
|
|
|
62
63
|
BARBRO_C_EHNBOM: re.compile(r'behnbom@aol.com|(Barbro\s.*)?Ehnbom', re.IGNORECASE),
|
|
63
64
|
BARRY_J_COHEN: re.compile(r'barry\s*((j.?|james)\s*)?cohen?', re.IGNORECASE),
|
|
64
65
|
BENNET_MOSKOWITZ: re.compile(r'Moskowitz.*Bennet|Bennet.*Moskowitz', re.IGNORECASE),
|
|
66
|
+
BOB_CROWE: re.compile(r"[BR]ob Crowe", re.IGNORECASE),
|
|
65
67
|
BORIS_NIKOLIC: re.compile(r'(boris )?nikolic?', re.IGNORECASE),
|
|
66
68
|
BRAD_EDWARDS: re.compile(r'Brad(ley)?(\s*J(.?|ames))?\s*Edwards', re.IGNORECASE),
|
|
67
69
|
BRAD_KARP: re.compile(r'Brad (S.? )?Karp|Karp, Brad', re.IGNORECASE),
|
|
@@ -83,6 +85,7 @@ EMAILER_ID_REGEXES: dict[str, re.Pattern] = {
|
|
|
83
85
|
JACKIE_PERCZEK: re.compile(r'jackie percze[kl]?', re.IGNORECASE),
|
|
84
86
|
JABOR_Y: re.compile(r'[ji]abor\s*y?', re.IGNORECASE),
|
|
85
87
|
JAMES_HILL: re.compile(r"hill, james e.|james.e.hill@abc.com", re.IGNORECASE),
|
|
88
|
+
JANUSZ_BANASIAK: re.compile(r"Janu[is]z Banasiak", re.IGNORECASE),
|
|
86
89
|
JEAN_LUC_BRUNEL: re.compile(r'Jean[- ]Luc Brunel?', re.IGNORECASE),
|
|
87
90
|
JEFF_FULLER: re.compile(r"jeff@mc2mm.com|Jeff Fuller", re.IGNORECASE),
|
|
88
91
|
JEFFREY_EPSTEIN: re.compile(r'[djl]\s?ee[vy]acation[©@]?g?(mail.com)?|Epstine|\bJEE?\b|Jeffrey E((sp|ps)tein?)?( VI Foundation)?|jeeproject@yahoo.com|J Jep|Jeffery Edwards|(?<!(Mark L.|ard Jay) )Epstein', re.IGNORECASE),
|
|
@@ -157,6 +160,7 @@ EMAILERS = [
|
|
|
157
160
|
BILL_GATES,
|
|
158
161
|
BILL_SIEGEL,
|
|
159
162
|
BRAD_WECHSLER,
|
|
163
|
+
CHRISTINA_GALBRAITH,
|
|
160
164
|
DANIEL_SABBA,
|
|
161
165
|
'Danny Goldberg',
|
|
162
166
|
DAVID_SCHOEN,
|
|
@@ -302,11 +306,10 @@ TEXTS_CONFIG = CONFIRMED_TEXTS_CONFIG + UNCONFIRMED_TEXTS_CONFIG
|
|
|
302
306
|
################################################ EMAILS ################################################
|
|
303
307
|
########################################################################################################
|
|
304
308
|
|
|
305
|
-
MICHAEL_WOLFF_EPSTEIN_ARTICLE_DRAFT = f"draft of an unpublished article about Epstein by {MICHAEL_WOLFF} written ca. 2014/2015"
|
|
306
|
-
|
|
307
309
|
# Some emails have a lot of uninteresting CCs
|
|
308
|
-
|
|
309
|
-
|
|
310
|
+
FLIGHT_IN_2012_PEOPLE: list[Name] = ['Francis Derby', JANUSZ_BANASIAK, 'Louella Rabuyo', 'Richard Barnnet']
|
|
311
|
+
IRAN_DEAL_RECIPIENTS: list[Name] = ['Allen West', 'Rafael Bardaji', 'Philip Kafka', 'Herb Goodman', 'Grant Seeger', 'Lisa Albert', 'Janet Kafka', 'James Ramsey', 'ACT for America', 'John Zouzelka', 'Joel Dunn', 'Nate McClain', 'Bennet Greenwald', 'Taal Safdie', 'Uri Fouzailov', 'Neil Anderson', 'Nate White', 'Rita Hortenstine', 'Henry Hortenstine', 'Gary Gross', 'Forrest Miller', 'Bennett Schmidt', 'Val Sherman', 'Marcie Brown', 'Michael Horowitz', 'Marshall Funk']
|
|
312
|
+
MICHAEL_WOLFF_EPSTEIN_ARTICLE_DRAFT = f"draft of an unpublished article about Epstein by {MICHAEL_WOLFF} written ca. 2014/2015"
|
|
310
313
|
|
|
311
314
|
EMAILS_CONFIG = [
|
|
312
315
|
# 026294 and 026296 might also be Ittihadieh based on timing
|
|
@@ -409,11 +412,11 @@ EMAILS_CONFIG = [
|
|
|
409
412
|
dupe_type='redacted'
|
|
410
413
|
),
|
|
411
414
|
EmailCfg(id='026547', author=GERALD_BARTON, recipients=[JEFFREY_EPSTEIN]), # Bad OCR # TODO: email header is really jacked up
|
|
412
|
-
EmailCfg(id='029969', author=GWENDOLYN_BECK, attribution_reason='
|
|
413
|
-
EmailCfg(id='029968', author=GWENDOLYN_BECK, attribution_reason='
|
|
415
|
+
EmailCfg(id='029969', author=GWENDOLYN_BECK, attribution_reason='signature "Longevity & Successful Aging"'),
|
|
416
|
+
EmailCfg(id='029968', author=GWENDOLYN_BECK, attribution_reason='signature "beckresearchlabs.com"', duplicate_ids=['031120']),
|
|
414
417
|
EmailCfg(id='029970', author=GWENDOLYN_BECK, attribution_reason='signed "Longevity & Successful Agin"'),
|
|
415
|
-
EmailCfg(id='029960', author=GWENDOLYN_BECK, attribution_reason='
|
|
416
|
-
EmailCfg(id='029959', author=GWENDOLYN_BECK, attribution_reason='"Longevity & Aging"'),
|
|
418
|
+
EmailCfg(id='029960', author=GWENDOLYN_BECK, attribution_reason='signature "Beck Center for Longevity & Aging"'),
|
|
419
|
+
EmailCfg(id='029959', author=GWENDOLYN_BECK, attribution_reason='signature "Beck Center for Longevity & Aging"'),
|
|
417
420
|
EmailCfg(id='033360', author=HENRY_HOLT, attribution_reason='in signature'), # Henry Holt is a company not a person
|
|
418
421
|
EmailCfg(id='033384', author=JACK_GOLDBERGER, attribution_reason='Might be Paul Prosperi?', is_attribution_uncertain=True),
|
|
419
422
|
EmailCfg(id='026024', author=JEAN_HUGUEN, attribution_reason='Signature'),
|
|
@@ -474,7 +477,7 @@ EMAILS_CONFIG = [
|
|
|
474
477
|
EmailCfg(
|
|
475
478
|
id='029977',
|
|
476
479
|
author=LAWRANCE_VISOSKI,
|
|
477
|
-
recipients=
|
|
480
|
+
recipients=[JEFFREY_EPSTEIN, DARREN_INDYKE, LESLEY_GROFF, RICHARD_KAHN] + FLIGHT_IN_2012_PEOPLE,
|
|
478
481
|
attribution_reason=LARRY_REASON,
|
|
479
482
|
duplicate_ids=['031129'],
|
|
480
483
|
),
|
|
@@ -491,14 +494,12 @@ EMAILS_CONFIG = [
|
|
|
491
494
|
EmailCfg(id='032606', author=MASHA_DROKOVA, attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
|
|
492
495
|
EmailCfg(id='032607', author=MASHA_DROKOVA, attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
|
|
493
496
|
EmailCfg(id='032609', author=MASHA_DROKOVA, attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
|
|
494
|
-
# 032581, 032604, 033025 may also be Masha based on timing, subject (interviews/articles), and sequential ID
|
|
495
497
|
EmailCfg(id='032604', author=MASHA_DROKOVA, attribution_reason="timing, subject (interviews/articles), and sequential ID", is_attribution_uncertain=True),
|
|
496
498
|
EmailCfg(id='032581', author=MASHA_DROKOVA, attribution_reason="timing, subject (interviews/articles), and sequential ID", is_attribution_uncertain=True),
|
|
497
|
-
EmailCfg(id='033025', author=MASHA_DROKOVA, attribution_reason="timing, subject (interviews/articles), and sequential ID", is_attribution_uncertain=True),
|
|
498
499
|
EmailCfg(id='030235', author=MELANIE_WALKER, attribution_reason='In fwd'),
|
|
499
500
|
EmailCfg(id='032343', author=MELANIE_WALKER, attribution_reason='Name seen in later reply 032346'),
|
|
500
501
|
EmailCfg(id='032212', author=MIROSLAV_LAJCAK, attribution_reason='signature'),
|
|
501
|
-
EmailCfg(id='021814', author=NADIA_MARCINKO, attribution_reason='reply'),
|
|
502
|
+
EmailCfg(id='021814', author=NADIA_MARCINKO, attribution_reason='reply'), #, actual_text="I'm a pilot...I prefer sex slave to copilot ;)"),
|
|
502
503
|
EmailCfg(id='021808', author=NADIA_MARCINKO, attribution_reason='reply'),
|
|
503
504
|
EmailCfg(id='022190', author=NADIA_MARCINKO, attribution_reason='reply'),
|
|
504
505
|
EmailCfg(id='021818', author=NADIA_MARCINKO, attribution_reason='reply'),
|
|
@@ -540,11 +541,12 @@ EMAILS_CONFIG = [
|
|
|
540
541
|
author=SEAN_BANNON,
|
|
541
542
|
attribution_reason="From protonmail, Bannon wrote 'just sent from my protonmail' in 027067",
|
|
542
543
|
),
|
|
543
|
-
EmailCfg(id='029003', author=SOON_YI_PREVIN, attribution_reason="
|
|
544
|
-
EmailCfg(id='029005', author=SOON_YI_PREVIN, attribution_reason="
|
|
545
|
-
EmailCfg(id='029007', author=SOON_YI_PREVIN, attribution_reason="
|
|
546
|
-
EmailCfg(id='029010', author=SOON_YI_PREVIN, attribution_reason="
|
|
547
|
-
EmailCfg(id='032296', author=SOON_YI_PREVIN, attribution_reason="
|
|
544
|
+
EmailCfg(id='029003', author=SOON_YI_PREVIN, attribution_reason='"Sent from Soon-Yi\'s iPhone"'),
|
|
545
|
+
EmailCfg(id='029005', author=SOON_YI_PREVIN, attribution_reason='"Sent from Soon-Yi\'s iPhone"'),
|
|
546
|
+
EmailCfg(id='029007', author=SOON_YI_PREVIN, attribution_reason='"Sent from Soon-Yi\'s iPhone"'),
|
|
547
|
+
EmailCfg(id='029010', author=SOON_YI_PREVIN, attribution_reason='"Sent from Soon-Yi\'s iPhone"'),
|
|
548
|
+
EmailCfg(id='032296', author=SOON_YI_PREVIN, attribution_reason='"Sent from Soon-Yi\'s iPhone"'),
|
|
549
|
+
EmailCfg(id='033292', author=SOON_YI_PREVIN, attribution_reason='mentions "Woody\'s movie"', is_attribution_uncertain=True),
|
|
548
550
|
EmailCfg(
|
|
549
551
|
id='019109',
|
|
550
552
|
author=STEVEN_HOFFENBERG,
|
|
@@ -557,7 +559,7 @@ EMAILS_CONFIG = [
|
|
|
557
559
|
attribution_reason='ends with "Respectfully, terry"',
|
|
558
560
|
author=TERRY_KAFKA,
|
|
559
561
|
fwded_text_after='From: Mike Cohen',
|
|
560
|
-
recipients=
|
|
562
|
+
recipients=[JEFFREY_EPSTEIN, MARK_EPSTEIN, MICHAEL_BUCHHOLTZ] + IRAN_DEAL_RECIPIENTS,
|
|
561
563
|
subject='Fw: The Iran Nuclear Deal',
|
|
562
564
|
duplicate_ids=['028482'],
|
|
563
565
|
),
|
|
@@ -620,6 +622,7 @@ EMAILS_CONFIG = [
|
|
|
620
622
|
EmailCfg(id='022250', recipients=[LESLEY_GROFF], attribution_reason='Reply'),
|
|
621
623
|
EmailCfg(id='030242', recipients=[MARIANA_IDZKOWSKA], duplicate_ids=['032048'], dupe_type='redacted'),
|
|
622
624
|
EmailCfg(id='033027', recipients=[MASHA_DROKOVA], attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
|
|
625
|
+
EmailCfg(id='033025', recipients=[MASHA_DROKOVA], attribution_reason="timing, subject (interviews/articles), and sequential ID", is_attribution_uncertain=True),
|
|
623
626
|
EmailCfg(id='030368', recipients=[MELANIE_SPINELLA], attribution_reason='Actually a self fwd from jeffrey to jeffrey'),
|
|
624
627
|
EmailCfg(id='030369', recipients=[MELANIE_SPINELLA], attribution_reason='Actually a self fwd from jeffrey to jeffrey'),
|
|
625
628
|
EmailCfg(id='030371', recipients=[MELANIE_SPINELLA], attribution_reason='Actually a self fwd from jeffrey to jeffrey'),
|
|
@@ -658,6 +661,7 @@ EMAILS_CONFIG = [
|
|
|
658
661
|
EmailCfg(id='032023', is_fwded_article=True, duplicate_ids=['032012']), # American-Israeli Cooperative Enterprise Newsletter
|
|
659
662
|
EmailCfg(id='021758', is_fwded_article=True, duplicate_ids=['030616']), # Radar Online article about Epstein's early prison release
|
|
660
663
|
EmailCfg(id='033297', is_fwded_article=True, duplicate_ids=['033586']), # Sultan Sulayem fwding article about Trump and Russia
|
|
664
|
+
EmailCfg(id='030983', is_fwded_article=True), # Power Line blog Alex Acosta and Jeffrey Epstein Plea Deal Analysis
|
|
661
665
|
EmailCfg(id='031774', is_fwded_article=True), # Krassner fwd of Palmer Report article
|
|
662
666
|
EmailCfg(id='033345', is_fwded_article=True), # Krassner fwd of Palmer Report article
|
|
663
667
|
EmailCfg(id='029903', is_fwded_article=True), # Krassner fwd of Ann Coulter article about Epstein
|
|
@@ -711,6 +715,7 @@ EMAILS_CONFIG = [
|
|
|
711
715
|
EmailCfg(id='030373', timestamp=parse('2018-10-03 01:49:27')),
|
|
712
716
|
|
|
713
717
|
# Configure duplicates
|
|
718
|
+
EmailCfg(id='026631', duplicate_ids=['026632'], dupe_type='quoted'),
|
|
714
719
|
EmailCfg(id='028768', duplicate_ids=['026563'], dupe_type='redacted'),
|
|
715
720
|
EmailCfg(id='027056', duplicate_ids=['028762'], dupe_type='redacted'),
|
|
716
721
|
EmailCfg(id='032248', duplicate_ids=['032246'], dupe_type='redacted'),
|
|
@@ -870,7 +875,7 @@ TWEET = 'tweet'
|
|
|
870
875
|
# Legal cases
|
|
871
876
|
BRUNEL_V_EPSTEIN = f"{JEAN_LUC_BRUNEL} v. {JEFFREY_EPSTEIN} and Tyler McDonald d/b/a YI.org"
|
|
872
877
|
EDWARDS_V_DERSHOWITZ = f"{BRAD_EDWARDS} & {PAUL_G_CASSELL} v. {ALAN_DERSHOWITZ}"
|
|
873
|
-
EPSTEIN_V_ROTHSTEIN_EDWARDS = f"Epstein v. Scott Rothstein, {BRAD_EDWARDS},
|
|
878
|
+
EPSTEIN_V_ROTHSTEIN_EDWARDS = f"Epstein v. Scott Rothstein, {BRAD_EDWARDS}, & L.M."
|
|
874
879
|
GIUFFRE_V_DERSHOWITZ = f"{VIRGINIA_GIUFFRE} v. {ALAN_DERSHOWITZ}"
|
|
875
880
|
GIUFFRE_V_EPSTEIN = f"{VIRGINIA_GIUFFRE} v. {JEFFREY_EPSTEIN}"
|
|
876
881
|
GIUFFRE_V_MAXWELL = f"{VIRGINIA_GIUFFRE} v. {GHISLAINE_MAXWELL}"
|
|
@@ -1379,8 +1384,6 @@ OTHER_FILES_LETTERS = [
|
|
|
1379
1384
|
description=f"letter about algorithmic trading",
|
|
1380
1385
|
date='2016-06-24', # date is based on Brexit reference but he could be backtesting,
|
|
1381
1386
|
),
|
|
1382
|
-
DocCfg(id='029304', author=DONALD_TRUMP, description=f"recommendation letter for recently departed {TRUMP_ORG} lawyer {MICHAEL_J_BOCCIO}"),
|
|
1383
|
-
DocCfg(id='029301', author=MICHAEL_J_BOCCIO, description=f"letter from former lawyer at the {TRUMP_ORG}", date='2011-08-07'),
|
|
1384
1387
|
DocCfg(id='026134', description=f'letter to someone named George about investment opportunities in the Ukraine banking sector'),
|
|
1385
1388
|
]
|
|
1386
1389
|
|
|
@@ -1531,13 +1534,27 @@ OTHER_FILES_ACADEMIA = [
|
|
|
1531
1534
|
|
|
1532
1535
|
# resumes and application letters
|
|
1533
1536
|
OTHER_FILES_RESUMES = [
|
|
1537
|
+
DocCfg(
|
|
1538
|
+
id='029304',
|
|
1539
|
+
attached_to_email_id='029299',
|
|
1540
|
+
author=DONALD_TRUMP,
|
|
1541
|
+
description=f"recommendation letter for recently departed {TRUMP_ORG} lawyer {MICHAEL_J_BOCCIO}",
|
|
1542
|
+
),
|
|
1534
1543
|
DocCfg(id='022367', author='Jack J Grynberg', description=RESUME_OF, date='2014-07-01'),
|
|
1535
1544
|
DocCfg(
|
|
1536
1545
|
id='029302',
|
|
1546
|
+
attached_to_email_id='029299',
|
|
1537
1547
|
author=MICHAEL_J_BOCCIO,
|
|
1538
1548
|
description=f"{RESUME_OF} (former lawyer at the {TRUMP_ORG})",
|
|
1539
1549
|
date='2011-08-07',
|
|
1540
1550
|
),
|
|
1551
|
+
DocCfg(
|
|
1552
|
+
id='029301',
|
|
1553
|
+
attached_to_email_id='029299',
|
|
1554
|
+
author=MICHAEL_J_BOCCIO,
|
|
1555
|
+
description=f"letter from former lawyer at the {TRUMP_ORG}",
|
|
1556
|
+
date='2011-08-07',
|
|
1557
|
+
),
|
|
1541
1558
|
DocCfg(id='029102', author=NERIO_ALESSANDRI, description=HBS_APPLICATION),
|
|
1542
1559
|
DocCfg(id='029104', author=NERIO_ALESSANDRI, description=HBS_APPLICATION),
|
|
1543
1560
|
DocCfg(id='015671', author='Robin Solomon', description=RESUME_OF, date='2015-06-02'), # She left Mount Sinai at some point in 2015,
|
|
@@ -1679,3 +1696,40 @@ REPLY_LINE_ON_DATE_PATTERN = fr"^On (\d+ )?((Mon|Tues?|Wed(nes)?|Thu(rs)?|Fri|Sa
|
|
|
1679
1696
|
REPLY_LINE_PATTERN = rf"({REPLY_LINE_IN_A_MSG_PATTERN}|{REPLY_LINE_ON_NUMERIC_DATE_PATTERN}|{REPLY_LINE_ON_DATE_PATTERN}|{FORWARDED_LINE_PATTERN})"
|
|
1680
1697
|
REPLY_REGEX = re.compile(REPLY_LINE_PATTERN, re.IGNORECASE | re.MULTILINE)
|
|
1681
1698
|
SENT_FROM_REGEX = re.compile(r'^(?:(Please forgive|Sorry for all the) typos.{1,4})?((Envoyé de mon|Sent (from|via)).*(and string|AT&T|Droid|iPad|Phone|Mail|BlackBerry(.*(smartphone|device|Handheld|AT&T|T- ?Mobile))?)\.?)', re.M | re.I)
|
|
1699
|
+
|
|
1700
|
+
|
|
1701
|
+
# No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
|
|
1702
|
+
UNINTERESTING_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + [
|
|
1703
|
+
'Alan Dlugash', # CCed with Richard Kahn
|
|
1704
|
+
'Alan Rogers', # Random CC
|
|
1705
|
+
'Andrew Friendly', # Presumably some relation of Kelly Friendly
|
|
1706
|
+
'BS Stern', # A random fwd of email we have
|
|
1707
|
+
'Cheryl Kleen', # Single email from Anne Boyles, displayed under Anne Boyles
|
|
1708
|
+
'Connie Zaguirre', # Random CC
|
|
1709
|
+
'Dan Fleuette', # CC from sean bannon
|
|
1710
|
+
'Danny Goldberg', # Random Paul Krassner emails
|
|
1711
|
+
GERALD_LEFCOURT, # Single CC
|
|
1712
|
+
GORDON_GETTY, # Random CC
|
|
1713
|
+
JEFF_FULLER, # Random Jean Luc Brunel CC
|
|
1714
|
+
'Jojo Fontanilla', # Random CC
|
|
1715
|
+
'Joseph Vinciguerra', # Random CC
|
|
1716
|
+
'Larry Cohen', # Random Bill Gates CC
|
|
1717
|
+
'Lyn Fontanilla', # Random CC
|
|
1718
|
+
'Mark Albert', # Random CC
|
|
1719
|
+
'Matthew Schafer', # Random CC
|
|
1720
|
+
MICHAEL_BUCHHOLTZ, # Terry Kafka CC
|
|
1721
|
+
'Nancy Dahl', # covered by Lawrence Krauss (her husband)
|
|
1722
|
+
'Michael Simmons', # Random CC
|
|
1723
|
+
'Nancy Portland', # Lawrence Krauss CC
|
|
1724
|
+
'Oliver Goodenough', # Robert Trivers CC
|
|
1725
|
+
'Peter Aldhous', # Lawrence Krauss CC
|
|
1726
|
+
'Players2', # Hoffenberg CC
|
|
1727
|
+
'Police Code Enforcement', # Kirk Blouin / John Page CC
|
|
1728
|
+
'Sam Harris', # Lawrence Krauss CC
|
|
1729
|
+
SAMUEL_LEFF, # Random CC
|
|
1730
|
+
'Sean T Lehane', # Random CC
|
|
1731
|
+
'Stephen Rubin', # Random CC
|
|
1732
|
+
'Tim Kane', # Random CC
|
|
1733
|
+
'Travis Pangburn', # Random CC
|
|
1734
|
+
'Vahe Stepanian', # Random CC
|
|
1735
|
+
]
|
epstein_files/util/data.py
CHANGED
|
@@ -29,7 +29,7 @@ escape_single_quotes = lambda text: text.replace("'", r"\'")
|
|
|
29
29
|
iso_timestamp = lambda dt: dt.isoformat().replace('T', ' ')
|
|
30
30
|
days_between = lambda dt1, dt2: (dt2 - dt1).days + 1
|
|
31
31
|
days_between_str = lambda dt1, dt2: f"{days_between(dt1, dt2)} day" + ('s' if days_between(dt1, dt2) > 1 else '')
|
|
32
|
-
|
|
32
|
+
remove_zero_time = lambda dt: dt.isoformat().removesuffix('T00:00:00')
|
|
33
33
|
uniquify = lambda _list: list(set(_list))
|
|
34
34
|
without_falsey = lambda _list: [e for e in _list if e]
|
|
35
35
|
|
|
@@ -38,25 +38,6 @@ def dict_sets_to_lists(d: dict[str, set]) -> dict[str, list]:
|
|
|
38
38
|
return {k: sorted(list(v)) for k, v in d.items()}
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
def extract_last_name(name: str) -> str:
|
|
42
|
-
if ' ' not in name:
|
|
43
|
-
return name
|
|
44
|
-
|
|
45
|
-
names = name.removesuffix(QUESTION_MARKS).strip().split()
|
|
46
|
-
|
|
47
|
-
if names[-1].startswith('Jr') and len(names[-1]) <= 3:
|
|
48
|
-
return ' '.join(names[-2:])
|
|
49
|
-
else:
|
|
50
|
-
return names[-1]
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def extract_first_name(name: str) -> str:
|
|
54
|
-
if ' ' not in name:
|
|
55
|
-
return name
|
|
56
|
-
|
|
57
|
-
return name.removesuffix(f" {extract_last_name(name)}")
|
|
58
|
-
|
|
59
|
-
|
|
60
41
|
def flatten(_list: list[list[T]]) -> list[T]:
|
|
61
42
|
return list(itertools.chain.from_iterable(_list))
|
|
62
43
|
|
epstein_files/util/doc_cfg.py
CHANGED
|
@@ -8,7 +8,7 @@ from dateutil.parser import parse
|
|
|
8
8
|
|
|
9
9
|
from epstein_files.util.constant.names import *
|
|
10
10
|
from epstein_files.util.constant.strings import *
|
|
11
|
-
from epstein_files.util.data import
|
|
11
|
+
from epstein_files.util.data import remove_zero_time, without_falsey
|
|
12
12
|
|
|
13
13
|
DuplicateType = Literal['earlier', 'quoted', 'redacted', 'same']
|
|
14
14
|
Metadata = dict[str, bool | datetime | int | str | list[str | None] |dict[str, bool | str]]
|
|
@@ -62,7 +62,7 @@ class DocCfg:
|
|
|
62
62
|
|
|
63
63
|
Attributes:
|
|
64
64
|
id (str): ID of file
|
|
65
|
-
author (
|
|
65
|
+
author (Name): Author of the document (if any)
|
|
66
66
|
category (str | None): Type of file
|
|
67
67
|
date (str | None): If passed will be immediated parsed into the 'timestamp' field
|
|
68
68
|
dupe_type (DuplicateType | None): The type of duplicate this file is or its 'duplicate_ids' are
|
|
@@ -74,13 +74,14 @@ class DocCfg:
|
|
|
74
74
|
"""
|
|
75
75
|
id: str
|
|
76
76
|
attached_to_email_id: str | None = None
|
|
77
|
-
author:
|
|
77
|
+
author: Name = None
|
|
78
78
|
category: str | None = None
|
|
79
79
|
date: str | None = None
|
|
80
80
|
description: str | None = None
|
|
81
81
|
dupe_type: DuplicateType | None = None
|
|
82
82
|
duplicate_ids: list[str] = field(default_factory=list)
|
|
83
83
|
duplicate_of_id: str | None = None
|
|
84
|
+
is_attribution_uncertain: bool = False
|
|
84
85
|
is_interesting: bool | None = None
|
|
85
86
|
is_synthetic: bool = False
|
|
86
87
|
timestamp: datetime | None = None
|
|
@@ -94,30 +95,40 @@ class DocCfg:
|
|
|
94
95
|
|
|
95
96
|
def complete_description(self) -> str | None:
|
|
96
97
|
"""String that summarizes what is known about this document."""
|
|
98
|
+
description = ''
|
|
99
|
+
|
|
97
100
|
if self.category and not self.description and not self.author:
|
|
98
101
|
if self.category == JUNK:
|
|
99
102
|
return None
|
|
100
103
|
else:
|
|
101
|
-
|
|
104
|
+
description = self.category
|
|
102
105
|
elif self.category == REPUTATION:
|
|
103
106
|
author_str = f"{self.author} " if self.author else ''
|
|
104
|
-
|
|
107
|
+
description = f"{REPUTATION_MGMT}: {author_str}{self.description}"
|
|
105
108
|
elif self.category == SKYPE_LOG:
|
|
106
109
|
msg = f"{self.category} of conversation with {self.author}" if self.author else self.category
|
|
107
|
-
|
|
110
|
+
description = f"{msg} {self.description}" if self.description else msg
|
|
108
111
|
elif self.author and self.description:
|
|
109
112
|
if self.category in [ACADEMIA, BOOK]:
|
|
110
113
|
title = self.description if '"' in self.description else f'"{self.description}"'
|
|
111
|
-
|
|
114
|
+
description = f"{title} by {self.author}"
|
|
112
115
|
elif self.category == FINANCE and self.author in FINANCIAL_REPORTS_AUTHORS:
|
|
113
|
-
|
|
116
|
+
description = f'{self.author} report: "{self.description}"'
|
|
114
117
|
elif self.category == LEGAL and 'v.' in self.author:
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
+
description = f"{self.author}: {self.description}"
|
|
119
|
+
|
|
120
|
+
if not description:
|
|
121
|
+
pieces = without_falsey([self.author, self.description])
|
|
122
|
+
|
|
123
|
+
if pieces:
|
|
124
|
+
description = ' '.join(pieces)
|
|
125
|
+
else:
|
|
126
|
+
return None
|
|
118
127
|
|
|
119
|
-
|
|
120
|
-
|
|
128
|
+
if self.attached_to_email_id:
|
|
129
|
+
description += f" attached to email {self.attached_to_email_id}"
|
|
130
|
+
|
|
131
|
+
return description
|
|
121
132
|
|
|
122
133
|
def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
|
|
123
134
|
"""Create synthetic DocCfg objects that set the 'duplicate_of_id' field to point back to this object."""
|
|
@@ -152,7 +163,7 @@ class DocCfg:
|
|
|
152
163
|
elif _field.name == 'timestamp' and self.date is not None:
|
|
153
164
|
continue # Don't print both timestamp and date
|
|
154
165
|
elif isinstance(value, datetime):
|
|
155
|
-
value_str =
|
|
166
|
+
value_str = remove_zero_time(value)
|
|
156
167
|
add_prop(_field, f"parse('{value_str}')" if CONSTANTIZE_NAMES else f"'{value}'")
|
|
157
168
|
elif isinstance(value, str):
|
|
158
169
|
if "'" in value:
|
|
@@ -196,7 +207,6 @@ class CommunicationCfg(DocCfg):
|
|
|
196
207
|
is_attribution_uncertain (bool): True if we have a good idea of who the author is but are not 100% certain
|
|
197
208
|
"""
|
|
198
209
|
attribution_reason: str | None = None
|
|
199
|
-
is_attribution_uncertain: bool = False
|
|
200
210
|
|
|
201
211
|
def __repr__(self) -> str:
|
|
202
212
|
return super().__repr__()
|
|
@@ -209,13 +219,13 @@ class EmailCfg(CommunicationCfg):
|
|
|
209
219
|
actual_text (str | None): In dire cases of broken OCR we just configure the body of the email as a string.
|
|
210
220
|
fwded_text_after (str | None): If set, any text after this is a fwd of an article or similar
|
|
211
221
|
is_fwded_article (bool): True if this is a newspaper article someone fwded. Used to exclude articles from word counting.
|
|
212
|
-
recipients (list[
|
|
222
|
+
recipients (list[Name]): Who received the email
|
|
213
223
|
subject (str): Subject line
|
|
214
224
|
"""
|
|
215
225
|
actual_text: str | None = None
|
|
216
226
|
fwded_text_after: str | None = None
|
|
217
227
|
is_fwded_article: bool = False
|
|
218
|
-
recipients: list[
|
|
228
|
+
recipients: list[Name] = field(default_factory=list)
|
|
219
229
|
subject: str | None = None
|
|
220
230
|
|
|
221
231
|
# This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
|
epstein_files/util/env.py
CHANGED
|
@@ -8,7 +8,7 @@ from rich_argparse_plus import RichHelpFormatterPlus
|
|
|
8
8
|
from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, CHRONOLOGICAL_EMAILS_PATH, TEXT_MSGS_HTML_PATH
|
|
9
9
|
from epstein_files.util.logging import env_log_level, exit_with_error, logger
|
|
10
10
|
|
|
11
|
-
DEFAULT_WIDTH =
|
|
11
|
+
DEFAULT_WIDTH = 155
|
|
12
12
|
DEFAULT_FILE = 'default_file'
|
|
13
13
|
EPSTEIN_GENERATE = 'epstein_generate'
|
|
14
14
|
HTML_SCRIPTS = [EPSTEIN_GENERATE, 'epstein_word_count']
|
|
@@ -38,6 +38,7 @@ output.add_argument('--all-emails', '-ae', action='store_true', help='all the em
|
|
|
38
38
|
output.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just the interesting ones')
|
|
39
39
|
parser.add_argument('--build', '-b', nargs="?", default=None, const=DEFAULT_FILE, help='write output to HTML file')
|
|
40
40
|
output.add_argument('--email-timeline', action='store_true', help='print a table of all emails in chronological order')
|
|
41
|
+
output.add_argument('--emailers-info', action='store_true', help='write a .png of the eeailers info table')
|
|
41
42
|
output.add_argument('--json-files', action='store_true', help='pretty print all the raw JSON data files in the collection and exit')
|
|
42
43
|
output.add_argument('--json-metadata', action='store_true', help='dump JSON metadata for all files and exit')
|
|
43
44
|
output.add_argument('--output-emails', '-oe', action='store_true', help='generate emails section')
|
|
@@ -66,8 +67,9 @@ debug.add_argument('--suppress-logs', '-sl', action='store_true', help='set debu
|
|
|
66
67
|
args = parser.parse_args()
|
|
67
68
|
is_html_script = parser.prog in HTML_SCRIPTS
|
|
68
69
|
|
|
70
|
+
args.build = args.build
|
|
69
71
|
args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
|
|
70
|
-
args.names = [None if n == 'None' else n for n in (args.names or [])]
|
|
72
|
+
args.names = [None if n == 'None' else n.strip() for n in (args.names or [])]
|
|
71
73
|
args.output_emails = args.output_emails or args.all_emails
|
|
72
74
|
args.output_other = args.output_other or args.all_other_files or args.uninteresting
|
|
73
75
|
args.overwrite_pickle = args.overwrite_pickle or (is_env_var_set('OVERWRITE_PICKLE') and not is_env_var_set('PICKLED'))
|
|
@@ -81,7 +83,7 @@ if is_html_script:
|
|
|
81
83
|
if any([is_output_arg(arg) and val for arg, val in vars(args).items()]):
|
|
82
84
|
if args.email_timeline:
|
|
83
85
|
exit_with_error(f"--email-timeline option is mutually exlusive with other output options")
|
|
84
|
-
elif not args.email_timeline:
|
|
86
|
+
elif not args.email_timeline and not args.emailers_info:
|
|
85
87
|
logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
|
|
86
88
|
args.output_texts = args.output_emails = args.output_other = True
|
|
87
89
|
|