epstein-files 1.1.5__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +5 -1
- epstein_files/documents/document.py +7 -3
- epstein_files/documents/email.py +43 -65
- epstein_files/documents/emails/email_header.py +4 -2
- epstein_files/documents/imessage/text_message.py +3 -3
- epstein_files/documents/messenger_log.py +7 -7
- epstein_files/epstein_files.py +117 -115
- epstein_files/person.py +350 -0
- epstein_files/util/constant/names.py +35 -11
- epstein_files/util/constant/output_files.py +1 -0
- epstein_files/util/constant/strings.py +3 -2
- epstein_files/util/constant/urls.py +14 -2
- epstein_files/util/constants.py +72 -20
- epstein_files/util/data.py +0 -19
- epstein_files/util/doc_cfg.py +24 -14
- epstein_files/util/env.py +3 -1
- epstein_files/util/highlighted_group.py +154 -127
- epstein_files/util/output.py +84 -152
- epstein_files/util/rich.py +6 -21
- epstein_files/util/word_count.py +1 -1
- {epstein_files-1.1.5.dist-info → epstein_files-1.2.0.dist-info}/METADATA +2 -1
- epstein_files-1.2.0.dist-info/RECORD +34 -0
- epstein_files-1.1.5.dist-info/RECORD +0 -33
- {epstein_files-1.1.5.dist-info → epstein_files-1.2.0.dist-info}/LICENSE +0 -0
- {epstein_files-1.1.5.dist-info → epstein_files-1.2.0.dist-info}/WHEEL +0 -0
- {epstein_files-1.1.5.dist-info → epstein_files-1.2.0.dist-info}/entry_points.txt +0 -0
epstein_files/util/constants.py
CHANGED
|
@@ -19,6 +19,7 @@ HEADER_ABBREVIATIONS = {
|
|
|
19
19
|
'bgC3': 'Bill Gates Ventures (renamed in 2018)',
|
|
20
20
|
"Brock": 'Brock Pierce (crypto bro with a very sordid past)',
|
|
21
21
|
"DB": "Deutsche Bank (maybe??)",
|
|
22
|
+
"GRAT": "Grantor Retained Annuity Trust (tax shelter)",
|
|
22
23
|
'HBJ': "Sheikh Hamad bin Jassim (former Qatari prime minister)",
|
|
23
24
|
'Jabor': '"an influential man in Qatar"',
|
|
24
25
|
'Jared': "Jared Kushner",
|
|
@@ -62,6 +63,7 @@ EMAILER_ID_REGEXES: dict[str, re.Pattern] = {
|
|
|
62
63
|
BARBRO_C_EHNBOM: re.compile(r'behnbom@aol.com|(Barbro\s.*)?Ehnbom', re.IGNORECASE),
|
|
63
64
|
BARRY_J_COHEN: re.compile(r'barry\s*((j.?|james)\s*)?cohen?', re.IGNORECASE),
|
|
64
65
|
BENNET_MOSKOWITZ: re.compile(r'Moskowitz.*Bennet|Bennet.*Moskowitz', re.IGNORECASE),
|
|
66
|
+
BOB_CROWE: re.compile(r"[BR]ob Crowe", re.IGNORECASE),
|
|
65
67
|
BORIS_NIKOLIC: re.compile(r'(boris )?nikolic?', re.IGNORECASE),
|
|
66
68
|
BRAD_EDWARDS: re.compile(r'Brad(ley)?(\s*J(.?|ames))?\s*Edwards', re.IGNORECASE),
|
|
67
69
|
BRAD_KARP: re.compile(r'Brad (S.? )?Karp|Karp, Brad', re.IGNORECASE),
|
|
@@ -83,6 +85,7 @@ EMAILER_ID_REGEXES: dict[str, re.Pattern] = {
|
|
|
83
85
|
JACKIE_PERCZEK: re.compile(r'jackie percze[kl]?', re.IGNORECASE),
|
|
84
86
|
JABOR_Y: re.compile(r'[ji]abor\s*y?', re.IGNORECASE),
|
|
85
87
|
JAMES_HILL: re.compile(r"hill, james e.|james.e.hill@abc.com", re.IGNORECASE),
|
|
88
|
+
JANUSZ_BANASIAK: re.compile(r"Janu[is]z Banasiak", re.IGNORECASE),
|
|
86
89
|
JEAN_LUC_BRUNEL: re.compile(r'Jean[- ]Luc Brunel?', re.IGNORECASE),
|
|
87
90
|
JEFF_FULLER: re.compile(r"jeff@mc2mm.com|Jeff Fuller", re.IGNORECASE),
|
|
88
91
|
JEFFREY_EPSTEIN: re.compile(r'[djl]\s?ee[vy]acation[©@]?g?(mail.com)?|Epstine|\bJEE?\b|Jeffrey E((sp|ps)tein?)?( VI Foundation)?|jeeproject@yahoo.com|J Jep|Jeffery Edwards|(?<!(Mark L.|ard Jay) )Epstein', re.IGNORECASE),
|
|
@@ -157,6 +160,7 @@ EMAILERS = [
|
|
|
157
160
|
BILL_GATES,
|
|
158
161
|
BILL_SIEGEL,
|
|
159
162
|
BRAD_WECHSLER,
|
|
163
|
+
CHRISTINA_GALBRAITH,
|
|
160
164
|
DANIEL_SABBA,
|
|
161
165
|
'Danny Goldberg',
|
|
162
166
|
DAVID_SCHOEN,
|
|
@@ -302,11 +306,10 @@ TEXTS_CONFIG = CONFIRMED_TEXTS_CONFIG + UNCONFIRMED_TEXTS_CONFIG
|
|
|
302
306
|
################################################ EMAILS ################################################
|
|
303
307
|
########################################################################################################
|
|
304
308
|
|
|
305
|
-
MICHAEL_WOLFF_EPSTEIN_ARTICLE_DRAFT = f"draft of an unpublished article about Epstein by {MICHAEL_WOLFF} written ca. 2014/2015"
|
|
306
|
-
|
|
307
309
|
# Some emails have a lot of uninteresting CCs
|
|
308
|
-
|
|
309
|
-
|
|
310
|
+
FLIGHT_IN_2012_PEOPLE: list[Name] = ['Francis Derby', JANUSZ_BANASIAK, 'Louella Rabuyo', 'Richard Barnnet']
|
|
311
|
+
IRAN_DEAL_RECIPIENTS: list[Name] = ['Allen West', 'Rafael Bardaji', 'Philip Kafka', 'Herb Goodman', 'Grant Seeger', 'Lisa Albert', 'Janet Kafka', 'James Ramsey', 'ACT for America', 'John Zouzelka', 'Joel Dunn', 'Nate McClain', 'Bennet Greenwald', 'Taal Safdie', 'Uri Fouzailov', 'Neil Anderson', 'Nate White', 'Rita Hortenstine', 'Henry Hortenstine', 'Gary Gross', 'Forrest Miller', 'Bennett Schmidt', 'Val Sherman', 'Marcie Brown', 'Michael Horowitz', 'Marshall Funk']
|
|
312
|
+
MICHAEL_WOLFF_EPSTEIN_ARTICLE_DRAFT = f"draft of an unpublished article about Epstein by {MICHAEL_WOLFF} written ca. 2014/2015"
|
|
310
313
|
|
|
311
314
|
EMAILS_CONFIG = [
|
|
312
315
|
# 026294 and 026296 might also be Ittihadieh based on timing
|
|
@@ -409,11 +412,11 @@ EMAILS_CONFIG = [
|
|
|
409
412
|
dupe_type='redacted'
|
|
410
413
|
),
|
|
411
414
|
EmailCfg(id='026547', author=GERALD_BARTON, recipients=[JEFFREY_EPSTEIN]), # Bad OCR # TODO: email header is really jacked up
|
|
412
|
-
EmailCfg(id='029969', author=GWENDOLYN_BECK, attribution_reason='
|
|
413
|
-
EmailCfg(id='029968', author=GWENDOLYN_BECK, attribution_reason='
|
|
415
|
+
EmailCfg(id='029969', author=GWENDOLYN_BECK, attribution_reason='signature "Longevity & Successful Aging"'),
|
|
416
|
+
EmailCfg(id='029968', author=GWENDOLYN_BECK, attribution_reason='signature "beckresearchlabs.com"', duplicate_ids=['031120']),
|
|
414
417
|
EmailCfg(id='029970', author=GWENDOLYN_BECK, attribution_reason='signed "Longevity & Successful Agin"'),
|
|
415
|
-
EmailCfg(id='029960', author=GWENDOLYN_BECK, attribution_reason='
|
|
416
|
-
EmailCfg(id='029959', author=GWENDOLYN_BECK, attribution_reason='"Longevity & Aging"'),
|
|
418
|
+
EmailCfg(id='029960', author=GWENDOLYN_BECK, attribution_reason='signature "Beck Center for Longevity & Aging"'),
|
|
419
|
+
EmailCfg(id='029959', author=GWENDOLYN_BECK, attribution_reason='signature "Beck Center for Longevity & Aging"'),
|
|
417
420
|
EmailCfg(id='033360', author=HENRY_HOLT, attribution_reason='in signature'), # Henry Holt is a company not a person
|
|
418
421
|
EmailCfg(id='033384', author=JACK_GOLDBERGER, attribution_reason='Might be Paul Prosperi?', is_attribution_uncertain=True),
|
|
419
422
|
EmailCfg(id='026024', author=JEAN_HUGUEN, attribution_reason='Signature'),
|
|
@@ -474,7 +477,7 @@ EMAILS_CONFIG = [
|
|
|
474
477
|
EmailCfg(
|
|
475
478
|
id='029977',
|
|
476
479
|
author=LAWRANCE_VISOSKI,
|
|
477
|
-
recipients=
|
|
480
|
+
recipients=[JEFFREY_EPSTEIN, DARREN_INDYKE, LESLEY_GROFF, RICHARD_KAHN] + FLIGHT_IN_2012_PEOPLE,
|
|
478
481
|
attribution_reason=LARRY_REASON,
|
|
479
482
|
duplicate_ids=['031129'],
|
|
480
483
|
),
|
|
@@ -491,14 +494,12 @@ EMAILS_CONFIG = [
|
|
|
491
494
|
EmailCfg(id='032606', author=MASHA_DROKOVA, attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
|
|
492
495
|
EmailCfg(id='032607', author=MASHA_DROKOVA, attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
|
|
493
496
|
EmailCfg(id='032609', author=MASHA_DROKOVA, attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
|
|
494
|
-
# 032581, 032604, 033025 may also be Masha based on timing, subject (interviews/articles), and sequential ID
|
|
495
497
|
EmailCfg(id='032604', author=MASHA_DROKOVA, attribution_reason="timing, subject (interviews/articles), and sequential ID", is_attribution_uncertain=True),
|
|
496
498
|
EmailCfg(id='032581', author=MASHA_DROKOVA, attribution_reason="timing, subject (interviews/articles), and sequential ID", is_attribution_uncertain=True),
|
|
497
|
-
EmailCfg(id='033025', author=MASHA_DROKOVA, attribution_reason="timing, subject (interviews/articles), and sequential ID", is_attribution_uncertain=True),
|
|
498
499
|
EmailCfg(id='030235', author=MELANIE_WALKER, attribution_reason='In fwd'),
|
|
499
500
|
EmailCfg(id='032343', author=MELANIE_WALKER, attribution_reason='Name seen in later reply 032346'),
|
|
500
501
|
EmailCfg(id='032212', author=MIROSLAV_LAJCAK, attribution_reason='signature'),
|
|
501
|
-
EmailCfg(id='021814', author=NADIA_MARCINKO, attribution_reason='reply'),
|
|
502
|
+
EmailCfg(id='021814', author=NADIA_MARCINKO, attribution_reason='reply'), #, actual_text="I'm a pilot...I prefer sex slave to copilot ;)"),
|
|
502
503
|
EmailCfg(id='021808', author=NADIA_MARCINKO, attribution_reason='reply'),
|
|
503
504
|
EmailCfg(id='022190', author=NADIA_MARCINKO, attribution_reason='reply'),
|
|
504
505
|
EmailCfg(id='021818', author=NADIA_MARCINKO, attribution_reason='reply'),
|
|
@@ -540,11 +541,12 @@ EMAILS_CONFIG = [
|
|
|
540
541
|
author=SEAN_BANNON,
|
|
541
542
|
attribution_reason="From protonmail, Bannon wrote 'just sent from my protonmail' in 027067",
|
|
542
543
|
),
|
|
543
|
-
EmailCfg(id='029003', author=SOON_YI_PREVIN, attribution_reason="
|
|
544
|
-
EmailCfg(id='029005', author=SOON_YI_PREVIN, attribution_reason="
|
|
545
|
-
EmailCfg(id='029007', author=SOON_YI_PREVIN, attribution_reason="
|
|
546
|
-
EmailCfg(id='029010', author=SOON_YI_PREVIN, attribution_reason="
|
|
547
|
-
EmailCfg(id='032296', author=SOON_YI_PREVIN, attribution_reason="
|
|
544
|
+
EmailCfg(id='029003', author=SOON_YI_PREVIN, attribution_reason='"Sent from Soon-Yi\'s iPhone"'),
|
|
545
|
+
EmailCfg(id='029005', author=SOON_YI_PREVIN, attribution_reason='"Sent from Soon-Yi\'s iPhone"'),
|
|
546
|
+
EmailCfg(id='029007', author=SOON_YI_PREVIN, attribution_reason='"Sent from Soon-Yi\'s iPhone"'),
|
|
547
|
+
EmailCfg(id='029010', author=SOON_YI_PREVIN, attribution_reason='"Sent from Soon-Yi\'s iPhone"'),
|
|
548
|
+
EmailCfg(id='032296', author=SOON_YI_PREVIN, attribution_reason='"Sent from Soon-Yi\'s iPhone"'),
|
|
549
|
+
EmailCfg(id='033292', author=SOON_YI_PREVIN, attribution_reason='mentions "Woody\'s movie"', is_attribution_uncertain=True),
|
|
548
550
|
EmailCfg(
|
|
549
551
|
id='019109',
|
|
550
552
|
author=STEVEN_HOFFENBERG,
|
|
@@ -557,7 +559,7 @@ EMAILS_CONFIG = [
|
|
|
557
559
|
attribution_reason='ends with "Respectfully, terry"',
|
|
558
560
|
author=TERRY_KAFKA,
|
|
559
561
|
fwded_text_after='From: Mike Cohen',
|
|
560
|
-
recipients=
|
|
562
|
+
recipients=[JEFFREY_EPSTEIN, MARK_EPSTEIN, MICHAEL_BUCHHOLTZ] + IRAN_DEAL_RECIPIENTS,
|
|
561
563
|
subject='Fw: The Iran Nuclear Deal',
|
|
562
564
|
duplicate_ids=['028482'],
|
|
563
565
|
),
|
|
@@ -620,6 +622,7 @@ EMAILS_CONFIG = [
|
|
|
620
622
|
EmailCfg(id='022250', recipients=[LESLEY_GROFF], attribution_reason='Reply'),
|
|
621
623
|
EmailCfg(id='030242', recipients=[MARIANA_IDZKOWSKA], duplicate_ids=['032048'], dupe_type='redacted'),
|
|
622
624
|
EmailCfg(id='033027', recipients=[MASHA_DROKOVA], attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
|
|
625
|
+
EmailCfg(id='033025', recipients=[MASHA_DROKOVA], attribution_reason="timing, subject (interviews/articles), and sequential ID", is_attribution_uncertain=True),
|
|
623
626
|
EmailCfg(id='030368', recipients=[MELANIE_SPINELLA], attribution_reason='Actually a self fwd from jeffrey to jeffrey'),
|
|
624
627
|
EmailCfg(id='030369', recipients=[MELANIE_SPINELLA], attribution_reason='Actually a self fwd from jeffrey to jeffrey'),
|
|
625
628
|
EmailCfg(id='030371', recipients=[MELANIE_SPINELLA], attribution_reason='Actually a self fwd from jeffrey to jeffrey'),
|
|
@@ -711,6 +714,7 @@ EMAILS_CONFIG = [
|
|
|
711
714
|
EmailCfg(id='030373', timestamp=parse('2018-10-03 01:49:27')),
|
|
712
715
|
|
|
713
716
|
# Configure duplicates
|
|
717
|
+
EmailCfg(id='026631', duplicate_ids=['026632'], dupe_type='quoted'),
|
|
714
718
|
EmailCfg(id='028768', duplicate_ids=['026563'], dupe_type='redacted'),
|
|
715
719
|
EmailCfg(id='027056', duplicate_ids=['028762'], dupe_type='redacted'),
|
|
716
720
|
EmailCfg(id='032248', duplicate_ids=['032246'], dupe_type='redacted'),
|
|
@@ -1379,8 +1383,6 @@ OTHER_FILES_LETTERS = [
|
|
|
1379
1383
|
description=f"letter about algorithmic trading",
|
|
1380
1384
|
date='2016-06-24', # date is based on Brexit reference but he could be backtesting,
|
|
1381
1385
|
),
|
|
1382
|
-
DocCfg(id='029304', author=DONALD_TRUMP, description=f"recommendation letter for recently departed {TRUMP_ORG} lawyer {MICHAEL_J_BOCCIO}"),
|
|
1383
|
-
DocCfg(id='029301', author=MICHAEL_J_BOCCIO, description=f"letter from former lawyer at the {TRUMP_ORG}", date='2011-08-07'),
|
|
1384
1386
|
DocCfg(id='026134', description=f'letter to someone named George about investment opportunities in the Ukraine banking sector'),
|
|
1385
1387
|
]
|
|
1386
1388
|
|
|
@@ -1531,13 +1533,27 @@ OTHER_FILES_ACADEMIA = [
|
|
|
1531
1533
|
|
|
1532
1534
|
# resumes and application letters
|
|
1533
1535
|
OTHER_FILES_RESUMES = [
|
|
1536
|
+
DocCfg(
|
|
1537
|
+
id='029304',
|
|
1538
|
+
attached_to_email_id='029299',
|
|
1539
|
+
author=DONALD_TRUMP,
|
|
1540
|
+
description=f"recommendation letter for recently departed {TRUMP_ORG} lawyer {MICHAEL_J_BOCCIO}",
|
|
1541
|
+
),
|
|
1534
1542
|
DocCfg(id='022367', author='Jack J Grynberg', description=RESUME_OF, date='2014-07-01'),
|
|
1535
1543
|
DocCfg(
|
|
1536
1544
|
id='029302',
|
|
1545
|
+
attached_to_email_id='029299',
|
|
1537
1546
|
author=MICHAEL_J_BOCCIO,
|
|
1538
1547
|
description=f"{RESUME_OF} (former lawyer at the {TRUMP_ORG})",
|
|
1539
1548
|
date='2011-08-07',
|
|
1540
1549
|
),
|
|
1550
|
+
DocCfg(
|
|
1551
|
+
id='029301',
|
|
1552
|
+
attached_to_email_id='029299',
|
|
1553
|
+
author=MICHAEL_J_BOCCIO,
|
|
1554
|
+
description=f"letter from former lawyer at the {TRUMP_ORG}",
|
|
1555
|
+
date='2011-08-07',
|
|
1556
|
+
),
|
|
1541
1557
|
DocCfg(id='029102', author=NERIO_ALESSANDRI, description=HBS_APPLICATION),
|
|
1542
1558
|
DocCfg(id='029104', author=NERIO_ALESSANDRI, description=HBS_APPLICATION),
|
|
1543
1559
|
DocCfg(id='015671', author='Robin Solomon', description=RESUME_OF, date='2015-06-02'), # She left Mount Sinai at some point in 2015,
|
|
@@ -1679,3 +1695,39 @@ REPLY_LINE_ON_DATE_PATTERN = fr"^On (\d+ )?((Mon|Tues?|Wed(nes)?|Thu(rs)?|Fri|Sa
|
|
|
1679
1695
|
REPLY_LINE_PATTERN = rf"({REPLY_LINE_IN_A_MSG_PATTERN}|{REPLY_LINE_ON_NUMERIC_DATE_PATTERN}|{REPLY_LINE_ON_DATE_PATTERN}|{FORWARDED_LINE_PATTERN})"
|
|
1680
1696
|
REPLY_REGEX = re.compile(REPLY_LINE_PATTERN, re.IGNORECASE | re.MULTILINE)
|
|
1681
1697
|
SENT_FROM_REGEX = re.compile(r'^(?:(Please forgive|Sorry for all the) typos.{1,4})?((Envoyé de mon|Sent (from|via)).*(and string|AT&T|Droid|iPad|Phone|Mail|BlackBerry(.*(smartphone|device|Handheld|AT&T|T- ?Mobile))?)\.?)', re.M | re.I)
|
|
1698
|
+
|
|
1699
|
+
|
|
1700
|
+
# No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
|
|
1701
|
+
UNINTERESTING_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + [
|
|
1702
|
+
'Alan Dlugash', # CCed with Richard Kahn
|
|
1703
|
+
'Alan Rogers', # Random CC
|
|
1704
|
+
'Andrew Friendly', # Presumably some relation of Kelly Friendly
|
|
1705
|
+
'BS Stern', # A random fwd of email we have
|
|
1706
|
+
'Cheryl Kleen', # Single email from Anne Boyles, displayed under Anne Boyles
|
|
1707
|
+
'Connie Zaguirre', # Random CC
|
|
1708
|
+
'Dan Fleuette', # CC from sean bannon
|
|
1709
|
+
'Danny Goldberg', # Random Paul Krassner emails
|
|
1710
|
+
GERALD_LEFCOURT, # Single CC
|
|
1711
|
+
GORDON_GETTY, # Random CC
|
|
1712
|
+
JEFF_FULLER, # Random Jean Luc Brunel CC
|
|
1713
|
+
'Jojo Fontanilla', # Random CC
|
|
1714
|
+
'Joseph Vinciguerra', # Random CC
|
|
1715
|
+
'Larry Cohen', # Random Bill Gates CC
|
|
1716
|
+
'Lyn Fontanilla', # Random CC
|
|
1717
|
+
'Mark Albert', # Random CC
|
|
1718
|
+
'Matthew Schafer', # Random CC
|
|
1719
|
+
MICHAEL_BUCHHOLTZ, # Terry Kafka CC
|
|
1720
|
+
'Nancy Dahl', # covered by Lawrence Krauss (her husband)
|
|
1721
|
+
'Michael Simmons', # Random CC
|
|
1722
|
+
'Nancy Portland', # Lawrence Krauss CC
|
|
1723
|
+
'Oliver Goodenough', # Robert Trivers CC
|
|
1724
|
+
'Peter Aldhous', # Lawrence Krauss CC
|
|
1725
|
+
'Players2', # Hoffenberg CC
|
|
1726
|
+
'Sam Harris', # Lawrence Krauss CC
|
|
1727
|
+
SAMUEL_LEFF, # Random CC
|
|
1728
|
+
'Sean T Lehane', # Random CC
|
|
1729
|
+
'Stephen Rubin', # Random CC
|
|
1730
|
+
'Tim Kane', # Random CC
|
|
1731
|
+
'Travis Pangburn', # Random CC
|
|
1732
|
+
'Vahe Stepanian', # Random CC
|
|
1733
|
+
]
|
epstein_files/util/data.py
CHANGED
|
@@ -38,25 +38,6 @@ def dict_sets_to_lists(d: dict[str, set]) -> dict[str, list]:
|
|
|
38
38
|
return {k: sorted(list(v)) for k, v in d.items()}
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
def extract_last_name(name: str) -> str:
|
|
42
|
-
if ' ' not in name:
|
|
43
|
-
return name
|
|
44
|
-
|
|
45
|
-
names = name.removesuffix(QUESTION_MARKS).strip().split()
|
|
46
|
-
|
|
47
|
-
if names[-1].startswith('Jr') and len(names[-1]) <= 3:
|
|
48
|
-
return ' '.join(names[-2:])
|
|
49
|
-
else:
|
|
50
|
-
return names[-1]
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def extract_first_name(name: str) -> str:
|
|
54
|
-
if ' ' not in name:
|
|
55
|
-
return name
|
|
56
|
-
|
|
57
|
-
return name.removesuffix(f" {extract_last_name(name)}")
|
|
58
|
-
|
|
59
|
-
|
|
60
41
|
def flatten(_list: list[list[T]]) -> list[T]:
|
|
61
42
|
return list(itertools.chain.from_iterable(_list))
|
|
62
43
|
|
epstein_files/util/doc_cfg.py
CHANGED
|
@@ -62,7 +62,7 @@ class DocCfg:
|
|
|
62
62
|
|
|
63
63
|
Attributes:
|
|
64
64
|
id (str): ID of file
|
|
65
|
-
author (
|
|
65
|
+
author (Name): Author of the document (if any)
|
|
66
66
|
category (str | None): Type of file
|
|
67
67
|
date (str | None): If passed will be immediated parsed into the 'timestamp' field
|
|
68
68
|
dupe_type (DuplicateType | None): The type of duplicate this file is or its 'duplicate_ids' are
|
|
@@ -74,7 +74,7 @@ class DocCfg:
|
|
|
74
74
|
"""
|
|
75
75
|
id: str
|
|
76
76
|
attached_to_email_id: str | None = None
|
|
77
|
-
author:
|
|
77
|
+
author: Name = None
|
|
78
78
|
category: str | None = None
|
|
79
79
|
date: str | None = None
|
|
80
80
|
description: str | None = None
|
|
@@ -94,30 +94,40 @@ class DocCfg:
|
|
|
94
94
|
|
|
95
95
|
def complete_description(self) -> str | None:
|
|
96
96
|
"""String that summarizes what is known about this document."""
|
|
97
|
+
description = ''
|
|
98
|
+
|
|
97
99
|
if self.category and not self.description and not self.author:
|
|
98
100
|
if self.category == JUNK:
|
|
99
101
|
return None
|
|
100
102
|
else:
|
|
101
|
-
|
|
103
|
+
description = self.category
|
|
102
104
|
elif self.category == REPUTATION:
|
|
103
105
|
author_str = f"{self.author} " if self.author else ''
|
|
104
|
-
|
|
106
|
+
description = f"{REPUTATION_MGMT}: {author_str}{self.description}"
|
|
105
107
|
elif self.category == SKYPE_LOG:
|
|
106
108
|
msg = f"{self.category} of conversation with {self.author}" if self.author else self.category
|
|
107
|
-
|
|
109
|
+
description = f"{msg} {self.description}" if self.description else msg
|
|
108
110
|
elif self.author and self.description:
|
|
109
111
|
if self.category in [ACADEMIA, BOOK]:
|
|
110
112
|
title = self.description if '"' in self.description else f'"{self.description}"'
|
|
111
|
-
|
|
113
|
+
description = f"{title} by {self.author}"
|
|
112
114
|
elif self.category == FINANCE and self.author in FINANCIAL_REPORTS_AUTHORS:
|
|
113
|
-
|
|
115
|
+
description = f'{self.author} report: "{self.description}"'
|
|
114
116
|
elif self.category == LEGAL and 'v.' in self.author:
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
117
|
+
description = f"{self.author}: {self.description}"
|
|
118
|
+
|
|
119
|
+
if not description:
|
|
120
|
+
pieces = without_falsey([self.author, self.description])
|
|
121
|
+
|
|
122
|
+
if pieces:
|
|
123
|
+
description = ' '.join(pieces)
|
|
124
|
+
else:
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
if self.attached_to_email_id:
|
|
128
|
+
description += f" attached to email {self.attached_to_email_id}"
|
|
118
129
|
|
|
119
|
-
|
|
120
|
-
return ' '.join(pieces) if pieces else None
|
|
130
|
+
return description
|
|
121
131
|
|
|
122
132
|
def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
|
|
123
133
|
"""Create synthetic DocCfg objects that set the 'duplicate_of_id' field to point back to this object."""
|
|
@@ -209,13 +219,13 @@ class EmailCfg(CommunicationCfg):
|
|
|
209
219
|
actual_text (str | None): In dire cases of broken OCR we just configure the body of the email as a string.
|
|
210
220
|
fwded_text_after (str | None): If set, any text after this is a fwd of an article or similar
|
|
211
221
|
is_fwded_article (bool): True if this is a newspaper article someone fwded. Used to exclude articles from word counting.
|
|
212
|
-
recipients (list[
|
|
222
|
+
recipients (list[Name]): Who received the email
|
|
213
223
|
subject (str): Subject line
|
|
214
224
|
"""
|
|
215
225
|
actual_text: str | None = None
|
|
216
226
|
fwded_text_after: str | None = None
|
|
217
227
|
is_fwded_article: bool = False
|
|
218
|
-
recipients: list[
|
|
228
|
+
recipients: list[Name] = field(default_factory=list)
|
|
219
229
|
subject: str | None = None
|
|
220
230
|
|
|
221
231
|
# This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
|
epstein_files/util/env.py
CHANGED
|
@@ -8,7 +8,7 @@ from rich_argparse_plus import RichHelpFormatterPlus
|
|
|
8
8
|
from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, CHRONOLOGICAL_EMAILS_PATH, TEXT_MSGS_HTML_PATH
|
|
9
9
|
from epstein_files.util.logging import env_log_level, exit_with_error, logger
|
|
10
10
|
|
|
11
|
-
DEFAULT_WIDTH =
|
|
11
|
+
DEFAULT_WIDTH = 155
|
|
12
12
|
DEFAULT_FILE = 'default_file'
|
|
13
13
|
EPSTEIN_GENERATE = 'epstein_generate'
|
|
14
14
|
HTML_SCRIPTS = [EPSTEIN_GENERATE, 'epstein_word_count']
|
|
@@ -38,6 +38,7 @@ output.add_argument('--all-emails', '-ae', action='store_true', help='all the em
|
|
|
38
38
|
output.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just the interesting ones')
|
|
39
39
|
parser.add_argument('--build', '-b', nargs="?", default=None, const=DEFAULT_FILE, help='write output to HTML file')
|
|
40
40
|
output.add_argument('--email-timeline', action='store_true', help='print a table of all emails in chronological order')
|
|
41
|
+
output.add_argument('--emailers-info-png', action='store_true', help='write a .png of the emeailers info table')
|
|
41
42
|
output.add_argument('--json-files', action='store_true', help='pretty print all the raw JSON data files in the collection and exit')
|
|
42
43
|
output.add_argument('--json-metadata', action='store_true', help='dump JSON metadata for all files and exit')
|
|
43
44
|
output.add_argument('--output-emails', '-oe', action='store_true', help='generate emails section')
|
|
@@ -66,6 +67,7 @@ debug.add_argument('--suppress-logs', '-sl', action='store_true', help='set debu
|
|
|
66
67
|
args = parser.parse_args()
|
|
67
68
|
is_html_script = parser.prog in HTML_SCRIPTS
|
|
68
69
|
|
|
70
|
+
args.build = args.build or args.emailers_info_png
|
|
69
71
|
args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
|
|
70
72
|
args.names = [None if n == 'None' else n for n in (args.names or [])]
|
|
71
73
|
args.output_emails = args.output_emails or args.all_emails
|