epstein-files 1.0.10__py3-none-any.whl → 1.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +4 -6
- epstein_files/documents/document.py +92 -49
- epstein_files/documents/email.py +7 -4
- epstein_files/documents/imessage/text_message.py +3 -12
- epstein_files/documents/json_file.py +13 -1
- epstein_files/documents/messenger_log.py +32 -19
- epstein_files/documents/other_file.py +66 -43
- epstein_files/epstein_files.py +22 -15
- epstein_files/util/constant/names.py +2 -2
- epstein_files/util/constants.py +84 -78
- epstein_files/util/doc_cfg.py +17 -25
- epstein_files/util/env.py +29 -17
- epstein_files/util/file_helper.py +13 -24
- epstein_files/util/highlighted_group.py +22 -14
- epstein_files/util/logging.py +0 -6
- epstein_files/util/output.py +12 -7
- epstein_files/util/rich.py +15 -10
- epstein_files/util/word_count.py +65 -5
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.11.dist-info}/METADATA +1 -1
- epstein_files-1.0.11.dist-info/RECORD +33 -0
- epstein_files/count_words.py +0 -72
- epstein_files-1.0.10.dist-info/RECORD +0 -34
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.11.dist-info}/LICENSE +0 -0
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.11.dist-info}/WHEEL +0 -0
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.11.dist-info}/entry_points.txt +0 -0
epstein_files/util/constants.py
CHANGED
|
@@ -56,7 +56,7 @@ EMAILER_ID_REGEXES: dict[str, re.Pattern] = {
|
|
|
56
56
|
AMANDA_ENS: re.compile(r'ens, amanda?|Amanda.Ens', re.IGNORECASE),
|
|
57
57
|
ANAS_ALRASHEED: re.compile(r'anas\s*al\s*rashee[cd]', re.IGNORECASE),
|
|
58
58
|
ANIL_AMBANI: re.compile(r'Anil.Ambani', re.IGNORECASE),
|
|
59
|
-
ANN_MARIE_VILLAFANA: re.compile(r'Villafana, Ann Marie|(A(\.|nn) Marie )?Villafa(n|ri)a', re.IGNORECASE),
|
|
59
|
+
ANN_MARIE_VILLAFANA: re.compile(r'Villafana, Ann Marie|(A(\.|nn) Marie )?Villafa(c|n|ri)a', re.IGNORECASE),
|
|
60
60
|
ANTHONY_SCARAMUCCI: re.compile(r"mooch|(Anthony ('The Mooch' )?)?Scaramucci", re.IGNORECASE),
|
|
61
61
|
ARIANE_DE_ROTHSCHILD: re.compile(r'AdeR|((Ariane|Edmond) de )?Rothschild|Ariane', re.IGNORECASE),
|
|
62
62
|
BARBRO_C_EHNBOM: re.compile(r'behnbom@aol.com|(Barbro\s.*)?Ehnbom', re.IGNORECASE),
|
|
@@ -238,14 +238,14 @@ BOFA_MERRILL = f'{BOFA} / Merrill Lynch Report'
|
|
|
238
238
|
BOFA_WEALTH_MGMT = f'{BOFA} Wealth Management'
|
|
239
239
|
BROCKMAN_INC = 'Brockman, Inc.'
|
|
240
240
|
CVRA = "Crime Victims' Rights Act [CVRA]"
|
|
241
|
+
CVRA_LEXIS_SEARCH = f"Lexis Nexis search for case law around the {CVRA}"
|
|
241
242
|
DAVID_BLAINE_VISA_LETTER = f"letter of recommendation for visa for a model"
|
|
242
|
-
DAVID_SCHOEN_CVRA_LEXIS_SEARCH = f"Lexis Nexis search for case law around the {CVRA} by {DAVID_SCHOEN}"
|
|
243
243
|
DERSH_GIUFFRE_TWEET = f"{TWEET} about {VIRGINIA_GIUFFRE}"
|
|
244
244
|
DEUTSCHE_BANK_TAX_TOPICS = f'{DEUTSCHE_BANK} Wealth Management Tax Topics'
|
|
245
245
|
DIANA_DEGETTE_CAMPAIGN = "Colorado legislator Diana DeGette's campaign"
|
|
246
246
|
EPSTEIN_FOUNDATION = 'Jeffrey Epstein VI Foundation'
|
|
247
|
-
FBI_REPORT = f"
|
|
248
|
-
FBI_SEIZED_PROPERTY = f"
|
|
247
|
+
FBI_REPORT = f"report on Epstein investigation (redacted)"
|
|
248
|
+
FBI_SEIZED_PROPERTY = f"seized property inventory (redacted)"
|
|
249
249
|
FEMALE_HEALTH_COMPANY = 'Female Health Company (FHX)'
|
|
250
250
|
FIRE_AND_FURY = f"Fire And Fury"
|
|
251
251
|
HARVARD_POETRY = f'{HARVARD} poetry stuff from {LISA_NEW}'
|
|
@@ -256,7 +256,7 @@ JOHN_BOLTON_PRESS_CLIPPING = 'John Bolton press clipping'
|
|
|
256
256
|
JP_MORGAN_EYE_ON_THE_MARKET = f"Eye On The Market"
|
|
257
257
|
LAWRENCE_KRAUSS_ASU_ORIGINS = f"{LAWRENCE_KRAUSS}'s ASU Origins Project"
|
|
258
258
|
KEN_STARR_LETTER = f"letter to judge overseeing Epstein's criminal prosecution, mentions Alex Acosta"
|
|
259
|
-
|
|
259
|
+
MICHAEL_WOLFF_EPSTEIN_ARTICLE_DRAFT = f"draft of an unpublished article about Epstein by {MICHAEL_WOLFF} written ca. 2014/2015"
|
|
260
260
|
NERIO_ALESSANDRI = 'Nerio Alessandri (Founder and Chairman of Technogym S.p.A. Italy)'
|
|
261
261
|
NIGHT_FLIGHT_BOOK = f'"Night Flight" (draft)'
|
|
262
262
|
NOBEL_CHARITABLE_TRUST = 'Nobel Charitable Trust'
|
|
@@ -646,10 +646,11 @@ EMAILS_CONFIG = [
|
|
|
646
646
|
EmailCfg(id='021106', recipients=[STEVE_BANNON], attribution_reason='Reply'),
|
|
647
647
|
|
|
648
648
|
# Misc configs
|
|
649
|
-
EmailCfg(id='033050', actual_text='schwartman'),
|
|
650
649
|
EmailCfg(id='029344', actual_text='I thought of you when I read this article. Was this your idea? Alan'),
|
|
650
|
+
EmailCfg(id='032358', actual_text=REDACTED), # Completely redacted
|
|
651
|
+
EmailCfg(id='033050', actual_text='schwartman'),
|
|
651
652
|
EmailCfg(id='022219', description="discussion of attempts to clean up Epstein's Google search results"),
|
|
652
|
-
EmailCfg(id='023627', is_fwded_article=True, description=
|
|
653
|
+
EmailCfg(id='023627', is_fwded_article=True, description=MICHAEL_WOLFF_EPSTEIN_ARTICLE_DRAFT),
|
|
653
654
|
EmailCfg(id='031333', is_fwded_article=True, description='looks like a Russian disinfo article'), # Russia Says IMF Chief Jailed For Discovering All US Gold is Gone
|
|
654
655
|
EmailCfg(id='031335', is_fwded_article=True, description='looks like a Russian disinfo article'), # DOMINQUE STRAUSS-KAHN ARRESTED, NOT BECAUSE HE RAPED A MAID, BUT BECAUSE HE HAD EVIDENCE US HAS NO GOLD IN FORT KNOX.
|
|
655
656
|
EmailCfg(id='026298', is_fwded_article=True, duplicate_ids=['026499']), # Written by someone else?
|
|
@@ -661,8 +662,9 @@ EMAILS_CONFIG = [
|
|
|
661
662
|
EmailCfg(id='025041', is_fwded_article=True, duplicate_ids=['028675']), # Obama agenda
|
|
662
663
|
EmailCfg(id='031136', is_fwded_article=True, duplicate_ids=['028791']), # 'Smart Money is Fleeing US Stocks'
|
|
663
664
|
EmailCfg(id='031779', is_fwded_article=True, duplicate_ids=['026938']), # Sarah Silverman on AI
|
|
664
|
-
EmailCfg(id='029849', is_fwded_article=True, duplicate_ids=['033482']),
|
|
665
|
-
EmailCfg(id='032023', is_fwded_article=True, duplicate_ids=['032012']),
|
|
665
|
+
EmailCfg(id='029849', is_fwded_article=True, duplicate_ids=['033482']), # Fareed Zakaria: Trump sells America short),
|
|
666
|
+
EmailCfg(id='032023', is_fwded_article=True, duplicate_ids=['032012']), # American-Israeli Cooperative Enterprise Newsletter
|
|
667
|
+
EmailCfg(id='021758', is_fwded_article=True, duplicate_ids=['030616']), # Radar Online article about Epstein's early prison release
|
|
666
668
|
EmailCfg(id='030868', is_fwded_article=True), # 'He doesn't like this sh*t': Trump reportedly hates his job and his staff after 1 month
|
|
667
669
|
EmailCfg(id='026755', is_fwded_article=True), # HuffPo
|
|
668
670
|
EmailCfg(id='016218', is_fwded_article=True), # AT&T confirms it paid Trump lawyer Cohen for insights on Trump
|
|
@@ -707,6 +709,8 @@ EMAILS_CONFIG = [
|
|
|
707
709
|
EmailCfg(id='026924', is_fwded_article=True), # The Onion
|
|
708
710
|
EmailCfg(id='033311', is_fwded_article=True), # 2016 election polls
|
|
709
711
|
EmailCfg(id='026580', is_fwded_article=True), # NPR: Antigua: Land Of Sun, Sand, And Super Cheap
|
|
712
|
+
EmailCfg(id='031340', is_fwded_article=True), # Article about Alex Jones threatening Robert Mueller
|
|
713
|
+
EmailCfg(id='033297', is_fwded_article=True, duplicate_ids=['033586']), # Sultan Sulayem fwding article about Trump and Russia
|
|
710
714
|
EmailCfg(id='032475', timestamp=parse('2017-02-15 13:31:25')),
|
|
711
715
|
EmailCfg(id='030373', timestamp=parse('2018-10-03 01:49:27')),
|
|
712
716
|
|
|
@@ -748,8 +752,6 @@ EMAILS_CONFIG = [
|
|
|
748
752
|
EmailCfg(id='031118', duplicate_ids=['019465']),
|
|
749
753
|
EmailCfg(id='031912', duplicate_ids=['032158']),
|
|
750
754
|
EmailCfg(id='030587', duplicate_ids=['030514']),
|
|
751
|
-
EmailCfg(id='029773', duplicate_ids=['012685'], fwded_text_after='Omar Quadhafi'),
|
|
752
|
-
EmailCfg(id='033297', duplicate_ids=['033586']),
|
|
753
755
|
EmailCfg(id='031089', duplicate_ids=['018084']),
|
|
754
756
|
EmailCfg(id='031088', duplicate_ids=['030885']),
|
|
755
757
|
EmailCfg(id='030238', duplicate_ids=['031130']),
|
|
@@ -803,7 +805,6 @@ EMAILS_CONFIG = [
|
|
|
803
805
|
EmailCfg(id='026618', duplicate_ids=['028485']),
|
|
804
806
|
EmailCfg(id='030609', duplicate_ids=['030495']),
|
|
805
807
|
EmailCfg(id='029831', duplicate_ids=['028972']),
|
|
806
|
-
EmailCfg(id='021758', duplicate_ids=['030616']),
|
|
807
808
|
EmailCfg(id='033498', duplicate_ids=['029884']),
|
|
808
809
|
EmailCfg(id='028620', duplicate_ids=['027094']),
|
|
809
810
|
EmailCfg(id='032456', duplicate_ids=['033579']),
|
|
@@ -851,7 +852,9 @@ EMAILS_CONFIG = [
|
|
|
851
852
|
EmailCfg(id='030015', fwded_text_after='Bill Clinton reportedly'),
|
|
852
853
|
EmailCfg(id='026312', fwded_text_after='Steve Bannon trying to get on disgraced'),
|
|
853
854
|
EmailCfg(id='031742', fwded_text_after="Trump's former campaign manager Paul Manafort"),
|
|
854
|
-
|
|
855
|
+
EmailCfg(id='012197_4', fwded_text_after="Thanks -- Jay"),
|
|
856
|
+
EmailCfg(id='028925', fwded_text_after='> on Jan 4, 2015'),
|
|
857
|
+
EmailCfg(id='029773', fwded_text_after='Omar Quadhafi', duplicate_ids=['012685']),
|
|
855
858
|
]
|
|
856
859
|
|
|
857
860
|
|
|
@@ -925,14 +928,14 @@ OTHER_FILES_ARTICLES = [
|
|
|
925
928
|
DocCfg(id='031776', author='Law360', description=f"article about Michael Avenatti by Andrew Strickler"),
|
|
926
929
|
DocCfg(id='023102', author=f'Litigation Daily', description=f"article about {REID_WEINGARTEN}", date='2015-09-04'),
|
|
927
930
|
DocCfg(id='029340', author=f'MarketWatch', description=f'article about estate taxes, particularly Epstein\'s favoured GRATs'),
|
|
928
|
-
DocCfg(id='022707', author=MICHAEL_WOLFF, description=
|
|
929
|
-
DocCfg(id='022727', author=MICHAEL_WOLFF, description=
|
|
930
|
-
DocCfg(id='022746', author=MICHAEL_WOLFF, description=
|
|
931
|
-
DocCfg(id='022844', author=MICHAEL_WOLFF, description=
|
|
932
|
-
DocCfg(id='022863', author=MICHAEL_WOLFF, description=
|
|
933
|
-
DocCfg(id='022894', author=MICHAEL_WOLFF, description=
|
|
934
|
-
DocCfg(id='022952', author=MICHAEL_WOLFF, description=
|
|
935
|
-
DocCfg(id='024229', author=MICHAEL_WOLFF, description=
|
|
931
|
+
DocCfg(id='022707', author=MICHAEL_WOLFF, description=MICHAEL_WOLFF_EPSTEIN_ARTICLE_DRAFT),
|
|
932
|
+
DocCfg(id='022727', author=MICHAEL_WOLFF, description=MICHAEL_WOLFF_EPSTEIN_ARTICLE_DRAFT),
|
|
933
|
+
DocCfg(id='022746', author=MICHAEL_WOLFF, description=MICHAEL_WOLFF_EPSTEIN_ARTICLE_DRAFT),
|
|
934
|
+
DocCfg(id='022844', author=MICHAEL_WOLFF, description=MICHAEL_WOLFF_EPSTEIN_ARTICLE_DRAFT),
|
|
935
|
+
DocCfg(id='022863', author=MICHAEL_WOLFF, description=MICHAEL_WOLFF_EPSTEIN_ARTICLE_DRAFT),
|
|
936
|
+
DocCfg(id='022894', author=MICHAEL_WOLFF, description=MICHAEL_WOLFF_EPSTEIN_ARTICLE_DRAFT),
|
|
937
|
+
DocCfg(id='022952', author=MICHAEL_WOLFF, description=MICHAEL_WOLFF_EPSTEIN_ARTICLE_DRAFT),
|
|
938
|
+
DocCfg(id='024229', author=MICHAEL_WOLFF, description=MICHAEL_WOLFF_EPSTEIN_ARTICLE_DRAFT),
|
|
936
939
|
DocCfg(id='031198', author='Morning News USA', description=f"article about identify of Jane Doe in {JANE_DOE_V_EPSTEIN_TRUMP}"),
|
|
937
940
|
DocCfg(id='031972', author=NYT, description=f"article about #MeToo allegations against {LAWRENCE_KRAUSS}", date='2018-03-07'),
|
|
938
941
|
DocCfg(id='032435', author=NYT, description=f'article about Chinese butlers'),
|
|
@@ -1016,22 +1019,12 @@ OTHER_FILES_ARTICLES = [
|
|
|
1016
1019
|
date='2019-02-06',
|
|
1017
1020
|
duplicate_ids=['031415'],
|
|
1018
1021
|
),
|
|
1019
|
-
|
|
1020
|
-
DocCfg(
|
|
1021
|
-
id='030199',
|
|
1022
|
-
description=f'article about allegations Trump raped a 13 year old girl {JANE_DOE_V_EPSTEIN_TRUMP}',
|
|
1023
|
-
date='2017-11-16',
|
|
1024
|
-
),
|
|
1022
|
+
DocCfg(id='030199', description=f'article about Trump rape allegations in {JANE_DOE_V_EPSTEIN_TRUMP}', date='2017-11-16'),
|
|
1025
1023
|
DocCfg(id='031725', description=f"article about Gloria Allred and Trump allegations", date='2016-10-10'),
|
|
1026
1024
|
DocCfg(id='026648', description=f'article about {JASTA} lawsuit against Saudi Arabia by 9/11 victims (Russian propaganda?)', date='2017-05-13'),
|
|
1027
1025
|
DocCfg(id='032159', description=f"article about microfinance and cell phones in Zimbabwe, Strive Masiyiwa (Econet Wireless)"),
|
|
1028
1026
|
DocCfg(id='033468', description=f'{ARTICLE_DRAFT} Rod Rosenstein', date='2018-09-24'),
|
|
1029
1027
|
DocCfg(id='030825', description=f'{ARTICLE_DRAFT} Syria'),
|
|
1030
|
-
DocCfg(
|
|
1031
|
-
id='019233',
|
|
1032
|
-
description=f"Freedom House: 'Breaking Down Democracy: Goals, Strategies, and Methods of Modern Authoritarians'",
|
|
1033
|
-
date='2017-06-02',
|
|
1034
|
-
),
|
|
1035
1028
|
DocCfg(id='027051', description=f"German language article about the 2013 Lifeball / AIDS Gala", date='2013-01-01'),
|
|
1036
1029
|
DocCfg(id='033480', description=f"{JOHN_BOLTON_PRESS_CLIPPING}", date='2018-04-06', duplicate_ids=['033481']),
|
|
1037
1030
|
DocCfg(id='013403', description=f"Lexis Nexis result from The Evening Standard about Bernie Madoff", date='2009-12-24'),
|
|
@@ -1045,14 +1038,16 @@ OTHER_FILES_ARTICLES = [
|
|
|
1045
1038
|
date='2017-05-13',
|
|
1046
1039
|
),
|
|
1047
1040
|
DocCfg(id='025094', description=f'{TRANSLATION} Spanish article about Cuba', date='2015-11-08'),
|
|
1048
|
-
DocCfg(id='031794', description=f"very short French magazine clipping"),
|
|
1041
|
+
DocCfg(id='031794', description=f"very short French magazine clipping", is_interesting=False),
|
|
1049
1042
|
]
|
|
1050
1043
|
|
|
1051
1044
|
OTHER_FILES_LEGAL = [
|
|
1045
|
+
DocCfg(id='017789', author=ALAN_DERSHOWITZ, description=f'letter to {HARVARD} Crimson complaining he was defamed'),
|
|
1052
1046
|
DocCfg(id='011908', author=BRUNEL_V_EPSTEIN, description=f"court filing"),
|
|
1053
|
-
DocCfg(id='
|
|
1054
|
-
DocCfg(id='
|
|
1055
|
-
DocCfg(id='
|
|
1047
|
+
DocCfg(id='017603', author=DAVID_SCHOEN, description=CVRA_LEXIS_SEARCH, date='2019-02-28'),
|
|
1048
|
+
DocCfg(id='017635', author=DAVID_SCHOEN, description=CVRA_LEXIS_SEARCH, date='2019-02-28'),
|
|
1049
|
+
DocCfg(id='016509', author=DAVID_SCHOEN, description=CVRA_LEXIS_SEARCH, date='2019-02-28'),
|
|
1050
|
+
DocCfg(id='017714', author=DAVID_SCHOEN, description=CVRA_LEXIS_SEARCH, date='2019-02-28'),
|
|
1056
1051
|
DocCfg(id='021824', author=EDWARDS_V_DERSHOWITZ, description=f"deposition of {PAUL_G_CASSELL}"),
|
|
1057
1052
|
DocCfg(
|
|
1058
1053
|
id='010757',
|
|
@@ -1084,10 +1079,10 @@ OTHER_FILES_LEGAL = [
|
|
|
1084
1079
|
DocCfg(id='017488', author=EPSTEIN_V_ROTHSTEIN_EDWARDS, description=f"Deposition of Scott Rothstein", date='2012-06-22'),
|
|
1085
1080
|
DocCfg(id='029315', author=EPSTEIN_V_ROTHSTEIN_EDWARDS, description=f"Plaintiff Motion for Summary Judgment by {JACK_SCAROLA}", date='2013-09-13'),
|
|
1086
1081
|
DocCfg(id='013304', author=EPSTEIN_V_ROTHSTEIN_EDWARDS, description=f"Plaintiff Response to Epstein's Motion for Summary Judgment", date='2014-04-17'),
|
|
1087
|
-
DocCfg(id='019352', description=FBI_REPORT,),
|
|
1088
|
-
DocCfg(id='021434', description=FBI_REPORT,),
|
|
1089
|
-
DocCfg(id='018872', description=FBI_SEIZED_PROPERTY,),
|
|
1090
|
-
DocCfg(id='021569', description=FBI_SEIZED_PROPERTY,),
|
|
1082
|
+
DocCfg(id='019352', author=FBI, description=FBI_REPORT,),
|
|
1083
|
+
DocCfg(id='021434', author=FBI, description=FBI_REPORT,),
|
|
1084
|
+
DocCfg(id='018872', author=FBI, description=FBI_SEIZED_PROPERTY,),
|
|
1085
|
+
DocCfg(id='021569', author=FBI, description=FBI_SEIZED_PROPERTY,),
|
|
1091
1086
|
DocCfg(id='017792', author=GIUFFRE_V_DERSHOWITZ, description=f"article about {ALAN_DERSHOWITZ}'s appearance on Wolf Blitzer"),
|
|
1092
1087
|
DocCfg(id='017767', author=GIUFFRE_V_DERSHOWITZ, description=f"article about {ALAN_DERSHOWITZ} working with {JEFFREY_EPSTEIN}"),
|
|
1093
1088
|
DocCfg(id='017796', author=GIUFFRE_V_DERSHOWITZ, description=f"article about {ALAN_DERSHOWITZ}"),
|
|
@@ -1147,6 +1142,9 @@ OTHER_FILES_LEGAL = [
|
|
|
1147
1142
|
DocCfg(id='017830', author=JASTA_SAUDI_LAWSUIT, description=f"legal text and court documents"),
|
|
1148
1143
|
DocCfg(id='017904', author=JASTA_SAUDI_LAWSUIT, description=f"Westlaw search results", date='2019-01-01'),
|
|
1149
1144
|
DocCfg(id='014037', author='Journal of Criminal Law and Criminology', description=f"article on {CVRA}"),
|
|
1145
|
+
DocCfg(id='025353', author=KEN_STARR, description=KEN_STARR_LETTER, date='2008-05-19', duplicate_ids=['010723', '019224'], dupe_type='redacted'),
|
|
1146
|
+
DocCfg(id='025704', author=KEN_STARR, description=KEN_STARR_LETTER, date='2008-05-27', duplicate_ids=['010732', '019221'], dupe_type='redacted'),
|
|
1147
|
+
DocCfg(id='012130', author=KEN_STARR, description=KEN_STARR_LETTER, date='2008-06-19', duplicate_ids=['012135']),
|
|
1150
1148
|
DocCfg(
|
|
1151
1149
|
id='031447',
|
|
1152
1150
|
author=MARTIN_WEINBERG,
|
|
@@ -1158,6 +1156,17 @@ OTHER_FILES_LEGAL = [
|
|
|
1158
1156
|
description=f"letter from to ABC / Good Morning America threatening libel lawsuit",
|
|
1159
1157
|
duplicate_ids=['028928']
|
|
1160
1158
|
),
|
|
1159
|
+
DocCfg(
|
|
1160
|
+
id='026793',
|
|
1161
|
+
author='Mintz Fraade',
|
|
1162
|
+
description=f"letter from {STEVEN_HOFFENBERG}'s lawyers offering to take over Epstein's business and resolve his legal issues",
|
|
1163
|
+
date='2018-03-23',
|
|
1164
|
+
),
|
|
1165
|
+
DocCfg(
|
|
1166
|
+
id='020662',
|
|
1167
|
+
author='Mishcon de Reya',
|
|
1168
|
+
description=f"letter from {ALAN_DERSHOWITZ}'s British lawyers to Daily Mail threatening libel suit",
|
|
1169
|
+
),
|
|
1161
1170
|
DocCfg(
|
|
1162
1171
|
id='029416',
|
|
1163
1172
|
author="National Enquirer / Radar Online v. FBI",
|
|
@@ -1173,12 +1182,6 @@ OTHER_FILES_LEGAL = [
|
|
|
1173
1182
|
),
|
|
1174
1183
|
DocCfg(id='028540', author='SCOTUS', description=f"decision in Budha Ismail Jam et al. v. INTERNATIONAL FINANCE CORP"),
|
|
1175
1184
|
DocCfg(id='012197', author='SDFL', description=f"Response to {JAY_LEFKOWITZ} on Epstein Plea Agreement Compliance"),
|
|
1176
|
-
DocCfg(id='020662', author='Mishcon de Reya', description=f"letter from {ALAN_DERSHOWITZ}'s British lawyers to Daily Mail threatening libel suit"),
|
|
1177
|
-
DocCfg(
|
|
1178
|
-
id='026793',
|
|
1179
|
-
description=f"letter from {STEVEN_HOFFENBERG}'s lawyers at Mintz Fraade offering to take over Epstein's business and resolve his legal issues",
|
|
1180
|
-
date='2018-03-23',
|
|
1181
|
-
),
|
|
1182
1185
|
DocCfg(id='022277', description=f"{TEXT_OF_US_LAW} National Labour Relations Board (NLRB)"),
|
|
1183
1186
|
]
|
|
1184
1187
|
|
|
@@ -1202,10 +1205,6 @@ OTHER_FILES_CONFERENCES = [
|
|
|
1202
1205
|
description=f'schedule including "Presidents Private Dinner - Jeffrey Epstine (sic)"',
|
|
1203
1206
|
date='2012-09-21',
|
|
1204
1207
|
),
|
|
1205
|
-
DocCfg(id='017603', description=DAVID_SCHOEN_CVRA_LEXIS_SEARCH, date='2019-02-28'),
|
|
1206
|
-
DocCfg(id='017635', description=DAVID_SCHOEN_CVRA_LEXIS_SEARCH, date='2019-02-28'),
|
|
1207
|
-
DocCfg(id='016509', description=DAVID_SCHOEN_CVRA_LEXIS_SEARCH, date='2019-02-28'),
|
|
1208
|
-
DocCfg(id='017714', description=DAVID_SCHOEN_CVRA_LEXIS_SEARCH, date='2019-02-28'),
|
|
1209
1208
|
DocCfg(id='017526', description=f'Intellectual Jazz conference brochure f. {DAVID_BLAINE}'),
|
|
1210
1209
|
DocCfg(id='029427', description=f"seems related to an IRL meeting about concerns China will attempt to absorb Mongolia"),
|
|
1211
1210
|
DocCfg(id='025797', description=f'someone\'s notes from Aspen Strategy Group', date='2013-05-29'),
|
|
@@ -1218,6 +1217,7 @@ OTHER_FILES_CONFERENCES = [
|
|
|
1218
1217
|
|
|
1219
1218
|
# All authors of documents in this category will be marked uninteresting
|
|
1220
1219
|
OTHER_FILES_FINANCE = [
|
|
1220
|
+
DocCfg(id='024631', author='Ackrell Capital', description=f"Cannabis Investment Report 2018", is_interesting=True),
|
|
1221
1221
|
DocCfg(id='016111', author=BOFA_MERRILL, description=f"GEMs Paper #26 Saudi Arabia: beyond oil but not so fast", date='2016-06-30'),
|
|
1222
1222
|
DocCfg(id='010609', author=BOFA_MERRILL, description=f"Liquid Insight Trump\'s effect on MXN", date='2016-09-22'),
|
|
1223
1223
|
DocCfg(id='025978', author=BOFA_MERRILL, description=f"Understanding when risk parity risk Increases", date='2016-08-09'),
|
|
@@ -1236,6 +1236,7 @@ OTHER_FILES_FINANCE = [
|
|
|
1236
1236
|
DocCfg(id='023575', author=BOFA_MERRILL, description=f"Global Equity Volatility Insights", date='2017-06-01'),
|
|
1237
1237
|
DocCfg(id='014518', author=BOFA_WEALTH_MGMT, description=f'tax alert', date='2016-05-02'),
|
|
1238
1238
|
DocCfg(id='029438', author=BOFA_WEALTH_MGMT, description=f'tax report', date='2018-01-02'),
|
|
1239
|
+
DocCfg(id='026668', author="Boothbay Fund Management", description=f"2016-Q4 earnings report signed by Ari Glass"),
|
|
1239
1240
|
DocCfg(id='024302', author='Carvana', description=f"form 14A SEC filing proxy statement", date='2019-04-23'),
|
|
1240
1241
|
DocCfg(id='029305', author='CCH Tax', description=f"Briefing on end of Defense of Marriage Act", date='2013-06-27'),
|
|
1241
1242
|
DocCfg(id='026794', author=DEUTSCHE_BANK, description=f"Global Political and Regulatory Risk in 2015/2016"),
|
|
@@ -1273,24 +1274,26 @@ OTHER_FILES_FINANCE = [
|
|
|
1273
1274
|
DocCfg(id='025296', author='Laffer Associates', description=f'report predicting Trump win', date='2016-07-06'),
|
|
1274
1275
|
DocCfg(id='020824', author='Mary Meeker', description=f"USA Inc: A Basic Summary of America's Financial Statements compiled", date='2011-02-01'),
|
|
1275
1276
|
DocCfg(id='025551', author='Morgan Stanley', description=f'report about alternative asset managers', date='2018-01-30'),
|
|
1277
|
+
DocCfg(id='019856', author='Sadis Goldberg LLP', description=f"report on SCOTUS ruling about insider trading", is_interesting=True),
|
|
1276
1278
|
DocCfg(id='025763', author='S&P', description=f"Economic Research: How Increasing Income Inequality Is Dampening U.S. Growth", date='2014-08-05'),
|
|
1277
1279
|
DocCfg(id='024135', author=UBS, description=UBS_CIO_REPORT, date='2012-06-29'),
|
|
1278
1280
|
DocCfg(id='025247', author=UBS, description=UBS_CIO_REPORT, date='2012-10-25'),
|
|
1279
|
-
DocCfg(id='024631', description=f"Ackrell Capital report: Cannabis Investment Report 2018"),
|
|
1280
1281
|
DocCfg(id='026584', description=f"article about tax implications of disregarded entities", date='2009-07-01'),
|
|
1281
|
-
DocCfg(
|
|
1282
|
+
DocCfg(
|
|
1283
|
+
id='024271',
|
|
1284
|
+
description=f"Blockchain Capital and Brock Pierce pitch deck",
|
|
1285
|
+
date='2015-10-01',
|
|
1286
|
+
is_interesting=True,
|
|
1287
|
+
),
|
|
1282
1288
|
DocCfg(id='024817', description=f"Cowen's Collective View of CBD / Cannabis report"),
|
|
1283
1289
|
DocCfg(id='012048', description=f"{PRESS_RELEASE} 'Rockefeller Partners with Gregory J. Fleming to Create Independent Financial Services Firm' and other articles"),
|
|
1284
|
-
DocCfg(id='019856', description=f"Sadis Goldberg LLP report on SCOTUS ruling about insider trading"),
|
|
1285
1290
|
|
|
1286
1291
|
# private placement memoranda
|
|
1287
1292
|
DocCfg(id='024432', description=f"Michael Milken's Knowledge Universe Education (KUE) $1,000,000 corporate share placement notice (SEC filing?)"),
|
|
1288
|
-
DocCfg(id='024003', description=f"New Leaf Ventures private placement memorandum"),
|
|
1293
|
+
DocCfg(id='024003', description=f"New Leaf Ventures ($375 million biotech fund) private placement memorandum"),
|
|
1289
1294
|
]
|
|
1290
1295
|
|
|
1291
1296
|
OTHER_FILES_LETTERS = [
|
|
1292
|
-
DocCfg(id='017789', author=ALAN_DERSHOWITZ, description=f'letter to {HARVARD} Crimson complaining he was defamed'),
|
|
1293
|
-
DocCfg(id='026668', author="Boothbay Fund Management", description=f"2016-Q4 earnings report signed by Ari Glass"),
|
|
1294
1297
|
DocCfg(
|
|
1295
1298
|
id='019086',
|
|
1296
1299
|
author=DAVID_BLAINE,
|
|
@@ -1315,17 +1318,23 @@ OTHER_FILES_LETTERS = [
|
|
|
1315
1318
|
description=f"letter about algorithmic trading",
|
|
1316
1319
|
date='2016-06-24', # date is based on Brexit reference but he could be backtesting,
|
|
1317
1320
|
),
|
|
1318
|
-
DocCfg(id='026248', author='Don McGahn', description=f'letter from Trump lawyer to Devin Nunes (R-CA) about FISA courts and Trump'),
|
|
1319
1321
|
DocCfg(id='029304', author=DONALD_TRUMP, description=f"recommendation letter for recently departed {TRUMP_ORG} lawyer {MICHAEL_J_BOCCIO}"),
|
|
1320
1322
|
DocCfg(id='029301', author=MICHAEL_J_BOCCIO, description=f"letter from former lawyer at the {TRUMP_ORG}", date='2011-08-07'),
|
|
1321
|
-
DocCfg(id='022405', author=NOAM_CHOMSKY, description=f"letter attesting to Epstein's good character"),
|
|
1322
1323
|
DocCfg(id='026134', description=f'letter to someone named George about investment opportunities in the Ukraine banking sector'),
|
|
1323
1324
|
]
|
|
1324
1325
|
|
|
1325
1326
|
OTHER_FILES_PROPERTY = [
|
|
1326
|
-
DocCfg(
|
|
1327
|
+
DocCfg(
|
|
1328
|
+
id='026759',
|
|
1329
|
+
author='Great Bay Condominium Owners Association',
|
|
1330
|
+
description=f'{PRESS_RELEASE} about Hurricane Irma damage',
|
|
1331
|
+
date='2017-09-13',
|
|
1332
|
+
is_interesting=False,
|
|
1333
|
+
),
|
|
1327
1334
|
DocCfg(id='016602', author=PALM_BEACH_CODE_ENFORCEMENT, description='board minutes', date='2008-04-17'),
|
|
1328
1335
|
DocCfg(id='016554', author=PALM_BEACH_CODE_ENFORCEMENT, description='board minutes', date='2008-07-17', duplicate_ids=['016616', '016574']),
|
|
1336
|
+
DocCfg(id='016636', author=PALM_BEACH_WATER_COMMITTEE, description=f"Meeting on January 29, 2009"),
|
|
1337
|
+
DocCfg(id='022417', author='Park Partners NYC', description=f"letter to partners in real estate project with architectural plans"),
|
|
1329
1338
|
DocCfg(id='027068', author=THE_REAL_DEAL, description=f"{THE_REAL_DEAL_ARTICLE} Palm House Hotel Bankruptcy and EB-5 Visa Fraud Allegations"),
|
|
1330
1339
|
DocCfg(id='029520', author=THE_REAL_DEAL, description=f"{THE_REAL_DEAL_ARTICLE} 'Lost Paradise at the Palm House'", date='2019-06-17'),
|
|
1331
1340
|
DocCfg(id='016597', author='Trump Properties LLC', description=f'appeal of some decision about Mar-a-Lago by {PALM_BEACH} authorities'),
|
|
@@ -1339,8 +1348,6 @@ OTHER_FILES_PROPERTY = [
|
|
|
1339
1348
|
DocCfg(id='016552', description=f"{PALM_BEACH_TSV} info"),
|
|
1340
1349
|
DocCfg(id='016698', description=f"{PALM_BEACH_TSV} info (broken?)"),
|
|
1341
1350
|
DocCfg(id='016696', description=f"{PALM_BEACH_TSV} info (water quality?"),
|
|
1342
|
-
DocCfg(id='016636', description=f"{PALM_BEACH_WATER_COMMITTEE} Meeting on January 29, 2009"),
|
|
1343
|
-
DocCfg(id='022417', description=f"Park Partners NYC letter to partners in real estate project with architectural plans"),
|
|
1344
1351
|
DocCfg(
|
|
1345
1352
|
id='018727',
|
|
1346
1353
|
description=f"{VIRGIN_ISLANDS} property deal pitch deck, building will be leased to the U.S. govt GSA",
|
|
@@ -1378,6 +1385,13 @@ OTHER_FILES_SOCIAL = [
|
|
|
1378
1385
|
OTHER_FILES_POLITICS = [
|
|
1379
1386
|
DocCfg(id='029918', author=DIANA_DEGETTE_CAMPAIGN, description=f"bio", date='2012-09-27'),
|
|
1380
1387
|
DocCfg(id='031184', author=DIANA_DEGETTE_CAMPAIGN, description=f"invitation to fundraiser hosted by {BARBRO_C_EHNBOM}", date='2012-09-27'),
|
|
1388
|
+
DocCfg(id='026248', author='Don McGahn', description=f'letter from Trump lawyer to Devin Nunes (R-CA) about FISA courts and Trump'),
|
|
1389
|
+
DocCfg(
|
|
1390
|
+
id='019233',
|
|
1391
|
+
author='Freedom House',
|
|
1392
|
+
description=f"'Breaking Down Democracy: Goals, Strategies, and Methods of Modern Authoritarians'",
|
|
1393
|
+
date='2017-06-02',
|
|
1394
|
+
),
|
|
1381
1395
|
DocCfg(id='026827', author='Scowcroft Group', description=f'report on ISIS', date='2015-11-14'),
|
|
1382
1396
|
DocCfg(id='024294', author=STACEY_PLASKETT, description=f"campaign flier", date='2016-10-01'),
|
|
1383
1397
|
DocCfg(
|
|
@@ -1417,6 +1431,7 @@ OTHER_FILES_ACADEMIA = [
|
|
|
1417
1431
|
author=f"{MOSHE_HOFFMAN}, Erez Yoeli, and {MARTIN_NOWAK}",
|
|
1418
1432
|
description=f"Cooperating Without Looking: Game Theory Model of Trust and Reciprocal Cooperation"
|
|
1419
1433
|
),
|
|
1434
|
+
DocCfg(id='022405', author=NOAM_CHOMSKY, description=f"letter attesting to Epstein's good character"),
|
|
1420
1435
|
DocCfg(id='025143', author=ROBERT_TRIVERS, description=f"Africa, Parasites, Intelligence", date='2018-06-25'),
|
|
1421
1436
|
DocCfg(id='029155', author=ROBERT_TRIVERS, description=f'response sent to the Gruterites ({GORDON_GETTY} fans)', date='2018-03-19'),
|
|
1422
1437
|
DocCfg(
|
|
@@ -1482,8 +1497,13 @@ OTHER_FILES_MISC = [
|
|
|
1482
1497
|
DocCfg(id='032206', category=SKYPE_LOG, author=LAWRENCE_KRAUSS),
|
|
1483
1498
|
DocCfg(id='032208', category=SKYPE_LOG, author=LAWRENCE_KRAUSS),
|
|
1484
1499
|
DocCfg(id='032209', category=SKYPE_LOG, author=LAWRENCE_KRAUSS),
|
|
1485
|
-
DocCfg(
|
|
1486
|
-
|
|
1500
|
+
DocCfg(
|
|
1501
|
+
id='018224',
|
|
1502
|
+
category=SKYPE_LOG,
|
|
1503
|
+
description=f'Skype conversations with linkspirit (French?) and {LAWRENCE_KRAUSS}',
|
|
1504
|
+
is_interesting=True, # we don't know who linkspirit is yet
|
|
1505
|
+
),
|
|
1506
|
+
DocCfg(id='032210', category=SKYPE_LOG, description=f'Skype conversation with linkspirit', is_interesting=True),
|
|
1487
1507
|
DocCfg(
|
|
1488
1508
|
id='025147',
|
|
1489
1509
|
author=BROCKMAN_INC,
|
|
@@ -1532,6 +1552,7 @@ OTHER_FILES_JUNK = [
|
|
|
1532
1552
|
DocCfg(id='029352', description=OBAMA_JOKE, date='2013-07-26'),
|
|
1533
1553
|
DocCfg(id='029351', description=OBAMA_JOKE, date='2013-07-26'),
|
|
1534
1554
|
DocCfg(id='029354', description=OBAMA_JOKE, date='2013-07-26'),
|
|
1555
|
+
DocCfg(id='031293'),
|
|
1535
1556
|
]
|
|
1536
1557
|
|
|
1537
1558
|
OTHER_FILES_CATEGORIES = [
|
|
@@ -1587,18 +1608,3 @@ REPLY_LINE_ON_DATE_PATTERN = fr"^On (\d+ )?((Mon|Tues?|Wed(nes)?|Thu(rs)?|Fri|Sa
|
|
|
1587
1608
|
REPLY_LINE_PATTERN = rf"({REPLY_LINE_IN_A_MSG_PATTERN}|{REPLY_LINE_ON_NUMERIC_DATE_PATTERN}|{REPLY_LINE_ON_DATE_PATTERN}|{FORWARDED_LINE_PATTERN})"
|
|
1588
1609
|
REPLY_REGEX = re.compile(REPLY_LINE_PATTERN, re.IGNORECASE | re.MULTILINE)
|
|
1589
1610
|
SENT_FROM_REGEX = re.compile(r'^(?:(Please forgive|Sorry for all the) typos.{1,4})?(Sent (from|via).*(and string|AT&T|Droid|iPad|Phone|Mail|BlackBerry(.*(smartphone|device|Handheld|AT&T|T- ?Mobile))?)\.?)', re.M | re.I)
|
|
1590
|
-
|
|
1591
|
-
|
|
1592
|
-
# Error checking.
|
|
1593
|
-
if len(OTHER_FILES_CONFIG) != 442:
|
|
1594
|
-
logger.warning(f"Found {len(OTHER_FILES_CONFIG)} configured other files!")
|
|
1595
|
-
|
|
1596
|
-
encountered_file_ids = set()
|
|
1597
|
-
|
|
1598
|
-
for cfg in ALL_CONFIGS:
|
|
1599
|
-
if cfg.id in encountered_file_ids:
|
|
1600
|
-
raise ValueError(f"{cfg.id} configured twice!\n\n{cfg}\n")
|
|
1601
|
-
elif cfg.dupe_of_id and cfg.dupe_of_id == cfg.id:
|
|
1602
|
-
raise ValueError(f"Invalid config!\n\n{cfg}\n")
|
|
1603
|
-
|
|
1604
|
-
encountered_file_ids.add(cfg.id)
|
epstein_files/util/doc_cfg.py
CHANGED
|
@@ -92,22 +92,7 @@ class DocCfg:
|
|
|
92
92
|
if self.dupe_of_id or self.duplicate_ids:
|
|
93
93
|
self.dupe_type = self.dupe_type or SAME
|
|
94
94
|
|
|
95
|
-
def
|
|
96
|
-
if self.dupe_type is not None:
|
|
97
|
-
return DUPE_TYPE_STRS[self.dupe_type]
|
|
98
|
-
|
|
99
|
-
def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
|
|
100
|
-
"""Create synthetic DocCfg objects that set the 'dupe_of_id' field to point back to this object."""
|
|
101
|
-
for id in self.duplicate_ids:
|
|
102
|
-
dupe_cfg = deepcopy(self)
|
|
103
|
-
dupe_cfg.id = id
|
|
104
|
-
dupe_cfg.dupe_of_id = self.id
|
|
105
|
-
dupe_cfg.duplicate_ids = []
|
|
106
|
-
dupe_cfg.dupe_type = self.dupe_type
|
|
107
|
-
dupe_cfg.was_generated = True
|
|
108
|
-
yield dupe_cfg
|
|
109
|
-
|
|
110
|
-
def info_str(self) -> str | None:
|
|
95
|
+
def complete_description(self) -> str | None:
|
|
111
96
|
"""String that summarizes what is known about this document."""
|
|
112
97
|
if self.category and not self.description:
|
|
113
98
|
return self.category
|
|
@@ -119,13 +104,28 @@ class DocCfg:
|
|
|
119
104
|
elif self.category == FINANCE and self.author in FINANCIAL_REPORTS_AUTHORS:
|
|
120
105
|
return f"{self.author} report: '{self.description}'"
|
|
121
106
|
elif self.category == LEGAL and 'v.' in self.author:
|
|
122
|
-
return f"{self.author}:
|
|
107
|
+
return f"{self.author}: {self.description}"
|
|
123
108
|
elif self.category and self.author is None and self.description is None:
|
|
124
109
|
return self.category
|
|
125
110
|
|
|
126
111
|
pieces = without_falsey([self.author, self.description])
|
|
127
112
|
return ' '.join(pieces) if pieces else None
|
|
128
113
|
|
|
114
|
+
def duplicate_reason(self) -> str | None:
|
|
115
|
+
if self.dupe_type is not None:
|
|
116
|
+
return DUPE_TYPE_STRS[self.dupe_type]
|
|
117
|
+
|
|
118
|
+
def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
|
|
119
|
+
"""Create synthetic DocCfg objects that set the 'dupe_of_id' field to point back to this object."""
|
|
120
|
+
for id in self.duplicate_ids:
|
|
121
|
+
dupe_cfg = deepcopy(self)
|
|
122
|
+
dupe_cfg.id = id
|
|
123
|
+
dupe_cfg.dupe_of_id = self.id
|
|
124
|
+
dupe_cfg.duplicate_ids = []
|
|
125
|
+
dupe_cfg.dupe_type = self.dupe_type
|
|
126
|
+
dupe_cfg.was_generated = True
|
|
127
|
+
yield dupe_cfg
|
|
128
|
+
|
|
129
129
|
def metadata(self) -> Metadata:
|
|
130
130
|
non_null_fields = {k: v for k, v in asdict(self).items() if v and k not in INVALID_FOR_METADATA}
|
|
131
131
|
|
|
@@ -229,10 +229,6 @@ class EmailCfg(CommunicationCfg):
|
|
|
229
229
|
is_fwded_article: bool = False
|
|
230
230
|
recipients: list[str | None] = field(default_factory=list)
|
|
231
231
|
|
|
232
|
-
def __post_init__(self):
|
|
233
|
-
super().__post_init__()
|
|
234
|
-
self.category = EMAIL
|
|
235
|
-
|
|
236
232
|
@classmethod
|
|
237
233
|
def from_doc_cfg(cls, cfg: DocCfg) -> 'EmailCfg':
|
|
238
234
|
return cls(**asdict(cfg))
|
|
@@ -244,10 +240,6 @@ class EmailCfg(CommunicationCfg):
|
|
|
244
240
|
|
|
245
241
|
@dataclass(kw_only=True)
|
|
246
242
|
class TextCfg(CommunicationCfg):
|
|
247
|
-
def __post_init__(self):
|
|
248
|
-
super().__post_init__()
|
|
249
|
-
self.category = TEXT_MESSAGE
|
|
250
|
-
|
|
251
243
|
# This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
|
|
252
244
|
def __repr__(self) -> str:
|
|
253
245
|
return super().__repr__()
|
epstein_files/util/env.py
CHANGED
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
from argparse import ArgumentParser
|
|
3
3
|
from os import environ
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from sys import argv
|
|
5
|
+
from sys import argv, exit
|
|
6
6
|
|
|
7
7
|
from rich_argparse_plus import RichHelpFormatterPlus
|
|
8
8
|
|
|
@@ -11,28 +11,30 @@ from epstein_files.util.logging import env_log_level, logger
|
|
|
11
11
|
COUNT_WORDS_SCRIPT = 'epstein_word_count'
|
|
12
12
|
DEFAULT_WIDTH = 145
|
|
13
13
|
HTML_SCRIPTS = ['epstein_generate', COUNT_WORDS_SCRIPT]
|
|
14
|
+
EPSTEIN_DOCS_DIR_ENV_VAR_NAME = 'EPSTEIN_DOCS_DIR'
|
|
15
|
+
|
|
14
16
|
|
|
15
17
|
RichHelpFormatterPlus.choose_theme('morning_glory')
|
|
16
18
|
parser = ArgumentParser(description="Parse epstein OCR docs and generate HTML pages.", formatter_class=RichHelpFormatterPlus)
|
|
19
|
+
parser.add_argument('--make-clean', action='store_true', help='delete all HTML build artifact and write latest URLs to .urls.env')
|
|
17
20
|
parser.add_argument('--name', '-n', action='append', dest='names', help='specify the name(s) whose communications should be output')
|
|
18
|
-
parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='ovewrite cached
|
|
21
|
+
parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='re-parse the files and ovewrite cached data')
|
|
19
22
|
|
|
20
|
-
output = parser.add_argument_group('OUTPUT')
|
|
23
|
+
output = parser.add_argument_group('OUTPUT', 'Options used by epstein_generate.')
|
|
21
24
|
output.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
|
|
22
25
|
output.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just the interesting ones')
|
|
23
|
-
output.add_argument('--build', '-b', action='store_true', help='write output to
|
|
24
|
-
output.add_argument('--json-
|
|
25
|
-
output.add_argument('--
|
|
26
|
-
output.add_argument('--output-emails', '-oe', action='store_true', help='generate
|
|
27
|
-
output.add_argument('--output-
|
|
28
|
-
output.add_argument('--output-other-files', '-oo', action='store_true', help='generate other files section')
|
|
26
|
+
output.add_argument('--build', '-b', action='store_true', help='write HTML output to a file')
|
|
27
|
+
output.add_argument('--json-files', action='store_true', help='pretty print all the raw JSON data files in the collection and exit')
|
|
28
|
+
output.add_argument('--json-metadata', action='store_true', help='dump JSON metadata for all files and exit')
|
|
29
|
+
output.add_argument('--output-emails', '-oe', action='store_true', help='generate emails section')
|
|
30
|
+
output.add_argument('--output-other', '-oo', action='store_true', help='generate other files section')
|
|
29
31
|
output.add_argument('--output-texts', '-ot', action='store_true', help='generate text messages section')
|
|
30
32
|
output.add_argument('--sort-alphabetical', action='store_true', help='sort emailers alphabetically intead of by email count')
|
|
31
33
|
output.add_argument('--suppress-output', action='store_true', help='no output to terminal (use with --build)')
|
|
32
34
|
output.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use (in characters)')
|
|
33
|
-
output.add_argument('--use-epstein-web
|
|
35
|
+
output.add_argument('--use-epstein-web', action='store_true', help='use epsteinweb.org links instead of epstein.media')
|
|
34
36
|
|
|
35
|
-
scripts = parser.add_argument_group('SCRIPTS', '
|
|
37
|
+
scripts = parser.add_argument_group('SCRIPTS', 'Options used by epstein_search, epstein_show, and epstein_diff.')
|
|
36
38
|
scripts.add_argument('positional_args', nargs='*', help='strings to searchs for, file IDs to show or diff, etc.')
|
|
37
39
|
scripts.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (used by epstein_show)')
|
|
38
40
|
scripts.add_argument('--whole-file', '-wf', action='store_true', help='print whole file (used by epstein_search)')
|
|
@@ -42,23 +44,35 @@ debug.add_argument('--colors-only', '-c', action='store_true', help='print heade
|
|
|
42
44
|
debug.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
|
|
43
45
|
debug.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
|
|
44
46
|
debug.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats about the files')
|
|
47
|
+
debug.add_argument('--skip-other-files', '-sof', action='store_true', help='skip parsing non email/text files')
|
|
45
48
|
debug.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
|
|
46
49
|
args = parser.parse_args()
|
|
47
50
|
|
|
51
|
+
|
|
52
|
+
# Verify Epstein docs can be found
|
|
53
|
+
DOCS_DIR_ENV = environ.get(EPSTEIN_DOCS_DIR_ENV_VAR_NAME)
|
|
54
|
+
DOCS_DIR = Path(DOCS_DIR_ENV or '').resolve()
|
|
55
|
+
|
|
56
|
+
if not DOCS_DIR_ENV:
|
|
57
|
+
print(f"\n ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME} env var not set!\n")
|
|
58
|
+
exit(1)
|
|
59
|
+
elif not DOCS_DIR.exists():
|
|
60
|
+
print(f"\n ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!\n")
|
|
61
|
+
exit(1)
|
|
62
|
+
|
|
48
63
|
current_script = Path(argv[0]).name
|
|
49
64
|
is_env_var_set = lambda s: len(environ.get(s) or '') > 0
|
|
50
65
|
is_html_script = current_script in HTML_SCRIPTS
|
|
51
66
|
|
|
52
67
|
args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
|
|
53
68
|
args.output_emails = args.output_emails or args.all_emails
|
|
54
|
-
args.
|
|
69
|
+
args.output_other = args.output_other or args.all_other_files
|
|
55
70
|
args.overwrite_pickle = args.overwrite_pickle or (is_env_var_set('OVERWRITE_PICKLE') and not is_env_var_set('PICKLED'))
|
|
56
71
|
args.width = args.width if is_html_script else None
|
|
57
72
|
is_output_selected = any([arg.startswith('output_') and value for arg, value in vars(args).items()])
|
|
58
73
|
is_output_selected = is_output_selected or args.json_metadata or args.colors_only
|
|
59
74
|
specified_names: list[str | None] = [None if n == 'None' else n for n in (args.names or [])]
|
|
60
75
|
|
|
61
|
-
|
|
62
76
|
# Log level args
|
|
63
77
|
if args.deep_debug:
|
|
64
78
|
logger.setLevel(logging.DEBUG)
|
|
@@ -74,11 +88,9 @@ logger.info(f'Log level set to {logger.level}...')
|
|
|
74
88
|
# Massage args that depend on other args to the appropriate state
|
|
75
89
|
if current_script == 'epstein_generate' and not (is_output_selected or args.make_clean):
|
|
76
90
|
logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
|
|
77
|
-
args.output_texts = True
|
|
78
|
-
args.output_emails = True
|
|
79
|
-
args.output_other_files = True
|
|
91
|
+
args.output_texts = args.output_emails = args.output_other = True
|
|
80
92
|
|
|
81
|
-
if args.
|
|
93
|
+
if args.use_epstein_web:
|
|
82
94
|
logger.warning(f"Using links to epsteinweb.org links instead of epsteinify.com...")
|
|
83
95
|
|
|
84
96
|
if args.debug:
|
|
@@ -1,20 +1,9 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from os import environ
|
|
3
2
|
from pathlib import Path
|
|
4
|
-
from sys import exit
|
|
5
3
|
|
|
6
4
|
from epstein_files.util.constant.strings import FILE_NAME_REGEX, FILE_STEM_REGEX, HOUSE_OVERSIGHT_PREFIX
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
DOCS_DIR_ENV = environ[EPSTEIN_DOCS_DIR_ENV_VAR_NAME]
|
|
10
|
-
DOCS_DIR = Path(DOCS_DIR_ENV or '').resolve()
|
|
11
|
-
|
|
12
|
-
if not DOCS_DIR_ENV:
|
|
13
|
-
print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME} env var not set!")
|
|
14
|
-
exit(1)
|
|
15
|
-
elif not DOCS_DIR.exists():
|
|
16
|
-
print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!")
|
|
17
|
-
exit(1)
|
|
5
|
+
from epstein_files.util.env import DOCS_DIR
|
|
6
|
+
from epstein_files.util.logging import logger
|
|
18
7
|
|
|
19
8
|
EXTRACTED_EMAILS_DIR = Path('emails_extracted_from_legal_filings')
|
|
20
9
|
FILE_ID_REGEX = re.compile(fr".*{FILE_NAME_REGEX.pattern}")
|
|
@@ -23,9 +12,10 @@ KB = 1024
|
|
|
23
12
|
MB = KB * KB
|
|
24
13
|
|
|
25
14
|
|
|
26
|
-
#
|
|
15
|
+
# Coerce methods hands both string and int arguments.
|
|
16
|
+
coerce_file_name = lambda filename_or_id: coerce_file_stem(filename_or_id) + '.txt'
|
|
17
|
+
coerce_file_path = lambda filename_or_id: DOCS_DIR.joinpath(coerce_file_name(filename_or_id))
|
|
27
18
|
id_str = lambda id: f"{int(id):06d}"
|
|
28
|
-
filename_for_id = lambda id: file_stem_for_id(id) + '.txt'
|
|
29
19
|
|
|
30
20
|
|
|
31
21
|
def coerce_file_stem(filename_or_id: int | str) -> str:
|
|
@@ -42,14 +32,6 @@ def coerce_file_stem(filename_or_id: int | str) -> str:
|
|
|
42
32
|
return file_stem
|
|
43
33
|
|
|
44
34
|
|
|
45
|
-
def coerce_file_name(filename_or_id: int | str) -> str:
|
|
46
|
-
return coerce_file_stem(filename_or_id) + '.txt'
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def coerce_file_path(filename_or_id: int | str) -> Path:
|
|
50
|
-
return DOCS_DIR.joinpath(coerce_file_name(filename_or_id))
|
|
51
|
-
|
|
52
|
-
|
|
53
35
|
def extract_file_id(filename_or_id: int | str | Path) -> str:
|
|
54
36
|
if isinstance(filename_or_id, int) or (isinstance(filename_or_id, str) and len(filename_or_id) <= 6):
|
|
55
37
|
return id_str(filename_or_id)
|
|
@@ -67,7 +49,10 @@ def file_size(file_path: str | Path) -> int:
|
|
|
67
49
|
|
|
68
50
|
|
|
69
51
|
def file_size_str(file_path: str | Path) -> str:
|
|
70
|
-
|
|
52
|
+
return file_size_to_str(file_size(file_path))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def file_size_to_str(size: int) -> str:
|
|
71
56
|
digits = 2
|
|
72
57
|
|
|
73
58
|
if size > MB:
|
|
@@ -96,3 +81,7 @@ def is_local_extract_file(filename) -> bool:
|
|
|
96
81
|
"""Return true if filename is of form 'HOUSE_OVERSIGHT_029835_1.txt'."""
|
|
97
82
|
file_match = FILE_ID_REGEX.match(str(filename))
|
|
98
83
|
return True if file_match and file_match.group(2) else False
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def log_file_write(file_path: str | Path) -> None:
|
|
87
|
+
logger.warning(f"Wrote {file_size_str(file_path)} to '{file_path}'")
|