epstein-files 1.2.0__py3-none-any.whl → 1.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +42 -30
- epstein_files/documents/communication.py +0 -3
- epstein_files/documents/document.py +66 -19
- epstein_files/documents/email.py +203 -208
- epstein_files/documents/emails/email_header.py +10 -2
- epstein_files/documents/imessage/text_message.py +3 -2
- epstein_files/documents/other_file.py +16 -34
- epstein_files/epstein_files.py +24 -35
- epstein_files/person.py +67 -73
- epstein_files/util/constant/names.py +21 -12
- epstein_files/util/constant/output_files.py +8 -5
- epstein_files/util/constant/strings.py +2 -2
- epstein_files/util/constant/urls.py +14 -2
- epstein_files/util/constants.py +38 -12
- epstein_files/util/data.py +2 -1
- epstein_files/util/doc_cfg.py +3 -3
- epstein_files/util/env.py +10 -7
- epstein_files/util/highlighted_group.py +366 -202
- epstein_files/util/logging.py +1 -1
- epstein_files/util/output.py +54 -21
- epstein_files/util/rich.py +21 -16
- epstein_files/util/timer.py +14 -0
- epstein_files/util/word_count.py +1 -1
- {epstein_files-1.2.0.dist-info → epstein_files-1.2.5.dist-info}/METADATA +5 -2
- epstein_files-1.2.5.dist-info/RECORD +34 -0
- epstein_files-1.2.0.dist-info/RECORD +0 -34
- {epstein_files-1.2.0.dist-info → epstein_files-1.2.5.dist-info}/LICENSE +0 -0
- {epstein_files-1.2.0.dist-info → epstein_files-1.2.5.dist-info}/WHEEL +0 -0
- {epstein_files-1.2.0.dist-info → epstein_files-1.2.5.dist-info}/entry_points.txt +0 -0
epstein_files/documents/email.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import re
|
|
4
|
+
from collections import defaultdict
|
|
4
5
|
from copy import deepcopy
|
|
5
6
|
from dataclasses import asdict, dataclass, field
|
|
6
7
|
from datetime import datetime
|
|
@@ -31,8 +32,9 @@ from epstein_files.util.rich import *
|
|
|
31
32
|
BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
|
|
32
33
|
BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
|
|
33
34
|
DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
|
|
34
|
-
LINK_LINE_REGEX = re.compile(f"
|
|
35
|
-
|
|
35
|
+
LINK_LINE_REGEX = re.compile(f"^>? ?htt")
|
|
36
|
+
LINK_LINE2_REGEX = re.compile(r"^[-\w.%&=/]{5,}$")
|
|
37
|
+
QUOTED_REPLY_LINE_REGEX = re.compile(r'(\nFrom:(.*)|wrote:)\n', re.IGNORECASE)
|
|
36
38
|
REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
|
|
37
39
|
|
|
38
40
|
BAD_TIMEZONE_REGEX = re.compile(fr'\((UTC|GMT\+\d\d:\d\d)\)|{REDACTED}')
|
|
@@ -42,11 +44,13 @@ LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
|
|
|
42
44
|
|
|
43
45
|
SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
|
|
44
46
|
REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
|
|
45
|
-
URL_SIGNIFIERS = ['gclid', 'htm', 'ref=', 'utm']
|
|
47
|
+
URL_SIGNIFIERS = ['amp?', 'cd=', 'click', 'ft=', 'gclid', 'htm', 'keywords=', 'module=', 'mpweb', 'nlid=', 'ref=', 'smid=', 'usg=', 'utm']
|
|
46
48
|
APPEARS_IN = 'appears in'
|
|
47
|
-
|
|
49
|
+
|
|
48
50
|
MAX_NUM_HEADER_LINES = 14
|
|
49
51
|
MAX_QUOTED_REPLIES = 2
|
|
52
|
+
MAX_CHARS_TO_PRINT = 4000
|
|
53
|
+
TRUNCATED_CHARS = int(MAX_CHARS_TO_PRINT / 3)
|
|
50
54
|
|
|
51
55
|
REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
|
|
52
56
|
'********************************',
|
|
@@ -72,12 +76,13 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
|
72
76
|
# Signatures
|
|
73
77
|
'BlackBerry by AT &T': 'BlackBerry by AT&T',
|
|
74
78
|
'BlackBerry from T- Mobile': 'BlackBerry from T-Mobile',
|
|
75
|
-
'Envoy& de
|
|
79
|
+
'Envoy& de': 'Envoyé de',
|
|
76
80
|
"from my 'Phone": 'from my iPhone',
|
|
77
81
|
'from Samsung Mob.le': 'from Samsung Mobile',
|
|
78
82
|
'gJeremyRubin': '@JeremyRubin',
|
|
79
83
|
'Sent from Mabfl': 'Sent from Mobile', # NADIA_MARCINKO signature bad OCR
|
|
80
84
|
'twitter glhsummers': 'twitter @lhsummers',
|
|
85
|
+
re.compile(r"[cC]o-authored with i ?Phone auto-correct"): "Co-authored with iPhone auto-correct",
|
|
81
86
|
re.compile(r"twitter\.com[i/][lI]krauss[1lt]"): "twitter.com/lkrauss1",
|
|
82
87
|
re.compile(r'from my BlackBerry[0°] wireless device'): 'from my BlackBerry® wireless device',
|
|
83
88
|
re.compile(r'^INW$', re.MULTILINE): REDACTED,
|
|
@@ -109,22 +114,28 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
|
109
114
|
EMAIL_SIGNATURE_REGEXES = {
|
|
110
115
|
ARIANE_DE_ROTHSCHILD: re.compile(r"Ensemble.*\nCe.*\ndestinataires.*\nremercions.*\nautorisee.*\nd.*\nLe.*\ncontenues.*\nEdmond.*\nRoth.*\nlo.*\nRoth.*\ninfo.*\nFranc.*\n.2.*", re.I),
|
|
111
116
|
BARBRO_C_EHNBOM: re.compile(r"Barbro C.? Ehn.*\nChairman, Swedish-American.*\n((Office|Cell|Sweden):.*\n)*(360.*\nNew York.*)?"),
|
|
117
|
+
BRAD_KARP: re.compile(r"This message is intended only for the use of the Addressee and may contain information.*\nnot the intended recipient, you are hereby notified.*\nreceived this communication in error.*"),
|
|
118
|
+
DANIEL_SIAD: re.compile(r"Confidentiality Notice: The information contained in this electronic message is PRIVILEGED and confidential information intended only for the use of the individual entity or entities named as recipient or recipients. If the reader is not the intended recipient, be hereby notified that any dissemination, distribution or copy of this communication is strictly prohibited. If you have received this communication in error, please notify me immediately by electronic mail or by telephone and permanently delete this message from your computer system. Thank you.".replace(' ', r'\s*'), re.IGNORECASE),
|
|
112
119
|
DANNY_FROST: re.compile(r"Danny Frost\nDirector.*\nManhattan District.*\n212.*", re.IGNORECASE),
|
|
113
120
|
DARREN_INDYKE: re.compile(r"DARREN K. INDYKE.*?\**\nThe information contained in this communication.*?Darren K.[\n\s]+?[Il]ndyke(, PLLC)? — All rights reserved\.? ?\n\*{50,120}(\n\**)?", re.DOTALL),
|
|
114
121
|
DAVID_INGRAM: re.compile(r"Thank you in advance.*\nDavid Ingram.*\nCorrespondent\nReuters.*\nThomson.*(\n(Office|Mobile|Reuters.com).*)*"),
|
|
115
122
|
DEEPAK_CHOPRA: re.compile(fr"({DEEPAK_CHOPRA}( MD)?\n)?2013 Costa Del Mar Road\nCarlsbad, CA 92009(\n(Chopra Foundation|Super Genes: Unlock.*))?(\nJiyo)?(\nChopra Center for Wellbeing)?(\nHome: Where Everyone is Welcome)?"),
|
|
123
|
+
EDUARDO_ROBLES: re.compile(fr"(• )?email:.*\n(• )?email:\n(• )?website: www.creativekingdom.com\n(• )?address: 5th Floor Office No:504 Aspect Tower,\nBusiness Bay, Dubai United Arab Emirates."),
|
|
116
124
|
JEFFREY_EPSTEIN: re.compile(r"((\*+|please note)\n+)?(> )?(• )?(» )?The information contained in this communication is\n(> )*(» )?confidential.*?all attachments.( copyright -all rights reserved?)?", re.DOTALL),
|
|
117
|
-
JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*", re.IGNORECASE),
|
|
125
|
+
JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*(\nTel:.*)?(\nEmail:.*)?", re.IGNORECASE),
|
|
118
126
|
KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
|
|
119
127
|
LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
|
|
120
128
|
LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
|
|
121
|
-
MARTIN_WEINBERG: re.compile(r"(Martin G. Weinberg, Esq.\n20 Park Plaza((, )|\n)Suite 1000\nBoston, MA 02116(\n61
|
|
122
|
-
|
|
129
|
+
MARTIN_WEINBERG: re.compile(r"(Martin G. Weinberg, Esq.\n20 Park Plaza((, )|\n)Suite 1000\nBoston, MA 02116(\n61.*?)?(\n.*?([cC]ell|Office))*\n)?This Electronic Message contains.*?contents of this message is.*?prohibited.", re.DOTALL),
|
|
130
|
+
NICHOLAS_RIBIS: re.compile(r"60 Morris Turnpike 2FL\nSummit,? NJ.*\n0:\nF:\n\*{20,}\nCONFIDENTIALITY NOTICE.*\nattachments.*\ncopying.*\nIf you have.*\nthe copy.*\nThank.*\n\*{20,}"),
|
|
123
131
|
PETER_MANDELSON: re.compile(r'Disclaimer This email and any attachments to it may be.*?with[ \n]+number(.*?EC4V[ \n]+6BJ)?', re.DOTALL | re.IGNORECASE),
|
|
124
132
|
PAUL_BARRETT: re.compile(r"Paul Barrett[\n\s]+Alpha Group Capital LLC[\n\s]+(142 W 57th Street, 11th Floor, New York, NY 10019?[\n\s]+)?(al?[\n\s]*)?ALPHA GROUP[\n\s]+CAPITAL"),
|
|
125
|
-
RICHARD_KAHN: re.compile(
|
|
133
|
+
RICHARD_KAHN: re.compile(fr'Richard Kahn[\n\s]+HBRK Associates Inc.?[\n\s]+((301 East 66th Street, Suite 1OF|575 Lexington Avenue,? 4th Floor,?)[\n\s]+)?New York, (NY|New York) 100(22|65)(\s+(Tel?|Phone)( I|{REDACTED})?\s+Fa[x",]?(_|{REDACTED})*\s+[Ce]el?l?)?', re.IGNORECASE),
|
|
134
|
+
ROSS_GOW: re.compile(r"Ross Gow\nManaging Partner\nACUITY Reputation Limited\n23 Berkeley Square\nLondon.*\nMobile.*\nTel"),
|
|
135
|
+
STEVEN_PFEIFFER: re.compile(r"Steven\nSteven .*\nAssociate.*\nIndependent Filmmaker Project\nMade in NY.*\n30 .*\nBrooklyn.*\n(p:.*\n)?www\.ifp.*", re.IGNORECASE),
|
|
126
136
|
'Susan Edelman': re.compile(r'Susan Edel.*\nReporter\n1211.*\n917.*\nsedelman.*', re.IGNORECASE),
|
|
127
137
|
TERRY_KAFKA: re.compile(r"((>|I) )?Terry B.? Kafka.*\n(> )?Impact Outdoor.*\n(> )?5454.*\n(> )?Dallas.*\n((> )?c?ell.*\n)?(> )?Impactoutdoor.*(\n(> )?cell.*)?", re.IGNORECASE),
|
|
138
|
+
TOM_PRITZKER: re.compile(r"The contents of this email message.*\ncontain confidential.*\n(not )?the intended.*\n(error|please).*\n(you )?(are )?not the.*\n(this )?message.*"),
|
|
128
139
|
TONJA_HADDAD_COLEMAN: re.compile(fr"Tonja Haddad Coleman.*\nTonja Haddad.*\nAdvocate Building\n315 SE 7th.*(\nSuite.*)?\nFort Lauderdale.*(\n({REDACTED} )?facsimile)?(\nwww.tonjahaddad.com?)?(\nPlease add this efiling.*\nThe information.*\nyou are not.*\nyou are not.*)?", re.IGNORECASE),
|
|
129
140
|
UNKNOWN: re.compile(r"(This message is directed to and is for the use of the above-noted addressee only.*\nhereon\.)", re.DOTALL),
|
|
130
141
|
}
|
|
@@ -136,118 +147,107 @@ MAILING_LISTS = [
|
|
|
136
147
|
JP_MORGAN_USGIO,
|
|
137
148
|
]
|
|
138
149
|
|
|
139
|
-
|
|
150
|
+
BCC_LISTS = JUNK_EMAILERS + MAILING_LISTS
|
|
151
|
+
|
|
152
|
+
TRUNCATE_EMAILS_FROM_OR_TO = [
|
|
153
|
+
AMANDA_ENS,
|
|
154
|
+
ANTHONY_BARRETT,
|
|
155
|
+
DIANE_ZIMAN,
|
|
156
|
+
JOSCHA_BACH,
|
|
157
|
+
KATHERINE_KEATING,
|
|
158
|
+
LAWRENCE_KRAUSS,
|
|
159
|
+
LISA_NEW,
|
|
160
|
+
NILI_PRIELL_BARAK,
|
|
161
|
+
PAUL_KRASSNER,
|
|
162
|
+
]
|
|
140
163
|
|
|
141
|
-
|
|
164
|
+
TRUNCATE_EMAILS_FROM = BCC_LISTS + TRUNCATE_EMAILS_FROM_OR_TO + [
|
|
142
165
|
'Alan S Halperin',
|
|
166
|
+
'Alain Forget',
|
|
167
|
+
ARIANE_DE_ROTHSCHILD,
|
|
168
|
+
AZIZA_ALAHMADI,
|
|
169
|
+
BILL_SIEGEL,
|
|
170
|
+
DAVID_HAIG,
|
|
171
|
+
EDWARD_ROD_LARSEN,
|
|
172
|
+
JOHNNY_EL_HACHEM,
|
|
173
|
+
MELANIE_WALKER,
|
|
143
174
|
'Mitchell Bard',
|
|
175
|
+
PEGGY_SIEGAL,
|
|
176
|
+
ROBERT_LAWRENCE_KUHN,
|
|
177
|
+
ROBERT_TRIVERS,
|
|
144
178
|
'Skip Rimer',
|
|
179
|
+
'Steven Elkman',
|
|
180
|
+
STEVEN_PFEIFFER,
|
|
145
181
|
'Steven Victor MD',
|
|
182
|
+
TERRY_KAFKA,
|
|
146
183
|
]
|
|
147
184
|
|
|
148
|
-
|
|
185
|
+
# These IDs will be appended to INTERESTING_EMAIL_IDS
|
|
186
|
+
INTERESTING_TRUNCATION_LENGTHS = {
|
|
149
187
|
'023627': 16_800, # Micheal Wolff article with brock pierce
|
|
150
188
|
'030245': None, # Epstein rationalizes his behavior in an open letter to the world
|
|
151
189
|
'030781': None, # Bannon email about crypto coin issues
|
|
152
190
|
'032906': None, # David Blaine email
|
|
153
191
|
'026036': 6000, # Gino Yu blockchain mention
|
|
154
|
-
'023208': None, # Long discussion about leon black's finances
|
|
155
192
|
'029609': None, # Joi Ito
|
|
156
193
|
'025233': None, # Reputation.com discussion
|
|
194
|
+
'017827': None, # Bannon / Peggy Siegal email about netflix doc on Epstein
|
|
195
|
+
'030222': None, # Ross Gow / Ghislaine correspondence
|
|
196
|
+
'026028': None, # Larry Summers / Karim Wade intro
|
|
197
|
+
'029545': None, # Tyler Shears reputation
|
|
198
|
+
'025812': None, # Tyler Shears reputation
|
|
199
|
+
'029914': 4500, # Lord Mandelson russian investments
|
|
200
|
+
'033453': None, # "Just heard you were telling people that you heard I asked Trump for a million dollars"
|
|
201
|
+
'031320': None, # Epstein Gratitude foundation
|
|
202
|
+
'031036': None, # Barbro Ehnbom talking about Swedish girl
|
|
203
|
+
'023454': 1878, # Email invitation sent to tech CEOs + Epstein
|
|
204
|
+
'029342': 2000, # Hakeem Jeffries
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
TRUNCATION_LENGTHS = {
|
|
208
|
+
**INTERESTING_TRUNCATION_LENGTHS,
|
|
209
|
+
'031791': None, # First email in Jessica Cadwell chain about service of legal documents
|
|
210
|
+
'023208': None, # Long discussion about leon black's finances
|
|
211
|
+
'028589': None, # Long thread with Reid Weingarten
|
|
212
|
+
'029433': TRUNCATED_CHARS, # Kahn taxes
|
|
213
|
+
'026778': TRUNCATED_CHARS, # Kahn taxes
|
|
214
|
+
'033311': TRUNCATED_CHARS, # Kahn taxes
|
|
215
|
+
'024251': TRUNCATED_CHARS, # Kahn taxes
|
|
216
|
+
'026755': TRUNCATED_CHARS, # Epstein self fwd
|
|
157
217
|
}
|
|
158
218
|
|
|
159
219
|
# These are long forwarded articles so we force a trim to 1,333 chars if these strings exist
|
|
160
220
|
TRUNCATE_TERMS = [
|
|
161
|
-
'The rebuilding of Indonesia',
|
|
221
|
+
'The rebuilding of Indonesia', # Vikcy ward article
|
|
162
222
|
'Dominique Strauss-Kahn',
|
|
163
223
|
'THOMAS L. FRIEDMAN',
|
|
164
|
-
'a sleek, briskly paced film whose title suggests a heist movie',
|
|
165
|
-
'quote from The Colbert Report distinguishes',
|
|
166
|
-
'co-inventor of the GTX Smart Shoe',
|
|
167
|
-
'my latest Washington Post column',
|
|
168
|
-
'supported my humanities work at Harvard',
|
|
224
|
+
'a sleek, briskly paced film whose title suggests a heist movie', # Inside Job
|
|
169
225
|
'Calendar of Major Events, Openings, and Fundraisers',
|
|
170
|
-
'Nuclear Operator Raises Alarm on Crisis',
|
|
171
|
-
'as responsible for the democratisation of computing and',
|
|
172
|
-
'AROUND 1,000 operational satellites are circling the Earth',
|
|
173
226
|
"In recent months, China's BAT collapse",
|
|
174
227
|
'President Obama introduces Jim Yong Kim as his nominee',
|
|
175
228
|
'Trump appears with mobster-affiliated felon at New',
|
|
176
|
-
'Lead Code Enforcement Walton presented the facts',
|
|
177
|
-
"Is UNRWA vital for the Palestinians' future",
|
|
178
|
-
'The New York company, led by Stephen Ross',
|
|
179
|
-
'I spent some time mulling additional aspects of a third choice presidential',
|
|
180
|
-
'you are referring to duplication of a gene',
|
|
181
|
-
'i am writing you both because i am attaching a still not-quite-complete response',
|
|
182
|
-
'Learn to meditate and discover what truly nourishes your entire being',
|
|
183
229
|
'Congratulations to the 2019 Hillman Prize recipients',
|
|
184
|
-
'This much we know - the Fall elections are shaping up',
|
|
185
230
|
"Special counsel Robert Mueller's investigation may face a serious legal obstacle",
|
|
186
231
|
"nearly leak-proof since its inception more than a year ago",
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
'
|
|
191
|
-
|
|
192
|
-
'
|
|
193
|
-
|
|
194
|
-
'We remain positive on banks that can make acceptable returns',
|
|
195
|
-
'David Woo (BAML head of FX, Rates and EM Strategy, very highly regarded',
|
|
196
|
-
"Please let me know if you're interested in joining a small group meeting",
|
|
197
|
-
'Erika Najarian, BAML financials research analyst, just returned',
|
|
198
|
-
'We can also discuss single stock and Topix banks',
|
|
199
|
-
'We are recording unprecedented divergences in falling equity vol',
|
|
200
|
-
'As previously discussed between you and Ariane',
|
|
201
|
-
'no evidence you got the latest so i have sent you just the key message',
|
|
202
|
-
# Joscha Bach
|
|
203
|
-
'Cells seem to be mostly indistinguishable (except',
|
|
204
|
-
'gender differenece. unlikely motivational, every cell is different',
|
|
205
|
-
'Some thoughts I meant to send back for a long time',
|
|
206
|
-
# Krassner
|
|
207
|
-
'My friend Michael Simmons, who has been the editor of National Lampoon',
|
|
208
|
-
"In the premiere episode of 'The Last Laugh' podcast, Sarah Silverman",
|
|
209
|
-
'Thanks so much for sharing both your note to Steven and your latest Manson essay',
|
|
210
|
-
# Edward Larson
|
|
211
|
-
'Coming from an international background, and having lived in Oslo, Tel Aviv',
|
|
212
|
-
# Katherine Keating
|
|
213
|
-
'Paul Keating is aware that many people see him as a puzzle and contradiction',
|
|
214
|
-
'his panoramic view of world affairs sharper than ever, Paul Keating blames',
|
|
215
|
-
# melanie
|
|
216
|
-
'Some years ago when I worked at the libertarian Cato Institute'
|
|
217
|
-
# rich kahn
|
|
218
|
-
'House and Senate Republicans on their respective tax overhaul',
|
|
219
|
-
'The Tax Act contains changes to the treatment of "carried interests"',
|
|
220
|
-
'General Election: Trump vs. Clinton LA Times/USC Tracking',
|
|
221
|
-
'Location: Quicken Loans Arena in Cleveland, OH',
|
|
222
|
-
'A friendly discussion about Syria with a former US State Department',
|
|
223
|
-
# Robert Kuhn
|
|
224
|
-
'The US trade war against China: The view from Beijing',
|
|
225
|
-
# Tom / Paul Krassner
|
|
226
|
-
'I forgot to post my cartoon from week before last, about Howard Schultz',
|
|
232
|
+
# Nikolic
|
|
233
|
+
'Nuclear Operator Raises Alarm on Crisis',
|
|
234
|
+
'as responsible for the democratisation of computing and',
|
|
235
|
+
'AROUND 1,000 operational satellites are circling the Earth',
|
|
236
|
+
# Sultan Sulayem
|
|
237
|
+
'co-inventor of the GTX Smart Shoe',
|
|
238
|
+
'my latest Washington Post column',
|
|
227
239
|
# Bannon
|
|
228
240
|
"Bannon the European: He's opening the populist fort in Brussels",
|
|
229
241
|
"Steve Bannon doesn't do subtle.",
|
|
230
242
|
'The Department of Justice lost its latest battle with Congress',
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
#
|
|
235
|
-
|
|
236
|
-
'
|
|
237
|
-
'
|
|
238
|
-
'
|
|
239
|
-
'lecture in Heidelberg Oct 14 but they had to cancel',
|
|
240
|
-
# Nikolic
|
|
241
|
-
'people from LifeBall',
|
|
242
|
-
# Epstein
|
|
243
|
-
'David Ben Gurion was asked why he, after 2000',
|
|
244
|
-
# Lisa New
|
|
245
|
-
'The raw materials for that period include interviews',
|
|
246
|
-
'Whether you donated to Poetry in America through',
|
|
247
|
-
# Random
|
|
248
|
-
'Little Hodiaki',
|
|
249
|
-
"It began with deep worries regarding China's growth path",
|
|
250
|
-
'https://www.washingtonpost.com/politics/2018/09/04/transcript-phone-call',
|
|
243
|
+
# lawyers
|
|
244
|
+
'recuses itself from Jeffrey Epstein case',
|
|
245
|
+
# Misc
|
|
246
|
+
'people from LifeBall', # Nikolic
|
|
247
|
+
"It began with deep worries regarding China's growth path", # Paul Morris
|
|
248
|
+
'A friendly discussion about Syria with a former US State Department', # Fabrice Aidan
|
|
249
|
+
'The US trade war against China: The view from Beijing', # Robert Kuhn / Groff
|
|
250
|
+
'This much we know - the Fall elections are shaping up', # Juleanna Glover / Bannon
|
|
251
251
|
]
|
|
252
252
|
|
|
253
253
|
METADATA_FIELDS = [
|
|
@@ -258,56 +258,78 @@ METADATA_FIELDS = [
|
|
|
258
258
|
'subject',
|
|
259
259
|
]
|
|
260
260
|
|
|
261
|
-
# Note the line repair happens *after* 'Importance: High' is removed
|
|
261
|
+
# Arguments to _merge_lines(). Note the line repair happens *after* 'Importance: High' is removed
|
|
262
262
|
LINE_REPAIR_MERGES = {
|
|
263
|
-
'
|
|
264
|
-
'
|
|
265
|
-
'
|
|
266
|
-
'
|
|
267
|
-
'
|
|
268
|
-
'
|
|
269
|
-
'
|
|
270
|
-
'
|
|
271
|
-
'
|
|
272
|
-
'
|
|
273
|
-
'
|
|
274
|
-
'
|
|
275
|
-
'
|
|
276
|
-
'
|
|
277
|
-
'
|
|
278
|
-
'
|
|
279
|
-
'
|
|
280
|
-
'
|
|
281
|
-
'
|
|
282
|
-
'
|
|
283
|
-
'
|
|
284
|
-
'
|
|
285
|
-
'
|
|
286
|
-
'
|
|
287
|
-
'
|
|
288
|
-
'
|
|
289
|
-
'
|
|
290
|
-
'
|
|
291
|
-
'
|
|
292
|
-
'
|
|
293
|
-
'
|
|
294
|
-
'
|
|
295
|
-
'
|
|
296
|
-
'
|
|
297
|
-
'
|
|
298
|
-
'
|
|
299
|
-
'
|
|
300
|
-
'
|
|
301
|
-
'
|
|
302
|
-
'
|
|
303
|
-
'
|
|
304
|
-
'
|
|
305
|
-
'
|
|
306
|
-
'
|
|
307
|
-
'
|
|
308
|
-
'
|
|
309
|
-
'
|
|
310
|
-
'
|
|
263
|
+
'013405': [[4]] * 2,
|
|
264
|
+
'013415': [[4]] * 2,
|
|
265
|
+
'014397': [[4]] * 2,
|
|
266
|
+
'014860': [[3], [4], [4]],
|
|
267
|
+
'017523': [[4]],
|
|
268
|
+
'019105': [[5]] * 4,
|
|
269
|
+
'019407': [[2, 4]],
|
|
270
|
+
'021729': [[2]],
|
|
271
|
+
'022673': [[9]],
|
|
272
|
+
'022684': [[9]],
|
|
273
|
+
'022695': [[4]],
|
|
274
|
+
'022977': [[9]] * 10,
|
|
275
|
+
'023001': [[5]] * 3,
|
|
276
|
+
'023067': [[3]],
|
|
277
|
+
'025233': [[4]] * 2,
|
|
278
|
+
'025329': [[2]] * 9,
|
|
279
|
+
'025790': [[2]],
|
|
280
|
+
'025812': [[3]] * 2,
|
|
281
|
+
'026345': [[3]],
|
|
282
|
+
'026609': [[4]],
|
|
283
|
+
'026829': [[3]],
|
|
284
|
+
'026924': [[2, 4]],
|
|
285
|
+
'028728': [[3]],
|
|
286
|
+
'028931': [[3, 6]],
|
|
287
|
+
'029154': [[2, 5]],
|
|
288
|
+
'029163': [[2, 5]],
|
|
289
|
+
'029282': [[2]],
|
|
290
|
+
'029402': [[5]],
|
|
291
|
+
'029433': [[3]],
|
|
292
|
+
'029458': [[4]] * 3,
|
|
293
|
+
'029498': [[2], [2, 4]],
|
|
294
|
+
'029501': [[2]],
|
|
295
|
+
'029545': [[3, 5]],
|
|
296
|
+
'029773': [[2, 5]],
|
|
297
|
+
'029831': [[3, 6]],
|
|
298
|
+
'029835': [[2, 4]],
|
|
299
|
+
'029841': [[3]],
|
|
300
|
+
'029889': [[2], [2, 5]],
|
|
301
|
+
'029976': [[3]],
|
|
302
|
+
'029977': ([[2]] * 4) + [[4], [2, 4]],
|
|
303
|
+
'030299': [[7, 10]],
|
|
304
|
+
'030315': [[3, 5]],
|
|
305
|
+
'030381': [[2, 4]],
|
|
306
|
+
'030384': [[2, 4]],
|
|
307
|
+
'030626': [[2], [4]],
|
|
308
|
+
'030999': [[2, 4]],
|
|
309
|
+
'031384': [[2]],
|
|
310
|
+
'031428': [[2], [2, 4]],
|
|
311
|
+
'031442': [[0]],
|
|
312
|
+
'031748': [[3]] * 2,
|
|
313
|
+
'031764': [[3]],
|
|
314
|
+
'031980': [[2, 4]],
|
|
315
|
+
'032063': [[3, 5]],
|
|
316
|
+
'032272': [[3]],
|
|
317
|
+
'032405': [[4]],
|
|
318
|
+
'032637': [[9]] * 3,
|
|
319
|
+
'033097': [[2]],
|
|
320
|
+
'033144': [[2, 4]],
|
|
321
|
+
'033217': [[3]],
|
|
322
|
+
'033228': [[3, 5]],
|
|
323
|
+
'033252': [[9]] * 2,
|
|
324
|
+
'033271': [[3]],
|
|
325
|
+
'033299': [[3]],
|
|
326
|
+
'033357': [[2, 4]],
|
|
327
|
+
'033486': [[7, 9]],
|
|
328
|
+
'033512': [[2]],
|
|
329
|
+
'033568': [[5]] * 5,
|
|
330
|
+
'033575': [[2, 4]],
|
|
331
|
+
'033576': [[3]],
|
|
332
|
+
'033583': [[2]],
|
|
311
333
|
}
|
|
312
334
|
|
|
313
335
|
|
|
@@ -328,6 +350,7 @@ class Email(Communication):
|
|
|
328
350
|
recipients: list[Name] = field(default_factory=list)
|
|
329
351
|
sent_from_device: str | None = None
|
|
330
352
|
signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
|
|
353
|
+
_line_merge_arguments: list[tuple[int] | tuple[int, int]] = field(default_factory=list)
|
|
331
354
|
|
|
332
355
|
# For logging how many headers we prettified while printing, kind of janky
|
|
333
356
|
rewritten_header_ids: ClassVar[set[str]] = set([])
|
|
@@ -353,7 +376,7 @@ class Email(Communication):
|
|
|
353
376
|
self.recipients.extend(self._extract_emailer_names(recipient))
|
|
354
377
|
|
|
355
378
|
# Assume mailing list emails are to Epstein
|
|
356
|
-
if self.author in
|
|
379
|
+
if self.author in BCC_LISTS and (self.is_note_to_self() or not self.recipients):
|
|
357
380
|
self.recipients = [JEFFREY_EPSTEIN]
|
|
358
381
|
|
|
359
382
|
# Remove self CCs but preserve self emails
|
|
@@ -390,6 +413,9 @@ class Email(Communication):
|
|
|
390
413
|
def is_note_to_self(self) -> bool:
|
|
391
414
|
return self.recipients == [self.author]
|
|
392
415
|
|
|
416
|
+
def is_with(self, name: str) -> bool:
|
|
417
|
+
return name in [self.author] + self.recipients
|
|
418
|
+
|
|
393
419
|
def metadata(self) -> Metadata:
|
|
394
420
|
local_metadata = asdict(self)
|
|
395
421
|
local_metadata['is_junk_mail'] = self.is_junk_mail()
|
|
@@ -436,9 +462,9 @@ class Email(Communication):
|
|
|
436
462
|
elif self.header.num_header_rows == 0:
|
|
437
463
|
return self.text
|
|
438
464
|
|
|
439
|
-
reply_text_match = REPLY_TEXT_REGEX.search(text)
|
|
440
465
|
self.log_top_lines(20, "Raw text:", logging.DEBUG)
|
|
441
466
|
self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
|
|
467
|
+
reply_text_match = REPLY_TEXT_REGEX.search(text)
|
|
442
468
|
|
|
443
469
|
if reply_text_match:
|
|
444
470
|
actual_num_chars = len(reply_text_match.group(1))
|
|
@@ -550,13 +576,24 @@ class Email(Communication):
|
|
|
550
576
|
|
|
551
577
|
def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
|
|
552
578
|
"""Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
|
|
553
|
-
|
|
579
|
+
if text is None:
|
|
580
|
+
header_offset = len(self.header.header_chars)
|
|
581
|
+
text = self.text[header_offset:]
|
|
582
|
+
else:
|
|
583
|
+
header_offset = 0
|
|
584
|
+
|
|
585
|
+
for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text)):
|
|
554
586
|
if i >= n:
|
|
555
|
-
return match.end() - 1
|
|
587
|
+
return match.end() + header_offset - 1
|
|
556
588
|
|
|
557
589
|
def _merge_lines(self, idx: int, idx2: int | None = None) -> None:
|
|
558
590
|
"""Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""
|
|
559
|
-
|
|
591
|
+
if idx2 is None:
|
|
592
|
+
self._line_merge_arguments.append((idx,))
|
|
593
|
+
idx2 = idx + 1
|
|
594
|
+
else:
|
|
595
|
+
self._line_merge_arguments.append((idx, idx2))
|
|
596
|
+
|
|
560
597
|
lines = self.lines[0:idx]
|
|
561
598
|
|
|
562
599
|
if idx2 <= idx:
|
|
@@ -599,68 +636,15 @@ class Email(Communication):
|
|
|
599
636
|
old_text = self.text
|
|
600
637
|
|
|
601
638
|
if self.file_id in LINE_REPAIR_MERGES:
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
self._merge_lines(*merge_args)
|
|
605
|
-
|
|
606
|
-
# These already had 2nd line merged
|
|
607
|
-
if self.file_id in ['030626']: # Merge 6th and 7th (now 5th and 6th) rows
|
|
608
|
-
self._merge_lines(4)
|
|
609
|
-
elif self.file_id == '029889':
|
|
610
|
-
self._merge_lines(2, 5)
|
|
611
|
-
elif self.file_id in ['029498', '031428']:
|
|
612
|
-
self._merge_lines(2, 4)
|
|
613
|
-
|
|
614
|
-
# Multiline
|
|
615
|
-
if self.file_id == '013415':
|
|
616
|
-
for _i in range(2):
|
|
617
|
-
self._merge_lines(4)
|
|
618
|
-
elif self.file_id == '013405':
|
|
619
|
-
for _i in range(2):
|
|
620
|
-
self._merge_lines(4)
|
|
621
|
-
elif self.file_id == '029458':
|
|
622
|
-
for _i in range(3):
|
|
623
|
-
self._merge_lines(4)
|
|
624
|
-
elif self.file_id in ['025233']:
|
|
625
|
-
for _i in range(2):
|
|
626
|
-
self._merge_lines(4)
|
|
639
|
+
for merge_args in LINE_REPAIR_MERGES[self.file_id]:
|
|
640
|
+
self._merge_lines(*merge_args)
|
|
627
641
|
|
|
642
|
+
if self.file_id in ['025233']:
|
|
628
643
|
self.lines[4] = f"Attachments: {self.lines[4]}"
|
|
629
644
|
self._set_computed_fields(lines=self.lines)
|
|
630
|
-
elif self.file_id in ['023001']:
|
|
631
|
-
for _i in range(3):
|
|
632
|
-
self._merge_lines(5)
|
|
633
|
-
elif self.file_id in ['019105']:
|
|
634
|
-
for _i in range(4):
|
|
635
|
-
self._merge_lines(5)
|
|
636
|
-
elif self.file_id in ['033568']:
|
|
637
|
-
for _i in range(5):
|
|
638
|
-
self._merge_lines(5)
|
|
639
|
-
elif self.file_id in ['025329']:
|
|
640
|
-
for _i in range(9):
|
|
641
|
-
self._merge_lines(2)
|
|
642
|
-
elif self.file_id in ['025812']:
|
|
643
|
-
for _i in range(2):
|
|
644
|
-
self._merge_lines(3)
|
|
645
|
-
elif self.file_id == '014860':
|
|
646
|
-
self._merge_lines(3)
|
|
647
|
-
self._merge_lines(4)
|
|
648
|
-
self._merge_lines(4)
|
|
649
645
|
elif self.file_id == '029977':
|
|
650
646
|
self._set_computed_fields(text=self.text.replace('Sent 9/28/2012 2:41:02 PM', 'Sent: 9/28/2012 2:41:02 PM'))
|
|
651
647
|
|
|
652
|
-
for _i in range(4):
|
|
653
|
-
self._merge_lines(2)
|
|
654
|
-
|
|
655
|
-
self._merge_lines(4)
|
|
656
|
-
self._merge_lines(2, 4)
|
|
657
|
-
elif self.file_id in ['033252']:
|
|
658
|
-
for _i in range(2):
|
|
659
|
-
self._merge_lines(9)
|
|
660
|
-
elif self.file_id in ['032637']:
|
|
661
|
-
for _i in range(3):
|
|
662
|
-
self._merge_lines(9)
|
|
663
|
-
|
|
664
648
|
# Bad line removal
|
|
665
649
|
if self.file_id == '025041':
|
|
666
650
|
self._remove_line(4)
|
|
@@ -681,14 +665,17 @@ class Email(Communication):
|
|
|
681
665
|
line = lines[i]
|
|
682
666
|
|
|
683
667
|
if LINK_LINE_REGEX.search(line):
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
668
|
+
while i < (len(lines) - 1) \
|
|
669
|
+
and 'http' not in lines[i + 1] \
|
|
670
|
+
and (lines[i + 1].endswith('/') or any(s in lines[i + 1] for s in URL_SIGNIFIERS) or LINK_LINE2_REGEX.match(lines[i + 1])):
|
|
687
671
|
logger.debug(f"{self.filename}: Joining link lines\n 1. {line}\n 2. {lines[i + 1]}\n")
|
|
688
672
|
line += lines[i + 1]
|
|
689
673
|
i += 1
|
|
690
674
|
|
|
691
675
|
line = line.replace(' ', '')
|
|
676
|
+
elif ' http' in line and line.endswith('html'):
|
|
677
|
+
pre_link, post_link = line.split(' http', 1)
|
|
678
|
+
line = f"{pre_link} http{post_link.replace(' ', '')}"
|
|
692
679
|
|
|
693
680
|
new_lines.append(line)
|
|
694
681
|
|
|
@@ -739,10 +726,12 @@ class Email(Communication):
|
|
|
739
726
|
|
|
740
727
|
if args.whole_file:
|
|
741
728
|
num_chars = len(self.text)
|
|
729
|
+
elif args.truncate:
|
|
730
|
+
num_chars = args.truncate
|
|
742
731
|
elif self.file_id in TRUNCATION_LENGTHS:
|
|
743
732
|
num_chars = TRUNCATION_LENGTHS[self.file_id] or self.file_size()
|
|
744
|
-
elif self.author in
|
|
745
|
-
num_chars =
|
|
733
|
+
elif self.author in TRUNCATE_EMAILS_FROM or any([self.is_with(n) for n in TRUNCATE_EMAILS_FROM_OR_TO]) or includes_truncate_term:
|
|
734
|
+
num_chars = min(quote_cutoff or MAX_CHARS_TO_PRINT, TRUNCATED_CHARS)
|
|
746
735
|
elif quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
|
|
747
736
|
num_chars = quote_cutoff
|
|
748
737
|
else:
|
|
@@ -751,15 +740,14 @@ class Email(Communication):
|
|
|
751
740
|
if num_chars != MAX_CHARS_TO_PRINT and not self.is_duplicate():
|
|
752
741
|
log_args = {
|
|
753
742
|
'num_chars': num_chars,
|
|
754
|
-
'author_truncate': self.author in
|
|
743
|
+
'author_truncate': self.author in TRUNCATE_EMAILS_FROM,
|
|
755
744
|
'is_fwded_article': self.is_fwded_article(),
|
|
756
745
|
'is_quote_cutoff': quote_cutoff == num_chars,
|
|
757
746
|
'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
|
|
758
747
|
'quote_cutoff': quote_cutoff,
|
|
759
748
|
}
|
|
760
749
|
|
|
761
|
-
if
|
|
762
|
-
logger.debug(f'{self.summary()} truncating: ' + ', '.join([f"{k}={v}" for k, v in log_args.items() if v]) + '\n')
|
|
750
|
+
logger.debug(f'{self.summary()} truncating: ' + ', '.join([f"{k}={v}" for k, v in log_args.items() if v]) + '\n')
|
|
763
751
|
|
|
764
752
|
return num_chars
|
|
765
753
|
|
|
@@ -793,6 +781,13 @@ class Email(Communication):
|
|
|
793
781
|
text = _add_line_breaks(text) # This was skipped when _prettify_text() w/a broken header so we do it now
|
|
794
782
|
self.rewritten_header_ids.add(self.file_id)
|
|
795
783
|
|
|
784
|
+
lines = [
|
|
785
|
+
Text.from_markup(f"[link={line}]{line}[/link]") if line.startswith('http') else Text(line)
|
|
786
|
+
for line in text.split('\n')
|
|
787
|
+
]
|
|
788
|
+
|
|
789
|
+
text = join_texts(lines, '\n')
|
|
790
|
+
|
|
796
791
|
email_txt_panel = Panel(
|
|
797
792
|
highlighter(text).append('\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
|
|
798
793
|
border_style=self._border_style(),
|
|
@@ -9,7 +9,6 @@ from epstein_files.util.logging import logger
|
|
|
9
9
|
from epstein_files.util.rich import UNKNOWN
|
|
10
10
|
|
|
11
11
|
FIELD_NAMES = ['Date', 'From', 'Sent', 'Subject']
|
|
12
|
-
NON_HEADER_FIELDS = ['field_names', 'num_header_rows', 'was_initially_empty']
|
|
13
12
|
ON_BEHALF_OF = 'on behalf of'
|
|
14
13
|
TO_FIELDS = ['bcc', 'cc', 'to']
|
|
15
14
|
EMAILER_FIELDS = [AUTHOR] + TO_FIELDS
|
|
@@ -28,10 +27,18 @@ CONFIGURED_ACTUAL_TEXTS = [
|
|
|
28
27
|
if isinstance(cfg, EmailCfg) and cfg.actual_text is not None
|
|
29
28
|
]
|
|
30
29
|
|
|
30
|
+
NON_HEADER_FIELDS = [
|
|
31
|
+
'field_names',
|
|
32
|
+
'header_chars',
|
|
33
|
+
'num_header_rows',
|
|
34
|
+
'was_initially_empty',
|
|
35
|
+
]
|
|
36
|
+
|
|
31
37
|
|
|
32
38
|
@dataclass(kw_only=True)
|
|
33
39
|
class EmailHeader:
|
|
34
40
|
field_names: list[str] # Order is same as the order header fields appear in the email file text
|
|
41
|
+
header_chars: str = ''
|
|
35
42
|
num_header_rows: int = field(init=False)
|
|
36
43
|
was_initially_empty: bool = False
|
|
37
44
|
|
|
@@ -101,6 +108,7 @@ class EmailHeader:
|
|
|
101
108
|
setattr(self, field_name, value)
|
|
102
109
|
|
|
103
110
|
self.num_header_rows = len(self.field_names) + num_headers
|
|
111
|
+
self.header_chars = '\n'.join(email_lines[0:self.num_header_rows])
|
|
104
112
|
log_msg = f"Corrected empty header using {self.num_header_rows} lines to:\n"
|
|
105
113
|
logger.debug(f"{log_msg}{self}\n\nTop lines:\n\n%s", '\n'.join(email_lines[0:(num_headers + 1) * 2]))
|
|
106
114
|
|
|
@@ -163,7 +171,7 @@ class EmailHeader:
|
|
|
163
171
|
if should_log_header:
|
|
164
172
|
logger.debug(f"Header being parsed was this:\n\n{header}\n")
|
|
165
173
|
|
|
166
|
-
return
|
|
174
|
+
return cls(field_names=field_names, header_chars=header, **kw_args)
|
|
167
175
|
|
|
168
176
|
@staticmethod
|
|
169
177
|
def cleanup_str(_str: str) -> str:
|