epstein-files 1.2.1__py3-none-any.whl → 1.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import logging
3
3
  import re
4
+ from collections import defaultdict
4
5
  from copy import deepcopy
5
6
  from dataclasses import asdict, dataclass, field
6
7
  from datetime import datetime
@@ -31,7 +32,8 @@ from epstein_files.util.rich import *
31
32
  BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
32
33
  BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
33
34
  DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
34
- LINK_LINE_REGEX = re.compile(f"^(> )?htt")
35
+ LINK_LINE_REGEX = re.compile(f"^>? ?htt")
36
+ LINK_LINE2_REGEX = re.compile(r"^[-\w.%&=/]{5,}$")
35
37
  QUOTED_REPLY_LINE_REGEX = re.compile(r'(\nFrom:(.*)|wrote:)\n', re.IGNORECASE)
36
38
  REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
37
39
 
@@ -42,11 +44,13 @@ LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
42
44
 
43
45
  SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
44
46
  REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
45
- URL_SIGNIFIERS = ['gclid', 'htm', 'ref=', 'utm']
47
+ URL_SIGNIFIERS = ['amp?', 'cd=', 'click', 'ft=', 'gclid', 'htm', 'keywords=', 'module=', 'mpweb', 'nlid=', 'ref=', 'smid=', 'usg=', 'utm']
46
48
  APPEARS_IN = 'appears in'
47
- MAX_CHARS_TO_PRINT = 4000
49
+
48
50
  MAX_NUM_HEADER_LINES = 14
49
51
  MAX_QUOTED_REPLIES = 2
52
+ MAX_CHARS_TO_PRINT = 4000
53
+ TRUNCATED_CHARS = int(MAX_CHARS_TO_PRINT / 3)
50
54
 
51
55
  REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
52
56
  '********************************',
@@ -72,12 +76,13 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
72
76
  # Signatures
73
77
  'BlackBerry by AT &T': 'BlackBerry by AT&T',
74
78
  'BlackBerry from T- Mobile': 'BlackBerry from T-Mobile',
75
- 'Envoy& de mon iPhone': 'Envoyé de mon iPhone',
79
+ 'Envoy& de': 'Envoyé de',
76
80
  "from my 'Phone": 'from my iPhone',
77
81
  'from Samsung Mob.le': 'from Samsung Mobile',
78
82
  'gJeremyRubin': '@JeremyRubin',
79
83
  'Sent from Mabfl': 'Sent from Mobile', # NADIA_MARCINKO signature bad OCR
80
84
  'twitter glhsummers': 'twitter @lhsummers',
85
+ re.compile(r"[cC]o-authored with i ?Phone auto-correct"): "Co-authored with iPhone auto-correct",
81
86
  re.compile(r"twitter\.com[i/][lI]krauss[1lt]"): "twitter.com/lkrauss1",
82
87
  re.compile(r'from my BlackBerry[0°] wireless device'): 'from my BlackBerry® wireless device',
83
88
  re.compile(r'^INW$', re.MULTILINE): REDACTED,
@@ -109,22 +114,28 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
109
114
  EMAIL_SIGNATURE_REGEXES = {
110
115
  ARIANE_DE_ROTHSCHILD: re.compile(r"Ensemble.*\nCe.*\ndestinataires.*\nremercions.*\nautorisee.*\nd.*\nLe.*\ncontenues.*\nEdmond.*\nRoth.*\nlo.*\nRoth.*\ninfo.*\nFranc.*\n.2.*", re.I),
111
116
  BARBRO_C_EHNBOM: re.compile(r"Barbro C.? Ehn.*\nChairman, Swedish-American.*\n((Office|Cell|Sweden):.*\n)*(360.*\nNew York.*)?"),
117
+ BRAD_KARP: re.compile(r"This message is intended only for the use of the Addressee and may contain information.*\nnot the intended recipient, you are hereby notified.*\nreceived this communication in error.*"),
118
+ DANIEL_SIAD: re.compile(r"Confidentiality Notice: The information contained in this electronic message is PRIVILEGED and confidential information intended only for the use of the individual entity or entities named as recipient or recipients. If the reader is not the intended recipient, be hereby notified that any dissemination, distribution or copy of this communication is strictly prohibited. If you have received this communication in error, please notify me immediately by electronic mail or by telephone and permanently delete this message from your computer system. Thank you.".replace(' ', r'\s*'), re.IGNORECASE),
112
119
  DANNY_FROST: re.compile(r"Danny Frost\nDirector.*\nManhattan District.*\n212.*", re.IGNORECASE),
113
120
  DARREN_INDYKE: re.compile(r"DARREN K. INDYKE.*?\**\nThe information contained in this communication.*?Darren K.[\n\s]+?[Il]ndyke(, PLLC)? — All rights reserved\.? ?\n\*{50,120}(\n\**)?", re.DOTALL),
114
121
  DAVID_INGRAM: re.compile(r"Thank you in advance.*\nDavid Ingram.*\nCorrespondent\nReuters.*\nThomson.*(\n(Office|Mobile|Reuters.com).*)*"),
115
122
  DEEPAK_CHOPRA: re.compile(fr"({DEEPAK_CHOPRA}( MD)?\n)?2013 Costa Del Mar Road\nCarlsbad, CA 92009(\n(Chopra Foundation|Super Genes: Unlock.*))?(\nJiyo)?(\nChopra Center for Wellbeing)?(\nHome: Where Everyone is Welcome)?"),
123
+ EDUARDO_ROBLES: re.compile(fr"(• )?email:.*\n(• )?email:\n(• )?website: www.creativekingdom.com\n(• )?address: 5th Floor Office No:504 Aspect Tower,\nBusiness Bay, Dubai United Arab Emirates."),
116
124
  JEFFREY_EPSTEIN: re.compile(r"((\*+|please note)\n+)?(> )?(• )?(» )?The information contained in this communication is\n(> )*(» )?confidential.*?all attachments.( copyright -all rights reserved?)?", re.DOTALL),
117
125
  JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*(\nTel:.*)?(\nEmail:.*)?", re.IGNORECASE),
118
126
  KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
119
127
  LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
120
128
  LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
121
- MARTIN_WEINBERG: re.compile(r"(Martin G. Weinberg, Esq.\n20 Park Plaza((, )|\n)Suite 1000\nBoston, MA 02116(\n61.*)?(\n.*([cC]ell|Office))*\n)?This Electronic Message contains.*?contents of this message is.*?prohibited.", re.DOTALL),
122
- STEVEN_PFEIFFER: re.compile(r"Steven\nSteven .*\nAssociate.*\nIndependent Filmmaker Project\nMade in NY.*\n30 .*\nBrooklyn.*\n(p:.*\n)?www\.ifp.*", re.IGNORECASE),
129
+ MARTIN_WEINBERG: re.compile(r"(Martin G. Weinberg, Esq.\n20 Park Plaza((, )|\n)Suite 1000\nBoston, MA 02116(\n61.*?)?(\n.*?([cC]ell|Office))*\n)?This Electronic Message contains.*?contents of this message is.*?prohibited.", re.DOTALL),
130
+ NICHOLAS_RIBIS: re.compile(r"60 Morris Turnpike 2FL\nSummit,? NJ.*\n0:\nF:\n\*{20,}\nCONFIDENTIALITY NOTICE.*\nattachments.*\ncopying.*\nIf you have.*\nthe copy.*\nThank.*\n\*{20,}"),
123
131
  PETER_MANDELSON: re.compile(r'Disclaimer This email and any attachments to it may be.*?with[ \n]+number(.*?EC4V[ \n]+6BJ)?', re.DOTALL | re.IGNORECASE),
124
132
  PAUL_BARRETT: re.compile(r"Paul Barrett[\n\s]+Alpha Group Capital LLC[\n\s]+(142 W 57th Street, 11th Floor, New York, NY 10019?[\n\s]+)?(al?[\n\s]*)?ALPHA GROUP[\n\s]+CAPITAL"),
125
- RICHARD_KAHN: re.compile(r'Richard Kahn[\n\s]+HBRK Associates Inc.?[\n\s]+((301 East 66th Street, Suite 1OF|575 Lexington Avenue,? 4th Floor,?)[\n\s]+)?New York, (NY|New York) 100(22|65)([\n\s]+(Tel?|Phone)( I)?[\n\s]+Fa[x"]?[\n\s]+[Ce]el?l?)?', re.IGNORECASE),
133
+ RICHARD_KAHN: re.compile(fr'Richard Kahn[\n\s]+HBRK Associates Inc.?[\n\s]+((301 East 66th Street, Suite 1OF|575 Lexington Avenue,? 4th Floor,?)[\n\s]+)?New York, (NY|New York) 100(22|65)(\s+(Tel?|Phone)( I|{REDACTED})?\s+Fa[x",]?(_|{REDACTED})*\s+[Ce]el?l?)?', re.IGNORECASE),
134
+ ROSS_GOW: re.compile(r"Ross Gow\nManaging Partner\nACUITY Reputation Limited\n23 Berkeley Square\nLondon.*\nMobile.*\nTel"),
135
+ STEVEN_PFEIFFER: re.compile(r"Steven\nSteven .*\nAssociate.*\nIndependent Filmmaker Project\nMade in NY.*\n30 .*\nBrooklyn.*\n(p:.*\n)?www\.ifp.*", re.IGNORECASE),
126
136
  'Susan Edelman': re.compile(r'Susan Edel.*\nReporter\n1211.*\n917.*\nsedelman.*', re.IGNORECASE),
127
137
  TERRY_KAFKA: re.compile(r"((>|I) )?Terry B.? Kafka.*\n(> )?Impact Outdoor.*\n(> )?5454.*\n(> )?Dallas.*\n((> )?c?ell.*\n)?(> )?Impactoutdoor.*(\n(> )?cell.*)?", re.IGNORECASE),
138
+ TOM_PRITZKER: re.compile(r"The contents of this email message.*\ncontain confidential.*\n(not )?the intended.*\n(error|please).*\n(you )?(are )?not the.*\n(this )?message.*"),
128
139
  TONJA_HADDAD_COLEMAN: re.compile(fr"Tonja Haddad Coleman.*\nTonja Haddad.*\nAdvocate Building\n315 SE 7th.*(\nSuite.*)?\nFort Lauderdale.*(\n({REDACTED} )?facsimile)?(\nwww.tonjahaddad.com?)?(\nPlease add this efiling.*\nThe information.*\nyou are not.*\nyou are not.*)?", re.IGNORECASE),
129
140
  UNKNOWN: re.compile(r"(This message is directed to and is for the use of the above-noted addressee only.*\nhereon\.)", re.DOTALL),
130
141
  }
@@ -136,118 +147,107 @@ MAILING_LISTS = [
136
147
  JP_MORGAN_USGIO,
137
148
  ]
138
149
 
139
- BBC_LISTS = JUNK_EMAILERS + MAILING_LISTS
150
+ BCC_LISTS = JUNK_EMAILERS + MAILING_LISTS
151
+
152
+ TRUNCATE_EMAILS_FROM_OR_TO = [
153
+ AMANDA_ENS,
154
+ ANTHONY_BARRETT,
155
+ DIANE_ZIMAN,
156
+ JOSCHA_BACH,
157
+ KATHERINE_KEATING,
158
+ LAWRENCE_KRAUSS,
159
+ LISA_NEW,
160
+ NILI_PRIELL_BARAK,
161
+ PAUL_KRASSNER,
162
+ ]
140
163
 
141
- TRUNCATE_ALL_EMAILS_FROM = BBC_LISTS + [
164
+ TRUNCATE_EMAILS_FROM = BCC_LISTS + TRUNCATE_EMAILS_FROM_OR_TO + [
142
165
  'Alan S Halperin',
166
+ 'Alain Forget',
167
+ ARIANE_DE_ROTHSCHILD,
168
+ AZIZA_ALAHMADI,
169
+ BILL_SIEGEL,
170
+ DAVID_HAIG,
171
+ EDWARD_ROD_LARSEN,
172
+ JOHNNY_EL_HACHEM,
173
+ MELANIE_WALKER,
143
174
  'Mitchell Bard',
175
+ PEGGY_SIEGAL,
176
+ ROBERT_LAWRENCE_KUHN,
177
+ ROBERT_TRIVERS,
144
178
  'Skip Rimer',
179
+ 'Steven Elkman',
180
+ STEVEN_PFEIFFER,
145
181
  'Steven Victor MD',
182
+ TERRY_KAFKA,
146
183
  ]
147
184
 
148
- TRUNCATION_LENGTHS = {
185
+ # These IDs will be appended to INTERESTING_EMAIL_IDS
186
+ INTERESTING_TRUNCATION_LENGTHS = {
149
187
  '023627': 16_800, # Micheal Wolff article with brock pierce
150
188
  '030245': None, # Epstein rationalizes his behavior in an open letter to the world
151
189
  '030781': None, # Bannon email about crypto coin issues
152
190
  '032906': None, # David Blaine email
153
191
  '026036': 6000, # Gino Yu blockchain mention
154
- '023208': None, # Long discussion about leon black's finances
155
192
  '029609': None, # Joi Ito
156
193
  '025233': None, # Reputation.com discussion
194
+ '017827': None, # Bannon / Peggy Siegal email about netflix doc on Epstein
195
+ '030222': None, # Ross Gow / Ghislaine correspondence
196
+ '026028': None, # Larry Summers / Karim Wade intro
197
+ '029545': None, # Tyler Shears reputation
198
+ '025812': None, # Tyler Shears reputation
199
+ '029914': 4500, # Lord Mandelson russian investments
200
+ '033453': None, # "Just heard you were telling people that you heard I asked Trump for a million dollars"
201
+ '031320': None, # Epstein Gratitude foundation
202
+ '031036': None, # Barbro Ehnbom talking about Swedish girl
203
+ '023454': 1878, # Email invitation sent to tech CEOs + Epstein
204
+ '029342': 2000, # Hakeem Jeffries
205
+ }
206
+
207
+ TRUNCATION_LENGTHS = {
208
+ **INTERESTING_TRUNCATION_LENGTHS,
209
+ '031791': None, # First email in Jessica Cadwell chain about service of legal documents
210
+ '023208': None, # Long discussion about leon black's finances
211
+ '028589': None, # Long thread with Reid Weingarten
212
+ '029433': TRUNCATED_CHARS, # Kahn taxes
213
+ '026778': TRUNCATED_CHARS, # Kahn taxes
214
+ '033311': TRUNCATED_CHARS, # Kahn taxes
215
+ '024251': TRUNCATED_CHARS, # Kahn taxes
216
+ '026755': TRUNCATED_CHARS, # Epstein self fwd
157
217
  }
158
218
 
159
219
  # These are long forwarded articles so we force a trim to 1,333 chars if these strings exist
160
220
  TRUNCATE_TERMS = [
161
- 'The rebuilding of Indonesia',
221
+ 'The rebuilding of Indonesia', # Vikcy ward article
162
222
  'Dominique Strauss-Kahn',
163
223
  'THOMAS L. FRIEDMAN',
164
- 'a sleek, briskly paced film whose title suggests a heist movie',
165
- 'quote from The Colbert Report distinguishes',
166
- 'co-inventor of the GTX Smart Shoe',
167
- 'my latest Washington Post column',
168
- 'supported my humanities work at Harvard',
224
+ 'a sleek, briskly paced film whose title suggests a heist movie', # Inside Job
169
225
  'Calendar of Major Events, Openings, and Fundraisers',
170
- 'Nuclear Operator Raises Alarm on Crisis',
171
- 'as responsible for the democratisation of computing and',
172
- 'AROUND 1,000 operational satellites are circling the Earth',
173
226
  "In recent months, China's BAT collapse",
174
227
  'President Obama introduces Jim Yong Kim as his nominee',
175
228
  'Trump appears with mobster-affiliated felon at New',
176
- 'Lead Code Enforcement Walton presented the facts',
177
- "Is UNRWA vital for the Palestinians' future",
178
- 'The New York company, led by Stephen Ross',
179
- 'I spent some time mulling additional aspects of a third choice presidential',
180
- 'you are referring to duplication of a gene',
181
- 'i am writing you both because i am attaching a still not-quite-complete response',
182
- 'Learn to meditate and discover what truly nourishes your entire being',
183
229
  'Congratulations to the 2019 Hillman Prize recipients',
184
- 'This much we know - the Fall elections are shaping up',
185
230
  "Special counsel Robert Mueller's investigation may face a serious legal obstacle",
186
231
  "nearly leak-proof since its inception more than a year ago",
187
- "I appreciate the opportunity to respond to your email",
188
- "Hello Peter. I am currently on a plane. I sent you earlier",
189
- "I appreciate the opportunity to respond to your email",
190
- 'I just wanted to follow up on a couple of notes. I have been coordinating with Richard Kahn',
191
- 'So, Peggy, if you could just let me know what info to include on the donation',
192
- 'Consult a lawyer beforehand, if possible, but be cooperative/nice at this stage',
193
- # Amanda Ens
194
- 'We remain positive on banks that can make acceptable returns',
195
- 'David Woo (BAML head of FX, Rates and EM Strategy, very highly regarded',
196
- "Please let me know if you're interested in joining a small group meeting",
197
- 'Erika Najarian, BAML financials research analyst, just returned',
198
- 'We can also discuss single stock and Topix banks',
199
- 'We are recording unprecedented divergences in falling equity vol',
200
- 'As previously discussed between you and Ariane',
201
- 'no evidence you got the latest so i have sent you just the key message',
202
- # Joscha Bach
203
- 'Cells seem to be mostly indistinguishable (except',
204
- 'gender differenece. unlikely motivational, every cell is different',
205
- 'Some thoughts I meant to send back for a long time',
206
- # Krassner
207
- 'My friend Michael Simmons, who has been the editor of National Lampoon',
208
- "In the premiere episode of 'The Last Laugh' podcast, Sarah Silverman",
209
- 'Thanks so much for sharing both your note to Steven and your latest Manson essay',
210
- # Edward Larson
211
- 'Coming from an international background, and having lived in Oslo, Tel Aviv',
212
- # Katherine Keating
213
- 'Paul Keating is aware that many people see him as a puzzle and contradiction',
214
- 'his panoramic view of world affairs sharper than ever, Paul Keating blames',
215
- # melanie
216
- 'Some years ago when I worked at the libertarian Cato Institute'
217
- # rich kahn
218
- 'House and Senate Republicans on their respective tax overhaul',
219
- 'The Tax Act contains changes to the treatment of "carried interests"',
220
- 'General Election: Trump vs. Clinton LA Times/USC Tracking',
221
- 'Location: Quicken Loans Arena in Cleveland, OH',
222
- 'A friendly discussion about Syria with a former US State Department',
223
- # Robert Kuhn
224
- 'The US trade war against China: The view from Beijing',
225
- # Tom / Paul Krassner
226
- 'I forgot to post my cartoon from week before last, about Howard Schultz',
232
+ # Nikolic
233
+ 'Nuclear Operator Raises Alarm on Crisis',
234
+ 'as responsible for the democratisation of computing and',
235
+ 'AROUND 1,000 operational satellites are circling the Earth',
236
+ # Sultan Sulayem
237
+ 'co-inventor of the GTX Smart Shoe',
238
+ 'my latest Washington Post column',
227
239
  # Bannon
228
240
  "Bannon the European: He's opening the populist fort in Brussels",
229
241
  "Steve Bannon doesn't do subtle.",
230
242
  'The Department of Justice lost its latest battle with Congress',
231
- "Donald Trump's newly named chief strategist and senior counselor",
232
- # Diane Ziman
233
- 'I was so proud to see him speak at the Women',
234
- # Krauss
235
- 'On confronting dogma, I of course agree',
236
- 'I did neck with that woman, but never forced myself on her',
237
- 'It is hard to know how to respond to a list of false',
238
- 'The Women in the World Summit opens April 12',
239
- 'lecture in Heidelberg Oct 14 but they had to cancel',
240
- # Nikolic
241
- 'people from LifeBall',
242
- # Epstein
243
- 'David Ben Gurion was asked why he, after 2000',
244
- # Lisa New
245
- 'The raw materials for that period include interviews',
246
- 'Whether you donated to Poetry in America through',
247
- # Random
248
- 'Little Hodiaki',
249
- "It began with deep worries regarding China's growth path",
250
- 'https://www.washingtonpost.com/politics/2018/09/04/transcript-phone-call',
243
+ # lawyers
244
+ 'recuses itself from Jeffrey Epstein case',
245
+ # Misc
246
+ 'people from LifeBall', # Nikolic
247
+ "It began with deep worries regarding China's growth path", # Paul Morris
248
+ 'A friendly discussion about Syria with a former US State Department', # Fabrice Aidan
249
+ 'The US trade war against China: The view from Beijing', # Robert Kuhn / Groff
250
+ 'This much we know - the Fall elections are shaping up', # Juleanna Glover / Bannon
251
251
  ]
252
252
 
253
253
  METADATA_FIELDS = [
@@ -258,56 +258,78 @@ METADATA_FIELDS = [
258
258
  'subject',
259
259
  ]
260
260
 
261
- # Note the line repair happens *after* 'Importance: High' is removed
261
+ # Arguments to _merge_lines(). Note the line repair happens *after* 'Importance: High' is removed
262
262
  LINE_REPAIR_MERGES = {
263
- '017523': 4,
264
- '019407': [2, 4],
265
- '021729': 2,
266
- '022673': 9,
267
- '022684': 9,
268
- '022695': 4,
269
- '029773': [2, 5],
270
- '023067': 3,
271
- '025790': 2,
272
- '029841': 3,
273
- '026345': 3,
274
- '026609': 4,
275
- '033299': 3,
276
- '026829': 3,
277
- '026924': [2, 4],
278
- '028931': [3, 6],
279
- '029154': [2, 5],
280
- '029163': [2, 5],
281
- '029282': 2,
282
- '029402': 5,
283
- '029498': 2,
284
- '029501': 2,
285
- '029835': [2, 4],
286
- '029889': 2,
287
- '029545': [3, 5],
288
- '029976': 3,
289
- '030299': [7, 10],
290
- '030381': [2, 4],
291
- '030384': [2, 4],
292
- '030626': 2,
293
- '030999': [2, 4],
294
- '031384': 2,
295
- '031428': 2,
296
- '031442': 0,
297
- '031980': [2, 4],
298
- '032063': [3, 5],
299
- '032272': 3,
300
- '032405': 4,
301
- '033097': 2,
302
- '033144': [2, 4],
303
- '033217': 3,
304
- '033228': [3, 5],
305
- '033357': [2, 4],
306
- '033486': [7, 9],
307
- '033512': 2,
308
- '033575': [2, 4],
309
- '033576': 3,
310
- '033583': 2,
263
+ '013405': [[4]] * 2,
264
+ '013415': [[4]] * 2,
265
+ '014397': [[4]] * 2,
266
+ '014860': [[3], [4], [4]],
267
+ '017523': [[4]],
268
+ '019105': [[5]] * 4,
269
+ '019407': [[2, 4]],
270
+ '021729': [[2]],
271
+ '022673': [[9]],
272
+ '022684': [[9]],
273
+ '022695': [[4]],
274
+ '022977': [[9]] * 10,
275
+ '023001': [[5]] * 3,
276
+ '023067': [[3]],
277
+ '025233': [[4]] * 2,
278
+ '025329': [[2]] * 9,
279
+ '025790': [[2]],
280
+ '025812': [[3]] * 2,
281
+ '026345': [[3]],
282
+ '026609': [[4]],
283
+ '026829': [[3]],
284
+ '026924': [[2, 4]],
285
+ '028728': [[3]],
286
+ '028931': [[3, 6]],
287
+ '029154': [[2, 5]],
288
+ '029163': [[2, 5]],
289
+ '029282': [[2]],
290
+ '029402': [[5]],
291
+ '029433': [[3]],
292
+ '029458': [[4]] * 3,
293
+ '029498': [[2], [2, 4]],
294
+ '029501': [[2]],
295
+ '029545': [[3, 5]],
296
+ '029773': [[2, 5]],
297
+ '029831': [[3, 6]],
298
+ '029835': [[2, 4]],
299
+ '029841': [[3]],
300
+ '029889': [[2], [2, 5]],
301
+ '029976': [[3]],
302
+ '029977': ([[2]] * 4) + [[4], [2, 4]],
303
+ '030299': [[7, 10]],
304
+ '030315': [[3, 5]],
305
+ '030381': [[2, 4]],
306
+ '030384': [[2, 4]],
307
+ '030626': [[2], [4]],
308
+ '030999': [[2, 4]],
309
+ '031384': [[2]],
310
+ '031428': [[2], [2, 4]],
311
+ '031442': [[0]],
312
+ '031748': [[3]] * 2,
313
+ '031764': [[3]],
314
+ '031980': [[2, 4]],
315
+ '032063': [[3, 5]],
316
+ '032272': [[3]],
317
+ '032405': [[4]],
318
+ '032637': [[9]] * 3,
319
+ '033097': [[2]],
320
+ '033144': [[2, 4]],
321
+ '033217': [[3]],
322
+ '033228': [[3, 5]],
323
+ '033252': [[9]] * 2,
324
+ '033271': [[3]],
325
+ '033299': [[3]],
326
+ '033357': [[2, 4]],
327
+ '033486': [[7, 9]],
328
+ '033512': [[2]],
329
+ '033568': [[5]] * 5,
330
+ '033575': [[2, 4]],
331
+ '033576': [[3]],
332
+ '033583': [[2]],
311
333
  }
312
334
 
313
335
 
@@ -328,6 +350,7 @@ class Email(Communication):
328
350
  recipients: list[Name] = field(default_factory=list)
329
351
  sent_from_device: str | None = None
330
352
  signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
353
+ _line_merge_arguments: list[tuple[int] | tuple[int, int]] = field(default_factory=list)
331
354
 
332
355
  # For logging how many headers we prettified while printing, kind of janky
333
356
  rewritten_header_ids: ClassVar[set[str]] = set([])
@@ -353,7 +376,7 @@ class Email(Communication):
353
376
  self.recipients.extend(self._extract_emailer_names(recipient))
354
377
 
355
378
  # Assume mailing list emails are to Epstein
356
- if self.author in BBC_LISTS and (self.is_note_to_self() or not self.recipients):
379
+ if self.author in BCC_LISTS and (self.is_note_to_self() or not self.recipients):
357
380
  self.recipients = [JEFFREY_EPSTEIN]
358
381
 
359
382
  # Remove self CCs but preserve self emails
@@ -390,6 +413,9 @@ class Email(Communication):
390
413
  def is_note_to_self(self) -> bool:
391
414
  return self.recipients == [self.author]
392
415
 
416
+ def is_with(self, name: str) -> bool:
417
+ return name in [self.author] + self.recipients
418
+
393
419
  def metadata(self) -> Metadata:
394
420
  local_metadata = asdict(self)
395
421
  local_metadata['is_junk_mail'] = self.is_junk_mail()
@@ -562,7 +588,12 @@ class Email(Communication):
562
588
 
563
589
  def _merge_lines(self, idx: int, idx2: int | None = None) -> None:
564
590
  """Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""
565
- idx2 = idx2 if idx2 is not None else (idx + 1)
591
+ if idx2 is None:
592
+ self._line_merge_arguments.append((idx,))
593
+ idx2 = idx + 1
594
+ else:
595
+ self._line_merge_arguments.append((idx, idx2))
596
+
566
597
  lines = self.lines[0:idx]
567
598
 
568
599
  if idx2 <= idx:
@@ -605,68 +636,15 @@ class Email(Communication):
605
636
  old_text = self.text
606
637
 
607
638
  if self.file_id in LINE_REPAIR_MERGES:
608
- merge = LINE_REPAIR_MERGES[self.file_id]
609
- merge_args = merge if isinstance(merge, list) else [merge]
610
- self._merge_lines(*merge_args)
611
-
612
- # These already had 2nd line merged
613
- if self.file_id in ['030626']: # Merge 6th and 7th (now 5th and 6th) rows
614
- self._merge_lines(4)
615
- elif self.file_id == '029889':
616
- self._merge_lines(2, 5)
617
- elif self.file_id in ['029498', '031428']:
618
- self._merge_lines(2, 4)
619
-
620
- # Multiline
621
- if self.file_id == '013415':
622
- for _i in range(2):
623
- self._merge_lines(4)
624
- elif self.file_id == '013405':
625
- for _i in range(2):
626
- self._merge_lines(4)
627
- elif self.file_id == '029458':
628
- for _i in range(3):
629
- self._merge_lines(4)
630
- elif self.file_id in ['025233']:
631
- for _i in range(2):
632
- self._merge_lines(4)
639
+ for merge_args in LINE_REPAIR_MERGES[self.file_id]:
640
+ self._merge_lines(*merge_args)
633
641
 
642
+ if self.file_id in ['025233']:
634
643
  self.lines[4] = f"Attachments: {self.lines[4]}"
635
644
  self._set_computed_fields(lines=self.lines)
636
- elif self.file_id in ['023001']:
637
- for _i in range(3):
638
- self._merge_lines(5)
639
- elif self.file_id in ['019105']:
640
- for _i in range(4):
641
- self._merge_lines(5)
642
- elif self.file_id in ['033568']:
643
- for _i in range(5):
644
- self._merge_lines(5)
645
- elif self.file_id in ['025329']:
646
- for _i in range(9):
647
- self._merge_lines(2)
648
- elif self.file_id in ['025812']:
649
- for _i in range(2):
650
- self._merge_lines(3)
651
- elif self.file_id == '014860':
652
- self._merge_lines(3)
653
- self._merge_lines(4)
654
- self._merge_lines(4)
655
645
  elif self.file_id == '029977':
656
646
  self._set_computed_fields(text=self.text.replace('Sent 9/28/2012 2:41:02 PM', 'Sent: 9/28/2012 2:41:02 PM'))
657
647
 
658
- for _i in range(4):
659
- self._merge_lines(2)
660
-
661
- self._merge_lines(4)
662
- self._merge_lines(2, 4)
663
- elif self.file_id in ['033252']:
664
- for _i in range(2):
665
- self._merge_lines(9)
666
- elif self.file_id in ['032637']:
667
- for _i in range(3):
668
- self._merge_lines(9)
669
-
670
648
  # Bad line removal
671
649
  if self.file_id == '025041':
672
650
  self._remove_line(4)
@@ -687,14 +665,17 @@ class Email(Communication):
687
665
  line = lines[i]
688
666
 
689
667
  if LINK_LINE_REGEX.search(line):
690
- if 'htm' not in line \
691
- and i < (len(lines) - 1) \
692
- and (lines[i + 1].endswith('/') or any(s in lines[i + 1] for s in URL_SIGNIFIERS)):
668
+ while i < (len(lines) - 1) \
669
+ and 'http' not in lines[i + 1] \
670
+ and (lines[i + 1].endswith('/') or any(s in lines[i + 1] for s in URL_SIGNIFIERS) or LINK_LINE2_REGEX.match(lines[i + 1])):
693
671
  logger.debug(f"{self.filename}: Joining link lines\n 1. {line}\n 2. {lines[i + 1]}\n")
694
672
  line += lines[i + 1]
695
673
  i += 1
696
674
 
697
675
  line = line.replace(' ', '')
676
+ elif ' http' in line and line.endswith('html'):
677
+ pre_link, post_link = line.split(' http', 1)
678
+ line = f"{pre_link} http{post_link.replace(' ', '')}"
698
679
 
699
680
  new_lines.append(line)
700
681
 
@@ -745,10 +726,12 @@ class Email(Communication):
745
726
 
746
727
  if args.whole_file:
747
728
  num_chars = len(self.text)
729
+ elif args.truncate:
730
+ num_chars = args.truncate
748
731
  elif self.file_id in TRUNCATION_LENGTHS:
749
732
  num_chars = TRUNCATION_LENGTHS[self.file_id] or self.file_size()
750
- elif self.author in TRUNCATE_ALL_EMAILS_FROM or includes_truncate_term:
751
- num_chars = int(MAX_CHARS_TO_PRINT / 3)
733
+ elif self.author in TRUNCATE_EMAILS_FROM or any([self.is_with(n) for n in TRUNCATE_EMAILS_FROM_OR_TO]) or includes_truncate_term:
734
+ num_chars = min(quote_cutoff or MAX_CHARS_TO_PRINT, TRUNCATED_CHARS)
752
735
  elif quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
753
736
  num_chars = quote_cutoff
754
737
  else:
@@ -757,15 +740,14 @@ class Email(Communication):
757
740
  if num_chars != MAX_CHARS_TO_PRINT and not self.is_duplicate():
758
741
  log_args = {
759
742
  'num_chars': num_chars,
760
- 'author_truncate': self.author in TRUNCATE_ALL_EMAILS_FROM,
743
+ 'author_truncate': self.author in TRUNCATE_EMAILS_FROM,
761
744
  'is_fwded_article': self.is_fwded_article(),
762
745
  'is_quote_cutoff': quote_cutoff == num_chars,
763
746
  'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
764
747
  'quote_cutoff': quote_cutoff,
765
748
  }
766
749
 
767
- if quote_cutoff != num_chars:
768
- logger.debug(f'{self.summary()} truncating: ' + ', '.join([f"{k}={v}" for k, v in log_args.items() if v]) + '\n')
750
+ logger.debug(f'{self.summary()} truncating: ' + ', '.join([f"{k}={v}" for k, v in log_args.items() if v]) + '\n')
769
751
 
770
752
  return num_chars
771
753
 
@@ -799,6 +781,13 @@ class Email(Communication):
799
781
  text = _add_line_breaks(text) # This was skipped when _prettify_text() w/a broken header so we do it now
800
782
  self.rewritten_header_ids.add(self.file_id)
801
783
 
784
+ lines = [
785
+ Text.from_markup(f"[link={line}]{line}[/link]") if line.startswith('http') else Text(line)
786
+ for line in text.split('\n')
787
+ ]
788
+
789
+ text = join_texts(lines, '\n')
790
+
802
791
  email_txt_panel = Panel(
803
792
  highlighter(text).append('\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
804
793
  border_style=self._border_style(),
@@ -297,7 +297,7 @@ class EpsteinFiles:
297
297
 
298
298
  def _set_uninteresting_ccs(self) -> None:
299
299
  for id in EMAILS_WITH_UNINTERESTING_BCCS:
300
- self.uninteresting_ccs += copy(cast(list[Name], self.email_for_id(id).header.bcc))
300
+ self.uninteresting_ccs += [bcc.lower() for bcc in cast(list[str], self.email_for_id(id).header.bcc)]
301
301
 
302
302
  for id in EMAILS_WITH_UNINTERESTING_CCS:
303
303
  self.uninteresting_ccs += self.email_for_id(id).recipients
@@ -334,5 +334,4 @@ def document_cls(doc: Document) -> Type[Document]:
334
334
 
335
335
 
336
336
  def _sorted_metadata(docs: Sequence[Document]) -> list[Metadata]:
337
- docs_sorted_by_id = sorted(docs, key=lambda d: d.file_id)
338
- return [json_safe(d.metadata()) for d in docs_sorted_by_id]
337
+ return [json_safe(d.metadata()) for d in Document.sort_by_id(docs)]