epstein-files 1.2.5__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,12 +17,12 @@ from rich.text import Text
17
17
  from epstein_files.documents.communication import Communication
18
18
  from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, INFO_INDENT
19
19
  from epstein_files.documents.emails.email_header import (BAD_EMAILER_REGEX, EMAIL_SIMPLE_HEADER_REGEX,
20
- EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, TIME_REGEX, EmailHeader)
20
+ EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, FIELDS_COLON_PATTERN, TIME_REGEX, EmailHeader)
21
+ from epstein_files.documents.other_file import OtherFile
21
22
  from epstein_files.util.constant.names import *
22
23
  from epstein_files.util.constant.strings import REDACTED
23
24
  from epstein_files.util.constants import *
24
- from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes,
25
- flatten, listify, remove_timezone, uniquify)
25
+ from epstein_files.util.data import TIMEZONE_INFO, collapse_newlines, escape_single_quotes, remove_timezone
26
26
  from epstein_files.util.doc_cfg import EmailCfg, Metadata
27
27
  from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
28
28
  from epstein_files.util.highlighted_group import JUNK_EMAILERS, get_style_for_name
@@ -30,9 +30,11 @@ from epstein_files.util.logging import logger
30
30
  from epstein_files.util.rich import *
31
31
 
32
32
  BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
33
- BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
33
+ BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Hide caption|Importance:?\s*High|[iI,•]|[1i] (_ )?[il]|, [-,]|L\._|_filtered|.*(yiv0232|font-family:|margin-bottom:).*)$')
34
+ BAD_SUBJECT_CONTINUATIONS = ['orwarded', 'Hi ', 'Sent ', 'AmLaw', 'Original Message', 'Privileged', 'Sorry', '---']
34
35
  DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
35
- LINK_LINE_REGEX = re.compile(f"^>? ?htt")
36
+ FIELDS_COLON_REGEX = re.compile(FIELDS_COLON_PATTERN)
37
+ LINK_LINE_REGEX = re.compile(f"^[>• ]*htt")
36
38
  LINK_LINE2_REGEX = re.compile(r"^[-\w.%&=/]{5,}$")
37
39
  QUOTED_REPLY_LINE_REGEX = re.compile(r'(\nFrom:(.*)|wrote:)\n', re.IGNORECASE)
38
40
  REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
@@ -44,13 +46,12 @@ LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
44
46
 
45
47
  SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
46
48
  REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
47
- URL_SIGNIFIERS = ['amp?', 'cd=', 'click', 'ft=', 'gclid', 'htm', 'keywords=', 'module=', 'mpweb', 'nlid=', 'ref=', 'smid=', 'usg=', 'utm']
49
+ URL_SIGNIFIERS = ['?amp', 'amp?', 'cd=', 'click', 'CMP=', 'contentId', 'ft=', 'gclid', 'htm', 'mp=', 'keywords=', 'Id=', 'module=', 'mpweb', 'nlid=', 'ref=', 'smid=', 'sp=', 'usg=', 'utm']
48
50
  APPEARS_IN = 'appears in'
49
51
 
50
52
  MAX_NUM_HEADER_LINES = 14
51
- MAX_QUOTED_REPLIES = 2
52
- MAX_CHARS_TO_PRINT = 4000
53
- TRUNCATED_CHARS = int(MAX_CHARS_TO_PRINT / 3)
53
+ MAX_QUOTED_REPLIES = 1
54
+ NUM_WORDS_IN_LAST_QUOTE = 6
54
55
 
55
56
  REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
56
57
  '********************************',
@@ -88,7 +89,13 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
88
89
  re.compile(r'^INW$', re.MULTILINE): REDACTED,
89
90
  # links
90
91
  'Imps ://': 'https://',
92
+ 'on-accusers-rose-\nmcgowan/ ': 'on-accusers-rose-\nmcgowan/\n',
93
+ 'the-truth-\nabout-the-bitcoin-foundation/ )': 'the-truth-about-the-bitcoin-foundation/ )\n',
94
+ 'woody-allen-jeffrey-epsteins-\nsociety-friends-close-ranks/ ---': 'woody-allen-jeffrey-epsteins-society-friends-close_ranks/\n',
95
+ ' https://www.theguardian.com/world/2017/may/29/close-friend-trump-thomas-barrack-\nalleged-tax-evasion-italy-sardinia?CMP=share btn fb': '\nhttps://www.theguardian.com/world/2017/may/29/close-friend-trump-thomas-barrack-alleged-tax-evasion-italy-sardinia?CMP=share_btn_fb',
91
96
  re.compile(r'timestopics/people/t/landon jr thomas/inde\n?x\n?\.\n?h\n?tml'): 'timestopics/people/t/landon_jr_thomas/index.html',
97
+ re.compile(r" http ?://www. ?dailymail. ?co ?.uk/news/article-\d+/Troub ?led-woman-history-drug-\n?us ?e-\n?.*html"): '\nhttp://www.dailymail.co.uk/news/article-3914012/Troubled-woman-history-drug-use-claimed-assaulted-Donald-Trump-Jeffrey-Epstein-sex-party-age-13-FABRICATED-story.html',
98
+ re.compile(r"http.*steve-bannon-trump-tower-\n?interview-\n?trumps-\n?strategist-plots-\n?new-political-movement-948747"): "\nhttp://www.hollywoodreporter.com/news/steve-bannon-trump-tower-interview-trumps-strategist-plots-new-political-movement-948747",
92
99
  # Subject lines
93
100
  "Arrested in\nInauguration Day Riot": "Arrested in Inauguration Day Riot",
94
101
  "as Putin Mayhem Tests President's Grip\non GOP": "as Putin Mayhem Tests President's Grip on GOP",
@@ -99,6 +106,7 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
99
106
  "COVER UP SEX ABUSE CRIMES\nBY THE WHITE HOUSE": "COVER UP SEX ABUSE CRIMES BY THE WHITE HOUSE",
100
107
  'Priebus, used\nprivate email accounts for': 'Priebus, used private email accounts for',
101
108
  "War on the Investigations\nEncircling Him": "War on the Investigations Encircling Him",
109
+ "Subject; RE": "Subject: RE",
102
110
  re.compile(r"deadline re Mr Bradley Edwards vs Mr\s*Jeffrey Epstein", re.I): "deadline re Mr Bradley Edwards vs Mr Jeffrey Epstein",
103
111
  re.compile(r"Following Plea That Implicated Trump -\s*https://www.npr.org/676040070", re.I): "Following Plea That Implicated Trump - https://www.npr.org/676040070",
104
112
  re.compile(r"for Attorney General -\s+Wikisource, the"): r"for Attorney General - Wikisource, the",
@@ -109,6 +117,8 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
109
117
  re.compile(r"Subject:\s*Fwd: Trending Now: Friends for three decades"): "Subject: Fwd: Trending Now: Friends for three decades",
110
118
  # Misc
111
119
  'AVG°': 'AVGO',
120
+ 'Saw Matt C with DTF at golf': 'Saw Matt C with DJT at golf',
121
+ re.compile(r"[i. ]*Privileged[- ]*Redacted[i. ]*"): '<PRIVILEGED - REDACTED>',
112
122
  }
113
123
 
114
124
  EMAIL_SIGNATURE_REGEXES = {
@@ -118,20 +128,28 @@ EMAIL_SIGNATURE_REGEXES = {
118
128
  DANIEL_SIAD: re.compile(r"Confidentiality Notice: The information contained in this electronic message is PRIVILEGED and confidential information intended only for the use of the individual entity or entities named as recipient or recipients. If the reader is not the intended recipient, be hereby notified that any dissemination, distribution or copy of this communication is strictly prohibited. If you have received this communication in error, please notify me immediately by electronic mail or by telephone and permanently delete this message from your computer system. Thank you.".replace(' ', r'\s*'), re.IGNORECASE),
119
129
  DANNY_FROST: re.compile(r"Danny Frost\nDirector.*\nManhattan District.*\n212.*", re.IGNORECASE),
120
130
  DARREN_INDYKE: re.compile(r"DARREN K. INDYKE.*?\**\nThe information contained in this communication.*?Darren K.[\n\s]+?[Il]ndyke(, PLLC)? — All rights reserved\.? ?\n\*{50,120}(\n\**)?", re.DOTALL),
131
+ DAVID_FISZEL: re.compile(r"This e-mail and any file.*\nmail and/or any file.*\nmail or any.*\nreceived.*\nmisdirected.*"),
121
132
  DAVID_INGRAM: re.compile(r"Thank you in advance.*\nDavid Ingram.*\nCorrespondent\nReuters.*\nThomson.*(\n(Office|Mobile|Reuters.com).*)*"),
122
133
  DEEPAK_CHOPRA: re.compile(fr"({DEEPAK_CHOPRA}( MD)?\n)?2013 Costa Del Mar Road\nCarlsbad, CA 92009(\n(Chopra Foundation|Super Genes: Unlock.*))?(\nJiyo)?(\nChopra Center for Wellbeing)?(\nHome: Where Everyone is Welcome)?"),
123
- EDUARDO_ROBLES: re.compile(fr"(• )?email:.*\n(• )?email:\n(• )?website: www.creativekingdom.com\n(• )?address: 5th Floor Office No:504 Aspect Tower,\nBusiness Bay, Dubai United Arab Emirates."),
134
+ EDUARDO_ROBLES: re.compile(r"(• )?email:.*\n(• )?email:\n(• )?website: www.creativekingdom.com\n(• )?address: 5th Floor Office No:504 Aspect Tower,\nBusiness Bay, Dubai United Arab Emirates."),
135
+ ERIC_ROTH: re.compile(r"2221 Smithtown Avenue\nLong Island.*\nRonkonkoma.*\n(.1. )?Phone\nFax\nCell\ne-mail"),
136
+ GHISLAINE_MAXWELL: re.compile(r"FACEBOOK\nTWITTER\nG\+\nPINTEREST\nINSTAGRAM\nPLEDGE\nTHE DAILY CATCH"),
124
137
  JEFFREY_EPSTEIN: re.compile(r"((\*+|please note)\n+)?(> )?(• )?(» )?The information contained in this communication is\n(> )*(» )?confidential.*?all attachments.( copyright -all rights reserved?)?", re.DOTALL),
125
138
  JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*(\nTel:.*)?(\nEmail:.*)?", re.IGNORECASE),
126
139
  KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
127
140
  LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
128
141
  LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
142
+ LEON_BLACK: re.compile(r"This email and any files transmitted with it are confidential and intended solely.*\n(they|whom).*\ndissemination.*\nother.*\nand delete.*"),
143
+ LISA_NEW: re.compile(r"Elisa New\nPowell M. Cabot.*\n(Director.*\n)?Harvard.*\n148.*\n([1I] )?12.*\nCambridge.*\n([1I] )?02138"),
129
144
  MARTIN_WEINBERG: re.compile(r"(Martin G. Weinberg, Esq.\n20 Park Plaza((, )|\n)Suite 1000\nBoston, MA 02116(\n61.*?)?(\n.*?([cC]ell|Office))*\n)?This Electronic Message contains.*?contents of this message is.*?prohibited.", re.DOTALL),
145
+ MICHAEL_MILLER: re.compile(r"Michael C. Miller\nPartner\nwww.steptoe.com/mmiller\nSteptoe\n(Privileged.*\n)?(\+1\s+)?direct.*\n(\+1\s+)?(\+1\s+)?fax.*\n(\+1.*)?cell.*\n(www.steptoe.com\n)?This message and any.*\nyou are not.*\nnotify the sender.*"),
130
146
  NICHOLAS_RIBIS: re.compile(r"60 Morris Turnpike 2FL\nSummit,? NJ.*\n0:\nF:\n\*{20,}\nCONFIDENTIALITY NOTICE.*\nattachments.*\ncopying.*\nIf you have.*\nthe copy.*\nThank.*\n\*{20,}"),
131
147
  PETER_MANDELSON: re.compile(r'Disclaimer This email and any attachments to it may be.*?with[ \n]+number(.*?EC4V[ \n]+6BJ)?', re.DOTALL | re.IGNORECASE),
132
148
  PAUL_BARRETT: re.compile(r"Paul Barrett[\n\s]+Alpha Group Capital LLC[\n\s]+(142 W 57th Street, 11th Floor, New York, NY 10019?[\n\s]+)?(al?[\n\s]*)?ALPHA GROUP[\n\s]+CAPITAL"),
149
+ PETER_ATTIA: re.compile(r"The information contained in this transmission may contain.*\n(laws|patient).*\n(distribution|named).*\n(distribution.*\nplease.*|copies.*)"),
133
150
  RICHARD_KAHN: re.compile(fr'Richard Kahn[\n\s]+HBRK Associates Inc.?[\n\s]+((301 East 66th Street, Suite 1OF|575 Lexington Avenue,? 4th Floor,?)[\n\s]+)?New York, (NY|New York) 100(22|65)(\s+(Tel?|Phone)( I|{REDACTED})?\s+Fa[x",]?(_|{REDACTED})*\s+[Ce]el?l?)?', re.IGNORECASE),
134
151
  ROSS_GOW: re.compile(r"Ross Gow\nManaging Partner\nACUITY Reputation Limited\n23 Berkeley Square\nLondon.*\nMobile.*\nTel"),
152
+ STEPHEN_HANSON: re.compile(r"(> )?Confidentiality Notice: This e-mail transmission.*\n(which it is addressed )?and may contain.*\n(applicable law. If you are not the intended )?recipient you are hereby.*\n(information contained in or attached to this transmission is )?STRICTLY PROHIBITED.*"),
135
153
  STEVEN_PFEIFFER: re.compile(r"Steven\nSteven .*\nAssociate.*\nIndependent Filmmaker Project\nMade in NY.*\n30 .*\nBrooklyn.*\n(p:.*\n)?www\.ifp.*", re.IGNORECASE),
136
154
  'Susan Edelman': re.compile(r'Susan Edel.*\nReporter\n1211.*\n917.*\nsedelman.*', re.IGNORECASE),
137
155
  TERRY_KAFKA: re.compile(r"((>|I) )?Terry B.? Kafka.*\n(> )?Impact Outdoor.*\n(> )?5454.*\n(> )?Dallas.*\n((> )?c?ell.*\n)?(> )?Impactoutdoor.*(\n(> )?cell.*)?", re.IGNORECASE),
@@ -152,13 +170,19 @@ BCC_LISTS = JUNK_EMAILERS + MAILING_LISTS
152
170
  TRUNCATE_EMAILS_FROM_OR_TO = [
153
171
  AMANDA_ENS,
154
172
  ANTHONY_BARRETT,
173
+ DANIEL_SABBA,
155
174
  DIANE_ZIMAN,
156
175
  JOSCHA_BACH,
157
176
  KATHERINE_KEATING,
177
+ LAWRANCE_VISOSKI,
158
178
  LAWRENCE_KRAUSS,
159
179
  LISA_NEW,
180
+ MOSHE_HOFFMAN,
160
181
  NILI_PRIELL_BARAK,
161
182
  PAUL_KRASSNER,
183
+ PAUL_PROSPERI,
184
+ 'Susan Edelman',
185
+ TERRY_KAFKA,
162
186
  ]
163
187
 
164
188
  TRUNCATE_EMAILS_FROM = BCC_LISTS + TRUNCATE_EMAILS_FROM_OR_TO + [
@@ -170,6 +194,7 @@ TRUNCATE_EMAILS_FROM = BCC_LISTS + TRUNCATE_EMAILS_FROM_OR_TO + [
170
194
  DAVID_HAIG,
171
195
  EDWARD_ROD_LARSEN,
172
196
  JOHNNY_EL_HACHEM,
197
+ 'Mark Green',
173
198
  MELANIE_WALKER,
174
199
  'Mitchell Bard',
175
200
  PEGGY_SIEGAL,
@@ -182,47 +207,12 @@ TRUNCATE_EMAILS_FROM = BCC_LISTS + TRUNCATE_EMAILS_FROM_OR_TO + [
182
207
  TERRY_KAFKA,
183
208
  ]
184
209
 
185
- # These IDs will be appended to INTERESTING_EMAIL_IDS
186
- INTERESTING_TRUNCATION_LENGTHS = {
187
- '023627': 16_800, # Micheal Wolff article with brock pierce
188
- '030245': None, # Epstein rationalizes his behavior in an open letter to the world
189
- '030781': None, # Bannon email about crypto coin issues
190
- '032906': None, # David Blaine email
191
- '026036': 6000, # Gino Yu blockchain mention
192
- '029609': None, # Joi Ito
193
- '025233': None, # Reputation.com discussion
194
- '017827': None, # Bannon / Peggy Siegal email about netflix doc on Epstein
195
- '030222': None, # Ross Gow / Ghislaine correspondence
196
- '026028': None, # Larry Summers / Karim Wade intro
197
- '029545': None, # Tyler Shears reputation
198
- '025812': None, # Tyler Shears reputation
199
- '029914': 4500, # Lord Mandelson russian investments
200
- '033453': None, # "Just heard you were telling people that you heard I asked Trump for a million dollars"
201
- '031320': None, # Epstein Gratitude foundation
202
- '031036': None, # Barbro Ehnbom talking about Swedish girl
203
- '023454': 1878, # Email invitation sent to tech CEOs + Epstein
204
- '029342': 2000, # Hakeem Jeffries
205
- }
206
-
207
- TRUNCATION_LENGTHS = {
208
- **INTERESTING_TRUNCATION_LENGTHS,
209
- '031791': None, # First email in Jessica Cadwell chain about service of legal documents
210
- '023208': None, # Long discussion about leon black's finances
211
- '028589': None, # Long thread with Reid Weingarten
212
- '029433': TRUNCATED_CHARS, # Kahn taxes
213
- '026778': TRUNCATED_CHARS, # Kahn taxes
214
- '033311': TRUNCATED_CHARS, # Kahn taxes
215
- '024251': TRUNCATED_CHARS, # Kahn taxes
216
- '026755': TRUNCATED_CHARS, # Epstein self fwd
217
- }
218
-
219
210
  # These are long forwarded articles so we force a trim to 1,333 chars if these strings exist
220
211
  TRUNCATE_TERMS = [
221
212
  'The rebuilding of Indonesia', # Vikcy ward article
222
- 'Dominique Strauss-Kahn',
223
- 'THOMAS L. FRIEDMAN',
224
213
  'a sleek, briskly paced film whose title suggests a heist movie', # Inside Job
225
214
  'Calendar of Major Events, Openings, and Fundraisers',
215
+ 'sent over from Marshall Heyman at the WSJ',
226
216
  "In recent months, China's BAT collapse",
227
217
  'President Obama introduces Jim Yong Kim as his nominee',
228
218
  'Trump appears with mobster-affiliated felon at New',
@@ -237,9 +227,11 @@ TRUNCATE_TERMS = [
237
227
  'co-inventor of the GTX Smart Shoe',
238
228
  'my latest Washington Post column',
239
229
  # Bannon
230
+ 'As Steve Bannon continues his tour of Europe',
240
231
  "Bannon the European: He's opening the populist fort in Brussels",
241
232
  "Steve Bannon doesn't do subtle.",
242
233
  'The Department of Justice lost its latest battle with Congress',
234
+ 'pedophile Jeffrey Epstein bought his way out',
243
235
  # lawyers
244
236
  'recuses itself from Jeffrey Epstein case',
245
237
  # Misc
@@ -265,11 +257,23 @@ LINE_REPAIR_MERGES = {
265
257
  '014397': [[4]] * 2,
266
258
  '014860': [[3], [4], [4]],
267
259
  '017523': [[4]],
260
+ '030367': [[1, 4], [2, 4]],
268
261
  '019105': [[5]] * 4,
269
262
  '019407': [[2, 4]],
263
+ '022187': [[1, 8], [2, 8], [3, 8], [4, 8]],
270
264
  '021729': [[2]],
265
+ '032896': [[2]],
266
+ '033050': [[0, 6], [1, 6], [2, 6], [3, 6], [4, 6]],
267
+ '022949': [[0, 4], [1, 4]],
268
+ '022197': [[0, 5], [1, 5], [3, 5]],
269
+ '021814': [[1, 6], [2, 6], [3, 6], [4, 6]],
270
+ '022190': [[1, 7], [0, 6], [3, 6], [4, 6]],
271
+ '029582': [[0, 5], [1, 5], [3, 5], [3, 5]],
271
272
  '022673': [[9]],
272
273
  '022684': [[9]],
274
+ '026625': [[0, 7], [1, 7], [2, 7], [3, 7], [4, 7], [5, 7]],
275
+ '026659': [[0, 5], [1, 5]],
276
+ '026764': [[0, 6], [1, 6]],
273
277
  '022695': [[4]],
274
278
  '022977': [[9]] * 10,
275
279
  '023001': [[5]] * 3,
@@ -278,11 +282,15 @@ LINE_REPAIR_MERGES = {
278
282
  '025329': [[2]] * 9,
279
283
  '025790': [[2]],
280
284
  '025812': [[3]] * 2,
285
+ '025589': [[3]] * 12,
281
286
  '026345': [[3]],
282
287
  '026609': [[4]],
288
+ '028921': [[5, 4], [4, 5]],
289
+ '026620': ([[20]] * 4) + [[3, 2]] + ([[2]] * 15) + [[2, 4]],
283
290
  '026829': [[3]],
284
291
  '026924': [[2, 4]],
285
292
  '028728': [[3]],
293
+ '026451': [[3, 5]] * 2,
286
294
  '028931': [[3, 6]],
287
295
  '029154': [[2, 5]],
288
296
  '029163': [[2, 5]],
@@ -302,18 +310,22 @@ LINE_REPAIR_MERGES = {
302
310
  '029977': ([[2]] * 4) + [[4], [2, 4]],
303
311
  '030299': [[7, 10]],
304
312
  '030315': [[3, 5]],
313
+ '030318': [[3, 5]],
305
314
  '030381': [[2, 4]],
306
315
  '030384': [[2, 4]],
307
316
  '030626': [[2], [4]],
317
+ '030861': [[3, 8]],
308
318
  '030999': [[2, 4]],
309
319
  '031384': [[2]],
310
320
  '031428': [[2], [2, 4]],
311
321
  '031442': [[0]],
322
+ '031489': [[2, 4], [3, 4], [3, 4], [10]],
323
+ '031619': [[7], [17], [17]],
312
324
  '031748': [[3]] * 2,
313
- '031764': [[3]],
325
+ '031764': [[3], [8]], # 8 is just for style fix internally, not header
314
326
  '031980': [[2, 4]],
315
327
  '032063': [[3, 5]],
316
- '032272': [[3]],
328
+ '032272': [[2, 10], [3]],
317
329
  '032405': [[4]],
318
330
  '032637': [[9]] * 3,
319
331
  '033097': [[2]],
@@ -326,6 +338,8 @@ LINE_REPAIR_MERGES = {
326
338
  '033357': [[2, 4]],
327
339
  '033486': [[7, 9]],
328
340
  '033512': [[2]],
341
+ '026024': [[1, 3], [2, 3]],
342
+ '024923': [[0, 5], [2]],
329
343
  '033568': [[5]] * 5,
330
344
  '033575': [[2, 4]],
331
345
  '033576': [[3]],
@@ -344,12 +358,14 @@ class Email(Communication):
344
358
  sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
345
359
  signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
346
360
  """
361
+ attached_docs: list[OtherFile] = field(default_factory=list)
347
362
  actual_text: str = field(init=False)
348
363
  config: EmailCfg | None = None
349
364
  header: EmailHeader = field(init=False)
350
365
  recipients: list[Name] = field(default_factory=list)
351
366
  sent_from_device: str | None = None
352
367
  signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
368
+ _is_first_for_user: bool = False # Only set when printing
353
369
  _line_merge_arguments: list[tuple[int] | tuple[int, int]] = field(default_factory=list)
354
370
 
355
371
  # For logging how many headers we prettified while printing, kind of janky
@@ -389,6 +405,7 @@ class Email(Communication):
389
405
  self.sent_from_device = self._sent_from_device()
390
406
 
391
407
  def attachments(self) -> list[str]:
408
+ """Returns the string in the header."""
392
409
  return (self.header.attachments or '').split(';')
393
410
 
394
411
  def info_txt(self) -> Text:
@@ -402,7 +419,12 @@ class Email(Communication):
402
419
  return txt.append(highlighter(f" probably sent at {self.timestamp}"))
403
420
 
404
421
  def is_fwded_article(self) -> bool:
405
- return bool(self.config and self.config.is_fwded_article)
422
+ if self.config is None:
423
+ return False
424
+ elif self.config.fwded_text_after:
425
+ return self.config.is_fwded_article is not False
426
+ else:
427
+ return bool(self.config.is_fwded_article)
406
428
 
407
429
  def is_junk_mail(self) -> bool:
408
430
  return self.author in JUNK_EMAILERS
@@ -413,9 +435,15 @@ class Email(Communication):
413
435
  def is_note_to_self(self) -> bool:
414
436
  return self.recipients == [self.author]
415
437
 
416
- def is_with(self, name: str) -> bool:
438
+ def is_from_or_to(self, name: str) -> bool:
417
439
  return name in [self.author] + self.recipients
418
440
 
441
+ def is_word_count_worthy(self) -> bool:
442
+ if self.is_fwded_article():
443
+ return bool(self.config.fwded_text_after) or len(self.actual_text) < 150
444
+ else:
445
+ return not self.is_mailing_list()
446
+
419
447
  def metadata(self) -> Metadata:
420
448
  local_metadata = asdict(self)
421
449
  local_metadata['is_junk_mail'] = self.is_junk_mail()
@@ -462,8 +490,9 @@ class Email(Communication):
462
490
  elif self.header.num_header_rows == 0:
463
491
  return self.text
464
492
 
493
+ # import pdb;pdb.set_trace()
465
494
  self.log_top_lines(20, "Raw text:", logging.DEBUG)
466
- self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
495
+ self.log(f"With {self.header.num_header_rows} header lines removed:\n{text[0:500]}\n\n", logging.DEBUG)
467
496
  reply_text_match = REPLY_TEXT_REGEX.search(text)
468
497
 
469
498
  if reply_text_match:
@@ -542,8 +571,8 @@ class Email(Communication):
542
571
  logger.debug(f"{self.file_id} extracted header\n\n{self.header}\n")
543
572
 
544
573
  def _extract_timestamp(self) -> datetime:
545
- if self.config and self.config.timestamp:
546
- return self.config.timestamp
574
+ if self.config and self.config.timestamp():
575
+ return self.config.timestamp()
547
576
  elif self.header.sent_at:
548
577
  timestamp = _parse_timestamp(self.header.sent_at)
549
578
 
@@ -572,36 +601,41 @@ class Email(Communication):
572
601
  logger.debug(f"Fell back to timestamp {timestamp} in line '{line}'...")
573
602
  return timestamp
574
603
 
575
- raise RuntimeError(f"No timestamp found in '{self.file_path.name}' top lines:\n{searchable_text}")
604
+ no_timestamp_msg = f"No timestamp found in '{self.file_path.name}'"
576
605
 
577
- def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
578
- """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
579
- if text is None:
580
- header_offset = len(self.header.header_chars)
581
- text = self.text[header_offset:]
606
+ if self.is_duplicate():
607
+ logger.warning(f"{no_timestamp_msg} but timestamp should be copied from {self.duplicate_of_id()}")
582
608
  else:
583
- header_offset = 0
609
+ raise RuntimeError(f"{no_timestamp_msg}, top lines:\n{searchable_text}")
610
+
611
+ def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES) -> int | None:
612
+ """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
613
+ header_offset = len(self.header.header_chars)
614
+ text = self.text[header_offset:]
584
615
 
585
616
  for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text)):
586
617
  if i >= n:
587
618
  return match.end() + header_offset - 1
588
619
 
589
- def _merge_lines(self, idx: int, idx2: int | None = None) -> None:
620
+ def _merge_lines(self, idx1: int, idx2: int | None = None) -> None:
590
621
  """Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""
591
622
  if idx2 is None:
592
- self._line_merge_arguments.append((idx,))
593
- idx2 = idx + 1
623
+ self._line_merge_arguments.append((idx1,))
624
+ idx2 = idx1 + 1
594
625
  else:
595
- self._line_merge_arguments.append((idx, idx2))
626
+ self._line_merge_arguments.append((idx1, idx2))
596
627
 
597
- lines = self.lines[0:idx]
598
-
599
- if idx2 <= idx:
600
- raise RuntimeError(f"idx2 ({idx2}) must be greater than idx ({idx})")
601
- elif idx2 == (idx + 1):
602
- lines += [self.lines[idx] + ' ' + self.lines[idx + 1]] + self.lines[idx + 2:]
628
+ if idx2 < idx1:
629
+ lines = self.lines[0:idx2] + self.lines[idx2 + 1:idx1] + [self.lines[idx1] + ' ' + self.lines[idx2]] + self.lines[idx1 + 1:]
630
+ elif idx2 == idx1:
631
+ raise RuntimeError(f"idx2 ({idx2}) must be greater or less than idx ({idx1})")
603
632
  else:
604
- lines += [self.lines[idx] + ' ' + self.lines[idx2]] + self.lines[idx + 1:idx2] + self.lines[idx2 + 1:]
633
+ lines = self.lines[0:idx1]
634
+
635
+ if idx2 == (idx1 + 1):
636
+ lines += [self.lines[idx1] + ' ' + self.lines[idx1 + 1]] + self.lines[idx1 + 2:]
637
+ else:
638
+ lines += [self.lines[idx1] + ' ' + self.lines[idx2]] + self.lines[idx1 + 1:idx2] + self.lines[idx2 + 1:]
605
639
 
606
640
  self._set_computed_fields(lines=lines)
607
641
 
@@ -617,6 +651,10 @@ class Email(Communication):
617
651
  self.signature_substitution_counts[name] = self.signature_substitution_counts.get(name, 0)
618
652
  self.signature_substitution_counts[name] += num_replaced
619
653
 
654
+ # Share / Tweet lines
655
+ if self.author == KATHRYN_RUEMMLER:
656
+ text = '\n'.join([l for l in text.split('\n') if l not in ['Share', 'Tweet', 'Bookmark it']])
657
+
620
658
  return collapse_newlines(text).strip()
621
659
 
622
660
  def _remove_line(self, idx: int) -> None:
@@ -657,17 +695,21 @@ class Email(Communication):
657
695
  self.log_top_lines(12, 'Result of modifications')
658
696
 
659
697
  lines = self.repair_ocr_text(OCR_REPAIRS, self.text).split('\n')
698
+ subject_line = next((line for line in lines if line.startswith('Subject:')), None) or ''
699
+ subject = subject_line.split(':')[1].strip() if subject_line else ''
660
700
  new_lines = []
661
701
  i = 0
662
702
 
663
- # Fix links (remove spaces, merge multiline links to a single line)
703
+ # Fix links and quoted subjects (remove spaces, merge multiline links to a single line)
664
704
  while i < len(lines):
665
705
  line = lines[i]
666
706
 
667
707
  if LINK_LINE_REGEX.search(line):
668
708
  while i < (len(lines) - 1) \
669
- and 'http' not in lines[i + 1] \
670
- and (lines[i + 1].endswith('/') or any(s in lines[i + 1] for s in URL_SIGNIFIERS) or LINK_LINE2_REGEX.match(lines[i + 1])):
709
+ and not lines[i + 1].startswith('htt') \
710
+ and (lines[i + 1].endswith('/') \
711
+ or any(s in lines[i + 1] for s in URL_SIGNIFIERS) \
712
+ or LINK_LINE2_REGEX.match(lines[i + 1])):
671
713
  logger.debug(f"{self.filename}: Joining link lines\n 1. {line}\n 2. {lines[i + 1]}\n")
672
714
  line += lines[i + 1]
673
715
  i += 1
@@ -676,6 +718,17 @@ class Email(Communication):
676
718
  elif ' http' in line and line.endswith('html'):
677
719
  pre_link, post_link = line.split(' http', 1)
678
720
  line = f"{pre_link} http{post_link.replace(' ', '')}"
721
+ elif line.startswith('Subject:') and i < (len(lines) - 2) and len(line) >= 40:
722
+ next_line = lines[i + 1]
723
+ next_next = lines[i + 2]
724
+
725
+ if len(next_line) <= 1 or any([cont in next_line for cont in BAD_SUBJECT_CONTINUATIONS]):
726
+ pass
727
+ elif (subject.endswith(next_line) and next_line != subject) \
728
+ or (FIELDS_COLON_REGEX.search(next_next) and not FIELDS_COLON_REGEX.search(next_line)):
729
+ self.warn(f"Fixing broken subject line\n line: '{line}'\n next: '{next_line}'\n next: '{next_next}'\nsubject='{subject}'\n")
730
+ line += f" {next_line}"
731
+ i += 1
679
732
 
680
733
  new_lines.append(line)
681
734
 
@@ -699,7 +752,7 @@ class Email(Communication):
699
752
  """Copy info from original config for file this document was extracted from."""
700
753
  if self.file_id in ALL_FILE_CONFIGS:
701
754
  self.config = cast(EmailCfg, deepcopy(ALL_FILE_CONFIGS[self.file_id]))
702
- self.warn(f"Merging existing cfg for '{self.file_id}' with cfg for extracted document...")
755
+ self.log(f"Merging existing cfg for '{self.file_id}' with cfg for extracted document...")
703
756
  else:
704
757
  self.config = EmailCfg(id=self.file_id)
705
758
 
@@ -721,34 +774,58 @@ class Email(Communication):
721
774
 
722
775
  def _truncate_to_length(self) -> int:
723
776
  """When printing truncate this email to this length."""
724
- quote_cutoff = self._idx_of_nth_quoted_reply(text=self.text) # Trim if there's many quoted replies
777
+ quote_cutoff = self._idx_of_nth_quoted_reply() # Trim if there's many quoted replies
725
778
  includes_truncate_term = next((term for term in TRUNCATE_TERMS if term in self.text), None)
726
779
 
727
780
  if args.whole_file:
728
781
  num_chars = len(self.text)
729
782
  elif args.truncate:
730
783
  num_chars = args.truncate
731
- elif self.file_id in TRUNCATION_LENGTHS:
732
- num_chars = TRUNCATION_LENGTHS[self.file_id] or self.file_size()
733
- elif self.author in TRUNCATE_EMAILS_FROM or any([self.is_with(n) for n in TRUNCATE_EMAILS_FROM_OR_TO]) or includes_truncate_term:
784
+ elif self.config and self.config.truncate_to is not None:
785
+ num_chars = len(self.text) if self.config.truncate_to == NO_TRUNCATE else self.config.truncate_to
786
+ elif self.is_interesting():
787
+ num_chars = len(self.text)
788
+ elif self.author in TRUNCATE_EMAILS_FROM \
789
+ or any([self.is_from_or_to(n) for n in TRUNCATE_EMAILS_FROM_OR_TO]) \
790
+ or self.is_fwded_article() \
791
+ or includes_truncate_term:
734
792
  num_chars = min(quote_cutoff or MAX_CHARS_TO_PRINT, TRUNCATED_CHARS)
735
- elif quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
736
- num_chars = quote_cutoff
737
793
  else:
738
- num_chars = MAX_CHARS_TO_PRINT
739
-
740
- if num_chars != MAX_CHARS_TO_PRINT and not self.is_duplicate():
741
- log_args = {
742
- 'num_chars': num_chars,
743
- 'author_truncate': self.author in TRUNCATE_EMAILS_FROM,
744
- 'is_fwded_article': self.is_fwded_article(),
745
- 'is_quote_cutoff': quote_cutoff == num_chars,
746
- 'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
747
- 'quote_cutoff': quote_cutoff,
748
- }
749
-
750
- logger.debug(f'{self.summary()} truncating: ' + ', '.join([f"{k}={v}" for k, v in log_args.items() if v]) + '\n')
751
-
794
+ if quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
795
+ trimmed_words = self.text[quote_cutoff:].split()
796
+
797
+ if '<...snipped' in trimmed_words[:NUM_WORDS_IN_LAST_QUOTE]:
798
+ num_trailing_words = 0
799
+ elif trimmed_words and trimmed_words[0] in ['From:', 'Sent:']:
800
+ num_trailing_words = NUM_WORDS_IN_LAST_QUOTE
801
+ else:
802
+ num_trailing_words = NUM_WORDS_IN_LAST_QUOTE
803
+
804
+ if trimmed_words:
805
+ last_quoted_text = ' '.join(trimmed_words[:num_trailing_words])
806
+ num_chars = quote_cutoff + len(last_quoted_text) + 1 # Give a hint of the next line
807
+ else:
808
+ num_chars = quote_cutoff
809
+ else:
810
+ num_chars = min(self.file_size(), MAX_CHARS_TO_PRINT)
811
+
812
+ # Always print whole email for 1st email for user
813
+ if self._is_first_for_user and num_chars < self.file_size() and not self.is_duplicate():
814
+ logger.info(f"{self} Overriding cutoff {num_chars} for first email")
815
+ num_chars = self.file_size()
816
+
817
+ log_args = {
818
+ 'num_chars': num_chars,
819
+ '_is_first_for_user': self._is_first_for_user,
820
+ 'author_truncate': self.author in TRUNCATE_EMAILS_FROM,
821
+ 'is_fwded_article': self.is_fwded_article(),
822
+ 'is_quote_cutoff': quote_cutoff == num_chars,
823
+ 'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
824
+ 'quote_cutoff': quote_cutoff,
825
+ }
826
+
827
+ log_args_str = ', '.join([f"{k}={v}" for k, v in log_args.items() if v])
828
+ logger.debug(f"Truncate determination: {log_args_str}")
752
829
  return num_chars
753
830
 
754
831
  def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
@@ -762,7 +839,7 @@ class Email(Communication):
762
839
  if len(text) > num_chars:
763
840
  text = text[0:num_chars]
764
841
  doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style())
765
- trim_note = f"<...trimmed to {num_chars} characters of {self.length()}, read the rest at {doc_link_markup}...>"
842
+ trim_note = f"<...trimmed to {num_chars:,} characters of {self.length():,}, read the rest at {doc_link_markup}...>"
766
843
  trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
767
844
 
768
845
  # Rewrite broken headers where the values are on separate lines from the field names
@@ -789,7 +866,7 @@ class Email(Communication):
789
866
  text = join_texts(lines, '\n')
790
867
 
791
868
  email_txt_panel = Panel(
792
- highlighter(text).append('\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
869
+ highlighter(text).append('...\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
793
870
  border_style=self._border_style(),
794
871
  expand=False,
795
872
  subtitle=REWRITTEN_HEADER_MSG if should_rewrite_header else None,
@@ -798,6 +875,11 @@ class Email(Communication):
798
875
  yield self.file_info_panel()
799
876
  yield Padding(email_txt_panel, (0, 0, 1, INFO_INDENT))
800
877
 
878
+ if self.attached_docs:
879
+ attachments_table_title = f" {self.url_slug} Email Attachments:"
880
+ attachments_table = OtherFile.files_preview_table(self.attached_docs, title=attachments_table_title)
881
+ yield Padding(attachments_table, (0, 0, 1, 12))
882
+
801
883
  if should_rewrite_header:
802
884
  self.log_top_lines(self.header.num_header_rows + 4, f'Original header:')
803
885
 
@@ -2,7 +2,7 @@ import json
2
2
  import re
3
3
  from dataclasses import asdict, dataclass, field
4
4
 
5
- from epstein_files.util.constant.strings import AUTHOR, REDACTED
5
+ from epstein_files.util.constant.strings import AUTHOR, REDACTED, indented
6
6
  from epstein_files.util.constants import ALL_CONFIGS
7
7
  from epstein_files.util.doc_cfg import EmailCfg
8
8
  from epstein_files.util.logging import logger
@@ -13,7 +13,10 @@ ON_BEHALF_OF = 'on behalf of'
13
13
  TO_FIELDS = ['bcc', 'cc', 'to']
14
14
  EMAILER_FIELDS = [AUTHOR] + TO_FIELDS
15
15
 
16
- HEADER_REGEX_STR = r'(((?:(?:Date|From|Sent|To|C[cC]|Importance|Subject|Bee|B[cC]{2}|Attachments|Classification|Flag):|on behalf of ?)(?! +(by |from my|via )).*\n){3,})'
16
+ FIELD_PATTERNS = ['Date', 'From', 'Sent', 'To', r"C[cC]", r"B[cC][cC]", 'Importance', 'Subject', 'Attachments', 'Classification', 'Flag', 'Reply-To']
17
+ FIELDS_PATTERN = '|'.join(FIELD_PATTERNS)
18
+ FIELDS_COLON_PATTERN = fr"^({FIELDS_PATTERN}):"
19
+ HEADER_REGEX_STR = fr"(((?:(?:{FIELDS_PATTERN}|Bee):|on behalf of ?)(?! +(by |from my|via )).*\n){{3,}})"
17
20
  EMAIL_SIMPLE_HEADER_REGEX = re.compile(rf'^{HEADER_REGEX_STR}')
18
21
  EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX = re.compile(HEADER_REGEX_STR)
19
22
  EMAIL_PRE_FORWARD_REGEX = re.compile(r"(.{3,2000}?)" + HEADER_REGEX_STR, re.DOTALL) # Match up to the next email header section
@@ -53,6 +56,7 @@ class EmailHeader:
53
56
  importance: str | None = None
54
57
  attachments: str | None = None
55
58
  to: list[str] | None = None
59
+ reply_to: str | None = None
56
60
 
57
61
  def __post_init__(self):
58
62
  self.num_header_rows = len(self.field_names)
@@ -95,13 +99,10 @@ class EmailHeader:
95
99
  logger.info(f"{log_prefix}, trying next line...")
96
100
  num_headers += 1
97
101
  value = email_lines[i + num_headers]
98
- elif BAD_EMAILER_REGEX.match(value):
102
+ elif BAD_EMAILER_REGEX.match(value) or value.startswith('http'):
99
103
  logger.info(f"{log_prefix}, decrementing num_headers and skipping...")
100
104
  num_headers -= 1
101
105
  continue
102
- elif value.startswith('http'):
103
- logger.info(f"{log_prefix}, using empty string instead...")
104
- value = ''
105
106
 
106
107
  value = [v.strip() for v in value.split(';') if len(v.strip()) > 0]
107
108
 
@@ -110,7 +111,12 @@ class EmailHeader:
110
111
  self.num_header_rows = len(self.field_names) + num_headers
111
112
  self.header_chars = '\n'.join(email_lines[0:self.num_header_rows])
112
113
  log_msg = f"Corrected empty header using {self.num_header_rows} lines to:\n"
113
- logger.debug(f"{log_msg}{self}\n\nTop lines:\n\n%s", '\n'.join(email_lines[0:(num_headers + 1) * 2]))
114
+
115
+ logger.warning(
116
+ f"{log_msg}{self}\n\n[top lines]:\n\n%s\n\n[body_lines]:\n\n%s\n\n",
117
+ indented('\n'.join(email_lines[0:(num_headers + 1) * 2]), prefix='> '),
118
+ indented('\n'.join(email_lines[self.num_header_rows:self.num_header_rows + 5]), prefix='> '),
119
+ )
114
120
 
115
121
  def rewrite_header(self) -> str:
116
122
  header_fields = {}
@@ -151,7 +157,7 @@ class EmailHeader:
151
157
  #logger.debug(f"extracting header line: '{line}'")
152
158
  key, value = [element.strip() for element in line.split(':', 1)]
153
159
  value = value.rstrip('_')
154
- key = AUTHOR if key == 'From' else ('sent_at' if key in ['Date', 'Sent'] else key.lower())
160
+ key = AUTHOR if key == 'From' else ('sent_at' if key in ['Date', 'Sent'] else key.lower().replace('-', '_'))
155
161
  key = 'bcc' if key == 'bee' else key
156
162
 
157
163
  if kw_args.get(key):
@@ -161,6 +167,9 @@ class EmailHeader:
161
167
 
162
168
  field_names.append(key)
163
169
 
170
+ if key == 'reply_to':
171
+ logger.warning(f"Found value for Reply-To field: '{value}'")
172
+
164
173
  if key in TO_FIELDS:
165
174
  recipients = [element.strip() for element in value.split(';')]
166
175
  recipients = [r for r in recipients if len(r) > 0]