epstein-files 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,655 @@
1
+ import logging
2
+ import re
3
+ from collections import defaultdict
4
+ from dataclasses import dataclass, field
5
+ from datetime import datetime
6
+ from typing import ClassVar, cast
7
+
8
+ from dateutil.parser import parse
9
+ from rich.console import Console, ConsoleOptions, RenderResult
10
+ from rich.padding import Padding
11
+ from rich.panel import Panel
12
+ from rich.text import Text
13
+
14
+ from epstein_files.documents.communication import Communication
15
+ from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, INFO_INDENT
16
+ from epstein_files.documents.emails.email_header import (BAD_EMAILER_REGEX, EMAIL_SIMPLE_HEADER_REGEX,
17
+ EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, TIME_REGEX, EmailHeader)
18
+ from epstein_files.util.constant.names import *
19
+ from epstein_files.util.constant.strings import REDACTED, URL_SIGNIFIERS
20
+ from epstein_files.util.constants import *
21
+ from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes, extract_last_name,
22
+ remove_timezone, uniquify)
23
+ from epstein_files.util.env import logger
24
+ from epstein_files.util.highlighted_group import get_style_for_name
25
+ from epstein_files.util.rich import *
26
+
27
+ BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|L\._|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
28
+ BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,])$')
29
+ DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
30
+ LINK_LINE_REGEX = re.compile(f"^(> )?htt")
31
+ QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
32
+ REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + ['********************************']
33
+ REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
34
+
35
+ BAD_TIMEZONE_REGEX = re.compile(fr'\((UTC|GMT\+\d\d:\d\d)\)|{REDACTED}')
36
+ DATE_HEADER_REGEX = re.compile(r'(?:Date|Sent):? +(?!by|from|to|via)([^\n]{6,})\n')
37
+ TIMESTAMP_LINE_REGEX = re.compile(r"\d+:\d+")
38
+
39
+ SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
40
+ REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
41
+ MAX_CHARS_TO_PRINT = 4000
42
+ MAX_QUOTED_REPLIES = 2
43
+ VALID_HEADER_LINES = 14
44
+
45
+ OCR_REPAIRS: dict[str | re.Pattern, str] = {
46
+ re.compile(r'grnail\.com'): 'gmail.com',
47
+ re.compile(r"^(From|To)(: )?[_1.]{5,}", re.MULTILINE): rf"\1: {REDACTED}", # Redacted email addresses
48
+ # These 3 must come in this order!
49
+ re.compile(r'([/vkT]|Ai|li|(I|7)v)rote:'): 'wrote:',
50
+ re.compile(r"([<>.=_HIM][<>.=_HIM14]{5,}[<>.=_HIM]|MOMMINNEMUMMIN) *(wrote:?)?"): rf"{REDACTED} \2",
51
+ re.compile(r"([,<>_]|AM|PM)\n(>)? ?wrote:?"): r'\1\2 wrote:',
52
+ # Names / email addresses
53
+ 'Alireza lttihadieh': ALIREZA_ITTIHADIEH,
54
+ 'Miroslav Laj6ak': MIROSLAV_LAJCAK,
55
+ 'Ross G°w': ROSS_GOW,
56
+ 'Torn Pritzker': TOM_PRITZKER,
57
+ re.compile(r' Banno(r]?|\b)'): ' Bannon',
58
+ re.compile(r'gmax ?[1l] ?[@g]ellmax.c ?om'): 'gmax1@ellmax.com',
59
+ re.compile(r"[ijlp']ee[vy]acation[©@a(&,P ]{1,3}g?mail.com"): 'jeevacation@gmail.com',
60
+ # Signatures
61
+ 'BlackBerry by AT &T': 'BlackBerry by AT&T',
62
+ 'BlackBerry from T- Mobile': 'BlackBerry from T-Mobile',
63
+ "from my 'Phone": 'from my iPhone',
64
+ 'from Samsung Mob.le': 'from Samsung Mobile',
65
+ 'gJeremyRubin': '@JeremyRubin',
66
+ 'Sent from Mabfl': 'Sent from Mobile', # NADIA_MARCINKO signature bad OCR
67
+ 'twitter glhsummers': 'twitter @lhsummers',
68
+ re.compile(r"twitter\.com[i/][lI]krauss[1lt]"): "twitter.com/lkrauss1",
69
+ re.compile(r'from my BlackBerry[0°] wireless device'): 'from my BlackBerry® wireless device',
70
+ # links
71
+ 'Imps ://': 'https://',
72
+ re.compile(r'timestopics/people/t/landon jr thomas/inde\n?x\n?\.\n?h\n?tml'): 'timestopics/people/t/landon_jr_thomas/index.html',
73
+ # Subject lines
74
+ r"as Putin Mayhem Tests President's Grip\non GOP": "as Putin Mayhem Tests President's Grip on GOP",
75
+ r"avoids testimony from alleged\nvictims": "avoids testimony from alleged victims",
76
+ r"but\nwatchdogs say probe is tainted": "watchdogs say probe is tainted",
77
+ r"COVER UP SEX ABUSE CRIMES\nBY THE WHITE HOUSE": "COVER UP SEX ABUSE CRIMES BY THE WHITE HOUSE",
78
+ r'Priebus, used\nprivate email accounts for': 'Priebus, used private email accounts for',
79
+ r"War on the Investigations\nEncircling Him": "War on the Investigations Encircling Him",
80
+ re.compile(r"deadline re Mr Bradley Edwards vs Mr\s*Jeffrey Epstein", re.I): "deadline re Mr Bradley Edwards vs Mr Jeffrey Epstein",
81
+ re.compile(r"Following Plea That Implicated Trump -\s*https://www.npr.org/676040070", re.I): "Following Plea That Implicated Trump - https://www.npr.org/676040070",
82
+ re.compile(r"for Attorney General -\s+Wikisource, the"): r"for Attorney General - Wikisource, the",
83
+ re.compile(r"JUDGE SWEET\s+ALLOWING\s+STEVEN\s+HOFFENBERG\s+TO\s+TALK\s+WITH\s+THE\s+TOWERS\s+VICTIMS\s+TO\s+EXPLAIN\s+THE\s+VICTIMS\s+SUI\n?T\s+FILING\s+AGAINST\s+JEFF\s+EPSTEIN"): "JUDGE SWEET ALLOWING STEVEN HOFFENBERG TO TALK WITH THE TOWERS VICTIMS TO EXPLAIN THE VICTIMS SUIT FILING AGAINST JEFF EPSTEIN",
84
+ re.compile(r"Lawyer for Susan Rice: Obama administration '?justifiably concerned' about sharing Intel with\s*Trump team -\s*POLITICO", re.I): "Lawyer for Susan Rice: Obama administration 'justifiably concerned' about sharing Intel with Trump team - POLITICO",
85
+ re.compile(r"PATTERSON NEW\s+BOOK\s+TELLING\s+FEDS\s+COVER\s+UP\s+OF\s+BILLIONAIRE\s+JEFF\s+EPSTEIN\s+CHILD\s+RAPES\s+RELEASE\s+DATE\s+OCT\s+10\s+2016\s+STEVEN\s+HOFFENBERG\s+IS\s+ON\s+THE\s+BOOK\s+WRITING\s+TEAM\s*!!!!"): "PATTERSON NEW BOOK TELLING FEDS COVER UP OF BILLIONAIRE JEFF EPSTEIN CHILD RAPES RELEASE DATE OCT 10 2016 STEVEN HOFFENBERG IS ON THE BOOK WRITING TEAM !!!!",
86
+ re.compile(r"PROCEEDINGS FOR THE ST THOMAS ATTACHMENT OF\s*ALL JEFF EPSTEIN ASSETS"): "PROCEEDINGS FOR THE ST THOMAS ATTACHMENT OF ALL JEFF EPSTEIN ASSETS",
87
+ re.compile(r"Subject:\s*Fwd: Trending Now: Friends for three decades"): "Subject: Fwd: Trending Now: Friends for three decades",
88
+ # Misc
89
+ 'AVG°': 'AVGO',
90
+ }
91
+
92
+ MARTIN_WEINBERG_SIGNATURE_PATTERN = r"Martin G. Weinberg, Esq.\n20 Park Plaza((, )|\n)Suite 1000\nBoston, MA 02116(\n61.*)?(\n.*([cC]ell|Office))*"
93
+
94
+ EMAIL_SIGNATURES = {
95
+ ARIANE_DE_ROTHSCHILD: re.compile(r"Ensemble.*\nCe.*\ndestinataires.*\nremercions.*\nautorisee.*\nd.*\nLe.*\ncontenues.*\nEdmond.*\nRoth.*\nlo.*\nRoth.*\ninfo.*\nFranc.*\n.2.*", re.I),
96
+ BARBRO_C_EHNBOM: re.compile(r"Barbro C.? Ehn.*\nChairman, Swedish-American.*\n((Office|Cell|Sweden):.*\n)*(360.*\nNew York.*)?"),
97
+ DANNY_FROST: re.compile(r"Danny Frost\nDirector.*\nManhattan District.*\n212.*", re.IGNORECASE),
98
+ DARREN_INDYKE: re.compile(r"DARREN K. INDYKE.*?\**\nThe information contained in this communication.*?Darren K.[\n\s]+?[Il]ndyke(, PLLC)? — All rights reserved\.? ?\n\*{50,120}(\n\**)?", re.DOTALL),
99
+ DAVID_INGRAM: re.compile(r"Thank you in advance.*\nDavid Ingram.*\nCorrespondent\nReuters.*\nThomson.*(\n(Office|Mobile|Reuters.com).*)*"),
100
+ DEEPAK_CHOPRA: re.compile(fr"({DEEPAK_CHOPRA}( MD)?\n)?2013 Costa Del Mar Road\nCarlsbad, CA 92009(\n(Chopra Foundation|Super Genes: Unlock.*))?(\nJiyo)?(\nChopra Center for Wellbeing)?(\nHome: Where Everyone is Welcome)?"),
101
+ JEFFREY_EPSTEIN: re.compile(r"((\*+|please note)\n+)?(> )?(• )?(» )?The information contained in this communication is\n(> )*(» )?confidential.*?all attachments.( copyright -all rights reserved?)?", re.DOTALL),
102
+ JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*", re.IGNORECASE),
103
+ KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
104
+ LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
105
+ LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
106
+ MARTIN_WEINBERG: re.compile(fr"({MARTIN_WEINBERG_SIGNATURE_PATTERN}\n)?This Electronic Message contains.*?contents of this message is.*?prohibited.", re.DOTALL),
107
+ STEVEN_PFEIFFER: re.compile(r"Steven\nSteven .*\nAssociate.*\nIndependent Filmmaker Project\nMade in NY.*\n30 .*\nBrooklyn.*\n(p:.*\n)?www\.ifp.*", re.IGNORECASE),
108
+ PETER_MANDELSON: re.compile(r'Disclaimer This email and any attachments to it may be.*?with[ \n]+number(.*?EC4V[ \n]+6BJ)?', re.DOTALL | re.IGNORECASE),
109
+ PAUL_BARRETT: re.compile(r"Paul Barrett[\n\s]+Alpha Group Capital LLC[\n\s]+(142 W 57th Street, 11th Floor, New York, NY 10019?[\n\s]+)?(al?[\n\s]*)?ALPHA GROUP[\n\s]+CAPITAL"),
110
+ RICHARD_KAHN: re.compile(r'Richard Kahn[\n\s]+HBRK Associates Inc.?[\n\s]+((301 East 66th Street, Suite 1OF|575 Lexington Avenue,? 4th Floor,?)[\n\s]+)?New York, (NY|New York) 100(22|65)([\n\s]+(Tel?|Phone)( I)?[\n\s]+Fa[x"]?[\n\s]+[Ce]el?l?)?', re.IGNORECASE),
111
+ 'Susan Edelman': re.compile(r'Susan Edel.*\nReporter\n1211.*\n917.*\nsedelman.*', re.IGNORECASE),
112
+ TERRY_KAFKA: re.compile(r"((>|I) )?Terry B.? Kafka.*\n(> )?Impact Outdoor.*\n(> )?5454.*\n(> )?Dallas.*\n((> )?c?ell.*\n)?(> )?Impactoutdoor.*(\n(> )?cell.*)?", re.IGNORECASE),
113
+ TONJA_HADDAD_COLEMAN: re.compile(fr"Tonja Haddad Coleman.*\nTonja Haddad.*\nAdvocate Building\n315 SE 7th.*(\nSuite.*)?\nFort Lauderdale.*(\n({REDACTED} )?facsimile)?(\nwww.tonjahaddad.com?)?(\nPlease add this efiling.*\nThe information.*\nyou are not.*\nyou are not.*)?", re.IGNORECASE),
114
+ UNKNOWN: re.compile(r"(This message is directed to and is for the use of the above-noted addressee only.*\nhereon\.)", re.DOTALL),
115
+ }
116
+
117
+ # Invalid for links to EpsteinWeb
118
+ JUNK_EMAILERS = [
119
+ 'asmallworld@travel.asmallworld.net',
120
+ 'editorialstaff@flipboard.com',
121
+ 'How To Academy',
122
+ 'Jokeland',
123
+ JP_MORGAN_USGIO,
124
+ 'Saved by Internet Explorer 11',
125
+ ]
126
+
127
+ TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + [
128
+ 'Alan S Halperin',
129
+ 'middle.east.update@hotmail.com',
130
+ 'Mitchell Bard',
131
+ 'Skip Rimer',
132
+ ]
133
+
134
+ TRUNCATION_LENGTHS = {
135
+ '023627': 16_800, # Micheal Wolff article with brock pierce
136
+ '030245': 7_500, # Epstein rationalizes his behavior in an open letter to the world
137
+ '030781': 1_700, # Bannon email about crypto coin issues
138
+ '032906': 750, # David Blaine email
139
+ }
140
+
141
+ # These are long forwarded articles so we force a trim to 1,333 chars if these strings exist
142
+ TRUNCATE_TERMS = [
143
+ 'The rebuilding of Indonesia',
144
+ 'Dominique Strauss-Kahn',
145
+ 'THOMAS L. FRIEDMAN',
146
+ 'a sleek, briskly paced film whose title suggests a heist movie',
147
+ 'quote from The Colbert Report distinguishes',
148
+ 'co-inventor of the GTX Smart Shoe',
149
+ 'my latest Washington Post column',
150
+ 'Whether you donated to Poetry in America through',
151
+ 'supported my humanities work at Harvard',
152
+ 'Calendar of Major Events, Openings, and Fundraisers',
153
+ 'Nuclear Operator Raises Alarm on Crisis',
154
+ 'as responsible for the democratisation of computing and',
155
+ 'AROUND 1,000 operational satellites are circling the Earth',
156
+ "In recent months, China's BAT collapse",
157
+ 'President Obama introduces Jim Yong Kim as his nominee',
158
+ 'Trump appears with mobster-affiliated felon at New',
159
+ 'Lead Code Enforcement Walton presented the facts',
160
+ "Is UNRWA vital for the Palestinians' future",
161
+ 'The New York company, led by Stephen Ross',
162
+ 'I spent some time mulling additional aspects of a third choice presidential',
163
+ 'you are referring to duplication of a gene',
164
+ 'i am writing you both because i am attaching a still not-quite-complete response',
165
+ 'Learn to meditate and discover what truly nourishes your entire being',
166
+ 'Congratulations to the 2019 Hillman Prize recipients',
167
+ 'This much we know - the Fall elections are shaping up',
168
+ "Special counsel Robert Mueller's investigation may face a serious legal obstacle",
169
+ "nearly leak-proof since its inception more than a year ago",
170
+ "I appreciate the opportunity to respond to your email",
171
+ "Hello Peter. I am currently on a plane. I sent you earlier",
172
+ "I appreciate the opportunity to respond to your email",
173
+ 'I just wanted to follow up on a couple of notes. I have been coordinating with Richard Kahn',
174
+ 'So, Peggy, if you could just let me know what info to include on the donation',
175
+ 'Consult a lawyer beforehand, if possible, but be cooperative/nice at this stage',
176
+ # Amanda Ens
177
+ 'We remain positive on banks that can make acceptable returns',
178
+ 'David Woo (BAML head of FX, Rates and EM Strategy, very highly regarded',
179
+ "Please let me know if you're interested in joining a small group meeting",
180
+ 'Erika Najarian, BAML financials research analyst, just returned',
181
+ 'We can also discuss single stock and Topix banks',
182
+ 'We are recording unprecedented divergences in falling equity vol',
183
+ 'As previously discussed between you and Ariane',
184
+ 'The US trade war against China: The view from Beijing',
185
+ 'no evidence you got the latest so i have sent you just the key message',
186
+ # Joscha Bach
187
+ 'Cells seem to be mostly indistinguishable (except',
188
+ 'gender differenece. unlikely motivational, every cell is different',
189
+ 'Some thoughts I meant to send back for a long time',
190
+ # Krassner
191
+ 'My friend Michael Simmons, who has been the editor of National Lampoon',
192
+ "In the premiere episode of 'The Last Laugh' podcast, Sarah Silverman",
193
+ 'Thanks so much for sharing both your note to Steven and your latest Manson essay',
194
+ # Edward Larson
195
+ 'Coming from an international background, and having lived in Oslo, Tel Aviv',
196
+ # Katherine Keating
197
+ 'Paul Keating is aware that many people see him as a puzzle and contradiction',
198
+ 'his panoramic view of world affairs sharper than ever, Paul Keating blames',
199
+ # melanie
200
+ 'Some years ago when I worked at the libertarian Cato Institute'
201
+ # rich kahn
202
+ 'House and Senate Republicans on their respective tax overhaul',
203
+ 'The Tax Act contains changes to the treatment of "carried interests"',
204
+ 'General Election: Trump vs. Clinton LA Times/USC Tracking',
205
+ 'Location: Quicken Loans Arena in Cleveland, OH',
206
+ 'A friendly discussion about Syria with a former US State Department',
207
+ # Tom / Paul Krassner
208
+ 'I forgot to post my cartoon from week before last, about Howard Schultz',
209
+ # Bannon
210
+ "Bannon the European: He's opening the populist fort in Brussels",
211
+ "Steve Bannon doesn't do subtle.",
212
+ 'The Department of Justice lost its latest battle with Congress',
213
+ "Donald Trump's newly named chief strategist and senior counselor",
214
+ # Diane Ziman
215
+ 'I was so proud to see him speak at the Women',
216
+ # Krauss
217
+ 'On confronting dogma, I of course agree',
218
+ 'I did neck with that woman, but never forced myself on her',
219
+ 'It is hard to know how to respond to a list of false',
220
+ 'The Women in the World Summit opens April 12',
221
+ 'lecture in Heidelberg Oct 14 but they had to cancel',
222
+ # Nikolic
223
+ 'people from LifeBall',
224
+ # Random
225
+ 'Little Hodiaki',
226
+ "It began with deep worries regarding China's growth path",
227
+ 'https://www.washingtonpost.com/politics/2018/09/04/transcript-phone-call',
228
+ # Epstein
229
+ 'David Ben Gurion was asked why he, after 2000',
230
+ # Lisa New
231
+ 'The raw materials for that period include interviews',
232
+ ]
233
+
234
+ KRASSNER_RECIPIENTS = uniquify(KRASSNER_MANSON_RECIPIENTS + KRASSNER_024923_RECIPIENTS + KRASSNER_033568_RECIPIENTS)
235
+
236
+ # No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
237
+ USELESS_EMAILERS = IRAN_NUCLEAR_DEAL_SPAM_EMAIL_RECIPIENTS + \
238
+ KRASSNER_RECIPIENTS + \
239
+ FLIGHT_IN_2012_PEOPLE + [
240
+ 'Alan Rogers', # Random CC
241
+ 'BS Stern', # A random fwd of email we have
242
+ 'Cheryl Kleen', # Single email from Anne Boyles, displayed under Anne Boyles
243
+ 'Connie Zaguirre', # Random CC
244
+ 'Dan Fleuette', # CC from sean bannon
245
+ 'Danny Goldberg', # Random Paul Krassner emails
246
+ GERALD_LEFCOURT, # Single CC
247
+ GORDON_GETTY, # Random CC
248
+ JEFF_FULLER, # Random Jean Luc Brunel CC
249
+ 'Jojo Fontanilla', # Random CC
250
+ 'Joseph Vinciguerra', # Random CC
251
+ 'Larry Cohen', # Random Bill Gates CC
252
+ 'Lyn Fontanilla', # Random CC
253
+ 'Mark Albert', # Random CC
254
+ 'Matthew Schafer', # Random CC
255
+ 'Michael Simmons', # Random CC
256
+ 'Nancy Portland', # Lawrence Krauss CC
257
+ 'Oliver Goodenough', # Robert Trivers CC
258
+ 'Owen Blicksilver', # Landon Thomas CC
259
+ 'Peter Aldhous', # Lawrence Krauss CC
260
+ 'Sam Harris', # Lawrence Krauss CC
261
+ SAMUEL_LEFF, # Random CC
262
+ "Saved by Internet Explorer 11",
263
+ 'Sean T Lehane', # Random CC
264
+ 'Stephen Rubin', # Random CC
265
+ 'Tim Kane', # Random CC
266
+ 'Travis Pangburn', # Random CC
267
+ 'Vahe Stepanian', # Random CC
268
+ ]
269
+
270
+ # Emails sent by epstein to himself that are just notes
271
+ NOTES_TO_SELF = [
272
+ '026677',
273
+ '029752',
274
+ '030238',
275
+ # '033274', # TODO: Epstein's note to self doesn't get printed if we don't set the recipients to [None]
276
+ ]
277
+
278
+
279
+ @dataclass
280
+ class Email(Communication):
281
+ actual_text: str = field(init=False)
282
+ header: EmailHeader = field(init=False)
283
+ is_junk_mail: bool = False
284
+ recipients: list[str | None] = field(default_factory=list)
285
+ sent_from_device: str | None = None
286
+ signature_substitution_counts: dict[str, int] = field(default_factory=lambda: defaultdict(int))
287
+
288
+ # Just for logging how many headers we rewrote
289
+ rewritten_header_ids: ClassVar[set[str]] = set([])
290
+
291
+ def __post_init__(self):
292
+ super().__post_init__()
293
+ self.is_junk_mail = self.author in JUNK_EMAILERS
294
+
295
+ if self.config and self.config.recipients:
296
+ self.recipients = cast(list[str | None], self.config.recipients)
297
+ else:
298
+ for recipient in self.header.recipients():
299
+ self.recipients.extend(self._get_names(recipient))
300
+
301
+ recipients = [r for r in self.recipients if r != self.author or self.file_id in NOTES_TO_SELF] # Remove self CCs
302
+ self.recipients = list(set(recipients))
303
+ self.text = self._cleaned_up_text()
304
+ self.actual_text = self._actual_text()
305
+ self.sent_from_device = self._sent_from_device()
306
+ logger.debug(f"Constructed {self.description()}")
307
+
308
+ def description(self) -> Text:
309
+ """One line summary mostly for logging."""
310
+ txt = self._description()
311
+
312
+ if len(self.recipients) > 0:
313
+ txt.append(', ').append(key_value_txt('recipients', self._recipients_txt()))
314
+
315
+ return txt.append(CLOSE_PROPERTIES_CHAR)
316
+
317
+ def idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
318
+ """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
319
+ for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text or self.text)):
320
+ if i >= n:
321
+ return match.end() - 1
322
+
323
+ def info_txt(self) -> Text:
324
+ txt = Text("OCR text of email from ", style='grey46').append(self.author_txt).append(' to ')
325
+ return txt.append(self._recipients_txt()).append(highlighter(f" probably sent at {self.timestamp}"))
326
+
327
+ def subject(self) -> str:
328
+ return self.header.subject or ''
329
+
330
+ def _actual_text(self) -> str:
331
+ """The text that comes before likely quoted replies and forwards etc."""
332
+ if self.config and self.config.actual_text is not None:
333
+ return self.config.actual_text
334
+ elif self.header.num_header_rows == 0:
335
+ return self.text
336
+
337
+ text = '\n'.join(self.text.split('\n')[self.header.num_header_rows:]).strip()
338
+ reply_text_match = REPLY_TEXT_REGEX.search(text)
339
+ # logger.info(f"Raw text:\n" + self.top_lines(20) + '\n\n')
340
+ # logger.info(f"With header removed:\n" + text[0:500] + '\n\n')
341
+
342
+ if self.file_id in ['024624']:
343
+ return text
344
+
345
+ if reply_text_match:
346
+ actual_num_chars = len(reply_text_match.group(1))
347
+ actual_text_pct = f"{(100 * float(actual_num_chars) / len(text)):.1f}%"
348
+ logger.debug(f"'{self.url_slug}': actual_text() reply_text_match is {actual_num_chars:,} chars ({actual_text_pct} of {len(text):,})")
349
+ text = reply_text_match.group(1)
350
+
351
+ # If all else fails look for lines like 'From: blah', 'Subject: blah', and split on that.
352
+ for field_name in REPLY_SPLITTERS:
353
+ field_string = f'\n{field_name}'
354
+
355
+ if field_string not in text:
356
+ continue
357
+
358
+ logger.debug(f"'{self.url_slug}': Splitting based on '{field_string.strip()}'")
359
+ pre_from_text = text.split(field_string)[0]
360
+ actual_num_chars = len(pre_from_text)
361
+ actual_text_pct = f"{(100 * float(actual_num_chars) / len(text)):.1f}%"
362
+ logger.debug(f"'{self.url_slug}': actual_text() fwd_text_match is {actual_num_chars:,} chars ({actual_text_pct} of {len(text):,})")
363
+ text = pre_from_text
364
+ break
365
+
366
+ return text.strip()
367
+
368
+ def _border_style(self) -> str:
369
+ """Color emails from epstein to others with the color for the first recipient."""
370
+ if self.author == JEFFREY_EPSTEIN:
371
+ if len(self.recipients) == 0 or self.recipients == [None]:
372
+ style = self.author_style
373
+ else:
374
+ style = get_style_for_name(self.recipients[0])
375
+ else:
376
+ style = self.author_style
377
+
378
+ return style.replace('bold', '').strip()
379
+
380
+ def _cleaned_up_text(self) -> str:
381
+ """Add newline after headers in text if actual header wasn't empty, remove bad lines, etc."""
382
+ # Insert line breaks now unless header is broken, in which case we'll do it later after fixing header
383
+ text = self.text if self.header.was_initially_empty else _add_line_breaks(self.text)
384
+ text = REPLY_REGEX.sub(r'\n\1', text) # Newlines between quoted replies
385
+
386
+ for name, signature_regex in EMAIL_SIGNATURES.items():
387
+ signature_replacement = f'<...snipped {name.lower()} legal signature...>'
388
+ text, num_replaced = signature_regex.subn(signature_replacement, text)
389
+ self.signature_substitution_counts[name] += num_replaced
390
+
391
+ return collapse_newlines(text).strip()
392
+
393
+ def _debug_info(self) -> str:
394
+ info = [
395
+ f"id={self.file_id}",
396
+ f"url_slug={self.url_slug}",
397
+ f"file_path='{self.file_path}'",
398
+ f"is_local_extract_file={self.is_local_extract_file()}",
399
+ ]
400
+
401
+ return f" " + "\n ".join(info)
402
+
403
+ def _extract_author(self) -> None:
404
+ self._extract_header()
405
+ super()._extract_author()
406
+
407
+ if not self.author and self.header.author:
408
+ authors = self._get_names(self.header.author)
409
+ self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
410
+
411
+ def _extract_header(self) -> None:
412
+ """Extract an EmailHeader object from the OCR text."""
413
+ header_match = EMAIL_SIMPLE_HEADER_REGEX.search(self.text)
414
+
415
+ if header_match:
416
+ self.header = EmailHeader.from_header_lines(header_match.group(0))
417
+
418
+ if self.header.is_empty():
419
+ self.header.repair_empty_header(self.lines)
420
+ else:
421
+ msg = f"No header match found in '{self.filename}'! Top lines:\n\n{self.top_lines()}"
422
+ log_fxn = logger.info if self.config else logger.warning
423
+ log_fxn(msg)
424
+ self.header = EmailHeader(field_names=[])
425
+
426
+ def _extract_timestamp(self) -> datetime:
427
+ if self.config and self.config.timestamp:
428
+ return self.config.timestamp
429
+ elif self.header.sent_at:
430
+ timestamp = _parse_timestamp(self.header.sent_at)
431
+
432
+ if timestamp:
433
+ return timestamp
434
+
435
+ searchable_lines = self.lines[0:VALID_HEADER_LINES]
436
+ searchable_text = '\n'.join(searchable_lines)
437
+ date_match = DATE_HEADER_REGEX.search(searchable_text)
438
+
439
+ if date_match:
440
+ timestamp = _parse_timestamp(date_match.group(1))
441
+
442
+ if timestamp:
443
+ return timestamp
444
+
445
+ logger.debug(f"Failed to find timestamp, falling back to parsing {VALID_HEADER_LINES} lines...")
446
+
447
+ for line in searchable_lines:
448
+ if not TIMESTAMP_LINE_REGEX.search(line):
449
+ continue
450
+
451
+ timestamp = _parse_timestamp(line)
452
+
453
+ if timestamp:
454
+ logger.debug(f"Fell back to timestamp {timestamp} in line '{line}'...")
455
+ return timestamp
456
+
457
+ raise RuntimeError(f"No timestamp found in '{self.file_path.name}' top lines:\n{searchable_text}")
458
+
459
+ def _get_names(self, emailer_str: str) -> list[str]:
460
+ """Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
461
+ emailer_str = EmailHeader.cleanup_str(emailer_str)
462
+
463
+ if len(emailer_str) == 0:
464
+ return []
465
+
466
+ names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
467
+
468
+ if BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
469
+ if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
470
+ logger.warning(f"'{self.filename}': No emailer found in '{escape_single_quotes(emailer_str)}'")
471
+ else:
472
+ logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
473
+
474
+ return names_found
475
+
476
+ names_found = names_found or [emailer_str]
477
+ return [_reverse_first_and_last_names(name) for name in names_found]
478
+
479
+ def _merge_lines(self, idx: int, idx2: int | None = None) -> None:
480
+ """Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""
481
+ idx2 = idx2 if idx2 is not None else (idx + 1)
482
+ lines = self.lines[0:idx]
483
+
484
+ if idx2 <= idx:
485
+ raise RuntimeError(f"idx2 ({idx2}) must be greater than idx ({idx})")
486
+ elif idx2 == (idx + 1):
487
+ lines += [self.lines[idx] + ' ' + self.lines[idx + 1]] + self.lines[idx + 2:]
488
+ else:
489
+ lines += [self.lines[idx] + ' ' + self.lines[idx2]] + self.lines[idx + 1:idx2] + self.lines[idx2 + 1:]
490
+
491
+ self._set_computed_fields(lines=lines)
492
+
493
+ def _recipients_txt(self) -> Text:
494
+ """Text object with comma separated colored versions of all recipients."""
495
+ recipients = [r or UNKNOWN for r in self.recipients] if len(self.recipients) > 0 else [UNKNOWN]
496
+
497
+ # Use just the last name for each recipient if there's 3 or more recipients
498
+ return join_texts([
499
+ Text(r if len(recipients) < 3 else extract_last_name(r), style=get_style_for_name(r))
500
+ for r in recipients
501
+ ], join=', ')
502
+
503
+ def _repair(self) -> None:
504
+ """Repair particularly janky files."""
505
+ if BAD_FIRST_LINE_REGEX.match(self.lines[0]):
506
+ self._set_computed_fields(lines=self.lines[1:])
507
+
508
+ self._set_computed_fields(lines=[line for line in self.lines if not BAD_LINE_REGEX.match(line)])
509
+ old_text = self.text
510
+
511
+ if self.file_id in ['031442']:
512
+ self._merge_lines(0) # Merge 1st and 2nd rows
513
+ elif self.file_id in '021729 029501 029282 030626 031384 033512'.split():
514
+ self._merge_lines(2) # Merge 3rd and 4th rows
515
+
516
+ if self.file_id in ['030626']: # Merge 6th and 7th (now 5th and 6th) rows
517
+ self._merge_lines(4)
518
+ elif self.file_id in ['029976']:
519
+ self._merge_lines(3) # Merge 4th and 5th rows
520
+ elif self.file_id in '026609 029402 032405'.split():
521
+ self._merge_lines(4) # Merge 5th and 6th rows
522
+ elif self.file_id in ['033568']:
523
+ for _i in range(5):
524
+ self._merge_lines(5)
525
+ elif self.file_id in ['025329']:
526
+ for _i in range(9):
527
+ self._merge_lines(2)
528
+ elif self.file_id == '029977':
529
+ self._set_computed_fields(text=self.text.replace('Sent 9/28/2012 2:41:02 PM', 'Sent: 9/28/2012 2:41:02 PM'))
530
+
531
+ for _i in range(4):
532
+ self._merge_lines(2)
533
+
534
+ self._merge_lines(4)
535
+ self._merge_lines(2, 4)
536
+
537
+ if old_text != self.text:
538
+ self.log(f"Modified text, old:\n\n" + '\n'.join(old_text.split('\n')[0:12]) + '\n', logging.INFO)
539
+ self.log_top_lines(12, 'Result of modifications', logging.INFO)
540
+ self.log('', logging.INFO)
541
+
542
+ lines = self.repair_ocr_text(OCR_REPAIRS, self.text).split('\n')
543
+ new_lines = []
544
+ i = 0
545
+
546
+ # Fix links (remove spaces, merge multiline links to a single line)
547
+ while i < len(lines):
548
+ line = lines[i]
549
+
550
+ if LINK_LINE_REGEX.search(line):
551
+ if 'htm' not in line \
552
+ and i < (len(lines) - 1) \
553
+ and (lines[i + 1].endswith('/') or any(s in lines[i + 1] for s in URL_SIGNIFIERS)):
554
+ logger.debug(f"{self.filename}: Joining link lines\n 1. {line}\n 2. {lines[i + 1]}\n")
555
+ line += lines[i + 1]
556
+ i += 1
557
+
558
+ line = line.replace(' ', '')
559
+
560
+ new_lines.append(line)
561
+
562
+ # TODO: hacky workaround to get a working link for HOUSE_OVERSIGHT_032564
563
+ if self.file_id == '032564' and line == 'http://m.huffpost.com/us/entry/us_599f532ae4b0dOef9f1c129d':
564
+ new_lines.append('(ed. note: an archived version of the above link is here: https://archive.is/hJxT3 )')
565
+
566
+ i += 1
567
+
568
+ self._set_computed_fields(lines=new_lines)
569
+
570
+ def _sent_from_device(self) -> str | None:
571
+ """Find any 'Sent from my iPhone' style lines if they exist."""
572
+ sent_from_match = SENT_FROM_REGEX.search(self.actual_text)
573
+
574
+ if sent_from_match:
575
+ sent_from = sent_from_match.group(0)
576
+ return 'S' + sent_from[1:] if sent_from.startswith('sent') else sent_from
577
+
578
+ def __rich_console__(self, _console: Console, _options: ConsoleOptions) -> RenderResult:
579
+ logger.debug(f"Printing '{self.filename}'...")
580
+ yield self.file_info_panel()
581
+ text = self.text
582
+ should_rewrite_header = self.header.was_initially_empty and self.header.num_header_rows > 0
583
+ quote_cutoff = self.idx_of_nth_quoted_reply(text=text) # Trim if there's many quoted replies
584
+ num_chars = MAX_CHARS_TO_PRINT
585
+ trim_footer_txt = None
586
+
587
+ if self.file_id in TRUNCATION_LENGTHS:
588
+ num_chars = TRUNCATION_LENGTHS[self.file_id]
589
+ elif self.author in TRUNCATE_ALL_EMAILS_FROM or any((term in self.text) for term in TRUNCATE_TERMS):
590
+ num_chars = int(MAX_CHARS_TO_PRINT / 3)
591
+ elif quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
592
+ num_chars = quote_cutoff
593
+
594
+ # Truncate long emails but leave a note explaining what happened w/link to source document
595
+ if len(text) > num_chars:
596
+ text = text[0:num_chars]
597
+ doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
598
+ trim_note = f"<...trimmed to {num_chars} characters of {self.length}, read the rest at {doc_link_markup}...>"
599
+ trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
600
+
601
+ # Rewrite broken headers where the values are on separate lines from the field names
602
+ if should_rewrite_header:
603
+ configured_actual_text = self.config.actual_text if self.config and self.config.actual_text else None
604
+ num_lines_to_skip = self.header.num_header_rows
605
+ lines = []
606
+
607
+ # Emails w/configured 'actual_text' are particularly broken; need to shuffle some lines
608
+ if configured_actual_text is not None:
609
+ num_lines_to_skip += 1
610
+ lines += [cast(str, configured_actual_text), '\n']
611
+
612
+ lines += text.split('\n')[num_lines_to_skip:]
613
+ text = self.header.rewrite_header() + '\n' + '\n'.join(lines)
614
+ text = _add_line_breaks(text) # This was skipped when _cleaned_up_text() w/a broken header so we do it now
615
+ self.rewritten_header_ids.add(self.file_id)
616
+
617
+ panel_txt = highlighter(text)
618
+
619
+ email_txt_panel = Panel(
620
+ panel_txt.append('\n\n').append(trim_footer_txt) if trim_footer_txt else panel_txt,
621
+ border_style=self._border_style(),
622
+ expand=False,
623
+ subtitle=REWRITTEN_HEADER_MSG if should_rewrite_header else None,
624
+ )
625
+
626
+ yield Padding(email_txt_panel, (0, 0, 1, INFO_INDENT))
627
+
628
+ if should_rewrite_header:
629
+ self.log_top_lines(self.header.num_header_rows + 4, f'Original header:', logging.INFO)
630
+
631
+
632
+ def _add_line_breaks(email_text: str) -> str:
633
+ return EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX.sub(r'\n\1\n', email_text).strip()
634
+
635
+
636
+ def _parse_timestamp(timestamp_str: str) -> None | datetime:
637
+ try:
638
+ timestamp_str = timestamp_str.replace('(GMT-05:00)', 'EST')
639
+ timestamp_str = BAD_TIMEZONE_REGEX.sub(' ', timestamp_str).strip()
640
+ timestamp = parse(timestamp_str, tzinfos=TIMEZONE_INFO)
641
+ logger.debug(f'Parsed timestamp "%s" from string "%s"', timestamp, timestamp_str)
642
+ return remove_timezone(timestamp)
643
+ except Exception as e:
644
+ logger.debug(f'Failed to parse "{timestamp_str}" to timestamp!')
645
+
646
+
647
+ def _reverse_first_and_last_names(name: str) -> str:
648
+ if '@' in name:
649
+ return name.lower()
650
+
651
+ if ', ' in name:
652
+ names = name.split(', ')
653
+ return f"{names[1]} {names[0]}"
654
+ else:
655
+ return name