epstein-files 1.2.5__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. epstein_files/__init__.py +55 -23
  2. epstein_files/documents/communication.py +9 -5
  3. epstein_files/documents/document.py +231 -135
  4. epstein_files/documents/doj_file.py +242 -0
  5. epstein_files/documents/doj_files/full_text.py +166 -0
  6. epstein_files/documents/email.py +289 -232
  7. epstein_files/documents/emails/email_header.py +35 -16
  8. epstein_files/documents/emails/emailers.py +223 -0
  9. epstein_files/documents/imessage/text_message.py +2 -3
  10. epstein_files/documents/json_file.py +18 -14
  11. epstein_files/documents/messenger_log.py +23 -39
  12. epstein_files/documents/other_file.py +54 -48
  13. epstein_files/epstein_files.py +65 -29
  14. epstein_files/person.py +151 -94
  15. epstein_files/util/constant/names.py +37 -10
  16. epstein_files/util/constant/output_files.py +2 -0
  17. epstein_files/util/constant/strings.py +14 -7
  18. epstein_files/util/constant/urls.py +17 -0
  19. epstein_files/util/constants.py +556 -391
  20. epstein_files/util/data.py +2 -0
  21. epstein_files/util/doc_cfg.py +44 -33
  22. epstein_files/util/env.py +34 -19
  23. epstein_files/util/file_helper.py +30 -6
  24. epstein_files/util/helpers/debugging_helper.py +13 -0
  25. epstein_files/util/helpers/env_helpers.py +21 -0
  26. epstein_files/util/highlighted_group.py +121 -37
  27. epstein_files/util/layout/left_bar_panel.py +26 -0
  28. epstein_files/util/logging.py +28 -13
  29. epstein_files/util/output.py +49 -40
  30. epstein_files/util/rich.py +30 -3
  31. epstein_files/util/word_count.py +7 -7
  32. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/METADATA +16 -3
  33. epstein_files-1.5.0.dist-info/RECORD +40 -0
  34. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/entry_points.txt +1 -1
  35. epstein_files-1.2.5.dist-info/RECORD +0 -34
  36. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/LICENSE +0 -0
  37. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/WHEEL +0 -0
@@ -1,7 +1,6 @@
1
1
  import json
2
2
  import logging
3
3
  import re
4
- from collections import defaultdict
5
4
  from copy import deepcopy
6
5
  from dataclasses import asdict, dataclass, field
7
6
  from datetime import datetime
@@ -16,13 +15,14 @@ from rich.text import Text
16
15
 
17
16
  from epstein_files.documents.communication import Communication
18
17
  from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, INFO_INDENT
19
- from epstein_files.documents.emails.email_header import (BAD_EMAILER_REGEX, EMAIL_SIMPLE_HEADER_REGEX,
20
- EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, TIME_REGEX, EmailHeader)
18
+ from epstein_files.documents.emails.email_header import (EMAIL_SIMPLE_HEADER_REGEX,
19
+ EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, FIELDS_COLON_PATTERN, EmailHeader)
20
+ from epstein_files.documents.emails.emailers import extract_emailer_names
21
+ from epstein_files.documents.other_file import OtherFile
21
22
  from epstein_files.util.constant.names import *
22
23
  from epstein_files.util.constant.strings import REDACTED
23
24
  from epstein_files.util.constants import *
24
- from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes,
25
- flatten, listify, remove_timezone, uniquify)
25
+ from epstein_files.util.data import AMERICAN_TIME_REGEX, TIMEZONE_INFO, collapse_newlines, remove_timezone
26
26
  from epstein_files.util.doc_cfg import EmailCfg, Metadata
27
27
  from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
28
28
  from epstein_files.util.highlighted_group import JUNK_EMAILERS, get_style_for_name
@@ -30,9 +30,10 @@ from epstein_files.util.logging import logger
30
30
  from epstein_files.util.rich import *
31
31
 
32
32
  BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
33
- BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
34
- DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
35
- LINK_LINE_REGEX = re.compile(f"^>? ?htt")
33
+ BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Hide caption|Importance:?\s*High|[iI,•]|[1i] (_ )?[il]|, [-,]|L\._|_filtered|.*(yiv0232|font-family:|margin-bottom:).*)$')
34
+ BAD_SUBJECT_CONTINUATIONS = ['orwarded', 'Hi ', 'Sent ', 'AmLaw', 'Original Message', 'Privileged', 'Sorry', '---']
35
+ FIELDS_COLON_REGEX = re.compile(FIELDS_COLON_PATTERN)
36
+ LINK_LINE_REGEX = re.compile(f"^[>• ]*htt")
36
37
  LINK_LINE2_REGEX = re.compile(r"^[-\w.%&=/]{5,}$")
37
38
  QUOTED_REPLY_LINE_REGEX = re.compile(r'(\nFrom:(.*)|wrote:)\n', re.IGNORECASE)
38
39
  REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
@@ -42,15 +43,13 @@ DATE_HEADER_REGEX = re.compile(r'(?:Date|Sent):? +(?!by|from|to|via)([^\n]{6,})\
42
43
  TIMESTAMP_LINE_REGEX = re.compile(r"\d+:\d+")
43
44
  LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
44
45
 
45
- SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
46
46
  REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
47
- URL_SIGNIFIERS = ['amp?', 'cd=', 'click', 'ft=', 'gclid', 'htm', 'keywords=', 'module=', 'mpweb', 'nlid=', 'ref=', 'smid=', 'usg=', 'utm']
47
+ URL_SIGNIFIERS = ['?amp', 'amp?', 'cd=', 'click', 'CMP=', 'contentId', 'ft=', 'gclid', 'htm', 'mp=', 'keywords=', 'Id=', 'module=', 'mpweb', 'nlid=', 'ref=', 'smid=', 'sp=', 'usg=', 'utm']
48
48
  APPEARS_IN = 'appears in'
49
49
 
50
50
  MAX_NUM_HEADER_LINES = 14
51
- MAX_QUOTED_REPLIES = 2
52
- MAX_CHARS_TO_PRINT = 4000
53
- TRUNCATED_CHARS = int(MAX_CHARS_TO_PRINT / 3)
51
+ MAX_QUOTED_REPLIES = 1
52
+ NUM_WORDS_IN_LAST_QUOTE = 6
54
53
 
55
54
  REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
56
55
  '********************************',
@@ -88,7 +87,13 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
88
87
  re.compile(r'^INW$', re.MULTILINE): REDACTED,
89
88
  # links
90
89
  'Imps ://': 'https://',
90
+ 'on-accusers-rose-\nmcgowan/ ': 'on-accusers-rose-\nmcgowan/\n',
91
+ 'the-truth-\nabout-the-bitcoin-foundation/ )': 'the-truth-about-the-bitcoin-foundation/ )\n',
92
+ 'woody-allen-jeffrey-epsteins-\nsociety-friends-close-ranks/ ---': 'woody-allen-jeffrey-epsteins-society-friends-close_ranks/\n',
93
+ ' https://www.theguardian.com/world/2017/may/29/close-friend-trump-thomas-barrack-\nalleged-tax-evasion-italy-sardinia?CMP=share btn fb': '\nhttps://www.theguardian.com/world/2017/may/29/close-friend-trump-thomas-barrack-alleged-tax-evasion-italy-sardinia?CMP=share_btn_fb',
91
94
  re.compile(r'timestopics/people/t/landon jr thomas/inde\n?x\n?\.\n?h\n?tml'): 'timestopics/people/t/landon_jr_thomas/index.html',
95
+ re.compile(r" http ?://www. ?dailymail. ?co ?.uk/news/article-\d+/Troub ?led-woman-history-drug-\n?us ?e-\n?.*html"): '\nhttp://www.dailymail.co.uk/news/article-3914012/Troubled-woman-history-drug-use-claimed-assaulted-Donald-Trump-Jeffrey-Epstein-sex-party-age-13-FABRICATED-story.html',
96
+ re.compile(r"http.*steve-bannon-trump-tower-\n?interview-\n?trumps-\n?strategist-plots-\n?new-political-movement-948747"): "\nhttp://www.hollywoodreporter.com/news/steve-bannon-trump-tower-interview-trumps-strategist-plots-new-political-movement-948747",
92
97
  # Subject lines
93
98
  "Arrested in\nInauguration Day Riot": "Arrested in Inauguration Day Riot",
94
99
  "as Putin Mayhem Tests President's Grip\non GOP": "as Putin Mayhem Tests President's Grip on GOP",
@@ -99,6 +104,8 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
99
104
  "COVER UP SEX ABUSE CRIMES\nBY THE WHITE HOUSE": "COVER UP SEX ABUSE CRIMES BY THE WHITE HOUSE",
100
105
  'Priebus, used\nprivate email accounts for': 'Priebus, used private email accounts for',
101
106
  "War on the Investigations\nEncircling Him": "War on the Investigations Encircling Him",
107
+ "Subject; RE": "Subject: RE",
108
+ "straining relations between UK and\nAmerica": "straining relations between UK and America",
102
109
  re.compile(r"deadline re Mr Bradley Edwards vs Mr\s*Jeffrey Epstein", re.I): "deadline re Mr Bradley Edwards vs Mr Jeffrey Epstein",
103
110
  re.compile(r"Following Plea That Implicated Trump -\s*https://www.npr.org/676040070", re.I): "Following Plea That Implicated Trump - https://www.npr.org/676040070",
104
111
  re.compile(r"for Attorney General -\s+Wikisource, the"): r"for Attorney General - Wikisource, the",
@@ -109,6 +116,8 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
109
116
  re.compile(r"Subject:\s*Fwd: Trending Now: Friends for three decades"): "Subject: Fwd: Trending Now: Friends for three decades",
110
117
  # Misc
111
118
  'AVG°': 'AVGO',
119
+ 'Saw Matt C with DTF at golf': 'Saw Matt C with DJT at golf',
120
+ re.compile(r"[i. ]*Privileged[- ]*Redacted[i. ]*"): '<PRIVILEGED - REDACTED>',
112
121
  }
113
122
 
114
123
  EMAIL_SIGNATURE_REGEXES = {
@@ -118,20 +127,28 @@ EMAIL_SIGNATURE_REGEXES = {
118
127
  DANIEL_SIAD: re.compile(r"Confidentiality Notice: The information contained in this electronic message is PRIVILEGED and confidential information intended only for the use of the individual entity or entities named as recipient or recipients. If the reader is not the intended recipient, be hereby notified that any dissemination, distribution or copy of this communication is strictly prohibited. If you have received this communication in error, please notify me immediately by electronic mail or by telephone and permanently delete this message from your computer system. Thank you.".replace(' ', r'\s*'), re.IGNORECASE),
119
128
  DANNY_FROST: re.compile(r"Danny Frost\nDirector.*\nManhattan District.*\n212.*", re.IGNORECASE),
120
129
  DARREN_INDYKE: re.compile(r"DARREN K. INDYKE.*?\**\nThe information contained in this communication.*?Darren K.[\n\s]+?[Il]ndyke(, PLLC)? — All rights reserved\.? ?\n\*{50,120}(\n\**)?", re.DOTALL),
130
+ DAVID_FISZEL: re.compile(r"This e-mail and any file.*\nmail and/or any file.*\nmail or any.*\nreceived.*\nmisdirected.*"),
121
131
  DAVID_INGRAM: re.compile(r"Thank you in advance.*\nDavid Ingram.*\nCorrespondent\nReuters.*\nThomson.*(\n(Office|Mobile|Reuters.com).*)*"),
122
132
  DEEPAK_CHOPRA: re.compile(fr"({DEEPAK_CHOPRA}( MD)?\n)?2013 Costa Del Mar Road\nCarlsbad, CA 92009(\n(Chopra Foundation|Super Genes: Unlock.*))?(\nJiyo)?(\nChopra Center for Wellbeing)?(\nHome: Where Everyone is Welcome)?"),
123
- EDUARDO_ROBLES: re.compile(fr"(• )?email:.*\n(• )?email:\n(• )?website: www.creativekingdom.com\n(• )?address: 5th Floor Office No:504 Aspect Tower,\nBusiness Bay, Dubai United Arab Emirates."),
133
+ EDUARDO_ROBLES: re.compile(r"(• )?email:.*\n(• )?email:\n(• )?website: www.creativekingdom.com\n(• )?address: 5th Floor Office No:504 Aspect Tower,\nBusiness Bay, Dubai United Arab Emirates."),
134
+ ERIC_ROTH: re.compile(r"2221 Smithtown Avenue\nLong Island.*\nRonkonkoma.*\n(.1. )?Phone\nFax\nCell\ne-mail"),
135
+ GHISLAINE_MAXWELL: re.compile(r"FACEBOOK\nTWITTER\nG\+\nPINTEREST\nINSTAGRAM\nPLEDGE\nTHE DAILY CATCH"),
124
136
  JEFFREY_EPSTEIN: re.compile(r"((\*+|please note)\n+)?(> )?(• )?(» )?The information contained in this communication is\n(> )*(» )?confidential.*?all attachments.( copyright -all rights reserved?)?", re.DOTALL),
125
137
  JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*(\nTel:.*)?(\nEmail:.*)?", re.IGNORECASE),
126
138
  KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
127
139
  LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
128
140
  LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
141
+ LEON_BLACK: re.compile(r"This email and any files transmitted with it are confidential and intended solely.*\n(they|whom).*\ndissemination.*\nother.*\nand delete.*"),
142
+ LISA_NEW: re.compile(r"Elisa New\nPowell M. Cabot.*\n(Director.*\n)?Harvard.*\n148.*\n([1I] )?12.*\nCambridge.*\n([1I] )?02138"),
129
143
  MARTIN_WEINBERG: re.compile(r"(Martin G. Weinberg, Esq.\n20 Park Plaza((, )|\n)Suite 1000\nBoston, MA 02116(\n61.*?)?(\n.*?([cC]ell|Office))*\n)?This Electronic Message contains.*?contents of this message is.*?prohibited.", re.DOTALL),
144
+ MICHAEL_MILLER: re.compile(r"Michael C. Miller\nPartner\nwww.steptoe.com/mmiller\nSteptoe\n(Privileged.*\n)?(\+1\s+)?direct.*\n(\+1\s+)?(\+1\s+)?fax.*\n(\+1.*)?cell.*\n(www.steptoe.com\n)?This message and any.*\nyou are not.*\nnotify the sender.*"),
130
145
  NICHOLAS_RIBIS: re.compile(r"60 Morris Turnpike 2FL\nSummit,? NJ.*\n0:\nF:\n\*{20,}\nCONFIDENTIALITY NOTICE.*\nattachments.*\ncopying.*\nIf you have.*\nthe copy.*\nThank.*\n\*{20,}"),
131
146
  PETER_MANDELSON: re.compile(r'Disclaimer This email and any attachments to it may be.*?with[ \n]+number(.*?EC4V[ \n]+6BJ)?', re.DOTALL | re.IGNORECASE),
132
147
  PAUL_BARRETT: re.compile(r"Paul Barrett[\n\s]+Alpha Group Capital LLC[\n\s]+(142 W 57th Street, 11th Floor, New York, NY 10019?[\n\s]+)?(al?[\n\s]*)?ALPHA GROUP[\n\s]+CAPITAL"),
148
+ PETER_ATTIA: re.compile(r"The information contained in this transmission may contain.*\n(laws|patient).*\n(distribution|named).*\n(distribution.*\nplease.*|copies.*)"),
133
149
  RICHARD_KAHN: re.compile(fr'Richard Kahn[\n\s]+HBRK Associates Inc.?[\n\s]+((301 East 66th Street, Suite 1OF|575 Lexington Avenue,? 4th Floor,?)[\n\s]+)?New York, (NY|New York) 100(22|65)(\s+(Tel?|Phone)( I|{REDACTED})?\s+Fa[x",]?(_|{REDACTED})*\s+[Ce]el?l?)?', re.IGNORECASE),
134
150
  ROSS_GOW: re.compile(r"Ross Gow\nManaging Partner\nACUITY Reputation Limited\n23 Berkeley Square\nLondon.*\nMobile.*\nTel"),
151
+ STEPHEN_HANSON: re.compile(r"(> )?Confidentiality Notice: This e-mail transmission.*\n(which it is addressed )?and may contain.*\n(applicable law. If you are not the intended )?recipient you are hereby.*\n(information contained in or attached to this transmission is )?STRICTLY PROHIBITED.*"),
135
152
  STEVEN_PFEIFFER: re.compile(r"Steven\nSteven .*\nAssociate.*\nIndependent Filmmaker Project\nMade in NY.*\n30 .*\nBrooklyn.*\n(p:.*\n)?www\.ifp.*", re.IGNORECASE),
136
153
  'Susan Edelman': re.compile(r'Susan Edel.*\nReporter\n1211.*\n917.*\nsedelman.*', re.IGNORECASE),
137
154
  TERRY_KAFKA: re.compile(r"((>|I) )?Terry B.? Kafka.*\n(> )?Impact Outdoor.*\n(> )?5454.*\n(> )?Dallas.*\n((> )?c?ell.*\n)?(> )?Impactoutdoor.*(\n(> )?cell.*)?", re.IGNORECASE),
@@ -152,13 +169,19 @@ BCC_LISTS = JUNK_EMAILERS + MAILING_LISTS
152
169
  TRUNCATE_EMAILS_FROM_OR_TO = [
153
170
  AMANDA_ENS,
154
171
  ANTHONY_BARRETT,
172
+ DANIEL_SABBA,
155
173
  DIANE_ZIMAN,
156
174
  JOSCHA_BACH,
157
175
  KATHERINE_KEATING,
176
+ LAWRANCE_VISOSKI,
158
177
  LAWRENCE_KRAUSS,
159
178
  LISA_NEW,
179
+ MOSHE_HOFFMAN,
160
180
  NILI_PRIELL_BARAK,
161
181
  PAUL_KRASSNER,
182
+ PAUL_PROSPERI,
183
+ 'Susan Edelman',
184
+ TERRY_KAFKA,
162
185
  ]
163
186
 
164
187
  TRUNCATE_EMAILS_FROM = BCC_LISTS + TRUNCATE_EMAILS_FROM_OR_TO + [
@@ -170,6 +193,7 @@ TRUNCATE_EMAILS_FROM = BCC_LISTS + TRUNCATE_EMAILS_FROM_OR_TO + [
170
193
  DAVID_HAIG,
171
194
  EDWARD_ROD_LARSEN,
172
195
  JOHNNY_EL_HACHEM,
196
+ 'Mark Green',
173
197
  MELANIE_WALKER,
174
198
  'Mitchell Bard',
175
199
  PEGGY_SIEGAL,
@@ -182,47 +206,12 @@ TRUNCATE_EMAILS_FROM = BCC_LISTS + TRUNCATE_EMAILS_FROM_OR_TO + [
182
206
  TERRY_KAFKA,
183
207
  ]
184
208
 
185
- # These IDs will be appended to INTERESTING_EMAIL_IDS
186
- INTERESTING_TRUNCATION_LENGTHS = {
187
- '023627': 16_800, # Micheal Wolff article with brock pierce
188
- '030245': None, # Epstein rationalizes his behavior in an open letter to the world
189
- '030781': None, # Bannon email about crypto coin issues
190
- '032906': None, # David Blaine email
191
- '026036': 6000, # Gino Yu blockchain mention
192
- '029609': None, # Joi Ito
193
- '025233': None, # Reputation.com discussion
194
- '017827': None, # Bannon / Peggy Siegal email about netflix doc on Epstein
195
- '030222': None, # Ross Gow / Ghislaine correspondence
196
- '026028': None, # Larry Summers / Karim Wade intro
197
- '029545': None, # Tyler Shears reputation
198
- '025812': None, # Tyler Shears reputation
199
- '029914': 4500, # Lord Mandelson russian investments
200
- '033453': None, # "Just heard you were telling people that you heard I asked Trump for a million dollars"
201
- '031320': None, # Epstein Gratitude foundation
202
- '031036': None, # Barbro Ehnbom talking about Swedish girl
203
- '023454': 1878, # Email invitation sent to tech CEOs + Epstein
204
- '029342': 2000, # Hakeem Jeffries
205
- }
206
-
207
- TRUNCATION_LENGTHS = {
208
- **INTERESTING_TRUNCATION_LENGTHS,
209
- '031791': None, # First email in Jessica Cadwell chain about service of legal documents
210
- '023208': None, # Long discussion about leon black's finances
211
- '028589': None, # Long thread with Reid Weingarten
212
- '029433': TRUNCATED_CHARS, # Kahn taxes
213
- '026778': TRUNCATED_CHARS, # Kahn taxes
214
- '033311': TRUNCATED_CHARS, # Kahn taxes
215
- '024251': TRUNCATED_CHARS, # Kahn taxes
216
- '026755': TRUNCATED_CHARS, # Epstein self fwd
217
- }
218
-
219
209
  # These are long forwarded articles so we force a trim to 1,333 chars if these strings exist
220
210
  TRUNCATE_TERMS = [
221
211
  'The rebuilding of Indonesia', # Vikcy ward article
222
- 'Dominique Strauss-Kahn',
223
- 'THOMAS L. FRIEDMAN',
224
212
  'a sleek, briskly paced film whose title suggests a heist movie', # Inside Job
225
213
  'Calendar of Major Events, Openings, and Fundraisers',
214
+ 'sent over from Marshall Heyman at the WSJ',
226
215
  "In recent months, China's BAT collapse",
227
216
  'President Obama introduces Jim Yong Kim as his nominee',
228
217
  'Trump appears with mobster-affiliated felon at New',
@@ -237,9 +226,11 @@ TRUNCATE_TERMS = [
237
226
  'co-inventor of the GTX Smart Shoe',
238
227
  'my latest Washington Post column',
239
228
  # Bannon
229
+ 'As Steve Bannon continues his tour of Europe',
240
230
  "Bannon the European: He's opening the populist fort in Brussels",
241
231
  "Steve Bannon doesn't do subtle.",
242
232
  'The Department of Justice lost its latest battle with Congress',
233
+ 'pedophile Jeffrey Epstein bought his way out',
243
234
  # lawyers
244
235
  'recuses itself from Jeffrey Epstein case',
245
236
  # Misc
@@ -265,11 +256,23 @@ LINE_REPAIR_MERGES = {
265
256
  '014397': [[4]] * 2,
266
257
  '014860': [[3], [4], [4]],
267
258
  '017523': [[4]],
259
+ '030367': [[1, 4], [2, 4]],
268
260
  '019105': [[5]] * 4,
269
261
  '019407': [[2, 4]],
262
+ '022187': [[1, 8], [2, 8], [3, 8], [4, 8]],
270
263
  '021729': [[2]],
264
+ '032896': [[2]],
265
+ '033050': [[0, 6], [1, 6], [2, 6], [3, 6], [4, 6]],
266
+ '022949': [[0, 4], [1, 4]],
267
+ '022197': [[0, 5], [1, 5], [3, 5]],
268
+ '021814': [[1, 6], [2, 6], [3, 6], [4, 6]],
269
+ '022190': [[1, 7], [0, 6], [3, 6], [4, 6]],
270
+ '029582': [[0, 5], [1, 5], [3, 5], [3, 5]],
271
271
  '022673': [[9]],
272
272
  '022684': [[9]],
273
+ '026625': [[0, 7], [1, 7], [2, 7], [3, 7], [4, 7], [5, 7]],
274
+ '026659': [[0, 5], [1, 5]],
275
+ '026764': [[0, 6], [1, 6]],
273
276
  '022695': [[4]],
274
277
  '022977': [[9]] * 10,
275
278
  '023001': [[5]] * 3,
@@ -278,11 +281,15 @@ LINE_REPAIR_MERGES = {
278
281
  '025329': [[2]] * 9,
279
282
  '025790': [[2]],
280
283
  '025812': [[3]] * 2,
284
+ '025589': [[3]] * 12,
281
285
  '026345': [[3]],
282
286
  '026609': [[4]],
287
+ '028921': [[5, 4], [4, 5]],
288
+ '026620': ([[20]] * 4) + [[3, 2]] + ([[2]] * 15) + [[2, 4]],
283
289
  '026829': [[3]],
284
290
  '026924': [[2, 4]],
285
291
  '028728': [[3]],
292
+ '026451': [[3, 5]] * 2,
286
293
  '028931': [[3, 6]],
287
294
  '029154': [[2, 5]],
288
295
  '029163': [[2, 5]],
@@ -302,18 +309,22 @@ LINE_REPAIR_MERGES = {
302
309
  '029977': ([[2]] * 4) + [[4], [2, 4]],
303
310
  '030299': [[7, 10]],
304
311
  '030315': [[3, 5]],
312
+ '030318': [[3, 5]],
305
313
  '030381': [[2, 4]],
306
314
  '030384': [[2, 4]],
307
315
  '030626': [[2], [4]],
316
+ '030861': [[3, 8]],
308
317
  '030999': [[2, 4]],
309
318
  '031384': [[2]],
310
319
  '031428': [[2], [2, 4]],
311
320
  '031442': [[0]],
321
+ '031489': [[2, 4], [3, 4], [3, 4], [10]],
322
+ '031619': [[7], [17], [17]],
312
323
  '031748': [[3]] * 2,
313
- '031764': [[3]],
324
+ '031764': [[3], [8]], # 8 is just for style fix internally, not header
314
325
  '031980': [[2, 4]],
315
326
  '032063': [[3, 5]],
316
- '032272': [[3]],
327
+ '032272': [[2, 10], [3]],
317
328
  '032405': [[4]],
318
329
  '032637': [[9]] * 3,
319
330
  '033097': [[2]],
@@ -326,10 +337,16 @@ LINE_REPAIR_MERGES = {
326
337
  '033357': [[2, 4]],
327
338
  '033486': [[7, 9]],
328
339
  '033512': [[2]],
340
+ '026024': [[1, 3], [2, 3]],
341
+ '024923': [[0, 5], [2]],
329
342
  '033568': [[5]] * 5,
330
343
  '033575': [[2, 4]],
331
344
  '033576': [[3]],
332
345
  '033583': [[2]],
346
+
347
+ # Note DOJ file line adjustments happen *after* DojFile._repair() is called
348
+ 'EFTA00039689': [[4]],
349
+ 'EFTA00040118': [[2], [2], [2], [2], [2], [2], [6], [6]],
333
350
  }
334
351
 
335
352
 
@@ -337,30 +354,104 @@ LINE_REPAIR_MERGES = {
337
354
  class Email(Communication):
338
355
  """
339
356
  Attributes:
340
- actual_text (str) - best effort at the text actually sent in this email, excluding quoted replies and forwards
341
- config (EmailCfg | None) - manual config for this email (if it exists)
342
- header (EmailHeader) - header data extracted from the text (from/to/sent/subject etc)
343
- recipients (list[Name]) - who this email was sent to
344
- sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
345
- signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
357
+ actual_text (str) - Best effort at the text actually sent in this email, excluding quoted replies and forwards.
358
+ config (EmailCfg, optional) - Manual config for this email (if it exists).
359
+ header (EmailHeader) - Header data extracted from the text (from/to/sent/subject etc).
360
+ recipients (list[Name]) - People to whom this email was sent.
361
+ sent_from_device (str, optional) - "Sent from my iPhone" style signature (if it exists).
362
+ signature_substitution_counts (dict[str, int]) - Number of times a signature was replaced with
363
+ <...snipped...> for each participant
346
364
  """
365
+ attached_docs: list[OtherFile] = field(default_factory=list)
347
366
  actual_text: str = field(init=False)
348
367
  config: EmailCfg | None = None
349
368
  header: EmailHeader = field(init=False)
350
369
  recipients: list[Name] = field(default_factory=list)
351
370
  sent_from_device: str | None = None
352
371
  signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
372
+ _is_first_for_user: bool = False # Only set when printing
353
373
  _line_merge_arguments: list[tuple[int] | tuple[int, int]] = field(default_factory=list)
354
374
 
355
375
  # For logging how many headers we prettified while printing, kind of janky
356
376
  rewritten_header_ids: ClassVar[set[str]] = set([])
357
377
 
378
+ @property
379
+ def attachments(self) -> list[str]:
380
+ """Returns the string in the header."""
381
+ return (self.header.attachments or '').split(';')
382
+
383
+ @property
384
+ def border_style(self) -> str:
385
+ """Color emails from epstein to others with the color for the first recipient."""
386
+ if self.author == JEFFREY_EPSTEIN and len(self.recipients) > 0:
387
+ style = get_style_for_name(self.recipients[0])
388
+ else:
389
+ style = self.author_style
390
+
391
+ return style.replace('bold', '').strip()
392
+
393
+ @property
394
+ def info_txt(self) -> Text:
395
+ email_type = 'fwded article' if self.is_fwded_article else 'email'
396
+ txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt)
397
+
398
+ if self.config and self.config.is_attribution_uncertain:
399
+ txt.append(f" {QUESTION_MARKS}", style=self.author_style)
400
+
401
+ txt.append(' to ').append(self.recipients_txt())
402
+ return txt.append(highlighter(f" probably sent at {self.timestamp}"))
403
+
404
+ @property
405
+ def is_fwded_article(self) -> bool:
406
+ if self.config is None:
407
+ return False
408
+ elif self.config.fwded_text_after:
409
+ return self.config.is_fwded_article is not False
410
+ else:
411
+ return bool(self.config.is_fwded_article)
412
+
413
+ @property
414
+ def is_junk_mail(self) -> bool:
415
+ return self.author in JUNK_EMAILERS
416
+
417
+ @property
418
+ def is_mailing_list(self) -> bool:
419
+ return self.author in MAILING_LISTS or self.is_junk_mail
420
+
421
+ @property
422
+ def is_note_to_self(self) -> bool:
423
+ return self.recipients == [self.author]
424
+
425
+ @property
426
+ def is_word_count_worthy(self) -> bool:
427
+ if self.is_fwded_article:
428
+ return bool(self.config.fwded_text_after) or len(self.actual_text) < 150
429
+ else:
430
+ return not self.is_mailing_list
431
+
432
+ @property
433
+ def metadata(self) -> Metadata:
434
+ local_metadata = asdict(self)
435
+ local_metadata['is_junk_mail'] = self.is_junk_mail
436
+ local_metadata['is_mailing_list'] = self.is_junk_mail
437
+ local_metadata['subject'] = self.subject or None
438
+ metadata = super().metadata
439
+ metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
440
+ return metadata
441
+
442
+ @property
443
+ def subject(self) -> str:
444
+ if self.config and self.config.subject:
445
+ return self.config.subject
446
+ else:
447
+ return self.header.subject or ''
448
+
358
449
  def __post_init__(self):
359
450
  self.filename = self.file_path.name
360
451
  self.file_id = extract_file_id(self.filename)
361
452
 
362
453
  # Special handling for copying properties out of the config for the document this one was extracted from
363
- if self.is_local_extract_file():
454
+ if self.is_local_extract_file:
364
455
  self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
365
456
  extracted_from_doc_id = self.url_slug.split('_')[-1]
366
457
 
@@ -373,58 +464,24 @@ class Email(Communication):
373
464
  self.recipients = self.config.recipients
374
465
  else:
375
466
  for recipient in self.header.recipients():
376
- self.recipients.extend(self._extract_emailer_names(recipient))
467
+ self.recipients.extend(extract_emailer_names(recipient))
377
468
 
378
469
  # Assume mailing list emails are to Epstein
379
- if self.author in BCC_LISTS and (self.is_note_to_self() or not self.recipients):
470
+ if self.author in BCC_LISTS and (self.is_note_to_self or not self.recipients):
380
471
  self.recipients = [JEFFREY_EPSTEIN]
381
472
 
382
473
  # Remove self CCs but preserve self emails
383
- if not self.is_note_to_self():
474
+ if not self.is_note_to_self:
384
475
  self.recipients = [r for r in self.recipients if r != self.author]
385
476
 
386
477
  self.recipients = sorted(list(set(self.recipients)), key=lambda r: r or UNKNOWN)
387
478
  self.text = self._prettify_text()
388
- self.actual_text = self._actual_text()
479
+ self.actual_text = self._extract_actual_text()
389
480
  self.sent_from_device = self._sent_from_device()
390
481
 
391
- def attachments(self) -> list[str]:
392
- return (self.header.attachments or '').split(';')
393
-
394
- def info_txt(self) -> Text:
395
- email_type = 'fwded article' if self.is_fwded_article() else 'email'
396
- txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt())
397
-
398
- if self.config and self.config.is_attribution_uncertain:
399
- txt.append(f" {QUESTION_MARKS}", style=self.author_style())
400
-
401
- txt.append(' to ').append(self.recipients_txt())
402
- return txt.append(highlighter(f" probably sent at {self.timestamp}"))
403
-
404
- def is_fwded_article(self) -> bool:
405
- return bool(self.config and self.config.is_fwded_article)
406
-
407
- def is_junk_mail(self) -> bool:
408
- return self.author in JUNK_EMAILERS
409
-
410
- def is_mailing_list(self) -> bool:
411
- return self.author in MAILING_LISTS or self.is_junk_mail()
412
-
413
- def is_note_to_self(self) -> bool:
414
- return self.recipients == [self.author]
415
-
416
- def is_with(self, name: str) -> bool:
482
+ def is_from_or_to(self, name: str) -> bool:
417
483
  return name in [self.author] + self.recipients
418
484
 
419
- def metadata(self) -> Metadata:
420
- local_metadata = asdict(self)
421
- local_metadata['is_junk_mail'] = self.is_junk_mail()
422
- local_metadata['is_mailing_list'] = self.is_junk_mail()
423
- local_metadata['subject'] = self.subject() or None
424
- metadata = super().metadata()
425
- metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
426
- return metadata
427
-
428
485
  def recipients_txt(self, max_full_names: int = 2) -> Text:
429
486
  """Text object with comma separated colored versions of all recipients."""
430
487
  recipients = [r or UNKNOWN for r in self.recipients] if len(self.recipients) > 0 else [UNKNOWN]
@@ -435,12 +492,6 @@ class Email(Communication):
435
492
  for r in recipients
436
493
  ], join=', ')
437
494
 
438
- def subject(self) -> str:
439
- if self.config and self.config.subject:
440
- return self.config.subject
441
- else:
442
- return self.header.subject or ''
443
-
444
495
  def summary(self) -> Text:
445
496
  """One line summary mostly for logging."""
446
497
  txt = self._summary()
@@ -450,7 +501,7 @@ class Email(Communication):
450
501
 
451
502
  return txt.append(CLOSE_PROPERTIES_CHAR)
452
503
 
453
- def _actual_text(self) -> str:
504
+ def _extract_actual_text(self) -> str:
454
505
  """The text that comes before likely quoted replies and forwards etc."""
455
506
  if self.config and self.config.actual_text is not None:
456
507
  return self.config.actual_text
@@ -463,7 +514,7 @@ class Email(Communication):
463
514
  return self.text
464
515
 
465
516
  self.log_top_lines(20, "Raw text:", logging.DEBUG)
466
- self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
517
+ self.log(f"With {self.header.num_header_rows} header lines removed:\n{text[0:500]}\n\n", logging.DEBUG)
467
518
  reply_text_match = REPLY_TEXT_REGEX.search(text)
468
519
 
469
520
  if reply_text_match:
@@ -488,51 +539,24 @@ class Email(Communication):
488
539
 
489
540
  return text.strip()
490
541
 
491
- def _border_style(self) -> str:
492
- """Color emails from epstein to others with the color for the first recipient."""
493
- if self.author == JEFFREY_EPSTEIN and len(self.recipients) > 0:
494
- style = get_style_for_name(self.recipients[0])
495
- else:
496
- style = self.author_style()
497
-
498
- return style.replace('bold', '').strip()
499
-
500
542
  def _extract_author(self) -> None:
543
+ """Overloads superclass method, called at instantiation time."""
501
544
  self._extract_header()
502
545
  super()._extract_author()
503
546
 
504
547
  if not self.author and self.header.author:
505
- authors = self._extract_emailer_names(self.header.author)
548
+ authors = extract_emailer_names(self.header.author)
506
549
  self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
507
550
 
508
- def _extract_emailer_names(self, emailer_str: str) -> list[str]:
509
- """Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
510
- emailer_str = EmailHeader.cleanup_str(emailer_str)
511
-
512
- if len(emailer_str) == 0:
513
- return []
514
-
515
- names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
516
-
517
- if BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
518
- if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
519
- logger.warning(f"'{self.filename}': No emailer found in '{escape_single_quotes(emailer_str)}'")
520
- else:
521
- logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
522
-
523
- return names_found
524
-
525
- names_found = names_found or [emailer_str]
526
- return [_reverse_first_and_last_names(name) for name in names_found]
527
-
528
551
  def _extract_header(self) -> None:
529
- """Extract an EmailHeader object from the OCR text."""
552
+ """Extract an `EmailHeader` from the OCR text."""
530
553
  header_match = EMAIL_SIMPLE_HEADER_REGEX.search(self.text)
531
554
 
532
555
  if header_match:
533
556
  self.header = EmailHeader.from_header_lines(header_match.group(0))
534
557
 
535
- if self.header.is_empty():
558
+ # DOJ file OCR text is broken in a less consistent way than the HOUSE_OVERSIGHT files
559
+ if self.header.is_empty() and not self.is_doj_file:
536
560
  self.header.repair_empty_header(self.lines)
537
561
  else:
538
562
  log_level = logging.INFO if self.config else logging.WARNING
@@ -542,22 +566,15 @@ class Email(Communication):
542
566
  logger.debug(f"{self.file_id} extracted header\n\n{self.header}\n")
543
567
 
544
568
  def _extract_timestamp(self) -> datetime:
545
- if self.config and self.config.timestamp:
546
- return self.config.timestamp
547
- elif self.header.sent_at:
548
- timestamp = _parse_timestamp(self.header.sent_at)
549
-
550
- if timestamp:
551
- return timestamp
569
+ """Find the time this email was sent."""
570
+ if self.header.sent_at and (timestamp := _parse_timestamp(self.header.sent_at)):
571
+ return timestamp
552
572
 
553
573
  searchable_lines = self.lines[0:MAX_NUM_HEADER_LINES]
554
574
  searchable_text = '\n'.join(searchable_lines)
555
- date_match = DATE_HEADER_REGEX.search(searchable_text)
556
575
 
557
- if date_match:
558
- timestamp = _parse_timestamp(date_match.group(1))
559
-
560
- if timestamp:
576
+ if (date_match := DATE_HEADER_REGEX.search(searchable_text)):
577
+ if (timestamp := _parse_timestamp(date_match.group(1))):
561
578
  return timestamp
562
579
 
563
580
  logger.debug(f"Failed to find timestamp, falling back to parsing {MAX_NUM_HEADER_LINES} lines...")
@@ -566,42 +583,45 @@ class Email(Communication):
566
583
  if not TIMESTAMP_LINE_REGEX.search(line):
567
584
  continue
568
585
 
569
- timestamp = _parse_timestamp(line)
570
-
571
- if timestamp:
586
+ if (timestamp := _parse_timestamp(line)):
572
587
  logger.debug(f"Fell back to timestamp {timestamp} in line '{line}'...")
573
588
  return timestamp
574
589
 
575
- raise RuntimeError(f"No timestamp found in '{self.file_path.name}' top lines:\n{searchable_text}")
590
+ no_timestamp_msg = f"No timestamp found in '{self.file_path.name}'"
576
591
 
577
- def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
578
- """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
579
- if text is None:
580
- header_offset = len(self.header.header_chars)
581
- text = self.text[header_offset:]
592
+ if self.is_duplicate:
593
+ logger.warning(f"{no_timestamp_msg} but timestamp should be copied from {self.duplicate_of_id}")
582
594
  else:
583
- header_offset = 0
595
+ raise RuntimeError(f"{no_timestamp_msg}, top lines:\n" + '\n'.join(self.lines[0:MAX_NUM_HEADER_LINES + 10]))
596
+
597
+ def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES) -> int | None:
598
+ """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
599
+ header_offset = len(self.header.header_chars)
600
+ text = self.text[header_offset:]
584
601
 
585
602
  for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text)):
586
603
  if i >= n:
587
604
  return match.end() + header_offset - 1
588
605
 
589
- def _merge_lines(self, idx: int, idx2: int | None = None) -> None:
606
+ def _merge_lines(self, idx1: int, idx2: int | None = None) -> None:
590
607
  """Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""
591
608
  if idx2 is None:
592
- self._line_merge_arguments.append((idx,))
593
- idx2 = idx + 1
609
+ self._line_merge_arguments.append((idx1,))
610
+ idx2 = idx1 + 1
594
611
  else:
595
- self._line_merge_arguments.append((idx, idx2))
596
-
597
- lines = self.lines[0:idx]
612
+ self._line_merge_arguments.append((idx1, idx2))
598
613
 
599
- if idx2 <= idx:
600
- raise RuntimeError(f"idx2 ({idx2}) must be greater than idx ({idx})")
601
- elif idx2 == (idx + 1):
602
- lines += [self.lines[idx] + ' ' + self.lines[idx + 1]] + self.lines[idx + 2:]
614
+ if idx2 < idx1:
615
+ lines = self.lines[0:idx2] + self.lines[idx2 + 1:idx1] + [self.lines[idx1] + ' ' + self.lines[idx2]] + self.lines[idx1 + 1:]
616
+ elif idx2 == idx1:
617
+ raise RuntimeError(f"idx2 ({idx2}) must be greater or less than idx ({idx1})")
603
618
  else:
604
- lines += [self.lines[idx] + ' ' + self.lines[idx2]] + self.lines[idx + 1:idx2] + self.lines[idx2 + 1:]
619
+ lines = self.lines[0:idx1]
620
+
621
+ if idx2 == (idx1 + 1):
622
+ lines += [self.lines[idx1] + ' ' + self.lines[idx1 + 1]] + self.lines[idx1 + 2:]
623
+ else:
624
+ lines += [self.lines[idx1] + ' ' + self.lines[idx2]] + self.lines[idx1 + 1:idx2] + self.lines[idx2 + 1:]
605
625
 
606
626
  self._set_computed_fields(lines=lines)
607
627
 
@@ -617,6 +637,10 @@ class Email(Communication):
617
637
  self.signature_substitution_counts[name] = self.signature_substitution_counts.get(name, 0)
618
638
  self.signature_substitution_counts[name] += num_replaced
619
639
 
640
+ # Share / Tweet lines
641
+ if self.author == KATHRYN_RUEMMLER:
642
+ text = '\n'.join([line for line in text.split('\n') if line not in ['Share', 'Tweet', 'Bookmark it']])
643
+
620
644
  return collapse_newlines(text).strip()
621
645
 
622
646
  def _remove_line(self, idx: int) -> None:
@@ -628,7 +652,7 @@ class Email(Communication):
628
652
  self.log_top_lines(num_lines, msg=f'after removal of line {idx}')
629
653
 
630
654
  def _repair(self) -> None:
631
- """Repair particularly janky files."""
655
+ """Repair particularly janky files. Note that OCR_REPAIRS are applied *after* other line adjustments."""
632
656
  if BAD_FIRST_LINE_REGEX.match(self.lines[0]):
633
657
  self._set_computed_fields(lines=self.lines[1:])
634
658
 
@@ -656,18 +680,26 @@ class Email(Communication):
656
680
  self.log(f"Modified text, old:\n\n" + '\n'.join(old_text.split('\n')[0:12]) + '\n')
657
681
  self.log_top_lines(12, 'Result of modifications')
658
682
 
659
- lines = self.repair_ocr_text(OCR_REPAIRS, self.text).split('\n')
683
+ repaired_text = self._repair_links_and_quoted_subjects(self.repair_ocr_text(OCR_REPAIRS, self.text))
684
+ self._set_computed_fields(text=repaired_text)
685
+
686
+ def _repair_links_and_quoted_subjects(self, text: str) -> str:
687
+ """Repair links that the OCR has broken into multiple lines as well as 'Subject:' lines."""
688
+ lines = text.split('\n')
689
+ subject_line = next((line for line in lines if line.startswith('Subject:')), None) or ''
690
+ subject = subject_line.split(':')[1].strip() if subject_line else ''
660
691
  new_lines = []
661
692
  i = 0
662
693
 
663
- # Fix links (remove spaces, merge multiline links to a single line)
664
694
  while i < len(lines):
665
695
  line = lines[i]
666
696
 
667
697
  if LINK_LINE_REGEX.search(line):
668
698
  while i < (len(lines) - 1) \
669
- and 'http' not in lines[i + 1] \
670
- and (lines[i + 1].endswith('/') or any(s in lines[i + 1] for s in URL_SIGNIFIERS) or LINK_LINE2_REGEX.match(lines[i + 1])):
699
+ and not lines[i + 1].startswith('htt') \
700
+ and (lines[i + 1].endswith('/') \
701
+ or any(s in lines[i + 1] for s in URL_SIGNIFIERS) \
702
+ or LINK_LINE2_REGEX.match(lines[i + 1])):
671
703
  logger.debug(f"{self.filename}: Joining link lines\n 1. {line}\n 2. {lines[i + 1]}\n")
672
704
  line += lines[i + 1]
673
705
  i += 1
@@ -676,22 +708,27 @@ class Email(Communication):
676
708
  elif ' http' in line and line.endswith('html'):
677
709
  pre_link, post_link = line.split(' http', 1)
678
710
  line = f"{pre_link} http{post_link.replace(' ', '')}"
711
+ elif line.startswith('Subject:') and i < (len(lines) - 2) and len(line) >= 40:
712
+ next_line = lines[i + 1]
713
+ next_next = lines[i + 2]
714
+
715
+ if len(next_line) <= 1 or any([cont in next_line for cont in BAD_SUBJECT_CONTINUATIONS]):
716
+ pass
717
+ elif (subject.endswith(next_line) and next_line != subject) \
718
+ or (FIELDS_COLON_REGEX.search(next_next) and not FIELDS_COLON_REGEX.search(next_line)):
719
+ self.log(f"Fixing broken subject line\n line: '{line}'\n next: '{next_line}'\n next: '{next_next}'\nsubject='{subject}'\n")
720
+ line += f" {next_line}"
721
+ i += 1
679
722
 
680
723
  new_lines.append(line)
681
-
682
- # TODO: hacky workaround to get a working link for HOUSE_OVERSIGHT_032564
683
- if self.file_id == '032564' and line == 'http://m.huffpost.com/us/entry/us_599f532ae4b0dOef9f1c129d':
684
- new_lines.append('(ed. note: an archived version of the above link is here: https://archive.is/hJxT3 )')
685
-
686
724
  i += 1
687
725
 
688
- self._set_computed_fields(lines=new_lines)
726
+ logger.debug(f"----after line repair---\n" + '\n'.join(new_lines[0:20]) + "\n---")
727
+ return '\n'.join(lines)
689
728
 
690
729
  def _sent_from_device(self) -> str | None:
691
730
  """Find any 'Sent from my iPhone' style signature line if it exist in the 'actual_text'."""
692
- sent_from_match = SENT_FROM_REGEX.search(self.actual_text)
693
-
694
- if sent_from_match:
731
+ if (sent_from_match := SENT_FROM_REGEX.search(self.actual_text)):
695
732
  sent_from = sent_from_match.group(0)
696
733
  return 'S' + sent_from[1:] if sent_from.startswith('sent') else sent_from
697
734
 
@@ -699,13 +736,11 @@ class Email(Communication):
699
736
  """Copy info from original config for file this document was extracted from."""
700
737
  if self.file_id in ALL_FILE_CONFIGS:
701
738
  self.config = cast(EmailCfg, deepcopy(ALL_FILE_CONFIGS[self.file_id]))
702
- self.warn(f"Merging existing cfg for '{self.file_id}' with cfg for extracted document...")
739
+ self.log(f"Merging existing cfg for '{self.file_id}' with cfg for extracted document...")
703
740
  else:
704
741
  self.config = EmailCfg(id=self.file_id)
705
742
 
706
- extracted_from_description = extracted_from_doc_cfg.complete_description()
707
-
708
- if extracted_from_description:
743
+ if (extracted_from_description := extracted_from_doc_cfg.complete_description):
709
744
  extracted_description = f"{APPEARS_IN} {extracted_from_description}"
710
745
 
711
746
  if isinstance(extracted_from_doc_cfg, EmailCfg):
@@ -721,34 +756,58 @@ class Email(Communication):
721
756
 
722
757
  def _truncate_to_length(self) -> int:
723
758
  """When printing truncate this email to this length."""
724
- quote_cutoff = self._idx_of_nth_quoted_reply(text=self.text) # Trim if there's many quoted replies
759
+ quote_cutoff = self._idx_of_nth_quoted_reply() # Trim if there's many quoted replies
725
760
  includes_truncate_term = next((term for term in TRUNCATE_TERMS if term in self.text), None)
726
761
 
727
762
  if args.whole_file:
728
763
  num_chars = len(self.text)
729
764
  elif args.truncate:
730
765
  num_chars = args.truncate
731
- elif self.file_id in TRUNCATION_LENGTHS:
732
- num_chars = TRUNCATION_LENGTHS[self.file_id] or self.file_size()
733
- elif self.author in TRUNCATE_EMAILS_FROM or any([self.is_with(n) for n in TRUNCATE_EMAILS_FROM_OR_TO]) or includes_truncate_term:
766
+ elif self.config and self.config.truncate_to is not None:
767
+ num_chars = len(self.text) if self.config.truncate_to == NO_TRUNCATE else self.config.truncate_to
768
+ elif self.is_interesting:
769
+ num_chars = len(self.text)
770
+ elif self.author in TRUNCATE_EMAILS_FROM \
771
+ or any([self.is_from_or_to(n) for n in TRUNCATE_EMAILS_FROM_OR_TO]) \
772
+ or self.is_fwded_article \
773
+ or includes_truncate_term:
734
774
  num_chars = min(quote_cutoff or MAX_CHARS_TO_PRINT, TRUNCATED_CHARS)
735
- elif quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
736
- num_chars = quote_cutoff
737
775
  else:
738
- num_chars = MAX_CHARS_TO_PRINT
739
-
740
- if num_chars != MAX_CHARS_TO_PRINT and not self.is_duplicate():
741
- log_args = {
742
- 'num_chars': num_chars,
743
- 'author_truncate': self.author in TRUNCATE_EMAILS_FROM,
744
- 'is_fwded_article': self.is_fwded_article(),
745
- 'is_quote_cutoff': quote_cutoff == num_chars,
746
- 'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
747
- 'quote_cutoff': quote_cutoff,
748
- }
749
-
750
- logger.debug(f'{self.summary()} truncating: ' + ', '.join([f"{k}={v}" for k, v in log_args.items() if v]) + '\n')
751
-
776
+ if quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
777
+ trimmed_words = self.text[quote_cutoff:].split()
778
+
779
+ if '<...snipped' in trimmed_words[:NUM_WORDS_IN_LAST_QUOTE]:
780
+ num_trailing_words = 0
781
+ elif trimmed_words and trimmed_words[0] in ['From:', 'Sent:']:
782
+ num_trailing_words = NUM_WORDS_IN_LAST_QUOTE
783
+ else:
784
+ num_trailing_words = NUM_WORDS_IN_LAST_QUOTE
785
+
786
+ if trimmed_words:
787
+ last_quoted_text = ' '.join(trimmed_words[:num_trailing_words])
788
+ num_chars = quote_cutoff + len(last_quoted_text) + 1 # Give a hint of the next line
789
+ else:
790
+ num_chars = quote_cutoff
791
+ else:
792
+ num_chars = min(self.file_size, MAX_CHARS_TO_PRINT)
793
+
794
+ # Always print whole email for 1st email for user
795
+ if self._is_first_for_user and num_chars < self.file_size and not self.is_duplicate:
796
+ logger.info(f"{self} Overriding cutoff {num_chars} for first email")
797
+ num_chars = self.file_size
798
+
799
+ log_args = {
800
+ 'num_chars': num_chars,
801
+ '_is_first_for_user': self._is_first_for_user,
802
+ 'author_truncate': self.author in TRUNCATE_EMAILS_FROM,
803
+ 'is_fwded_article': self.is_fwded_article,
804
+ 'is_quote_cutoff': quote_cutoff == num_chars,
805
+ 'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
806
+ 'quote_cutoff': quote_cutoff,
807
+ }
808
+
809
+ log_args_str = ', '.join([f"{k}={v}" for k, v in log_args.items() if v])
810
+ logger.debug(f"Truncate determination: {log_args_str}")
752
811
  return num_chars
753
812
 
754
813
  def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
@@ -761,8 +820,8 @@ class Email(Communication):
761
820
  # Truncate long emails but leave a note explaining what happened w/link to source document
762
821
  if len(text) > num_chars:
763
822
  text = text[0:num_chars]
764
- doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style())
765
- trim_note = f"<...trimmed to {num_chars} characters of {self.length()}, read the rest at {doc_link_markup}...>"
823
+ doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
824
+ trim_note = f"<...trimmed to {num_chars:,} characters of {self.length:,}, read the rest at {doc_link_markup}...>"
766
825
  trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
767
826
 
768
827
  # Rewrite broken headers where the values are on separate lines from the field names
@@ -778,7 +837,7 @@ class Email(Communication):
778
837
 
779
838
  lines += text.split('\n')[num_lines_to_skip:]
780
839
  text = self.header.rewrite_header() + '\n' + '\n'.join(lines)
781
- text = _add_line_breaks(text) # This was skipped when _prettify_text() w/a broken header so we do it now
840
+ text = _add_line_breaks(text)
782
841
  self.rewritten_header_ids.add(self.file_id)
783
842
 
784
843
  lines = [
@@ -789,8 +848,8 @@ class Email(Communication):
789
848
  text = join_texts(lines, '\n')
790
849
 
791
850
  email_txt_panel = Panel(
792
- highlighter(text).append('\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
793
- border_style=self._border_style(),
851
+ highlighter(text).append('...\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
852
+ border_style=self.border_style,
794
853
  expand=False,
795
854
  subtitle=REWRITTEN_HEADER_MSG if should_rewrite_header else None,
796
855
  )
@@ -798,6 +857,11 @@ class Email(Communication):
798
857
  yield self.file_info_panel()
799
858
  yield Padding(email_txt_panel, (0, 0, 1, INFO_INDENT))
800
859
 
860
+ if self.attached_docs:
861
+ attachments_table_title = f" {self.url_slug} Email Attachments:"
862
+ attachments_table = OtherFile.files_preview_table(self.attached_docs, title=attachments_table_title)
863
+ yield Padding(attachments_table, (0, 0, 1, 12))
864
+
801
865
  if should_rewrite_header:
802
866
  self.log_top_lines(self.header.num_header_rows + 4, f'Original header:')
803
867
 
@@ -832,11 +896,11 @@ class Email(Communication):
832
896
 
833
897
  for email in emails:
834
898
  fields = [
835
- email.epstein_media_link(link_txt=email.timestamp_without_seconds(), style=link_style),
836
- email.author_txt(),
899
+ email.epstein_media_link(link_txt=email.timestamp_without_seconds, style=link_style),
900
+ email.author_txt,
837
901
  email.recipients_txt(max_full_names=1),
838
- f"{email.length()}",
839
- email.subject(),
902
+ f"{email.length}",
903
+ email.subject,
840
904
  ]
841
905
 
842
906
  if not show_length:
@@ -853,21 +917,14 @@ def _add_line_breaks(email_text: str) -> str:
853
917
 
854
918
  def _parse_timestamp(timestamp_str: str) -> None | datetime:
855
919
  try:
856
- timestamp_str = timestamp_str.replace('(GMT-05:00)', 'EST')
857
- timestamp_str = BAD_TIMEZONE_REGEX.sub(' ', timestamp_str).strip()
858
- timestamp = parse(timestamp_str, tzinfos=TIMEZONE_INFO)
920
+ if (american_date_match := AMERICAN_TIME_REGEX.search(timestamp_str)):
921
+ timestamp_str = american_date_match.group(1)
922
+ else:
923
+ timestamp_str = timestamp_str.replace('(GMT-05:00)', 'EST')
924
+ timestamp_str = BAD_TIMEZONE_REGEX.sub(' ', timestamp_str).strip()
925
+
926
+ timestamp = parse(timestamp_str, fuzzy=True, tzinfos=TIMEZONE_INFO)
859
927
  logger.debug(f'Parsed timestamp "%s" from string "%s"', timestamp, timestamp_str)
860
928
  return remove_timezone(timestamp)
861
929
  except Exception as e:
862
930
  logger.debug(f'Failed to parse "{timestamp_str}" to timestamp!')
863
-
864
-
865
- def _reverse_first_and_last_names(name: str) -> str:
866
- if '@' in name:
867
- return name.lower()
868
-
869
- if ', ' in name:
870
- names = name.split(', ')
871
- return f"{names[1]} {names[0]}"
872
- else:
873
- return name