epstein-files 1.2.1__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import logging
3
3
  import re
4
+ from collections import defaultdict
4
5
  from copy import deepcopy
5
6
  from dataclasses import asdict, dataclass, field
6
7
  from datetime import datetime
@@ -16,12 +17,12 @@ from rich.text import Text
16
17
  from epstein_files.documents.communication import Communication
17
18
  from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, INFO_INDENT
18
19
  from epstein_files.documents.emails.email_header import (BAD_EMAILER_REGEX, EMAIL_SIMPLE_HEADER_REGEX,
19
- EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, TIME_REGEX, EmailHeader)
20
+ EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, FIELDS_COLON_PATTERN, TIME_REGEX, EmailHeader)
21
+ from epstein_files.documents.other_file import OtherFile
20
22
  from epstein_files.util.constant.names import *
21
23
  from epstein_files.util.constant.strings import REDACTED
22
24
  from epstein_files.util.constants import *
23
- from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes,
24
- flatten, listify, remove_timezone, uniquify)
25
+ from epstein_files.util.data import TIMEZONE_INFO, collapse_newlines, escape_single_quotes, remove_timezone
25
26
  from epstein_files.util.doc_cfg import EmailCfg, Metadata
26
27
  from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
27
28
  from epstein_files.util.highlighted_group import JUNK_EMAILERS, get_style_for_name
@@ -29,9 +30,12 @@ from epstein_files.util.logging import logger
29
30
  from epstein_files.util.rich import *
30
31
 
31
32
  BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
32
- BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
33
+ BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Hide caption|Importance:?\s*High|[iI,•]|[1i] (_ )?[il]|, [-,]|L\._|_filtered|.*(yiv0232|font-family:|margin-bottom:).*)$')
34
+ BAD_SUBJECT_CONTINUATIONS = ['orwarded', 'Hi ', 'Sent ', 'AmLaw', 'Original Message', 'Privileged', 'Sorry', '---']
33
35
  DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
34
- LINK_LINE_REGEX = re.compile(f"^(> )?htt")
36
+ FIELDS_COLON_REGEX = re.compile(FIELDS_COLON_PATTERN)
37
+ LINK_LINE_REGEX = re.compile(f"^[>• ]*htt")
38
+ LINK_LINE2_REGEX = re.compile(r"^[-\w.%&=/]{5,}$")
35
39
  QUOTED_REPLY_LINE_REGEX = re.compile(r'(\nFrom:(.*)|wrote:)\n', re.IGNORECASE)
36
40
  REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
37
41
 
@@ -42,11 +46,12 @@ LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
42
46
 
43
47
  SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
44
48
  REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
45
- URL_SIGNIFIERS = ['gclid', 'htm', 'ref=', 'utm']
49
+ URL_SIGNIFIERS = ['?amp', 'amp?', 'cd=', 'click', 'CMP=', 'contentId', 'ft=', 'gclid', 'htm', 'mp=', 'keywords=', 'Id=', 'module=', 'mpweb', 'nlid=', 'ref=', 'smid=', 'sp=', 'usg=', 'utm']
46
50
  APPEARS_IN = 'appears in'
47
- MAX_CHARS_TO_PRINT = 4000
51
+
48
52
  MAX_NUM_HEADER_LINES = 14
49
- MAX_QUOTED_REPLIES = 2
53
+ MAX_QUOTED_REPLIES = 1
54
+ NUM_WORDS_IN_LAST_QUOTE = 6
50
55
 
51
56
  REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
52
57
  '********************************',
@@ -72,18 +77,25 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
72
77
  # Signatures
73
78
  'BlackBerry by AT &T': 'BlackBerry by AT&T',
74
79
  'BlackBerry from T- Mobile': 'BlackBerry from T-Mobile',
75
- 'Envoy& de mon iPhone': 'Envoyé de mon iPhone',
80
+ 'Envoy& de': 'Envoyé de',
76
81
  "from my 'Phone": 'from my iPhone',
77
82
  'from Samsung Mob.le': 'from Samsung Mobile',
78
83
  'gJeremyRubin': '@JeremyRubin',
79
84
  'Sent from Mabfl': 'Sent from Mobile', # NADIA_MARCINKO signature bad OCR
80
85
  'twitter glhsummers': 'twitter @lhsummers',
86
+ re.compile(r"[cC]o-authored with i ?Phone auto-correct"): "Co-authored with iPhone auto-correct",
81
87
  re.compile(r"twitter\.com[i/][lI]krauss[1lt]"): "twitter.com/lkrauss1",
82
88
  re.compile(r'from my BlackBerry[0°] wireless device'): 'from my BlackBerry® wireless device',
83
89
  re.compile(r'^INW$', re.MULTILINE): REDACTED,
84
90
  # links
85
91
  'Imps ://': 'https://',
92
+ 'on-accusers-rose-\nmcgowan/ ': 'on-accusers-rose-\nmcgowan/\n',
93
+ 'the-truth-\nabout-the-bitcoin-foundation/ )': 'the-truth-about-the-bitcoin-foundation/ )\n',
94
+ 'woody-allen-jeffrey-epsteins-\nsociety-friends-close-ranks/ ---': 'woody-allen-jeffrey-epsteins-society-friends-close_ranks/\n',
95
+ ' https://www.theguardian.com/world/2017/may/29/close-friend-trump-thomas-barrack-\nalleged-tax-evasion-italy-sardinia?CMP=share btn fb': '\nhttps://www.theguardian.com/world/2017/may/29/close-friend-trump-thomas-barrack-alleged-tax-evasion-italy-sardinia?CMP=share_btn_fb',
86
96
  re.compile(r'timestopics/people/t/landon jr thomas/inde\n?x\n?\.\n?h\n?tml'): 'timestopics/people/t/landon_jr_thomas/index.html',
97
+ re.compile(r" http ?://www. ?dailymail. ?co ?.uk/news/article-\d+/Troub ?led-woman-history-drug-\n?us ?e-\n?.*html"): '\nhttp://www.dailymail.co.uk/news/article-3914012/Troubled-woman-history-drug-use-claimed-assaulted-Donald-Trump-Jeffrey-Epstein-sex-party-age-13-FABRICATED-story.html',
98
+ re.compile(r"http.*steve-bannon-trump-tower-\n?interview-\n?trumps-\n?strategist-plots-\n?new-political-movement-948747"): "\nhttp://www.hollywoodreporter.com/news/steve-bannon-trump-tower-interview-trumps-strategist-plots-new-political-movement-948747",
87
99
  # Subject lines
88
100
  "Arrested in\nInauguration Day Riot": "Arrested in Inauguration Day Riot",
89
101
  "as Putin Mayhem Tests President's Grip\non GOP": "as Putin Mayhem Tests President's Grip on GOP",
@@ -94,6 +106,7 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
94
106
  "COVER UP SEX ABUSE CRIMES\nBY THE WHITE HOUSE": "COVER UP SEX ABUSE CRIMES BY THE WHITE HOUSE",
95
107
  'Priebus, used\nprivate email accounts for': 'Priebus, used private email accounts for',
96
108
  "War on the Investigations\nEncircling Him": "War on the Investigations Encircling Him",
109
+ "Subject; RE": "Subject: RE",
97
110
  re.compile(r"deadline re Mr Bradley Edwards vs Mr\s*Jeffrey Epstein", re.I): "deadline re Mr Bradley Edwards vs Mr Jeffrey Epstein",
98
111
  re.compile(r"Following Plea That Implicated Trump -\s*https://www.npr.org/676040070", re.I): "Following Plea That Implicated Trump - https://www.npr.org/676040070",
99
112
  re.compile(r"for Attorney General -\s+Wikisource, the"): r"for Attorney General - Wikisource, the",
@@ -104,27 +117,43 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
104
117
  re.compile(r"Subject:\s*Fwd: Trending Now: Friends for three decades"): "Subject: Fwd: Trending Now: Friends for three decades",
105
118
  # Misc
106
119
  'AVG°': 'AVGO',
120
+ 'Saw Matt C with DTF at golf': 'Saw Matt C with DJT at golf',
121
+ re.compile(r"[i. ]*Privileged[- ]*Redacted[i. ]*"): '<PRIVILEGED - REDACTED>',
107
122
  }
108
123
 
109
124
  EMAIL_SIGNATURE_REGEXES = {
110
125
  ARIANE_DE_ROTHSCHILD: re.compile(r"Ensemble.*\nCe.*\ndestinataires.*\nremercions.*\nautorisee.*\nd.*\nLe.*\ncontenues.*\nEdmond.*\nRoth.*\nlo.*\nRoth.*\ninfo.*\nFranc.*\n.2.*", re.I),
111
126
  BARBRO_C_EHNBOM: re.compile(r"Barbro C.? Ehn.*\nChairman, Swedish-American.*\n((Office|Cell|Sweden):.*\n)*(360.*\nNew York.*)?"),
127
+ BRAD_KARP: re.compile(r"This message is intended only for the use of the Addressee and may contain information.*\nnot the intended recipient, you are hereby notified.*\nreceived this communication in error.*"),
128
+ DANIEL_SIAD: re.compile(r"Confidentiality Notice: The information contained in this electronic message is PRIVILEGED and confidential information intended only for the use of the individual entity or entities named as recipient or recipients. If the reader is not the intended recipient, be hereby notified that any dissemination, distribution or copy of this communication is strictly prohibited. If you have received this communication in error, please notify me immediately by electronic mail or by telephone and permanently delete this message from your computer system. Thank you.".replace(' ', r'\s*'), re.IGNORECASE),
112
129
  DANNY_FROST: re.compile(r"Danny Frost\nDirector.*\nManhattan District.*\n212.*", re.IGNORECASE),
113
130
  DARREN_INDYKE: re.compile(r"DARREN K. INDYKE.*?\**\nThe information contained in this communication.*?Darren K.[\n\s]+?[Il]ndyke(, PLLC)? — All rights reserved\.? ?\n\*{50,120}(\n\**)?", re.DOTALL),
131
+ DAVID_FISZEL: re.compile(r"This e-mail and any file.*\nmail and/or any file.*\nmail or any.*\nreceived.*\nmisdirected.*"),
114
132
  DAVID_INGRAM: re.compile(r"Thank you in advance.*\nDavid Ingram.*\nCorrespondent\nReuters.*\nThomson.*(\n(Office|Mobile|Reuters.com).*)*"),
115
133
  DEEPAK_CHOPRA: re.compile(fr"({DEEPAK_CHOPRA}( MD)?\n)?2013 Costa Del Mar Road\nCarlsbad, CA 92009(\n(Chopra Foundation|Super Genes: Unlock.*))?(\nJiyo)?(\nChopra Center for Wellbeing)?(\nHome: Where Everyone is Welcome)?"),
134
+ EDUARDO_ROBLES: re.compile(r"(• )?email:.*\n(• )?email:\n(• )?website: www.creativekingdom.com\n(• )?address: 5th Floor Office No:504 Aspect Tower,\nBusiness Bay, Dubai United Arab Emirates."),
135
+ ERIC_ROTH: re.compile(r"2221 Smithtown Avenue\nLong Island.*\nRonkonkoma.*\n(.1. )?Phone\nFax\nCell\ne-mail"),
136
+ GHISLAINE_MAXWELL: re.compile(r"FACEBOOK\nTWITTER\nG\+\nPINTEREST\nINSTAGRAM\nPLEDGE\nTHE DAILY CATCH"),
116
137
  JEFFREY_EPSTEIN: re.compile(r"((\*+|please note)\n+)?(> )?(• )?(» )?The information contained in this communication is\n(> )*(» )?confidential.*?all attachments.( copyright -all rights reserved?)?", re.DOTALL),
117
138
  JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*(\nTel:.*)?(\nEmail:.*)?", re.IGNORECASE),
118
139
  KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
119
140
  LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
120
141
  LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
121
- MARTIN_WEINBERG: re.compile(r"(Martin G. Weinberg, Esq.\n20 Park Plaza((, )|\n)Suite 1000\nBoston, MA 02116(\n61.*)?(\n.*([cC]ell|Office))*\n)?This Electronic Message contains.*?contents of this message is.*?prohibited.", re.DOTALL),
122
- STEVEN_PFEIFFER: re.compile(r"Steven\nSteven .*\nAssociate.*\nIndependent Filmmaker Project\nMade in NY.*\n30 .*\nBrooklyn.*\n(p:.*\n)?www\.ifp.*", re.IGNORECASE),
142
+ LEON_BLACK: re.compile(r"This email and any files transmitted with it are confidential and intended solely.*\n(they|whom).*\ndissemination.*\nother.*\nand delete.*"),
143
+ LISA_NEW: re.compile(r"Elisa New\nPowell M. Cabot.*\n(Director.*\n)?Harvard.*\n148.*\n([1I] )?12.*\nCambridge.*\n([1I] )?02138"),
144
+ MARTIN_WEINBERG: re.compile(r"(Martin G. Weinberg, Esq.\n20 Park Plaza((, )|\n)Suite 1000\nBoston, MA 02116(\n61.*?)?(\n.*?([cC]ell|Office))*\n)?This Electronic Message contains.*?contents of this message is.*?prohibited.", re.DOTALL),
145
+ MICHAEL_MILLER: re.compile(r"Michael C. Miller\nPartner\nwww.steptoe.com/mmiller\nSteptoe\n(Privileged.*\n)?(\+1\s+)?direct.*\n(\+1\s+)?(\+1\s+)?fax.*\n(\+1.*)?cell.*\n(www.steptoe.com\n)?This message and any.*\nyou are not.*\nnotify the sender.*"),
146
+ NICHOLAS_RIBIS: re.compile(r"60 Morris Turnpike 2FL\nSummit,? NJ.*\n0:\nF:\n\*{20,}\nCONFIDENTIALITY NOTICE.*\nattachments.*\ncopying.*\nIf you have.*\nthe copy.*\nThank.*\n\*{20,}"),
123
147
  PETER_MANDELSON: re.compile(r'Disclaimer This email and any attachments to it may be.*?with[ \n]+number(.*?EC4V[ \n]+6BJ)?', re.DOTALL | re.IGNORECASE),
124
148
  PAUL_BARRETT: re.compile(r"Paul Barrett[\n\s]+Alpha Group Capital LLC[\n\s]+(142 W 57th Street, 11th Floor, New York, NY 10019?[\n\s]+)?(al?[\n\s]*)?ALPHA GROUP[\n\s]+CAPITAL"),
125
- RICHARD_KAHN: re.compile(r'Richard Kahn[\n\s]+HBRK Associates Inc.?[\n\s]+((301 East 66th Street, Suite 1OF|575 Lexington Avenue,? 4th Floor,?)[\n\s]+)?New York, (NY|New York) 100(22|65)([\n\s]+(Tel?|Phone)( I)?[\n\s]+Fa[x"]?[\n\s]+[Ce]el?l?)?', re.IGNORECASE),
149
+ PETER_ATTIA: re.compile(r"The information contained in this transmission may contain.*\n(laws|patient).*\n(distribution|named).*\n(distribution.*\nplease.*|copies.*)"),
150
+ RICHARD_KAHN: re.compile(fr'Richard Kahn[\n\s]+HBRK Associates Inc.?[\n\s]+((301 East 66th Street, Suite 1OF|575 Lexington Avenue,? 4th Floor,?)[\n\s]+)?New York, (NY|New York) 100(22|65)(\s+(Tel?|Phone)( I|{REDACTED})?\s+Fa[x",]?(_|{REDACTED})*\s+[Ce]el?l?)?', re.IGNORECASE),
151
+ ROSS_GOW: re.compile(r"Ross Gow\nManaging Partner\nACUITY Reputation Limited\n23 Berkeley Square\nLondon.*\nMobile.*\nTel"),
152
+ STEPHEN_HANSON: re.compile(r"(> )?Confidentiality Notice: This e-mail transmission.*\n(which it is addressed )?and may contain.*\n(applicable law. If you are not the intended )?recipient you are hereby.*\n(information contained in or attached to this transmission is )?STRICTLY PROHIBITED.*"),
153
+ STEVEN_PFEIFFER: re.compile(r"Steven\nSteven .*\nAssociate.*\nIndependent Filmmaker Project\nMade in NY.*\n30 .*\nBrooklyn.*\n(p:.*\n)?www\.ifp.*", re.IGNORECASE),
126
154
  'Susan Edelman': re.compile(r'Susan Edel.*\nReporter\n1211.*\n917.*\nsedelman.*', re.IGNORECASE),
127
155
  TERRY_KAFKA: re.compile(r"((>|I) )?Terry B.? Kafka.*\n(> )?Impact Outdoor.*\n(> )?5454.*\n(> )?Dallas.*\n((> )?c?ell.*\n)?(> )?Impactoutdoor.*(\n(> )?cell.*)?", re.IGNORECASE),
156
+ TOM_PRITZKER: re.compile(r"The contents of this email message.*\ncontain confidential.*\n(not )?the intended.*\n(error|please).*\n(you )?(are )?not the.*\n(this )?message.*"),
128
157
  TONJA_HADDAD_COLEMAN: re.compile(fr"Tonja Haddad Coleman.*\nTonja Haddad.*\nAdvocate Building\n315 SE 7th.*(\nSuite.*)?\nFort Lauderdale.*(\n({REDACTED} )?facsimile)?(\nwww.tonjahaddad.com?)?(\nPlease add this efiling.*\nThe information.*\nyou are not.*\nyou are not.*)?", re.IGNORECASE),
129
158
  UNKNOWN: re.compile(r"(This message is directed to and is for the use of the above-noted addressee only.*\nhereon\.)", re.DOTALL),
130
159
  }
@@ -136,118 +165,81 @@ MAILING_LISTS = [
136
165
  JP_MORGAN_USGIO,
137
166
  ]
138
167
 
139
- BBC_LISTS = JUNK_EMAILERS + MAILING_LISTS
168
+ BCC_LISTS = JUNK_EMAILERS + MAILING_LISTS
169
+
170
+ TRUNCATE_EMAILS_FROM_OR_TO = [
171
+ AMANDA_ENS,
172
+ ANTHONY_BARRETT,
173
+ DANIEL_SABBA,
174
+ DIANE_ZIMAN,
175
+ JOSCHA_BACH,
176
+ KATHERINE_KEATING,
177
+ LAWRANCE_VISOSKI,
178
+ LAWRENCE_KRAUSS,
179
+ LISA_NEW,
180
+ MOSHE_HOFFMAN,
181
+ NILI_PRIELL_BARAK,
182
+ PAUL_KRASSNER,
183
+ PAUL_PROSPERI,
184
+ 'Susan Edelman',
185
+ TERRY_KAFKA,
186
+ ]
140
187
 
141
- TRUNCATE_ALL_EMAILS_FROM = BBC_LISTS + [
188
+ TRUNCATE_EMAILS_FROM = BCC_LISTS + TRUNCATE_EMAILS_FROM_OR_TO + [
142
189
  'Alan S Halperin',
190
+ 'Alain Forget',
191
+ ARIANE_DE_ROTHSCHILD,
192
+ AZIZA_ALAHMADI,
193
+ BILL_SIEGEL,
194
+ DAVID_HAIG,
195
+ EDWARD_ROD_LARSEN,
196
+ JOHNNY_EL_HACHEM,
197
+ 'Mark Green',
198
+ MELANIE_WALKER,
143
199
  'Mitchell Bard',
200
+ PEGGY_SIEGAL,
201
+ ROBERT_LAWRENCE_KUHN,
202
+ ROBERT_TRIVERS,
144
203
  'Skip Rimer',
204
+ 'Steven Elkman',
205
+ STEVEN_PFEIFFER,
145
206
  'Steven Victor MD',
207
+ TERRY_KAFKA,
146
208
  ]
147
209
 
148
- TRUNCATION_LENGTHS = {
149
- '023627': 16_800, # Micheal Wolff article with brock pierce
150
- '030245': None, # Epstein rationalizes his behavior in an open letter to the world
151
- '030781': None, # Bannon email about crypto coin issues
152
- '032906': None, # David Blaine email
153
- '026036': 6000, # Gino Yu blockchain mention
154
- '023208': None, # Long discussion about leon black's finances
155
- '029609': None, # Joi Ito
156
- '025233': None, # Reputation.com discussion
157
- }
158
-
159
210
  # These are long forwarded articles so we force a trim to 1,333 chars if these strings exist
160
211
  TRUNCATE_TERMS = [
161
- 'The rebuilding of Indonesia',
162
- 'Dominique Strauss-Kahn',
163
- 'THOMAS L. FRIEDMAN',
164
- 'a sleek, briskly paced film whose title suggests a heist movie',
165
- 'quote from The Colbert Report distinguishes',
166
- 'co-inventor of the GTX Smart Shoe',
167
- 'my latest Washington Post column',
168
- 'supported my humanities work at Harvard',
212
+ 'The rebuilding of Indonesia', # Vikcy ward article
213
+ 'a sleek, briskly paced film whose title suggests a heist movie', # Inside Job
169
214
  'Calendar of Major Events, Openings, and Fundraisers',
170
- 'Nuclear Operator Raises Alarm on Crisis',
171
- 'as responsible for the democratisation of computing and',
172
- 'AROUND 1,000 operational satellites are circling the Earth',
215
+ 'sent over from Marshall Heyman at the WSJ',
173
216
  "In recent months, China's BAT collapse",
174
217
  'President Obama introduces Jim Yong Kim as his nominee',
175
218
  'Trump appears with mobster-affiliated felon at New',
176
- 'Lead Code Enforcement Walton presented the facts',
177
- "Is UNRWA vital for the Palestinians' future",
178
- 'The New York company, led by Stephen Ross',
179
- 'I spent some time mulling additional aspects of a third choice presidential',
180
- 'you are referring to duplication of a gene',
181
- 'i am writing you both because i am attaching a still not-quite-complete response',
182
- 'Learn to meditate and discover what truly nourishes your entire being',
183
219
  'Congratulations to the 2019 Hillman Prize recipients',
184
- 'This much we know - the Fall elections are shaping up',
185
220
  "Special counsel Robert Mueller's investigation may face a serious legal obstacle",
186
221
  "nearly leak-proof since its inception more than a year ago",
187
- "I appreciate the opportunity to respond to your email",
188
- "Hello Peter. I am currently on a plane. I sent you earlier",
189
- "I appreciate the opportunity to respond to your email",
190
- 'I just wanted to follow up on a couple of notes. I have been coordinating with Richard Kahn',
191
- 'So, Peggy, if you could just let me know what info to include on the donation',
192
- 'Consult a lawyer beforehand, if possible, but be cooperative/nice at this stage',
193
- # Amanda Ens
194
- 'We remain positive on banks that can make acceptable returns',
195
- 'David Woo (BAML head of FX, Rates and EM Strategy, very highly regarded',
196
- "Please let me know if you're interested in joining a small group meeting",
197
- 'Erika Najarian, BAML financials research analyst, just returned',
198
- 'We can also discuss single stock and Topix banks',
199
- 'We are recording unprecedented divergences in falling equity vol',
200
- 'As previously discussed between you and Ariane',
201
- 'no evidence you got the latest so i have sent you just the key message',
202
- # Joscha Bach
203
- 'Cells seem to be mostly indistinguishable (except',
204
- 'gender differenece. unlikely motivational, every cell is different',
205
- 'Some thoughts I meant to send back for a long time',
206
- # Krassner
207
- 'My friend Michael Simmons, who has been the editor of National Lampoon',
208
- "In the premiere episode of 'The Last Laugh' podcast, Sarah Silverman",
209
- 'Thanks so much for sharing both your note to Steven and your latest Manson essay',
210
- # Edward Larson
211
- 'Coming from an international background, and having lived in Oslo, Tel Aviv',
212
- # Katherine Keating
213
- 'Paul Keating is aware that many people see him as a puzzle and contradiction',
214
- 'his panoramic view of world affairs sharper than ever, Paul Keating blames',
215
- # melanie
216
- 'Some years ago when I worked at the libertarian Cato Institute'
217
- # rich kahn
218
- 'House and Senate Republicans on their respective tax overhaul',
219
- 'The Tax Act contains changes to the treatment of "carried interests"',
220
- 'General Election: Trump vs. Clinton LA Times/USC Tracking',
221
- 'Location: Quicken Loans Arena in Cleveland, OH',
222
- 'A friendly discussion about Syria with a former US State Department',
223
- # Robert Kuhn
224
- 'The US trade war against China: The view from Beijing',
225
- # Tom / Paul Krassner
226
- 'I forgot to post my cartoon from week before last, about Howard Schultz',
222
+ # Nikolic
223
+ 'Nuclear Operator Raises Alarm on Crisis',
224
+ 'as responsible for the democratisation of computing and',
225
+ 'AROUND 1,000 operational satellites are circling the Earth',
226
+ # Sultan Sulayem
227
+ 'co-inventor of the GTX Smart Shoe',
228
+ 'my latest Washington Post column',
227
229
  # Bannon
230
+ 'As Steve Bannon continues his tour of Europe',
228
231
  "Bannon the European: He's opening the populist fort in Brussels",
229
232
  "Steve Bannon doesn't do subtle.",
230
233
  'The Department of Justice lost its latest battle with Congress',
231
- "Donald Trump's newly named chief strategist and senior counselor",
232
- # Diane Ziman
233
- 'I was so proud to see him speak at the Women',
234
- # Krauss
235
- 'On confronting dogma, I of course agree',
236
- 'I did neck with that woman, but never forced myself on her',
237
- 'It is hard to know how to respond to a list of false',
238
- 'The Women in the World Summit opens April 12',
239
- 'lecture in Heidelberg Oct 14 but they had to cancel',
240
- # Nikolic
241
- 'people from LifeBall',
242
- # Epstein
243
- 'David Ben Gurion was asked why he, after 2000',
244
- # Lisa New
245
- 'The raw materials for that period include interviews',
246
- 'Whether you donated to Poetry in America through',
247
- # Random
248
- 'Little Hodiaki',
249
- "It began with deep worries regarding China's growth path",
250
- 'https://www.washingtonpost.com/politics/2018/09/04/transcript-phone-call',
234
+ 'pedophile Jeffrey Epstein bought his way out',
235
+ # lawyers
236
+ 'recuses itself from Jeffrey Epstein case',
237
+ # Misc
238
+ 'people from LifeBall', # Nikolic
239
+ "It began with deep worries regarding China's growth path", # Paul Morris
240
+ 'A friendly discussion about Syria with a former US State Department', # Fabrice Aidan
241
+ 'The US trade war against China: The view from Beijing', # Robert Kuhn / Groff
242
+ 'This much we know - the Fall elections are shaping up', # Juleanna Glover / Bannon
251
243
  ]
252
244
 
253
245
  METADATA_FIELDS = [
@@ -258,56 +250,100 @@ METADATA_FIELDS = [
258
250
  'subject',
259
251
  ]
260
252
 
261
- # Note the line repair happens *after* 'Importance: High' is removed
253
+ # Arguments to _merge_lines(). Note the line repair happens *after* 'Importance: High' is removed
262
254
  LINE_REPAIR_MERGES = {
263
- '017523': 4,
264
- '019407': [2, 4],
265
- '021729': 2,
266
- '022673': 9,
267
- '022684': 9,
268
- '022695': 4,
269
- '029773': [2, 5],
270
- '023067': 3,
271
- '025790': 2,
272
- '029841': 3,
273
- '026345': 3,
274
- '026609': 4,
275
- '033299': 3,
276
- '026829': 3,
277
- '026924': [2, 4],
278
- '028931': [3, 6],
279
- '029154': [2, 5],
280
- '029163': [2, 5],
281
- '029282': 2,
282
- '029402': 5,
283
- '029498': 2,
284
- '029501': 2,
285
- '029835': [2, 4],
286
- '029889': 2,
287
- '029545': [3, 5],
288
- '029976': 3,
289
- '030299': [7, 10],
290
- '030381': [2, 4],
291
- '030384': [2, 4],
292
- '030626': 2,
293
- '030999': [2, 4],
294
- '031384': 2,
295
- '031428': 2,
296
- '031442': 0,
297
- '031980': [2, 4],
298
- '032063': [3, 5],
299
- '032272': 3,
300
- '032405': 4,
301
- '033097': 2,
302
- '033144': [2, 4],
303
- '033217': 3,
304
- '033228': [3, 5],
305
- '033357': [2, 4],
306
- '033486': [7, 9],
307
- '033512': 2,
308
- '033575': [2, 4],
309
- '033576': 3,
310
- '033583': 2,
255
+ '013405': [[4]] * 2,
256
+ '013415': [[4]] * 2,
257
+ '014397': [[4]] * 2,
258
+ '014860': [[3], [4], [4]],
259
+ '017523': [[4]],
260
+ '030367': [[1, 4], [2, 4]],
261
+ '019105': [[5]] * 4,
262
+ '019407': [[2, 4]],
263
+ '022187': [[1, 8], [2, 8], [3, 8], [4, 8]],
264
+ '021729': [[2]],
265
+ '032896': [[2]],
266
+ '033050': [[0, 6], [1, 6], [2, 6], [3, 6], [4, 6]],
267
+ '022949': [[0, 4], [1, 4]],
268
+ '022197': [[0, 5], [1, 5], [3, 5]],
269
+ '021814': [[1, 6], [2, 6], [3, 6], [4, 6]],
270
+ '022190': [[1, 7], [0, 6], [3, 6], [4, 6]],
271
+ '029582': [[0, 5], [1, 5], [3, 5], [3, 5]],
272
+ '022673': [[9]],
273
+ '022684': [[9]],
274
+ '026625': [[0, 7], [1, 7], [2, 7], [3, 7], [4, 7], [5, 7]],
275
+ '026659': [[0, 5], [1, 5]],
276
+ '026764': [[0, 6], [1, 6]],
277
+ '022695': [[4]],
278
+ '022977': [[9]] * 10,
279
+ '023001': [[5]] * 3,
280
+ '023067': [[3]],
281
+ '025233': [[4]] * 2,
282
+ '025329': [[2]] * 9,
283
+ '025790': [[2]],
284
+ '025812': [[3]] * 2,
285
+ '025589': [[3]] * 12,
286
+ '026345': [[3]],
287
+ '026609': [[4]],
288
+ '028921': [[5, 4], [4, 5]],
289
+ '026620': ([[20]] * 4) + [[3, 2]] + ([[2]] * 15) + [[2, 4]],
290
+ '026829': [[3]],
291
+ '026924': [[2, 4]],
292
+ '028728': [[3]],
293
+ '026451': [[3, 5]] * 2,
294
+ '028931': [[3, 6]],
295
+ '029154': [[2, 5]],
296
+ '029163': [[2, 5]],
297
+ '029282': [[2]],
298
+ '029402': [[5]],
299
+ '029433': [[3]],
300
+ '029458': [[4]] * 3,
301
+ '029498': [[2], [2, 4]],
302
+ '029501': [[2]],
303
+ '029545': [[3, 5]],
304
+ '029773': [[2, 5]],
305
+ '029831': [[3, 6]],
306
+ '029835': [[2, 4]],
307
+ '029841': [[3]],
308
+ '029889': [[2], [2, 5]],
309
+ '029976': [[3]],
310
+ '029977': ([[2]] * 4) + [[4], [2, 4]],
311
+ '030299': [[7, 10]],
312
+ '030315': [[3, 5]],
313
+ '030318': [[3, 5]],
314
+ '030381': [[2, 4]],
315
+ '030384': [[2, 4]],
316
+ '030626': [[2], [4]],
317
+ '030861': [[3, 8]],
318
+ '030999': [[2, 4]],
319
+ '031384': [[2]],
320
+ '031428': [[2], [2, 4]],
321
+ '031442': [[0]],
322
+ '031489': [[2, 4], [3, 4], [3, 4], [10]],
323
+ '031619': [[7], [17], [17]],
324
+ '031748': [[3]] * 2,
325
+ '031764': [[3], [8]], # 8 is just for style fix internally, not header
326
+ '031980': [[2, 4]],
327
+ '032063': [[3, 5]],
328
+ '032272': [[2, 10], [3]],
329
+ '032405': [[4]],
330
+ '032637': [[9]] * 3,
331
+ '033097': [[2]],
332
+ '033144': [[2, 4]],
333
+ '033217': [[3]],
334
+ '033228': [[3, 5]],
335
+ '033252': [[9]] * 2,
336
+ '033271': [[3]],
337
+ '033299': [[3]],
338
+ '033357': [[2, 4]],
339
+ '033486': [[7, 9]],
340
+ '033512': [[2]],
341
+ '026024': [[1, 3], [2, 3]],
342
+ '024923': [[0, 5], [2]],
343
+ '033568': [[5]] * 5,
344
+ '033575': [[2, 4]],
345
+ '033576': [[3]],
346
+ '033583': [[2]],
311
347
  }
312
348
 
313
349
 
@@ -322,12 +358,15 @@ class Email(Communication):
322
358
  sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
323
359
  signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
324
360
  """
361
+ attached_docs: list[OtherFile] = field(default_factory=list)
325
362
  actual_text: str = field(init=False)
326
363
  config: EmailCfg | None = None
327
364
  header: EmailHeader = field(init=False)
328
365
  recipients: list[Name] = field(default_factory=list)
329
366
  sent_from_device: str | None = None
330
367
  signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
368
+ _is_first_for_user: bool = False # Only set when printing
369
+ _line_merge_arguments: list[tuple[int] | tuple[int, int]] = field(default_factory=list)
331
370
 
332
371
  # For logging how many headers we prettified while printing, kind of janky
333
372
  rewritten_header_ids: ClassVar[set[str]] = set([])
@@ -353,7 +392,7 @@ class Email(Communication):
353
392
  self.recipients.extend(self._extract_emailer_names(recipient))
354
393
 
355
394
  # Assume mailing list emails are to Epstein
356
- if self.author in BBC_LISTS and (self.is_note_to_self() or not self.recipients):
395
+ if self.author in BCC_LISTS and (self.is_note_to_self() or not self.recipients):
357
396
  self.recipients = [JEFFREY_EPSTEIN]
358
397
 
359
398
  # Remove self CCs but preserve self emails
@@ -366,6 +405,7 @@ class Email(Communication):
366
405
  self.sent_from_device = self._sent_from_device()
367
406
 
368
407
  def attachments(self) -> list[str]:
408
+ """Returns the string in the header."""
369
409
  return (self.header.attachments or '').split(';')
370
410
 
371
411
  def info_txt(self) -> Text:
@@ -379,7 +419,12 @@ class Email(Communication):
379
419
  return txt.append(highlighter(f" probably sent at {self.timestamp}"))
380
420
 
381
421
  def is_fwded_article(self) -> bool:
382
- return bool(self.config and self.config.is_fwded_article)
422
+ if self.config is None:
423
+ return False
424
+ elif self.config.fwded_text_after:
425
+ return self.config.is_fwded_article is not False
426
+ else:
427
+ return bool(self.config.is_fwded_article)
383
428
 
384
429
  def is_junk_mail(self) -> bool:
385
430
  return self.author in JUNK_EMAILERS
@@ -390,6 +435,15 @@ class Email(Communication):
390
435
  def is_note_to_self(self) -> bool:
391
436
  return self.recipients == [self.author]
392
437
 
438
+ def is_from_or_to(self, name: str) -> bool:
439
+ return name in [self.author] + self.recipients
440
+
441
+ def is_word_count_worthy(self) -> bool:
442
+ if self.is_fwded_article():
443
+ return bool(self.config.fwded_text_after) or len(self.actual_text) < 150
444
+ else:
445
+ return not self.is_mailing_list()
446
+
393
447
  def metadata(self) -> Metadata:
394
448
  local_metadata = asdict(self)
395
449
  local_metadata['is_junk_mail'] = self.is_junk_mail()
@@ -436,8 +490,9 @@ class Email(Communication):
436
490
  elif self.header.num_header_rows == 0:
437
491
  return self.text
438
492
 
493
+ # import pdb;pdb.set_trace()
439
494
  self.log_top_lines(20, "Raw text:", logging.DEBUG)
440
- self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
495
+ self.log(f"With {self.header.num_header_rows} header lines removed:\n{text[0:500]}\n\n", logging.DEBUG)
441
496
  reply_text_match = REPLY_TEXT_REGEX.search(text)
442
497
 
443
498
  if reply_text_match:
@@ -516,8 +571,8 @@ class Email(Communication):
516
571
  logger.debug(f"{self.file_id} extracted header\n\n{self.header}\n")
517
572
 
518
573
  def _extract_timestamp(self) -> datetime:
519
- if self.config and self.config.timestamp:
520
- return self.config.timestamp
574
+ if self.config and self.config.timestamp():
575
+ return self.config.timestamp()
521
576
  elif self.header.sent_at:
522
577
  timestamp = _parse_timestamp(self.header.sent_at)
523
578
 
@@ -546,31 +601,41 @@ class Email(Communication):
546
601
  logger.debug(f"Fell back to timestamp {timestamp} in line '{line}'...")
547
602
  return timestamp
548
603
 
549
- raise RuntimeError(f"No timestamp found in '{self.file_path.name}' top lines:\n{searchable_text}")
604
+ no_timestamp_msg = f"No timestamp found in '{self.file_path.name}'"
550
605
 
551
- def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
552
- """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
553
- if text is None:
554
- header_offset = len(self.header.header_chars)
555
- text = self.text[header_offset:]
606
+ if self.is_duplicate():
607
+ logger.warning(f"{no_timestamp_msg} but timestamp should be copied from {self.duplicate_of_id()}")
556
608
  else:
557
- header_offset = 0
609
+ raise RuntimeError(f"{no_timestamp_msg}, top lines:\n{searchable_text}")
610
+
611
+ def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES) -> int | None:
612
+ """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
613
+ header_offset = len(self.header.header_chars)
614
+ text = self.text[header_offset:]
558
615
 
559
616
  for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text)):
560
617
  if i >= n:
561
618
  return match.end() + header_offset - 1
562
619
 
563
- def _merge_lines(self, idx: int, idx2: int | None = None) -> None:
620
+ def _merge_lines(self, idx1: int, idx2: int | None = None) -> None:
564
621
  """Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""
565
- idx2 = idx2 if idx2 is not None else (idx + 1)
566
- lines = self.lines[0:idx]
622
+ if idx2 is None:
623
+ self._line_merge_arguments.append((idx1,))
624
+ idx2 = idx1 + 1
625
+ else:
626
+ self._line_merge_arguments.append((idx1, idx2))
567
627
 
568
- if idx2 <= idx:
569
- raise RuntimeError(f"idx2 ({idx2}) must be greater than idx ({idx})")
570
- elif idx2 == (idx + 1):
571
- lines += [self.lines[idx] + ' ' + self.lines[idx + 1]] + self.lines[idx + 2:]
628
+ if idx2 < idx1:
629
+ lines = self.lines[0:idx2] + self.lines[idx2 + 1:idx1] + [self.lines[idx1] + ' ' + self.lines[idx2]] + self.lines[idx1 + 1:]
630
+ elif idx2 == idx1:
631
+ raise RuntimeError(f"idx2 ({idx2}) must be greater or less than idx ({idx1})")
572
632
  else:
573
- lines += [self.lines[idx] + ' ' + self.lines[idx2]] + self.lines[idx + 1:idx2] + self.lines[idx2 + 1:]
633
+ lines = self.lines[0:idx1]
634
+
635
+ if idx2 == (idx1 + 1):
636
+ lines += [self.lines[idx1] + ' ' + self.lines[idx1 + 1]] + self.lines[idx1 + 2:]
637
+ else:
638
+ lines += [self.lines[idx1] + ' ' + self.lines[idx2]] + self.lines[idx1 + 1:idx2] + self.lines[idx2 + 1:]
574
639
 
575
640
  self._set_computed_fields(lines=lines)
576
641
 
@@ -586,6 +651,10 @@ class Email(Communication):
586
651
  self.signature_substitution_counts[name] = self.signature_substitution_counts.get(name, 0)
587
652
  self.signature_substitution_counts[name] += num_replaced
588
653
 
654
+ # Share / Tweet lines
655
+ if self.author == KATHRYN_RUEMMLER:
656
+ text = '\n'.join([l for l in text.split('\n') if l not in ['Share', 'Tweet', 'Bookmark it']])
657
+
589
658
  return collapse_newlines(text).strip()
590
659
 
591
660
  def _remove_line(self, idx: int) -> None:
@@ -605,68 +674,15 @@ class Email(Communication):
605
674
  old_text = self.text
606
675
 
607
676
  if self.file_id in LINE_REPAIR_MERGES:
608
- merge = LINE_REPAIR_MERGES[self.file_id]
609
- merge_args = merge if isinstance(merge, list) else [merge]
610
- self._merge_lines(*merge_args)
611
-
612
- # These already had 2nd line merged
613
- if self.file_id in ['030626']: # Merge 6th and 7th (now 5th and 6th) rows
614
- self._merge_lines(4)
615
- elif self.file_id == '029889':
616
- self._merge_lines(2, 5)
617
- elif self.file_id in ['029498', '031428']:
618
- self._merge_lines(2, 4)
619
-
620
- # Multiline
621
- if self.file_id == '013415':
622
- for _i in range(2):
623
- self._merge_lines(4)
624
- elif self.file_id == '013405':
625
- for _i in range(2):
626
- self._merge_lines(4)
627
- elif self.file_id == '029458':
628
- for _i in range(3):
629
- self._merge_lines(4)
630
- elif self.file_id in ['025233']:
631
- for _i in range(2):
632
- self._merge_lines(4)
677
+ for merge_args in LINE_REPAIR_MERGES[self.file_id]:
678
+ self._merge_lines(*merge_args)
633
679
 
680
+ if self.file_id in ['025233']:
634
681
  self.lines[4] = f"Attachments: {self.lines[4]}"
635
682
  self._set_computed_fields(lines=self.lines)
636
- elif self.file_id in ['023001']:
637
- for _i in range(3):
638
- self._merge_lines(5)
639
- elif self.file_id in ['019105']:
640
- for _i in range(4):
641
- self._merge_lines(5)
642
- elif self.file_id in ['033568']:
643
- for _i in range(5):
644
- self._merge_lines(5)
645
- elif self.file_id in ['025329']:
646
- for _i in range(9):
647
- self._merge_lines(2)
648
- elif self.file_id in ['025812']:
649
- for _i in range(2):
650
- self._merge_lines(3)
651
- elif self.file_id == '014860':
652
- self._merge_lines(3)
653
- self._merge_lines(4)
654
- self._merge_lines(4)
655
683
  elif self.file_id == '029977':
656
684
  self._set_computed_fields(text=self.text.replace('Sent 9/28/2012 2:41:02 PM', 'Sent: 9/28/2012 2:41:02 PM'))
657
685
 
658
- for _i in range(4):
659
- self._merge_lines(2)
660
-
661
- self._merge_lines(4)
662
- self._merge_lines(2, 4)
663
- elif self.file_id in ['033252']:
664
- for _i in range(2):
665
- self._merge_lines(9)
666
- elif self.file_id in ['032637']:
667
- for _i in range(3):
668
- self._merge_lines(9)
669
-
670
686
  # Bad line removal
671
687
  if self.file_id == '025041':
672
688
  self._remove_line(4)
@@ -679,22 +695,40 @@ class Email(Communication):
679
695
  self.log_top_lines(12, 'Result of modifications')
680
696
 
681
697
  lines = self.repair_ocr_text(OCR_REPAIRS, self.text).split('\n')
698
+ subject_line = next((line for line in lines if line.startswith('Subject:')), None) or ''
699
+ subject = subject_line.split(':')[1].strip() if subject_line else ''
682
700
  new_lines = []
683
701
  i = 0
684
702
 
685
- # Fix links (remove spaces, merge multiline links to a single line)
703
+ # Fix links and quoted subjects (remove spaces, merge multiline links to a single line)
686
704
  while i < len(lines):
687
705
  line = lines[i]
688
706
 
689
707
  if LINK_LINE_REGEX.search(line):
690
- if 'htm' not in line \
691
- and i < (len(lines) - 1) \
692
- and (lines[i + 1].endswith('/') or any(s in lines[i + 1] for s in URL_SIGNIFIERS)):
708
+ while i < (len(lines) - 1) \
709
+ and not lines[i + 1].startswith('htt') \
710
+ and (lines[i + 1].endswith('/') \
711
+ or any(s in lines[i + 1] for s in URL_SIGNIFIERS) \
712
+ or LINK_LINE2_REGEX.match(lines[i + 1])):
693
713
  logger.debug(f"{self.filename}: Joining link lines\n 1. {line}\n 2. {lines[i + 1]}\n")
694
714
  line += lines[i + 1]
695
715
  i += 1
696
716
 
697
717
  line = line.replace(' ', '')
718
+ elif ' http' in line and line.endswith('html'):
719
+ pre_link, post_link = line.split(' http', 1)
720
+ line = f"{pre_link} http{post_link.replace(' ', '')}"
721
+ elif line.startswith('Subject:') and i < (len(lines) - 2) and len(line) >= 40:
722
+ next_line = lines[i + 1]
723
+ next_next = lines[i + 2]
724
+
725
+ if len(next_line) <= 1 or any([cont in next_line for cont in BAD_SUBJECT_CONTINUATIONS]):
726
+ pass
727
+ elif (subject.endswith(next_line) and next_line != subject) \
728
+ or (FIELDS_COLON_REGEX.search(next_next) and not FIELDS_COLON_REGEX.search(next_line)):
729
+ self.warn(f"Fixing broken subject line\n line: '{line}'\n next: '{next_line}'\n next: '{next_next}'\nsubject='{subject}'\n")
730
+ line += f" {next_line}"
731
+ i += 1
698
732
 
699
733
  new_lines.append(line)
700
734
 
@@ -718,7 +752,7 @@ class Email(Communication):
718
752
  """Copy info from original config for file this document was extracted from."""
719
753
  if self.file_id in ALL_FILE_CONFIGS:
720
754
  self.config = cast(EmailCfg, deepcopy(ALL_FILE_CONFIGS[self.file_id]))
721
- self.warn(f"Merging existing cfg for '{self.file_id}' with cfg for extracted document...")
755
+ self.log(f"Merging existing cfg for '{self.file_id}' with cfg for extracted document...")
722
756
  else:
723
757
  self.config = EmailCfg(id=self.file_id)
724
758
 
@@ -740,33 +774,58 @@ class Email(Communication):
740
774
 
741
775
  def _truncate_to_length(self) -> int:
742
776
  """When printing truncate this email to this length."""
743
- quote_cutoff = self._idx_of_nth_quoted_reply(text=self.text) # Trim if there's many quoted replies
777
+ quote_cutoff = self._idx_of_nth_quoted_reply() # Trim if there's many quoted replies
744
778
  includes_truncate_term = next((term for term in TRUNCATE_TERMS if term in self.text), None)
745
779
 
746
780
  if args.whole_file:
747
781
  num_chars = len(self.text)
748
- elif self.file_id in TRUNCATION_LENGTHS:
749
- num_chars = TRUNCATION_LENGTHS[self.file_id] or self.file_size()
750
- elif self.author in TRUNCATE_ALL_EMAILS_FROM or includes_truncate_term:
751
- num_chars = int(MAX_CHARS_TO_PRINT / 3)
752
- elif quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
753
- num_chars = quote_cutoff
782
+ elif args.truncate:
783
+ num_chars = args.truncate
784
+ elif self.config and self.config.truncate_to is not None:
785
+ num_chars = len(self.text) if self.config.truncate_to == NO_TRUNCATE else self.config.truncate_to
786
+ elif self.is_interesting():
787
+ num_chars = len(self.text)
788
+ elif self.author in TRUNCATE_EMAILS_FROM \
789
+ or any([self.is_from_or_to(n) for n in TRUNCATE_EMAILS_FROM_OR_TO]) \
790
+ or self.is_fwded_article() \
791
+ or includes_truncate_term:
792
+ num_chars = min(quote_cutoff or MAX_CHARS_TO_PRINT, TRUNCATED_CHARS)
754
793
  else:
755
- num_chars = MAX_CHARS_TO_PRINT
756
-
757
- if num_chars != MAX_CHARS_TO_PRINT and not self.is_duplicate():
758
- log_args = {
759
- 'num_chars': num_chars,
760
- 'author_truncate': self.author in TRUNCATE_ALL_EMAILS_FROM,
761
- 'is_fwded_article': self.is_fwded_article(),
762
- 'is_quote_cutoff': quote_cutoff == num_chars,
763
- 'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
764
- 'quote_cutoff': quote_cutoff,
765
- }
766
-
767
- if quote_cutoff != num_chars:
768
- logger.debug(f'{self.summary()} truncating: ' + ', '.join([f"{k}={v}" for k, v in log_args.items() if v]) + '\n')
769
-
794
+ if quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
795
+ trimmed_words = self.text[quote_cutoff:].split()
796
+
797
+ if '<...snipped' in trimmed_words[:NUM_WORDS_IN_LAST_QUOTE]:
798
+ num_trailing_words = 0
799
+ elif trimmed_words and trimmed_words[0] in ['From:', 'Sent:']:
800
+ num_trailing_words = NUM_WORDS_IN_LAST_QUOTE
801
+ else:
802
+ num_trailing_words = NUM_WORDS_IN_LAST_QUOTE
803
+
804
+ if trimmed_words:
805
+ last_quoted_text = ' '.join(trimmed_words[:num_trailing_words])
806
+ num_chars = quote_cutoff + len(last_quoted_text) + 1 # Give a hint of the next line
807
+ else:
808
+ num_chars = quote_cutoff
809
+ else:
810
+ num_chars = min(self.file_size(), MAX_CHARS_TO_PRINT)
811
+
812
+ # Always print whole email for 1st email for user
813
+ if self._is_first_for_user and num_chars < self.file_size() and not self.is_duplicate():
814
+ logger.info(f"{self} Overriding cutoff {num_chars} for first email")
815
+ num_chars = self.file_size()
816
+
817
+ log_args = {
818
+ 'num_chars': num_chars,
819
+ '_is_first_for_user': self._is_first_for_user,
820
+ 'author_truncate': self.author in TRUNCATE_EMAILS_FROM,
821
+ 'is_fwded_article': self.is_fwded_article(),
822
+ 'is_quote_cutoff': quote_cutoff == num_chars,
823
+ 'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
824
+ 'quote_cutoff': quote_cutoff,
825
+ }
826
+
827
+ log_args_str = ', '.join([f"{k}={v}" for k, v in log_args.items() if v])
828
+ logger.debug(f"Truncate determination: {log_args_str}")
770
829
  return num_chars
771
830
 
772
831
  def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
@@ -780,7 +839,7 @@ class Email(Communication):
780
839
  if len(text) > num_chars:
781
840
  text = text[0:num_chars]
782
841
  doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style())
783
- trim_note = f"<...trimmed to {num_chars} characters of {self.length()}, read the rest at {doc_link_markup}...>"
842
+ trim_note = f"<...trimmed to {num_chars:,} characters of {self.length():,}, read the rest at {doc_link_markup}...>"
784
843
  trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
785
844
 
786
845
  # Rewrite broken headers where the values are on separate lines from the field names
@@ -799,8 +858,15 @@ class Email(Communication):
799
858
  text = _add_line_breaks(text) # This was skipped when _prettify_text() w/a broken header so we do it now
800
859
  self.rewritten_header_ids.add(self.file_id)
801
860
 
861
+ lines = [
862
+ Text.from_markup(f"[link={line}]{line}[/link]") if line.startswith('http') else Text(line)
863
+ for line in text.split('\n')
864
+ ]
865
+
866
+ text = join_texts(lines, '\n')
867
+
802
868
  email_txt_panel = Panel(
803
- highlighter(text).append('\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
869
+ highlighter(text).append('...\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
804
870
  border_style=self._border_style(),
805
871
  expand=False,
806
872
  subtitle=REWRITTEN_HEADER_MSG if should_rewrite_header else None,
@@ -809,6 +875,11 @@ class Email(Communication):
809
875
  yield self.file_info_panel()
810
876
  yield Padding(email_txt_panel, (0, 0, 1, INFO_INDENT))
811
877
 
878
+ if self.attached_docs:
879
+ attachments_table_title = f" {self.url_slug} Email Attachments:"
880
+ attachments_table = OtherFile.files_preview_table(self.attached_docs, title=attachments_table_title)
881
+ yield Padding(attachments_table, (0, 0, 1, 12))
882
+
812
883
  if should_rewrite_header:
813
884
  self.log_top_lines(self.header.num_header_rows + 4, f'Original header:')
814
885