epstein-files 1.2.5__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +32 -13
- epstein_files/documents/document.py +8 -1
- epstein_files/documents/email.py +179 -97
- epstein_files/documents/emails/email_header.py +17 -8
- epstein_files/documents/other_file.py +8 -6
- epstein_files/epstein_files.py +16 -1
- epstein_files/person.py +40 -15
- epstein_files/util/constant/names.py +10 -6
- epstein_files/util/constant/strings.py +2 -1
- epstein_files/util/constants.py +463 -225
- epstein_files/util/doc_cfg.py +33 -27
- epstein_files/util/env.py +10 -3
- epstein_files/util/file_helper.py +2 -0
- epstein_files/util/highlighted_group.py +66 -23
- epstein_files/util/output.py +17 -31
- epstein_files/util/rich.py +2 -1
- epstein_files/util/word_count.py +1 -1
- {epstein_files-1.2.5.dist-info → epstein_files-1.4.1.dist-info}/METADATA +3 -3
- epstein_files-1.4.1.dist-info/RECORD +34 -0
- {epstein_files-1.2.5.dist-info → epstein_files-1.4.1.dist-info}/entry_points.txt +1 -1
- epstein_files-1.2.5.dist-info/RECORD +0 -34
- {epstein_files-1.2.5.dist-info → epstein_files-1.4.1.dist-info}/LICENSE +0 -0
- {epstein_files-1.2.5.dist-info → epstein_files-1.4.1.dist-info}/WHEEL +0 -0
epstein_files/documents/email.py
CHANGED
|
@@ -17,12 +17,12 @@ from rich.text import Text
|
|
|
17
17
|
from epstein_files.documents.communication import Communication
|
|
18
18
|
from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, INFO_INDENT
|
|
19
19
|
from epstein_files.documents.emails.email_header import (BAD_EMAILER_REGEX, EMAIL_SIMPLE_HEADER_REGEX,
|
|
20
|
-
EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, TIME_REGEX, EmailHeader)
|
|
20
|
+
EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, FIELDS_COLON_PATTERN, TIME_REGEX, EmailHeader)
|
|
21
|
+
from epstein_files.documents.other_file import OtherFile
|
|
21
22
|
from epstein_files.util.constant.names import *
|
|
22
23
|
from epstein_files.util.constant.strings import REDACTED
|
|
23
24
|
from epstein_files.util.constants import *
|
|
24
|
-
from epstein_files.util.data import
|
|
25
|
-
flatten, listify, remove_timezone, uniquify)
|
|
25
|
+
from epstein_files.util.data import TIMEZONE_INFO, collapse_newlines, escape_single_quotes, remove_timezone
|
|
26
26
|
from epstein_files.util.doc_cfg import EmailCfg, Metadata
|
|
27
27
|
from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
|
|
28
28
|
from epstein_files.util.highlighted_group import JUNK_EMAILERS, get_style_for_name
|
|
@@ -30,9 +30,11 @@ from epstein_files.util.logging import logger
|
|
|
30
30
|
from epstein_files.util.rich import *
|
|
31
31
|
|
|
32
32
|
BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
|
|
33
|
-
BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|
|
|
33
|
+
BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Hide caption|Importance:?\s*High|[iI,•]|[1i] (_ )?[il]|, [-,]|L\._|_filtered|.*(yiv0232|font-family:|margin-bottom:).*)$')
|
|
34
|
+
BAD_SUBJECT_CONTINUATIONS = ['orwarded', 'Hi ', 'Sent ', 'AmLaw', 'Original Message', 'Privileged', 'Sorry', '---']
|
|
34
35
|
DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
|
|
35
|
-
|
|
36
|
+
FIELDS_COLON_REGEX = re.compile(FIELDS_COLON_PATTERN)
|
|
37
|
+
LINK_LINE_REGEX = re.compile(f"^[>• ]*htt")
|
|
36
38
|
LINK_LINE2_REGEX = re.compile(r"^[-\w.%&=/]{5,}$")
|
|
37
39
|
QUOTED_REPLY_LINE_REGEX = re.compile(r'(\nFrom:(.*)|wrote:)\n', re.IGNORECASE)
|
|
38
40
|
REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
|
|
@@ -44,13 +46,12 @@ LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
|
|
|
44
46
|
|
|
45
47
|
SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
|
|
46
48
|
REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
|
|
47
|
-
URL_SIGNIFIERS = ['amp?', 'cd=', 'click', 'ft=', 'gclid', 'htm', 'keywords=', 'module=', 'mpweb', 'nlid=', 'ref=', 'smid=', 'usg=', 'utm']
|
|
49
|
+
URL_SIGNIFIERS = ['?amp', 'amp?', 'cd=', 'click', 'CMP=', 'contentId', 'ft=', 'gclid', 'htm', 'mp=', 'keywords=', 'Id=', 'module=', 'mpweb', 'nlid=', 'ref=', 'smid=', 'sp=', 'usg=', 'utm']
|
|
48
50
|
APPEARS_IN = 'appears in'
|
|
49
51
|
|
|
50
52
|
MAX_NUM_HEADER_LINES = 14
|
|
51
|
-
MAX_QUOTED_REPLIES =
|
|
52
|
-
|
|
53
|
-
TRUNCATED_CHARS = int(MAX_CHARS_TO_PRINT / 3)
|
|
53
|
+
MAX_QUOTED_REPLIES = 1
|
|
54
|
+
NUM_WORDS_IN_LAST_QUOTE = 6
|
|
54
55
|
|
|
55
56
|
REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
|
|
56
57
|
'********************************',
|
|
@@ -88,7 +89,13 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
|
88
89
|
re.compile(r'^INW$', re.MULTILINE): REDACTED,
|
|
89
90
|
# links
|
|
90
91
|
'Imps ://': 'https://',
|
|
92
|
+
'on-accusers-rose-\nmcgowan/ ': 'on-accusers-rose-\nmcgowan/\n',
|
|
93
|
+
'the-truth-\nabout-the-bitcoin-foundation/ )': 'the-truth-about-the-bitcoin-foundation/ )\n',
|
|
94
|
+
'woody-allen-jeffrey-epsteins-\nsociety-friends-close-ranks/ ---': 'woody-allen-jeffrey-epsteins-society-friends-close_ranks/\n',
|
|
95
|
+
' https://www.theguardian.com/world/2017/may/29/close-friend-trump-thomas-barrack-\nalleged-tax-evasion-italy-sardinia?CMP=share btn fb': '\nhttps://www.theguardian.com/world/2017/may/29/close-friend-trump-thomas-barrack-alleged-tax-evasion-italy-sardinia?CMP=share_btn_fb',
|
|
91
96
|
re.compile(r'timestopics/people/t/landon jr thomas/inde\n?x\n?\.\n?h\n?tml'): 'timestopics/people/t/landon_jr_thomas/index.html',
|
|
97
|
+
re.compile(r" http ?://www. ?dailymail. ?co ?.uk/news/article-\d+/Troub ?led-woman-history-drug-\n?us ?e-\n?.*html"): '\nhttp://www.dailymail.co.uk/news/article-3914012/Troubled-woman-history-drug-use-claimed-assaulted-Donald-Trump-Jeffrey-Epstein-sex-party-age-13-FABRICATED-story.html',
|
|
98
|
+
re.compile(r"http.*steve-bannon-trump-tower-\n?interview-\n?trumps-\n?strategist-plots-\n?new-political-movement-948747"): "\nhttp://www.hollywoodreporter.com/news/steve-bannon-trump-tower-interview-trumps-strategist-plots-new-political-movement-948747",
|
|
92
99
|
# Subject lines
|
|
93
100
|
"Arrested in\nInauguration Day Riot": "Arrested in Inauguration Day Riot",
|
|
94
101
|
"as Putin Mayhem Tests President's Grip\non GOP": "as Putin Mayhem Tests President's Grip on GOP",
|
|
@@ -99,6 +106,7 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
|
99
106
|
"COVER UP SEX ABUSE CRIMES\nBY THE WHITE HOUSE": "COVER UP SEX ABUSE CRIMES BY THE WHITE HOUSE",
|
|
100
107
|
'Priebus, used\nprivate email accounts for': 'Priebus, used private email accounts for',
|
|
101
108
|
"War on the Investigations\nEncircling Him": "War on the Investigations Encircling Him",
|
|
109
|
+
"Subject; RE": "Subject: RE",
|
|
102
110
|
re.compile(r"deadline re Mr Bradley Edwards vs Mr\s*Jeffrey Epstein", re.I): "deadline re Mr Bradley Edwards vs Mr Jeffrey Epstein",
|
|
103
111
|
re.compile(r"Following Plea That Implicated Trump -\s*https://www.npr.org/676040070", re.I): "Following Plea That Implicated Trump - https://www.npr.org/676040070",
|
|
104
112
|
re.compile(r"for Attorney General -\s+Wikisource, the"): r"for Attorney General - Wikisource, the",
|
|
@@ -109,6 +117,8 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
|
109
117
|
re.compile(r"Subject:\s*Fwd: Trending Now: Friends for three decades"): "Subject: Fwd: Trending Now: Friends for three decades",
|
|
110
118
|
# Misc
|
|
111
119
|
'AVG°': 'AVGO',
|
|
120
|
+
'Saw Matt C with DTF at golf': 'Saw Matt C with DJT at golf',
|
|
121
|
+
re.compile(r"[i. ]*Privileged[- ]*Redacted[i. ]*"): '<PRIVILEGED - REDACTED>',
|
|
112
122
|
}
|
|
113
123
|
|
|
114
124
|
EMAIL_SIGNATURE_REGEXES = {
|
|
@@ -118,20 +128,28 @@ EMAIL_SIGNATURE_REGEXES = {
|
|
|
118
128
|
DANIEL_SIAD: re.compile(r"Confidentiality Notice: The information contained in this electronic message is PRIVILEGED and confidential information intended only for the use of the individual entity or entities named as recipient or recipients. If the reader is not the intended recipient, be hereby notified that any dissemination, distribution or copy of this communication is strictly prohibited. If you have received this communication in error, please notify me immediately by electronic mail or by telephone and permanently delete this message from your computer system. Thank you.".replace(' ', r'\s*'), re.IGNORECASE),
|
|
119
129
|
DANNY_FROST: re.compile(r"Danny Frost\nDirector.*\nManhattan District.*\n212.*", re.IGNORECASE),
|
|
120
130
|
DARREN_INDYKE: re.compile(r"DARREN K. INDYKE.*?\**\nThe information contained in this communication.*?Darren K.[\n\s]+?[Il]ndyke(, PLLC)? — All rights reserved\.? ?\n\*{50,120}(\n\**)?", re.DOTALL),
|
|
131
|
+
DAVID_FISZEL: re.compile(r"This e-mail and any file.*\nmail and/or any file.*\nmail or any.*\nreceived.*\nmisdirected.*"),
|
|
121
132
|
DAVID_INGRAM: re.compile(r"Thank you in advance.*\nDavid Ingram.*\nCorrespondent\nReuters.*\nThomson.*(\n(Office|Mobile|Reuters.com).*)*"),
|
|
122
133
|
DEEPAK_CHOPRA: re.compile(fr"({DEEPAK_CHOPRA}( MD)?\n)?2013 Costa Del Mar Road\nCarlsbad, CA 92009(\n(Chopra Foundation|Super Genes: Unlock.*))?(\nJiyo)?(\nChopra Center for Wellbeing)?(\nHome: Where Everyone is Welcome)?"),
|
|
123
|
-
EDUARDO_ROBLES: re.compile(
|
|
134
|
+
EDUARDO_ROBLES: re.compile(r"(• )?email:.*\n(• )?email:\n(• )?website: www.creativekingdom.com\n(• )?address: 5th Floor Office No:504 Aspect Tower,\nBusiness Bay, Dubai United Arab Emirates."),
|
|
135
|
+
ERIC_ROTH: re.compile(r"2221 Smithtown Avenue\nLong Island.*\nRonkonkoma.*\n(.1. )?Phone\nFax\nCell\ne-mail"),
|
|
136
|
+
GHISLAINE_MAXWELL: re.compile(r"FACEBOOK\nTWITTER\nG\+\nPINTEREST\nINSTAGRAM\nPLEDGE\nTHE DAILY CATCH"),
|
|
124
137
|
JEFFREY_EPSTEIN: re.compile(r"((\*+|please note)\n+)?(> )?(• )?(» )?The information contained in this communication is\n(> )*(» )?confidential.*?all attachments.( copyright -all rights reserved?)?", re.DOTALL),
|
|
125
138
|
JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*(\nTel:.*)?(\nEmail:.*)?", re.IGNORECASE),
|
|
126
139
|
KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
|
|
127
140
|
LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
|
|
128
141
|
LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
|
|
142
|
+
LEON_BLACK: re.compile(r"This email and any files transmitted with it are confidential and intended solely.*\n(they|whom).*\ndissemination.*\nother.*\nand delete.*"),
|
|
143
|
+
LISA_NEW: re.compile(r"Elisa New\nPowell M. Cabot.*\n(Director.*\n)?Harvard.*\n148.*\n([1I] )?12.*\nCambridge.*\n([1I] )?02138"),
|
|
129
144
|
MARTIN_WEINBERG: re.compile(r"(Martin G. Weinberg, Esq.\n20 Park Plaza((, )|\n)Suite 1000\nBoston, MA 02116(\n61.*?)?(\n.*?([cC]ell|Office))*\n)?This Electronic Message contains.*?contents of this message is.*?prohibited.", re.DOTALL),
|
|
145
|
+
MICHAEL_MILLER: re.compile(r"Michael C. Miller\nPartner\nwww.steptoe.com/mmiller\nSteptoe\n(Privileged.*\n)?(\+1\s+)?direct.*\n(\+1\s+)?(\+1\s+)?fax.*\n(\+1.*)?cell.*\n(www.steptoe.com\n)?This message and any.*\nyou are not.*\nnotify the sender.*"),
|
|
130
146
|
NICHOLAS_RIBIS: re.compile(r"60 Morris Turnpike 2FL\nSummit,? NJ.*\n0:\nF:\n\*{20,}\nCONFIDENTIALITY NOTICE.*\nattachments.*\ncopying.*\nIf you have.*\nthe copy.*\nThank.*\n\*{20,}"),
|
|
131
147
|
PETER_MANDELSON: re.compile(r'Disclaimer This email and any attachments to it may be.*?with[ \n]+number(.*?EC4V[ \n]+6BJ)?', re.DOTALL | re.IGNORECASE),
|
|
132
148
|
PAUL_BARRETT: re.compile(r"Paul Barrett[\n\s]+Alpha Group Capital LLC[\n\s]+(142 W 57th Street, 11th Floor, New York, NY 10019?[\n\s]+)?(al?[\n\s]*)?ALPHA GROUP[\n\s]+CAPITAL"),
|
|
149
|
+
PETER_ATTIA: re.compile(r"The information contained in this transmission may contain.*\n(laws|patient).*\n(distribution|named).*\n(distribution.*\nplease.*|copies.*)"),
|
|
133
150
|
RICHARD_KAHN: re.compile(fr'Richard Kahn[\n\s]+HBRK Associates Inc.?[\n\s]+((301 East 66th Street, Suite 1OF|575 Lexington Avenue,? 4th Floor,?)[\n\s]+)?New York, (NY|New York) 100(22|65)(\s+(Tel?|Phone)( I|{REDACTED})?\s+Fa[x",]?(_|{REDACTED})*\s+[Ce]el?l?)?', re.IGNORECASE),
|
|
134
151
|
ROSS_GOW: re.compile(r"Ross Gow\nManaging Partner\nACUITY Reputation Limited\n23 Berkeley Square\nLondon.*\nMobile.*\nTel"),
|
|
152
|
+
STEPHEN_HANSON: re.compile(r"(> )?Confidentiality Notice: This e-mail transmission.*\n(which it is addressed )?and may contain.*\n(applicable law. If you are not the intended )?recipient you are hereby.*\n(information contained in or attached to this transmission is )?STRICTLY PROHIBITED.*"),
|
|
135
153
|
STEVEN_PFEIFFER: re.compile(r"Steven\nSteven .*\nAssociate.*\nIndependent Filmmaker Project\nMade in NY.*\n30 .*\nBrooklyn.*\n(p:.*\n)?www\.ifp.*", re.IGNORECASE),
|
|
136
154
|
'Susan Edelman': re.compile(r'Susan Edel.*\nReporter\n1211.*\n917.*\nsedelman.*', re.IGNORECASE),
|
|
137
155
|
TERRY_KAFKA: re.compile(r"((>|I) )?Terry B.? Kafka.*\n(> )?Impact Outdoor.*\n(> )?5454.*\n(> )?Dallas.*\n((> )?c?ell.*\n)?(> )?Impactoutdoor.*(\n(> )?cell.*)?", re.IGNORECASE),
|
|
@@ -152,13 +170,19 @@ BCC_LISTS = JUNK_EMAILERS + MAILING_LISTS
|
|
|
152
170
|
TRUNCATE_EMAILS_FROM_OR_TO = [
|
|
153
171
|
AMANDA_ENS,
|
|
154
172
|
ANTHONY_BARRETT,
|
|
173
|
+
DANIEL_SABBA,
|
|
155
174
|
DIANE_ZIMAN,
|
|
156
175
|
JOSCHA_BACH,
|
|
157
176
|
KATHERINE_KEATING,
|
|
177
|
+
LAWRANCE_VISOSKI,
|
|
158
178
|
LAWRENCE_KRAUSS,
|
|
159
179
|
LISA_NEW,
|
|
180
|
+
MOSHE_HOFFMAN,
|
|
160
181
|
NILI_PRIELL_BARAK,
|
|
161
182
|
PAUL_KRASSNER,
|
|
183
|
+
PAUL_PROSPERI,
|
|
184
|
+
'Susan Edelman',
|
|
185
|
+
TERRY_KAFKA,
|
|
162
186
|
]
|
|
163
187
|
|
|
164
188
|
TRUNCATE_EMAILS_FROM = BCC_LISTS + TRUNCATE_EMAILS_FROM_OR_TO + [
|
|
@@ -170,6 +194,7 @@ TRUNCATE_EMAILS_FROM = BCC_LISTS + TRUNCATE_EMAILS_FROM_OR_TO + [
|
|
|
170
194
|
DAVID_HAIG,
|
|
171
195
|
EDWARD_ROD_LARSEN,
|
|
172
196
|
JOHNNY_EL_HACHEM,
|
|
197
|
+
'Mark Green',
|
|
173
198
|
MELANIE_WALKER,
|
|
174
199
|
'Mitchell Bard',
|
|
175
200
|
PEGGY_SIEGAL,
|
|
@@ -182,47 +207,12 @@ TRUNCATE_EMAILS_FROM = BCC_LISTS + TRUNCATE_EMAILS_FROM_OR_TO + [
|
|
|
182
207
|
TERRY_KAFKA,
|
|
183
208
|
]
|
|
184
209
|
|
|
185
|
-
# These IDs will be appended to INTERESTING_EMAIL_IDS
|
|
186
|
-
INTERESTING_TRUNCATION_LENGTHS = {
|
|
187
|
-
'023627': 16_800, # Micheal Wolff article with brock pierce
|
|
188
|
-
'030245': None, # Epstein rationalizes his behavior in an open letter to the world
|
|
189
|
-
'030781': None, # Bannon email about crypto coin issues
|
|
190
|
-
'032906': None, # David Blaine email
|
|
191
|
-
'026036': 6000, # Gino Yu blockchain mention
|
|
192
|
-
'029609': None, # Joi Ito
|
|
193
|
-
'025233': None, # Reputation.com discussion
|
|
194
|
-
'017827': None, # Bannon / Peggy Siegal email about netflix doc on Epstein
|
|
195
|
-
'030222': None, # Ross Gow / Ghislaine correspondence
|
|
196
|
-
'026028': None, # Larry Summers / Karim Wade intro
|
|
197
|
-
'029545': None, # Tyler Shears reputation
|
|
198
|
-
'025812': None, # Tyler Shears reputation
|
|
199
|
-
'029914': 4500, # Lord Mandelson russian investments
|
|
200
|
-
'033453': None, # "Just heard you were telling people that you heard I asked Trump for a million dollars"
|
|
201
|
-
'031320': None, # Epstein Gratitude foundation
|
|
202
|
-
'031036': None, # Barbro Ehnbom talking about Swedish girl
|
|
203
|
-
'023454': 1878, # Email invitation sent to tech CEOs + Epstein
|
|
204
|
-
'029342': 2000, # Hakeem Jeffries
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
TRUNCATION_LENGTHS = {
|
|
208
|
-
**INTERESTING_TRUNCATION_LENGTHS,
|
|
209
|
-
'031791': None, # First email in Jessica Cadwell chain about service of legal documents
|
|
210
|
-
'023208': None, # Long discussion about leon black's finances
|
|
211
|
-
'028589': None, # Long thread with Reid Weingarten
|
|
212
|
-
'029433': TRUNCATED_CHARS, # Kahn taxes
|
|
213
|
-
'026778': TRUNCATED_CHARS, # Kahn taxes
|
|
214
|
-
'033311': TRUNCATED_CHARS, # Kahn taxes
|
|
215
|
-
'024251': TRUNCATED_CHARS, # Kahn taxes
|
|
216
|
-
'026755': TRUNCATED_CHARS, # Epstein self fwd
|
|
217
|
-
}
|
|
218
|
-
|
|
219
210
|
# These are long forwarded articles so we force a trim to 1,333 chars if these strings exist
|
|
220
211
|
TRUNCATE_TERMS = [
|
|
221
212
|
'The rebuilding of Indonesia', # Vikcy ward article
|
|
222
|
-
'Dominique Strauss-Kahn',
|
|
223
|
-
'THOMAS L. FRIEDMAN',
|
|
224
213
|
'a sleek, briskly paced film whose title suggests a heist movie', # Inside Job
|
|
225
214
|
'Calendar of Major Events, Openings, and Fundraisers',
|
|
215
|
+
'sent over from Marshall Heyman at the WSJ',
|
|
226
216
|
"In recent months, China's BAT collapse",
|
|
227
217
|
'President Obama introduces Jim Yong Kim as his nominee',
|
|
228
218
|
'Trump appears with mobster-affiliated felon at New',
|
|
@@ -237,9 +227,11 @@ TRUNCATE_TERMS = [
|
|
|
237
227
|
'co-inventor of the GTX Smart Shoe',
|
|
238
228
|
'my latest Washington Post column',
|
|
239
229
|
# Bannon
|
|
230
|
+
'As Steve Bannon continues his tour of Europe',
|
|
240
231
|
"Bannon the European: He's opening the populist fort in Brussels",
|
|
241
232
|
"Steve Bannon doesn't do subtle.",
|
|
242
233
|
'The Department of Justice lost its latest battle with Congress',
|
|
234
|
+
'pedophile Jeffrey Epstein bought his way out',
|
|
243
235
|
# lawyers
|
|
244
236
|
'recuses itself from Jeffrey Epstein case',
|
|
245
237
|
# Misc
|
|
@@ -265,11 +257,23 @@ LINE_REPAIR_MERGES = {
|
|
|
265
257
|
'014397': [[4]] * 2,
|
|
266
258
|
'014860': [[3], [4], [4]],
|
|
267
259
|
'017523': [[4]],
|
|
260
|
+
'030367': [[1, 4], [2, 4]],
|
|
268
261
|
'019105': [[5]] * 4,
|
|
269
262
|
'019407': [[2, 4]],
|
|
263
|
+
'022187': [[1, 8], [2, 8], [3, 8], [4, 8]],
|
|
270
264
|
'021729': [[2]],
|
|
265
|
+
'032896': [[2]],
|
|
266
|
+
'033050': [[0, 6], [1, 6], [2, 6], [3, 6], [4, 6]],
|
|
267
|
+
'022949': [[0, 4], [1, 4]],
|
|
268
|
+
'022197': [[0, 5], [1, 5], [3, 5]],
|
|
269
|
+
'021814': [[1, 6], [2, 6], [3, 6], [4, 6]],
|
|
270
|
+
'022190': [[1, 7], [0, 6], [3, 6], [4, 6]],
|
|
271
|
+
'029582': [[0, 5], [1, 5], [3, 5], [3, 5]],
|
|
271
272
|
'022673': [[9]],
|
|
272
273
|
'022684': [[9]],
|
|
274
|
+
'026625': [[0, 7], [1, 7], [2, 7], [3, 7], [4, 7], [5, 7]],
|
|
275
|
+
'026659': [[0, 5], [1, 5]],
|
|
276
|
+
'026764': [[0, 6], [1, 6]],
|
|
273
277
|
'022695': [[4]],
|
|
274
278
|
'022977': [[9]] * 10,
|
|
275
279
|
'023001': [[5]] * 3,
|
|
@@ -278,11 +282,15 @@ LINE_REPAIR_MERGES = {
|
|
|
278
282
|
'025329': [[2]] * 9,
|
|
279
283
|
'025790': [[2]],
|
|
280
284
|
'025812': [[3]] * 2,
|
|
285
|
+
'025589': [[3]] * 12,
|
|
281
286
|
'026345': [[3]],
|
|
282
287
|
'026609': [[4]],
|
|
288
|
+
'028921': [[5, 4], [4, 5]],
|
|
289
|
+
'026620': ([[20]] * 4) + [[3, 2]] + ([[2]] * 15) + [[2, 4]],
|
|
283
290
|
'026829': [[3]],
|
|
284
291
|
'026924': [[2, 4]],
|
|
285
292
|
'028728': [[3]],
|
|
293
|
+
'026451': [[3, 5]] * 2,
|
|
286
294
|
'028931': [[3, 6]],
|
|
287
295
|
'029154': [[2, 5]],
|
|
288
296
|
'029163': [[2, 5]],
|
|
@@ -302,18 +310,22 @@ LINE_REPAIR_MERGES = {
|
|
|
302
310
|
'029977': ([[2]] * 4) + [[4], [2, 4]],
|
|
303
311
|
'030299': [[7, 10]],
|
|
304
312
|
'030315': [[3, 5]],
|
|
313
|
+
'030318': [[3, 5]],
|
|
305
314
|
'030381': [[2, 4]],
|
|
306
315
|
'030384': [[2, 4]],
|
|
307
316
|
'030626': [[2], [4]],
|
|
317
|
+
'030861': [[3, 8]],
|
|
308
318
|
'030999': [[2, 4]],
|
|
309
319
|
'031384': [[2]],
|
|
310
320
|
'031428': [[2], [2, 4]],
|
|
311
321
|
'031442': [[0]],
|
|
322
|
+
'031489': [[2, 4], [3, 4], [3, 4], [10]],
|
|
323
|
+
'031619': [[7], [17], [17]],
|
|
312
324
|
'031748': [[3]] * 2,
|
|
313
|
-
'031764': [[3]],
|
|
325
|
+
'031764': [[3], [8]], # 8 is just for style fix internally, not header
|
|
314
326
|
'031980': [[2, 4]],
|
|
315
327
|
'032063': [[3, 5]],
|
|
316
|
-
'032272': [[3]],
|
|
328
|
+
'032272': [[2, 10], [3]],
|
|
317
329
|
'032405': [[4]],
|
|
318
330
|
'032637': [[9]] * 3,
|
|
319
331
|
'033097': [[2]],
|
|
@@ -326,6 +338,8 @@ LINE_REPAIR_MERGES = {
|
|
|
326
338
|
'033357': [[2, 4]],
|
|
327
339
|
'033486': [[7, 9]],
|
|
328
340
|
'033512': [[2]],
|
|
341
|
+
'026024': [[1, 3], [2, 3]],
|
|
342
|
+
'024923': [[0, 5], [2]],
|
|
329
343
|
'033568': [[5]] * 5,
|
|
330
344
|
'033575': [[2, 4]],
|
|
331
345
|
'033576': [[3]],
|
|
@@ -344,12 +358,14 @@ class Email(Communication):
|
|
|
344
358
|
sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
|
|
345
359
|
signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
|
|
346
360
|
"""
|
|
361
|
+
attached_docs: list[OtherFile] = field(default_factory=list)
|
|
347
362
|
actual_text: str = field(init=False)
|
|
348
363
|
config: EmailCfg | None = None
|
|
349
364
|
header: EmailHeader = field(init=False)
|
|
350
365
|
recipients: list[Name] = field(default_factory=list)
|
|
351
366
|
sent_from_device: str | None = None
|
|
352
367
|
signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
|
|
368
|
+
_is_first_for_user: bool = False # Only set when printing
|
|
353
369
|
_line_merge_arguments: list[tuple[int] | tuple[int, int]] = field(default_factory=list)
|
|
354
370
|
|
|
355
371
|
# For logging how many headers we prettified while printing, kind of janky
|
|
@@ -389,6 +405,7 @@ class Email(Communication):
|
|
|
389
405
|
self.sent_from_device = self._sent_from_device()
|
|
390
406
|
|
|
391
407
|
def attachments(self) -> list[str]:
|
|
408
|
+
"""Returns the string in the header."""
|
|
392
409
|
return (self.header.attachments or '').split(';')
|
|
393
410
|
|
|
394
411
|
def info_txt(self) -> Text:
|
|
@@ -402,7 +419,12 @@ class Email(Communication):
|
|
|
402
419
|
return txt.append(highlighter(f" probably sent at {self.timestamp}"))
|
|
403
420
|
|
|
404
421
|
def is_fwded_article(self) -> bool:
|
|
405
|
-
|
|
422
|
+
if self.config is None:
|
|
423
|
+
return False
|
|
424
|
+
elif self.config.fwded_text_after:
|
|
425
|
+
return self.config.is_fwded_article is not False
|
|
426
|
+
else:
|
|
427
|
+
return bool(self.config.is_fwded_article)
|
|
406
428
|
|
|
407
429
|
def is_junk_mail(self) -> bool:
|
|
408
430
|
return self.author in JUNK_EMAILERS
|
|
@@ -413,9 +435,15 @@ class Email(Communication):
|
|
|
413
435
|
def is_note_to_self(self) -> bool:
|
|
414
436
|
return self.recipients == [self.author]
|
|
415
437
|
|
|
416
|
-
def
|
|
438
|
+
def is_from_or_to(self, name: str) -> bool:
|
|
417
439
|
return name in [self.author] + self.recipients
|
|
418
440
|
|
|
441
|
+
def is_word_count_worthy(self) -> bool:
|
|
442
|
+
if self.is_fwded_article():
|
|
443
|
+
return bool(self.config.fwded_text_after) or len(self.actual_text) < 150
|
|
444
|
+
else:
|
|
445
|
+
return not self.is_mailing_list()
|
|
446
|
+
|
|
419
447
|
def metadata(self) -> Metadata:
|
|
420
448
|
local_metadata = asdict(self)
|
|
421
449
|
local_metadata['is_junk_mail'] = self.is_junk_mail()
|
|
@@ -462,8 +490,9 @@ class Email(Communication):
|
|
|
462
490
|
elif self.header.num_header_rows == 0:
|
|
463
491
|
return self.text
|
|
464
492
|
|
|
493
|
+
# import pdb;pdb.set_trace()
|
|
465
494
|
self.log_top_lines(20, "Raw text:", logging.DEBUG)
|
|
466
|
-
self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
|
|
495
|
+
self.log(f"With {self.header.num_header_rows} header lines removed:\n{text[0:500]}\n\n", logging.DEBUG)
|
|
467
496
|
reply_text_match = REPLY_TEXT_REGEX.search(text)
|
|
468
497
|
|
|
469
498
|
if reply_text_match:
|
|
@@ -542,8 +571,8 @@ class Email(Communication):
|
|
|
542
571
|
logger.debug(f"{self.file_id} extracted header\n\n{self.header}\n")
|
|
543
572
|
|
|
544
573
|
def _extract_timestamp(self) -> datetime:
|
|
545
|
-
if self.config and self.config.timestamp:
|
|
546
|
-
return self.config.timestamp
|
|
574
|
+
if self.config and self.config.timestamp():
|
|
575
|
+
return self.config.timestamp()
|
|
547
576
|
elif self.header.sent_at:
|
|
548
577
|
timestamp = _parse_timestamp(self.header.sent_at)
|
|
549
578
|
|
|
@@ -572,36 +601,41 @@ class Email(Communication):
|
|
|
572
601
|
logger.debug(f"Fell back to timestamp {timestamp} in line '{line}'...")
|
|
573
602
|
return timestamp
|
|
574
603
|
|
|
575
|
-
|
|
604
|
+
no_timestamp_msg = f"No timestamp found in '{self.file_path.name}'"
|
|
576
605
|
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
if text is None:
|
|
580
|
-
header_offset = len(self.header.header_chars)
|
|
581
|
-
text = self.text[header_offset:]
|
|
606
|
+
if self.is_duplicate():
|
|
607
|
+
logger.warning(f"{no_timestamp_msg} but timestamp should be copied from {self.duplicate_of_id()}")
|
|
582
608
|
else:
|
|
583
|
-
|
|
609
|
+
raise RuntimeError(f"{no_timestamp_msg}, top lines:\n{searchable_text}")
|
|
610
|
+
|
|
611
|
+
def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES) -> int | None:
|
|
612
|
+
"""Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
|
|
613
|
+
header_offset = len(self.header.header_chars)
|
|
614
|
+
text = self.text[header_offset:]
|
|
584
615
|
|
|
585
616
|
for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text)):
|
|
586
617
|
if i >= n:
|
|
587
618
|
return match.end() + header_offset - 1
|
|
588
619
|
|
|
589
|
-
def _merge_lines(self,
|
|
620
|
+
def _merge_lines(self, idx1: int, idx2: int | None = None) -> None:
|
|
590
621
|
"""Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""
|
|
591
622
|
if idx2 is None:
|
|
592
|
-
self._line_merge_arguments.append((
|
|
593
|
-
idx2 =
|
|
623
|
+
self._line_merge_arguments.append((idx1,))
|
|
624
|
+
idx2 = idx1 + 1
|
|
594
625
|
else:
|
|
595
|
-
self._line_merge_arguments.append((
|
|
626
|
+
self._line_merge_arguments.append((idx1, idx2))
|
|
596
627
|
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
raise RuntimeError(f"idx2 ({idx2}) must be greater than idx ({
|
|
601
|
-
elif idx2 == (idx + 1):
|
|
602
|
-
lines += [self.lines[idx] + ' ' + self.lines[idx + 1]] + self.lines[idx + 2:]
|
|
628
|
+
if idx2 < idx1:
|
|
629
|
+
lines = self.lines[0:idx2] + self.lines[idx2 + 1:idx1] + [self.lines[idx1] + ' ' + self.lines[idx2]] + self.lines[idx1 + 1:]
|
|
630
|
+
elif idx2 == idx1:
|
|
631
|
+
raise RuntimeError(f"idx2 ({idx2}) must be greater or less than idx ({idx1})")
|
|
603
632
|
else:
|
|
604
|
-
lines
|
|
633
|
+
lines = self.lines[0:idx1]
|
|
634
|
+
|
|
635
|
+
if idx2 == (idx1 + 1):
|
|
636
|
+
lines += [self.lines[idx1] + ' ' + self.lines[idx1 + 1]] + self.lines[idx1 + 2:]
|
|
637
|
+
else:
|
|
638
|
+
lines += [self.lines[idx1] + ' ' + self.lines[idx2]] + self.lines[idx1 + 1:idx2] + self.lines[idx2 + 1:]
|
|
605
639
|
|
|
606
640
|
self._set_computed_fields(lines=lines)
|
|
607
641
|
|
|
@@ -617,6 +651,10 @@ class Email(Communication):
|
|
|
617
651
|
self.signature_substitution_counts[name] = self.signature_substitution_counts.get(name, 0)
|
|
618
652
|
self.signature_substitution_counts[name] += num_replaced
|
|
619
653
|
|
|
654
|
+
# Share / Tweet lines
|
|
655
|
+
if self.author == KATHRYN_RUEMMLER:
|
|
656
|
+
text = '\n'.join([l for l in text.split('\n') if l not in ['Share', 'Tweet', 'Bookmark it']])
|
|
657
|
+
|
|
620
658
|
return collapse_newlines(text).strip()
|
|
621
659
|
|
|
622
660
|
def _remove_line(self, idx: int) -> None:
|
|
@@ -657,17 +695,21 @@ class Email(Communication):
|
|
|
657
695
|
self.log_top_lines(12, 'Result of modifications')
|
|
658
696
|
|
|
659
697
|
lines = self.repair_ocr_text(OCR_REPAIRS, self.text).split('\n')
|
|
698
|
+
subject_line = next((line for line in lines if line.startswith('Subject:')), None) or ''
|
|
699
|
+
subject = subject_line.split(':')[1].strip() if subject_line else ''
|
|
660
700
|
new_lines = []
|
|
661
701
|
i = 0
|
|
662
702
|
|
|
663
|
-
# Fix links (remove spaces, merge multiline links to a single line)
|
|
703
|
+
# Fix links and quoted subjects (remove spaces, merge multiline links to a single line)
|
|
664
704
|
while i < len(lines):
|
|
665
705
|
line = lines[i]
|
|
666
706
|
|
|
667
707
|
if LINK_LINE_REGEX.search(line):
|
|
668
708
|
while i < (len(lines) - 1) \
|
|
669
|
-
and
|
|
670
|
-
and (lines[i + 1].endswith('/')
|
|
709
|
+
and not lines[i + 1].startswith('htt') \
|
|
710
|
+
and (lines[i + 1].endswith('/') \
|
|
711
|
+
or any(s in lines[i + 1] for s in URL_SIGNIFIERS) \
|
|
712
|
+
or LINK_LINE2_REGEX.match(lines[i + 1])):
|
|
671
713
|
logger.debug(f"{self.filename}: Joining link lines\n 1. {line}\n 2. {lines[i + 1]}\n")
|
|
672
714
|
line += lines[i + 1]
|
|
673
715
|
i += 1
|
|
@@ -676,6 +718,17 @@ class Email(Communication):
|
|
|
676
718
|
elif ' http' in line and line.endswith('html'):
|
|
677
719
|
pre_link, post_link = line.split(' http', 1)
|
|
678
720
|
line = f"{pre_link} http{post_link.replace(' ', '')}"
|
|
721
|
+
elif line.startswith('Subject:') and i < (len(lines) - 2) and len(line) >= 40:
|
|
722
|
+
next_line = lines[i + 1]
|
|
723
|
+
next_next = lines[i + 2]
|
|
724
|
+
|
|
725
|
+
if len(next_line) <= 1 or any([cont in next_line for cont in BAD_SUBJECT_CONTINUATIONS]):
|
|
726
|
+
pass
|
|
727
|
+
elif (subject.endswith(next_line) and next_line != subject) \
|
|
728
|
+
or (FIELDS_COLON_REGEX.search(next_next) and not FIELDS_COLON_REGEX.search(next_line)):
|
|
729
|
+
self.warn(f"Fixing broken subject line\n line: '{line}'\n next: '{next_line}'\n next: '{next_next}'\nsubject='{subject}'\n")
|
|
730
|
+
line += f" {next_line}"
|
|
731
|
+
i += 1
|
|
679
732
|
|
|
680
733
|
new_lines.append(line)
|
|
681
734
|
|
|
@@ -699,7 +752,7 @@ class Email(Communication):
|
|
|
699
752
|
"""Copy info from original config for file this document was extracted from."""
|
|
700
753
|
if self.file_id in ALL_FILE_CONFIGS:
|
|
701
754
|
self.config = cast(EmailCfg, deepcopy(ALL_FILE_CONFIGS[self.file_id]))
|
|
702
|
-
self.
|
|
755
|
+
self.log(f"Merging existing cfg for '{self.file_id}' with cfg for extracted document...")
|
|
703
756
|
else:
|
|
704
757
|
self.config = EmailCfg(id=self.file_id)
|
|
705
758
|
|
|
@@ -721,34 +774,58 @@ class Email(Communication):
|
|
|
721
774
|
|
|
722
775
|
def _truncate_to_length(self) -> int:
|
|
723
776
|
"""When printing truncate this email to this length."""
|
|
724
|
-
quote_cutoff = self._idx_of_nth_quoted_reply(
|
|
777
|
+
quote_cutoff = self._idx_of_nth_quoted_reply() # Trim if there's many quoted replies
|
|
725
778
|
includes_truncate_term = next((term for term in TRUNCATE_TERMS if term in self.text), None)
|
|
726
779
|
|
|
727
780
|
if args.whole_file:
|
|
728
781
|
num_chars = len(self.text)
|
|
729
782
|
elif args.truncate:
|
|
730
783
|
num_chars = args.truncate
|
|
731
|
-
elif self.
|
|
732
|
-
num_chars =
|
|
733
|
-
elif self.
|
|
784
|
+
elif self.config and self.config.truncate_to is not None:
|
|
785
|
+
num_chars = len(self.text) if self.config.truncate_to == NO_TRUNCATE else self.config.truncate_to
|
|
786
|
+
elif self.is_interesting():
|
|
787
|
+
num_chars = len(self.text)
|
|
788
|
+
elif self.author in TRUNCATE_EMAILS_FROM \
|
|
789
|
+
or any([self.is_from_or_to(n) for n in TRUNCATE_EMAILS_FROM_OR_TO]) \
|
|
790
|
+
or self.is_fwded_article() \
|
|
791
|
+
or includes_truncate_term:
|
|
734
792
|
num_chars = min(quote_cutoff or MAX_CHARS_TO_PRINT, TRUNCATED_CHARS)
|
|
735
|
-
elif quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
|
|
736
|
-
num_chars = quote_cutoff
|
|
737
793
|
else:
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
794
|
+
if quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
|
|
795
|
+
trimmed_words = self.text[quote_cutoff:].split()
|
|
796
|
+
|
|
797
|
+
if '<...snipped' in trimmed_words[:NUM_WORDS_IN_LAST_QUOTE]:
|
|
798
|
+
num_trailing_words = 0
|
|
799
|
+
elif trimmed_words and trimmed_words[0] in ['From:', 'Sent:']:
|
|
800
|
+
num_trailing_words = NUM_WORDS_IN_LAST_QUOTE
|
|
801
|
+
else:
|
|
802
|
+
num_trailing_words = NUM_WORDS_IN_LAST_QUOTE
|
|
803
|
+
|
|
804
|
+
if trimmed_words:
|
|
805
|
+
last_quoted_text = ' '.join(trimmed_words[:num_trailing_words])
|
|
806
|
+
num_chars = quote_cutoff + len(last_quoted_text) + 1 # Give a hint of the next line
|
|
807
|
+
else:
|
|
808
|
+
num_chars = quote_cutoff
|
|
809
|
+
else:
|
|
810
|
+
num_chars = min(self.file_size(), MAX_CHARS_TO_PRINT)
|
|
811
|
+
|
|
812
|
+
# Always print whole email for 1st email for user
|
|
813
|
+
if self._is_first_for_user and num_chars < self.file_size() and not self.is_duplicate():
|
|
814
|
+
logger.info(f"{self} Overriding cutoff {num_chars} for first email")
|
|
815
|
+
num_chars = self.file_size()
|
|
816
|
+
|
|
817
|
+
log_args = {
|
|
818
|
+
'num_chars': num_chars,
|
|
819
|
+
'_is_first_for_user': self._is_first_for_user,
|
|
820
|
+
'author_truncate': self.author in TRUNCATE_EMAILS_FROM,
|
|
821
|
+
'is_fwded_article': self.is_fwded_article(),
|
|
822
|
+
'is_quote_cutoff': quote_cutoff == num_chars,
|
|
823
|
+
'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
|
|
824
|
+
'quote_cutoff': quote_cutoff,
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
log_args_str = ', '.join([f"{k}={v}" for k, v in log_args.items() if v])
|
|
828
|
+
logger.debug(f"Truncate determination: {log_args_str}")
|
|
752
829
|
return num_chars
|
|
753
830
|
|
|
754
831
|
def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
|
|
@@ -762,7 +839,7 @@ class Email(Communication):
|
|
|
762
839
|
if len(text) > num_chars:
|
|
763
840
|
text = text[0:num_chars]
|
|
764
841
|
doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style())
|
|
765
|
-
trim_note = f"<...trimmed to {num_chars} characters of {self.length()}, read the rest at {doc_link_markup}...>"
|
|
842
|
+
trim_note = f"<...trimmed to {num_chars:,} characters of {self.length():,}, read the rest at {doc_link_markup}...>"
|
|
766
843
|
trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
|
|
767
844
|
|
|
768
845
|
# Rewrite broken headers where the values are on separate lines from the field names
|
|
@@ -789,7 +866,7 @@ class Email(Communication):
|
|
|
789
866
|
text = join_texts(lines, '\n')
|
|
790
867
|
|
|
791
868
|
email_txt_panel = Panel(
|
|
792
|
-
highlighter(text).append('
|
|
869
|
+
highlighter(text).append('...\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
|
|
793
870
|
border_style=self._border_style(),
|
|
794
871
|
expand=False,
|
|
795
872
|
subtitle=REWRITTEN_HEADER_MSG if should_rewrite_header else None,
|
|
@@ -798,6 +875,11 @@ class Email(Communication):
|
|
|
798
875
|
yield self.file_info_panel()
|
|
799
876
|
yield Padding(email_txt_panel, (0, 0, 1, INFO_INDENT))
|
|
800
877
|
|
|
878
|
+
if self.attached_docs:
|
|
879
|
+
attachments_table_title = f" {self.url_slug} Email Attachments:"
|
|
880
|
+
attachments_table = OtherFile.files_preview_table(self.attached_docs, title=attachments_table_title)
|
|
881
|
+
yield Padding(attachments_table, (0, 0, 1, 12))
|
|
882
|
+
|
|
801
883
|
if should_rewrite_header:
|
|
802
884
|
self.log_top_lines(self.header.num_header_rows + 4, f'Original header:')
|
|
803
885
|
|
|
@@ -2,7 +2,7 @@ import json
|
|
|
2
2
|
import re
|
|
3
3
|
from dataclasses import asdict, dataclass, field
|
|
4
4
|
|
|
5
|
-
from epstein_files.util.constant.strings import AUTHOR, REDACTED
|
|
5
|
+
from epstein_files.util.constant.strings import AUTHOR, REDACTED, indented
|
|
6
6
|
from epstein_files.util.constants import ALL_CONFIGS
|
|
7
7
|
from epstein_files.util.doc_cfg import EmailCfg
|
|
8
8
|
from epstein_files.util.logging import logger
|
|
@@ -13,7 +13,10 @@ ON_BEHALF_OF = 'on behalf of'
|
|
|
13
13
|
TO_FIELDS = ['bcc', 'cc', 'to']
|
|
14
14
|
EMAILER_FIELDS = [AUTHOR] + TO_FIELDS
|
|
15
15
|
|
|
16
|
-
|
|
16
|
+
FIELD_PATTERNS = ['Date', 'From', 'Sent', 'To', r"C[cC]", r"B[cC][cC]", 'Importance', 'Subject', 'Attachments', 'Classification', 'Flag', 'Reply-To']
|
|
17
|
+
FIELDS_PATTERN = '|'.join(FIELD_PATTERNS)
|
|
18
|
+
FIELDS_COLON_PATTERN = fr"^({FIELDS_PATTERN}):"
|
|
19
|
+
HEADER_REGEX_STR = fr"(((?:(?:{FIELDS_PATTERN}|Bee):|on behalf of ?)(?! +(by |from my|via )).*\n){{3,}})"
|
|
17
20
|
EMAIL_SIMPLE_HEADER_REGEX = re.compile(rf'^{HEADER_REGEX_STR}')
|
|
18
21
|
EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX = re.compile(HEADER_REGEX_STR)
|
|
19
22
|
EMAIL_PRE_FORWARD_REGEX = re.compile(r"(.{3,2000}?)" + HEADER_REGEX_STR, re.DOTALL) # Match up to the next email header section
|
|
@@ -53,6 +56,7 @@ class EmailHeader:
|
|
|
53
56
|
importance: str | None = None
|
|
54
57
|
attachments: str | None = None
|
|
55
58
|
to: list[str] | None = None
|
|
59
|
+
reply_to: str | None = None
|
|
56
60
|
|
|
57
61
|
def __post_init__(self):
|
|
58
62
|
self.num_header_rows = len(self.field_names)
|
|
@@ -95,13 +99,10 @@ class EmailHeader:
|
|
|
95
99
|
logger.info(f"{log_prefix}, trying next line...")
|
|
96
100
|
num_headers += 1
|
|
97
101
|
value = email_lines[i + num_headers]
|
|
98
|
-
elif BAD_EMAILER_REGEX.match(value):
|
|
102
|
+
elif BAD_EMAILER_REGEX.match(value) or value.startswith('http'):
|
|
99
103
|
logger.info(f"{log_prefix}, decrementing num_headers and skipping...")
|
|
100
104
|
num_headers -= 1
|
|
101
105
|
continue
|
|
102
|
-
elif value.startswith('http'):
|
|
103
|
-
logger.info(f"{log_prefix}, using empty string instead...")
|
|
104
|
-
value = ''
|
|
105
106
|
|
|
106
107
|
value = [v.strip() for v in value.split(';') if len(v.strip()) > 0]
|
|
107
108
|
|
|
@@ -110,7 +111,12 @@ class EmailHeader:
|
|
|
110
111
|
self.num_header_rows = len(self.field_names) + num_headers
|
|
111
112
|
self.header_chars = '\n'.join(email_lines[0:self.num_header_rows])
|
|
112
113
|
log_msg = f"Corrected empty header using {self.num_header_rows} lines to:\n"
|
|
113
|
-
|
|
114
|
+
|
|
115
|
+
logger.warning(
|
|
116
|
+
f"{log_msg}{self}\n\n[top lines]:\n\n%s\n\n[body_lines]:\n\n%s\n\n",
|
|
117
|
+
indented('\n'.join(email_lines[0:(num_headers + 1) * 2]), prefix='> '),
|
|
118
|
+
indented('\n'.join(email_lines[self.num_header_rows:self.num_header_rows + 5]), prefix='> '),
|
|
119
|
+
)
|
|
114
120
|
|
|
115
121
|
def rewrite_header(self) -> str:
|
|
116
122
|
header_fields = {}
|
|
@@ -151,7 +157,7 @@ class EmailHeader:
|
|
|
151
157
|
#logger.debug(f"extracting header line: '{line}'")
|
|
152
158
|
key, value = [element.strip() for element in line.split(':', 1)]
|
|
153
159
|
value = value.rstrip('_')
|
|
154
|
-
key = AUTHOR if key == 'From' else ('sent_at' if key in ['Date', 'Sent'] else key.lower())
|
|
160
|
+
key = AUTHOR if key == 'From' else ('sent_at' if key in ['Date', 'Sent'] else key.lower().replace('-', '_'))
|
|
155
161
|
key = 'bcc' if key == 'bee' else key
|
|
156
162
|
|
|
157
163
|
if kw_args.get(key):
|
|
@@ -161,6 +167,9 @@ class EmailHeader:
|
|
|
161
167
|
|
|
162
168
|
field_names.append(key)
|
|
163
169
|
|
|
170
|
+
if key == 'reply_to':
|
|
171
|
+
logger.warning(f"Found value for Reply-To field: '{value}'")
|
|
172
|
+
|
|
164
173
|
if key in TO_FIELDS:
|
|
165
174
|
recipients = [element.strip() for element in value.split(';')]
|
|
166
175
|
recipients = [r for r in recipients if len(r) > 0]
|