epstein-files 1.2.5__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +55 -23
- epstein_files/documents/communication.py +9 -5
- epstein_files/documents/document.py +231 -135
- epstein_files/documents/doj_file.py +242 -0
- epstein_files/documents/doj_files/full_text.py +166 -0
- epstein_files/documents/email.py +289 -232
- epstein_files/documents/emails/email_header.py +35 -16
- epstein_files/documents/emails/emailers.py +223 -0
- epstein_files/documents/imessage/text_message.py +2 -3
- epstein_files/documents/json_file.py +18 -14
- epstein_files/documents/messenger_log.py +23 -39
- epstein_files/documents/other_file.py +54 -48
- epstein_files/epstein_files.py +65 -29
- epstein_files/person.py +151 -94
- epstein_files/util/constant/names.py +37 -10
- epstein_files/util/constant/output_files.py +2 -0
- epstein_files/util/constant/strings.py +14 -7
- epstein_files/util/constant/urls.py +17 -0
- epstein_files/util/constants.py +556 -391
- epstein_files/util/data.py +2 -0
- epstein_files/util/doc_cfg.py +44 -33
- epstein_files/util/env.py +34 -19
- epstein_files/util/file_helper.py +30 -6
- epstein_files/util/helpers/debugging_helper.py +13 -0
- epstein_files/util/helpers/env_helpers.py +21 -0
- epstein_files/util/highlighted_group.py +121 -37
- epstein_files/util/layout/left_bar_panel.py +26 -0
- epstein_files/util/logging.py +28 -13
- epstein_files/util/output.py +49 -40
- epstein_files/util/rich.py +30 -3
- epstein_files/util/word_count.py +7 -7
- {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/METADATA +16 -3
- epstein_files-1.5.0.dist-info/RECORD +40 -0
- {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/entry_points.txt +1 -1
- epstein_files-1.2.5.dist-info/RECORD +0 -34
- {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/LICENSE +0 -0
- {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/WHEEL +0 -0
epstein_files/documents/email.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import re
|
|
4
|
-
from collections import defaultdict
|
|
5
4
|
from copy import deepcopy
|
|
6
5
|
from dataclasses import asdict, dataclass, field
|
|
7
6
|
from datetime import datetime
|
|
@@ -16,13 +15,14 @@ from rich.text import Text
|
|
|
16
15
|
|
|
17
16
|
from epstein_files.documents.communication import Communication
|
|
18
17
|
from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, INFO_INDENT
|
|
19
|
-
from epstein_files.documents.emails.email_header import (
|
|
20
|
-
EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES,
|
|
18
|
+
from epstein_files.documents.emails.email_header import (EMAIL_SIMPLE_HEADER_REGEX,
|
|
19
|
+
EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, FIELDS_COLON_PATTERN, EmailHeader)
|
|
20
|
+
from epstein_files.documents.emails.emailers import extract_emailer_names
|
|
21
|
+
from epstein_files.documents.other_file import OtherFile
|
|
21
22
|
from epstein_files.util.constant.names import *
|
|
22
23
|
from epstein_files.util.constant.strings import REDACTED
|
|
23
24
|
from epstein_files.util.constants import *
|
|
24
|
-
from epstein_files.util.data import
|
|
25
|
-
flatten, listify, remove_timezone, uniquify)
|
|
25
|
+
from epstein_files.util.data import AMERICAN_TIME_REGEX, TIMEZONE_INFO, collapse_newlines, remove_timezone
|
|
26
26
|
from epstein_files.util.doc_cfg import EmailCfg, Metadata
|
|
27
27
|
from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
|
|
28
28
|
from epstein_files.util.highlighted_group import JUNK_EMAILERS, get_style_for_name
|
|
@@ -30,9 +30,10 @@ from epstein_files.util.logging import logger
|
|
|
30
30
|
from epstein_files.util.rich import *
|
|
31
31
|
|
|
32
32
|
BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
|
|
33
|
-
BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|
|
|
34
|
-
|
|
35
|
-
|
|
33
|
+
BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Hide caption|Importance:?\s*High|[iI,•]|[1i] (_ )?[il]|, [-,]|L\._|_filtered|.*(yiv0232|font-family:|margin-bottom:).*)$')
|
|
34
|
+
BAD_SUBJECT_CONTINUATIONS = ['orwarded', 'Hi ', 'Sent ', 'AmLaw', 'Original Message', 'Privileged', 'Sorry', '---']
|
|
35
|
+
FIELDS_COLON_REGEX = re.compile(FIELDS_COLON_PATTERN)
|
|
36
|
+
LINK_LINE_REGEX = re.compile(f"^[>• ]*htt")
|
|
36
37
|
LINK_LINE2_REGEX = re.compile(r"^[-\w.%&=/]{5,}$")
|
|
37
38
|
QUOTED_REPLY_LINE_REGEX = re.compile(r'(\nFrom:(.*)|wrote:)\n', re.IGNORECASE)
|
|
38
39
|
REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
|
|
@@ -42,15 +43,13 @@ DATE_HEADER_REGEX = re.compile(r'(?:Date|Sent):? +(?!by|from|to|via)([^\n]{6,})\
|
|
|
42
43
|
TIMESTAMP_LINE_REGEX = re.compile(r"\d+:\d+")
|
|
43
44
|
LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
|
|
44
45
|
|
|
45
|
-
SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
|
|
46
46
|
REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
|
|
47
|
-
URL_SIGNIFIERS = ['amp?', 'cd=', 'click', 'ft=', 'gclid', 'htm', 'keywords=', 'module=', 'mpweb', 'nlid=', 'ref=', 'smid=', 'usg=', 'utm']
|
|
47
|
+
URL_SIGNIFIERS = ['?amp', 'amp?', 'cd=', 'click', 'CMP=', 'contentId', 'ft=', 'gclid', 'htm', 'mp=', 'keywords=', 'Id=', 'module=', 'mpweb', 'nlid=', 'ref=', 'smid=', 'sp=', 'usg=', 'utm']
|
|
48
48
|
APPEARS_IN = 'appears in'
|
|
49
49
|
|
|
50
50
|
MAX_NUM_HEADER_LINES = 14
|
|
51
|
-
MAX_QUOTED_REPLIES =
|
|
52
|
-
|
|
53
|
-
TRUNCATED_CHARS = int(MAX_CHARS_TO_PRINT / 3)
|
|
51
|
+
MAX_QUOTED_REPLIES = 1
|
|
52
|
+
NUM_WORDS_IN_LAST_QUOTE = 6
|
|
54
53
|
|
|
55
54
|
REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
|
|
56
55
|
'********************************',
|
|
@@ -88,7 +87,13 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
|
88
87
|
re.compile(r'^INW$', re.MULTILINE): REDACTED,
|
|
89
88
|
# links
|
|
90
89
|
'Imps ://': 'https://',
|
|
90
|
+
'on-accusers-rose-\nmcgowan/ ': 'on-accusers-rose-\nmcgowan/\n',
|
|
91
|
+
'the-truth-\nabout-the-bitcoin-foundation/ )': 'the-truth-about-the-bitcoin-foundation/ )\n',
|
|
92
|
+
'woody-allen-jeffrey-epsteins-\nsociety-friends-close-ranks/ ---': 'woody-allen-jeffrey-epsteins-society-friends-close_ranks/\n',
|
|
93
|
+
' https://www.theguardian.com/world/2017/may/29/close-friend-trump-thomas-barrack-\nalleged-tax-evasion-italy-sardinia?CMP=share btn fb': '\nhttps://www.theguardian.com/world/2017/may/29/close-friend-trump-thomas-barrack-alleged-tax-evasion-italy-sardinia?CMP=share_btn_fb',
|
|
91
94
|
re.compile(r'timestopics/people/t/landon jr thomas/inde\n?x\n?\.\n?h\n?tml'): 'timestopics/people/t/landon_jr_thomas/index.html',
|
|
95
|
+
re.compile(r" http ?://www. ?dailymail. ?co ?.uk/news/article-\d+/Troub ?led-woman-history-drug-\n?us ?e-\n?.*html"): '\nhttp://www.dailymail.co.uk/news/article-3914012/Troubled-woman-history-drug-use-claimed-assaulted-Donald-Trump-Jeffrey-Epstein-sex-party-age-13-FABRICATED-story.html',
|
|
96
|
+
re.compile(r"http.*steve-bannon-trump-tower-\n?interview-\n?trumps-\n?strategist-plots-\n?new-political-movement-948747"): "\nhttp://www.hollywoodreporter.com/news/steve-bannon-trump-tower-interview-trumps-strategist-plots-new-political-movement-948747",
|
|
92
97
|
# Subject lines
|
|
93
98
|
"Arrested in\nInauguration Day Riot": "Arrested in Inauguration Day Riot",
|
|
94
99
|
"as Putin Mayhem Tests President's Grip\non GOP": "as Putin Mayhem Tests President's Grip on GOP",
|
|
@@ -99,6 +104,8 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
|
99
104
|
"COVER UP SEX ABUSE CRIMES\nBY THE WHITE HOUSE": "COVER UP SEX ABUSE CRIMES BY THE WHITE HOUSE",
|
|
100
105
|
'Priebus, used\nprivate email accounts for': 'Priebus, used private email accounts for',
|
|
101
106
|
"War on the Investigations\nEncircling Him": "War on the Investigations Encircling Him",
|
|
107
|
+
"Subject; RE": "Subject: RE",
|
|
108
|
+
"straining relations between UK and\nAmerica": "straining relations between UK and America",
|
|
102
109
|
re.compile(r"deadline re Mr Bradley Edwards vs Mr\s*Jeffrey Epstein", re.I): "deadline re Mr Bradley Edwards vs Mr Jeffrey Epstein",
|
|
103
110
|
re.compile(r"Following Plea That Implicated Trump -\s*https://www.npr.org/676040070", re.I): "Following Plea That Implicated Trump - https://www.npr.org/676040070",
|
|
104
111
|
re.compile(r"for Attorney General -\s+Wikisource, the"): r"for Attorney General - Wikisource, the",
|
|
@@ -109,6 +116,8 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
|
109
116
|
re.compile(r"Subject:\s*Fwd: Trending Now: Friends for three decades"): "Subject: Fwd: Trending Now: Friends for three decades",
|
|
110
117
|
# Misc
|
|
111
118
|
'AVG°': 'AVGO',
|
|
119
|
+
'Saw Matt C with DTF at golf': 'Saw Matt C with DJT at golf',
|
|
120
|
+
re.compile(r"[i. ]*Privileged[- ]*Redacted[i. ]*"): '<PRIVILEGED - REDACTED>',
|
|
112
121
|
}
|
|
113
122
|
|
|
114
123
|
EMAIL_SIGNATURE_REGEXES = {
|
|
@@ -118,20 +127,28 @@ EMAIL_SIGNATURE_REGEXES = {
|
|
|
118
127
|
DANIEL_SIAD: re.compile(r"Confidentiality Notice: The information contained in this electronic message is PRIVILEGED and confidential information intended only for the use of the individual entity or entities named as recipient or recipients. If the reader is not the intended recipient, be hereby notified that any dissemination, distribution or copy of this communication is strictly prohibited. If you have received this communication in error, please notify me immediately by electronic mail or by telephone and permanently delete this message from your computer system. Thank you.".replace(' ', r'\s*'), re.IGNORECASE),
|
|
119
128
|
DANNY_FROST: re.compile(r"Danny Frost\nDirector.*\nManhattan District.*\n212.*", re.IGNORECASE),
|
|
120
129
|
DARREN_INDYKE: re.compile(r"DARREN K. INDYKE.*?\**\nThe information contained in this communication.*?Darren K.[\n\s]+?[Il]ndyke(, PLLC)? — All rights reserved\.? ?\n\*{50,120}(\n\**)?", re.DOTALL),
|
|
130
|
+
DAVID_FISZEL: re.compile(r"This e-mail and any file.*\nmail and/or any file.*\nmail or any.*\nreceived.*\nmisdirected.*"),
|
|
121
131
|
DAVID_INGRAM: re.compile(r"Thank you in advance.*\nDavid Ingram.*\nCorrespondent\nReuters.*\nThomson.*(\n(Office|Mobile|Reuters.com).*)*"),
|
|
122
132
|
DEEPAK_CHOPRA: re.compile(fr"({DEEPAK_CHOPRA}( MD)?\n)?2013 Costa Del Mar Road\nCarlsbad, CA 92009(\n(Chopra Foundation|Super Genes: Unlock.*))?(\nJiyo)?(\nChopra Center for Wellbeing)?(\nHome: Where Everyone is Welcome)?"),
|
|
123
|
-
EDUARDO_ROBLES: re.compile(
|
|
133
|
+
EDUARDO_ROBLES: re.compile(r"(• )?email:.*\n(• )?email:\n(• )?website: www.creativekingdom.com\n(• )?address: 5th Floor Office No:504 Aspect Tower,\nBusiness Bay, Dubai United Arab Emirates."),
|
|
134
|
+
ERIC_ROTH: re.compile(r"2221 Smithtown Avenue\nLong Island.*\nRonkonkoma.*\n(.1. )?Phone\nFax\nCell\ne-mail"),
|
|
135
|
+
GHISLAINE_MAXWELL: re.compile(r"FACEBOOK\nTWITTER\nG\+\nPINTEREST\nINSTAGRAM\nPLEDGE\nTHE DAILY CATCH"),
|
|
124
136
|
JEFFREY_EPSTEIN: re.compile(r"((\*+|please note)\n+)?(> )?(• )?(» )?The information contained in this communication is\n(> )*(» )?confidential.*?all attachments.( copyright -all rights reserved?)?", re.DOTALL),
|
|
125
137
|
JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*(\nTel:.*)?(\nEmail:.*)?", re.IGNORECASE),
|
|
126
138
|
KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
|
|
127
139
|
LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
|
|
128
140
|
LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
|
|
141
|
+
LEON_BLACK: re.compile(r"This email and any files transmitted with it are confidential and intended solely.*\n(they|whom).*\ndissemination.*\nother.*\nand delete.*"),
|
|
142
|
+
LISA_NEW: re.compile(r"Elisa New\nPowell M. Cabot.*\n(Director.*\n)?Harvard.*\n148.*\n([1I] )?12.*\nCambridge.*\n([1I] )?02138"),
|
|
129
143
|
MARTIN_WEINBERG: re.compile(r"(Martin G. Weinberg, Esq.\n20 Park Plaza((, )|\n)Suite 1000\nBoston, MA 02116(\n61.*?)?(\n.*?([cC]ell|Office))*\n)?This Electronic Message contains.*?contents of this message is.*?prohibited.", re.DOTALL),
|
|
144
|
+
MICHAEL_MILLER: re.compile(r"Michael C. Miller\nPartner\nwww.steptoe.com/mmiller\nSteptoe\n(Privileged.*\n)?(\+1\s+)?direct.*\n(\+1\s+)?(\+1\s+)?fax.*\n(\+1.*)?cell.*\n(www.steptoe.com\n)?This message and any.*\nyou are not.*\nnotify the sender.*"),
|
|
130
145
|
NICHOLAS_RIBIS: re.compile(r"60 Morris Turnpike 2FL\nSummit,? NJ.*\n0:\nF:\n\*{20,}\nCONFIDENTIALITY NOTICE.*\nattachments.*\ncopying.*\nIf you have.*\nthe copy.*\nThank.*\n\*{20,}"),
|
|
131
146
|
PETER_MANDELSON: re.compile(r'Disclaimer This email and any attachments to it may be.*?with[ \n]+number(.*?EC4V[ \n]+6BJ)?', re.DOTALL | re.IGNORECASE),
|
|
132
147
|
PAUL_BARRETT: re.compile(r"Paul Barrett[\n\s]+Alpha Group Capital LLC[\n\s]+(142 W 57th Street, 11th Floor, New York, NY 10019?[\n\s]+)?(al?[\n\s]*)?ALPHA GROUP[\n\s]+CAPITAL"),
|
|
148
|
+
PETER_ATTIA: re.compile(r"The information contained in this transmission may contain.*\n(laws|patient).*\n(distribution|named).*\n(distribution.*\nplease.*|copies.*)"),
|
|
133
149
|
RICHARD_KAHN: re.compile(fr'Richard Kahn[\n\s]+HBRK Associates Inc.?[\n\s]+((301 East 66th Street, Suite 1OF|575 Lexington Avenue,? 4th Floor,?)[\n\s]+)?New York, (NY|New York) 100(22|65)(\s+(Tel?|Phone)( I|{REDACTED})?\s+Fa[x",]?(_|{REDACTED})*\s+[Ce]el?l?)?', re.IGNORECASE),
|
|
134
150
|
ROSS_GOW: re.compile(r"Ross Gow\nManaging Partner\nACUITY Reputation Limited\n23 Berkeley Square\nLondon.*\nMobile.*\nTel"),
|
|
151
|
+
STEPHEN_HANSON: re.compile(r"(> )?Confidentiality Notice: This e-mail transmission.*\n(which it is addressed )?and may contain.*\n(applicable law. If you are not the intended )?recipient you are hereby.*\n(information contained in or attached to this transmission is )?STRICTLY PROHIBITED.*"),
|
|
135
152
|
STEVEN_PFEIFFER: re.compile(r"Steven\nSteven .*\nAssociate.*\nIndependent Filmmaker Project\nMade in NY.*\n30 .*\nBrooklyn.*\n(p:.*\n)?www\.ifp.*", re.IGNORECASE),
|
|
136
153
|
'Susan Edelman': re.compile(r'Susan Edel.*\nReporter\n1211.*\n917.*\nsedelman.*', re.IGNORECASE),
|
|
137
154
|
TERRY_KAFKA: re.compile(r"((>|I) )?Terry B.? Kafka.*\n(> )?Impact Outdoor.*\n(> )?5454.*\n(> )?Dallas.*\n((> )?c?ell.*\n)?(> )?Impactoutdoor.*(\n(> )?cell.*)?", re.IGNORECASE),
|
|
@@ -152,13 +169,19 @@ BCC_LISTS = JUNK_EMAILERS + MAILING_LISTS
|
|
|
152
169
|
TRUNCATE_EMAILS_FROM_OR_TO = [
|
|
153
170
|
AMANDA_ENS,
|
|
154
171
|
ANTHONY_BARRETT,
|
|
172
|
+
DANIEL_SABBA,
|
|
155
173
|
DIANE_ZIMAN,
|
|
156
174
|
JOSCHA_BACH,
|
|
157
175
|
KATHERINE_KEATING,
|
|
176
|
+
LAWRANCE_VISOSKI,
|
|
158
177
|
LAWRENCE_KRAUSS,
|
|
159
178
|
LISA_NEW,
|
|
179
|
+
MOSHE_HOFFMAN,
|
|
160
180
|
NILI_PRIELL_BARAK,
|
|
161
181
|
PAUL_KRASSNER,
|
|
182
|
+
PAUL_PROSPERI,
|
|
183
|
+
'Susan Edelman',
|
|
184
|
+
TERRY_KAFKA,
|
|
162
185
|
]
|
|
163
186
|
|
|
164
187
|
TRUNCATE_EMAILS_FROM = BCC_LISTS + TRUNCATE_EMAILS_FROM_OR_TO + [
|
|
@@ -170,6 +193,7 @@ TRUNCATE_EMAILS_FROM = BCC_LISTS + TRUNCATE_EMAILS_FROM_OR_TO + [
|
|
|
170
193
|
DAVID_HAIG,
|
|
171
194
|
EDWARD_ROD_LARSEN,
|
|
172
195
|
JOHNNY_EL_HACHEM,
|
|
196
|
+
'Mark Green',
|
|
173
197
|
MELANIE_WALKER,
|
|
174
198
|
'Mitchell Bard',
|
|
175
199
|
PEGGY_SIEGAL,
|
|
@@ -182,47 +206,12 @@ TRUNCATE_EMAILS_FROM = BCC_LISTS + TRUNCATE_EMAILS_FROM_OR_TO + [
|
|
|
182
206
|
TERRY_KAFKA,
|
|
183
207
|
]
|
|
184
208
|
|
|
185
|
-
# These IDs will be appended to INTERESTING_EMAIL_IDS
|
|
186
|
-
INTERESTING_TRUNCATION_LENGTHS = {
|
|
187
|
-
'023627': 16_800, # Micheal Wolff article with brock pierce
|
|
188
|
-
'030245': None, # Epstein rationalizes his behavior in an open letter to the world
|
|
189
|
-
'030781': None, # Bannon email about crypto coin issues
|
|
190
|
-
'032906': None, # David Blaine email
|
|
191
|
-
'026036': 6000, # Gino Yu blockchain mention
|
|
192
|
-
'029609': None, # Joi Ito
|
|
193
|
-
'025233': None, # Reputation.com discussion
|
|
194
|
-
'017827': None, # Bannon / Peggy Siegal email about netflix doc on Epstein
|
|
195
|
-
'030222': None, # Ross Gow / Ghislaine correspondence
|
|
196
|
-
'026028': None, # Larry Summers / Karim Wade intro
|
|
197
|
-
'029545': None, # Tyler Shears reputation
|
|
198
|
-
'025812': None, # Tyler Shears reputation
|
|
199
|
-
'029914': 4500, # Lord Mandelson russian investments
|
|
200
|
-
'033453': None, # "Just heard you were telling people that you heard I asked Trump for a million dollars"
|
|
201
|
-
'031320': None, # Epstein Gratitude foundation
|
|
202
|
-
'031036': None, # Barbro Ehnbom talking about Swedish girl
|
|
203
|
-
'023454': 1878, # Email invitation sent to tech CEOs + Epstein
|
|
204
|
-
'029342': 2000, # Hakeem Jeffries
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
TRUNCATION_LENGTHS = {
|
|
208
|
-
**INTERESTING_TRUNCATION_LENGTHS,
|
|
209
|
-
'031791': None, # First email in Jessica Cadwell chain about service of legal documents
|
|
210
|
-
'023208': None, # Long discussion about leon black's finances
|
|
211
|
-
'028589': None, # Long thread with Reid Weingarten
|
|
212
|
-
'029433': TRUNCATED_CHARS, # Kahn taxes
|
|
213
|
-
'026778': TRUNCATED_CHARS, # Kahn taxes
|
|
214
|
-
'033311': TRUNCATED_CHARS, # Kahn taxes
|
|
215
|
-
'024251': TRUNCATED_CHARS, # Kahn taxes
|
|
216
|
-
'026755': TRUNCATED_CHARS, # Epstein self fwd
|
|
217
|
-
}
|
|
218
|
-
|
|
219
209
|
# These are long forwarded articles so we force a trim to 1,333 chars if these strings exist
|
|
220
210
|
TRUNCATE_TERMS = [
|
|
221
211
|
'The rebuilding of Indonesia', # Vikcy ward article
|
|
222
|
-
'Dominique Strauss-Kahn',
|
|
223
|
-
'THOMAS L. FRIEDMAN',
|
|
224
212
|
'a sleek, briskly paced film whose title suggests a heist movie', # Inside Job
|
|
225
213
|
'Calendar of Major Events, Openings, and Fundraisers',
|
|
214
|
+
'sent over from Marshall Heyman at the WSJ',
|
|
226
215
|
"In recent months, China's BAT collapse",
|
|
227
216
|
'President Obama introduces Jim Yong Kim as his nominee',
|
|
228
217
|
'Trump appears with mobster-affiliated felon at New',
|
|
@@ -237,9 +226,11 @@ TRUNCATE_TERMS = [
|
|
|
237
226
|
'co-inventor of the GTX Smart Shoe',
|
|
238
227
|
'my latest Washington Post column',
|
|
239
228
|
# Bannon
|
|
229
|
+
'As Steve Bannon continues his tour of Europe',
|
|
240
230
|
"Bannon the European: He's opening the populist fort in Brussels",
|
|
241
231
|
"Steve Bannon doesn't do subtle.",
|
|
242
232
|
'The Department of Justice lost its latest battle with Congress',
|
|
233
|
+
'pedophile Jeffrey Epstein bought his way out',
|
|
243
234
|
# lawyers
|
|
244
235
|
'recuses itself from Jeffrey Epstein case',
|
|
245
236
|
# Misc
|
|
@@ -265,11 +256,23 @@ LINE_REPAIR_MERGES = {
|
|
|
265
256
|
'014397': [[4]] * 2,
|
|
266
257
|
'014860': [[3], [4], [4]],
|
|
267
258
|
'017523': [[4]],
|
|
259
|
+
'030367': [[1, 4], [2, 4]],
|
|
268
260
|
'019105': [[5]] * 4,
|
|
269
261
|
'019407': [[2, 4]],
|
|
262
|
+
'022187': [[1, 8], [2, 8], [3, 8], [4, 8]],
|
|
270
263
|
'021729': [[2]],
|
|
264
|
+
'032896': [[2]],
|
|
265
|
+
'033050': [[0, 6], [1, 6], [2, 6], [3, 6], [4, 6]],
|
|
266
|
+
'022949': [[0, 4], [1, 4]],
|
|
267
|
+
'022197': [[0, 5], [1, 5], [3, 5]],
|
|
268
|
+
'021814': [[1, 6], [2, 6], [3, 6], [4, 6]],
|
|
269
|
+
'022190': [[1, 7], [0, 6], [3, 6], [4, 6]],
|
|
270
|
+
'029582': [[0, 5], [1, 5], [3, 5], [3, 5]],
|
|
271
271
|
'022673': [[9]],
|
|
272
272
|
'022684': [[9]],
|
|
273
|
+
'026625': [[0, 7], [1, 7], [2, 7], [3, 7], [4, 7], [5, 7]],
|
|
274
|
+
'026659': [[0, 5], [1, 5]],
|
|
275
|
+
'026764': [[0, 6], [1, 6]],
|
|
273
276
|
'022695': [[4]],
|
|
274
277
|
'022977': [[9]] * 10,
|
|
275
278
|
'023001': [[5]] * 3,
|
|
@@ -278,11 +281,15 @@ LINE_REPAIR_MERGES = {
|
|
|
278
281
|
'025329': [[2]] * 9,
|
|
279
282
|
'025790': [[2]],
|
|
280
283
|
'025812': [[3]] * 2,
|
|
284
|
+
'025589': [[3]] * 12,
|
|
281
285
|
'026345': [[3]],
|
|
282
286
|
'026609': [[4]],
|
|
287
|
+
'028921': [[5, 4], [4, 5]],
|
|
288
|
+
'026620': ([[20]] * 4) + [[3, 2]] + ([[2]] * 15) + [[2, 4]],
|
|
283
289
|
'026829': [[3]],
|
|
284
290
|
'026924': [[2, 4]],
|
|
285
291
|
'028728': [[3]],
|
|
292
|
+
'026451': [[3, 5]] * 2,
|
|
286
293
|
'028931': [[3, 6]],
|
|
287
294
|
'029154': [[2, 5]],
|
|
288
295
|
'029163': [[2, 5]],
|
|
@@ -302,18 +309,22 @@ LINE_REPAIR_MERGES = {
|
|
|
302
309
|
'029977': ([[2]] * 4) + [[4], [2, 4]],
|
|
303
310
|
'030299': [[7, 10]],
|
|
304
311
|
'030315': [[3, 5]],
|
|
312
|
+
'030318': [[3, 5]],
|
|
305
313
|
'030381': [[2, 4]],
|
|
306
314
|
'030384': [[2, 4]],
|
|
307
315
|
'030626': [[2], [4]],
|
|
316
|
+
'030861': [[3, 8]],
|
|
308
317
|
'030999': [[2, 4]],
|
|
309
318
|
'031384': [[2]],
|
|
310
319
|
'031428': [[2], [2, 4]],
|
|
311
320
|
'031442': [[0]],
|
|
321
|
+
'031489': [[2, 4], [3, 4], [3, 4], [10]],
|
|
322
|
+
'031619': [[7], [17], [17]],
|
|
312
323
|
'031748': [[3]] * 2,
|
|
313
|
-
'031764': [[3]],
|
|
324
|
+
'031764': [[3], [8]], # 8 is just for style fix internally, not header
|
|
314
325
|
'031980': [[2, 4]],
|
|
315
326
|
'032063': [[3, 5]],
|
|
316
|
-
'032272': [[3]],
|
|
327
|
+
'032272': [[2, 10], [3]],
|
|
317
328
|
'032405': [[4]],
|
|
318
329
|
'032637': [[9]] * 3,
|
|
319
330
|
'033097': [[2]],
|
|
@@ -326,10 +337,16 @@ LINE_REPAIR_MERGES = {
|
|
|
326
337
|
'033357': [[2, 4]],
|
|
327
338
|
'033486': [[7, 9]],
|
|
328
339
|
'033512': [[2]],
|
|
340
|
+
'026024': [[1, 3], [2, 3]],
|
|
341
|
+
'024923': [[0, 5], [2]],
|
|
329
342
|
'033568': [[5]] * 5,
|
|
330
343
|
'033575': [[2, 4]],
|
|
331
344
|
'033576': [[3]],
|
|
332
345
|
'033583': [[2]],
|
|
346
|
+
|
|
347
|
+
# Note DOJ file line adjustments happen *after* DojFile._repair() is called
|
|
348
|
+
'EFTA00039689': [[4]],
|
|
349
|
+
'EFTA00040118': [[2], [2], [2], [2], [2], [2], [6], [6]],
|
|
333
350
|
}
|
|
334
351
|
|
|
335
352
|
|
|
@@ -337,30 +354,104 @@ LINE_REPAIR_MERGES = {
|
|
|
337
354
|
class Email(Communication):
|
|
338
355
|
"""
|
|
339
356
|
Attributes:
|
|
340
|
-
actual_text (str) -
|
|
341
|
-
config (EmailCfg
|
|
342
|
-
header (EmailHeader) -
|
|
343
|
-
recipients (list[Name]) -
|
|
344
|
-
sent_from_device (str
|
|
345
|
-
signature_substitution_counts (dict[str, int]) -
|
|
357
|
+
actual_text (str) - Best effort at the text actually sent in this email, excluding quoted replies and forwards.
|
|
358
|
+
config (EmailCfg, optional) - Manual config for this email (if it exists).
|
|
359
|
+
header (EmailHeader) - Header data extracted from the text (from/to/sent/subject etc).
|
|
360
|
+
recipients (list[Name]) - People to whom this email was sent.
|
|
361
|
+
sent_from_device (str, optional) - "Sent from my iPhone" style signature (if it exists).
|
|
362
|
+
signature_substitution_counts (dict[str, int]) - Number of times a signature was replaced with
|
|
363
|
+
<...snipped...> for each participant
|
|
346
364
|
"""
|
|
365
|
+
attached_docs: list[OtherFile] = field(default_factory=list)
|
|
347
366
|
actual_text: str = field(init=False)
|
|
348
367
|
config: EmailCfg | None = None
|
|
349
368
|
header: EmailHeader = field(init=False)
|
|
350
369
|
recipients: list[Name] = field(default_factory=list)
|
|
351
370
|
sent_from_device: str | None = None
|
|
352
371
|
signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
|
|
372
|
+
_is_first_for_user: bool = False # Only set when printing
|
|
353
373
|
_line_merge_arguments: list[tuple[int] | tuple[int, int]] = field(default_factory=list)
|
|
354
374
|
|
|
355
375
|
# For logging how many headers we prettified while printing, kind of janky
|
|
356
376
|
rewritten_header_ids: ClassVar[set[str]] = set([])
|
|
357
377
|
|
|
378
|
+
@property
|
|
379
|
+
def attachments(self) -> list[str]:
|
|
380
|
+
"""Returns the string in the header."""
|
|
381
|
+
return (self.header.attachments or '').split(';')
|
|
382
|
+
|
|
383
|
+
@property
|
|
384
|
+
def border_style(self) -> str:
|
|
385
|
+
"""Color emails from epstein to others with the color for the first recipient."""
|
|
386
|
+
if self.author == JEFFREY_EPSTEIN and len(self.recipients) > 0:
|
|
387
|
+
style = get_style_for_name(self.recipients[0])
|
|
388
|
+
else:
|
|
389
|
+
style = self.author_style
|
|
390
|
+
|
|
391
|
+
return style.replace('bold', '').strip()
|
|
392
|
+
|
|
393
|
+
@property
|
|
394
|
+
def info_txt(self) -> Text:
|
|
395
|
+
email_type = 'fwded article' if self.is_fwded_article else 'email'
|
|
396
|
+
txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt)
|
|
397
|
+
|
|
398
|
+
if self.config and self.config.is_attribution_uncertain:
|
|
399
|
+
txt.append(f" {QUESTION_MARKS}", style=self.author_style)
|
|
400
|
+
|
|
401
|
+
txt.append(' to ').append(self.recipients_txt())
|
|
402
|
+
return txt.append(highlighter(f" probably sent at {self.timestamp}"))
|
|
403
|
+
|
|
404
|
+
@property
|
|
405
|
+
def is_fwded_article(self) -> bool:
|
|
406
|
+
if self.config is None:
|
|
407
|
+
return False
|
|
408
|
+
elif self.config.fwded_text_after:
|
|
409
|
+
return self.config.is_fwded_article is not False
|
|
410
|
+
else:
|
|
411
|
+
return bool(self.config.is_fwded_article)
|
|
412
|
+
|
|
413
|
+
@property
|
|
414
|
+
def is_junk_mail(self) -> bool:
|
|
415
|
+
return self.author in JUNK_EMAILERS
|
|
416
|
+
|
|
417
|
+
@property
|
|
418
|
+
def is_mailing_list(self) -> bool:
|
|
419
|
+
return self.author in MAILING_LISTS or self.is_junk_mail
|
|
420
|
+
|
|
421
|
+
@property
|
|
422
|
+
def is_note_to_self(self) -> bool:
|
|
423
|
+
return self.recipients == [self.author]
|
|
424
|
+
|
|
425
|
+
@property
|
|
426
|
+
def is_word_count_worthy(self) -> bool:
|
|
427
|
+
if self.is_fwded_article:
|
|
428
|
+
return bool(self.config.fwded_text_after) or len(self.actual_text) < 150
|
|
429
|
+
else:
|
|
430
|
+
return not self.is_mailing_list
|
|
431
|
+
|
|
432
|
+
@property
|
|
433
|
+
def metadata(self) -> Metadata:
|
|
434
|
+
local_metadata = asdict(self)
|
|
435
|
+
local_metadata['is_junk_mail'] = self.is_junk_mail
|
|
436
|
+
local_metadata['is_mailing_list'] = self.is_junk_mail
|
|
437
|
+
local_metadata['subject'] = self.subject or None
|
|
438
|
+
metadata = super().metadata
|
|
439
|
+
metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
|
|
440
|
+
return metadata
|
|
441
|
+
|
|
442
|
+
@property
|
|
443
|
+
def subject(self) -> str:
|
|
444
|
+
if self.config and self.config.subject:
|
|
445
|
+
return self.config.subject
|
|
446
|
+
else:
|
|
447
|
+
return self.header.subject or ''
|
|
448
|
+
|
|
358
449
|
def __post_init__(self):
|
|
359
450
|
self.filename = self.file_path.name
|
|
360
451
|
self.file_id = extract_file_id(self.filename)
|
|
361
452
|
|
|
362
453
|
# Special handling for copying properties out of the config for the document this one was extracted from
|
|
363
|
-
if self.is_local_extract_file
|
|
454
|
+
if self.is_local_extract_file:
|
|
364
455
|
self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
|
|
365
456
|
extracted_from_doc_id = self.url_slug.split('_')[-1]
|
|
366
457
|
|
|
@@ -373,58 +464,24 @@ class Email(Communication):
|
|
|
373
464
|
self.recipients = self.config.recipients
|
|
374
465
|
else:
|
|
375
466
|
for recipient in self.header.recipients():
|
|
376
|
-
self.recipients.extend(
|
|
467
|
+
self.recipients.extend(extract_emailer_names(recipient))
|
|
377
468
|
|
|
378
469
|
# Assume mailing list emails are to Epstein
|
|
379
|
-
if self.author in BCC_LISTS and (self.is_note_to_self
|
|
470
|
+
if self.author in BCC_LISTS and (self.is_note_to_self or not self.recipients):
|
|
380
471
|
self.recipients = [JEFFREY_EPSTEIN]
|
|
381
472
|
|
|
382
473
|
# Remove self CCs but preserve self emails
|
|
383
|
-
if not self.is_note_to_self
|
|
474
|
+
if not self.is_note_to_self:
|
|
384
475
|
self.recipients = [r for r in self.recipients if r != self.author]
|
|
385
476
|
|
|
386
477
|
self.recipients = sorted(list(set(self.recipients)), key=lambda r: r or UNKNOWN)
|
|
387
478
|
self.text = self._prettify_text()
|
|
388
|
-
self.actual_text = self.
|
|
479
|
+
self.actual_text = self._extract_actual_text()
|
|
389
480
|
self.sent_from_device = self._sent_from_device()
|
|
390
481
|
|
|
391
|
-
def
|
|
392
|
-
return (self.header.attachments or '').split(';')
|
|
393
|
-
|
|
394
|
-
def info_txt(self) -> Text:
|
|
395
|
-
email_type = 'fwded article' if self.is_fwded_article() else 'email'
|
|
396
|
-
txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt())
|
|
397
|
-
|
|
398
|
-
if self.config and self.config.is_attribution_uncertain:
|
|
399
|
-
txt.append(f" {QUESTION_MARKS}", style=self.author_style())
|
|
400
|
-
|
|
401
|
-
txt.append(' to ').append(self.recipients_txt())
|
|
402
|
-
return txt.append(highlighter(f" probably sent at {self.timestamp}"))
|
|
403
|
-
|
|
404
|
-
def is_fwded_article(self) -> bool:
|
|
405
|
-
return bool(self.config and self.config.is_fwded_article)
|
|
406
|
-
|
|
407
|
-
def is_junk_mail(self) -> bool:
|
|
408
|
-
return self.author in JUNK_EMAILERS
|
|
409
|
-
|
|
410
|
-
def is_mailing_list(self) -> bool:
|
|
411
|
-
return self.author in MAILING_LISTS or self.is_junk_mail()
|
|
412
|
-
|
|
413
|
-
def is_note_to_self(self) -> bool:
|
|
414
|
-
return self.recipients == [self.author]
|
|
415
|
-
|
|
416
|
-
def is_with(self, name: str) -> bool:
|
|
482
|
+
def is_from_or_to(self, name: str) -> bool:
|
|
417
483
|
return name in [self.author] + self.recipients
|
|
418
484
|
|
|
419
|
-
def metadata(self) -> Metadata:
|
|
420
|
-
local_metadata = asdict(self)
|
|
421
|
-
local_metadata['is_junk_mail'] = self.is_junk_mail()
|
|
422
|
-
local_metadata['is_mailing_list'] = self.is_junk_mail()
|
|
423
|
-
local_metadata['subject'] = self.subject() or None
|
|
424
|
-
metadata = super().metadata()
|
|
425
|
-
metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
|
|
426
|
-
return metadata
|
|
427
|
-
|
|
428
485
|
def recipients_txt(self, max_full_names: int = 2) -> Text:
|
|
429
486
|
"""Text object with comma separated colored versions of all recipients."""
|
|
430
487
|
recipients = [r or UNKNOWN for r in self.recipients] if len(self.recipients) > 0 else [UNKNOWN]
|
|
@@ -435,12 +492,6 @@ class Email(Communication):
|
|
|
435
492
|
for r in recipients
|
|
436
493
|
], join=', ')
|
|
437
494
|
|
|
438
|
-
def subject(self) -> str:
|
|
439
|
-
if self.config and self.config.subject:
|
|
440
|
-
return self.config.subject
|
|
441
|
-
else:
|
|
442
|
-
return self.header.subject or ''
|
|
443
|
-
|
|
444
495
|
def summary(self) -> Text:
|
|
445
496
|
"""One line summary mostly for logging."""
|
|
446
497
|
txt = self._summary()
|
|
@@ -450,7 +501,7 @@ class Email(Communication):
|
|
|
450
501
|
|
|
451
502
|
return txt.append(CLOSE_PROPERTIES_CHAR)
|
|
452
503
|
|
|
453
|
-
def
|
|
504
|
+
def _extract_actual_text(self) -> str:
|
|
454
505
|
"""The text that comes before likely quoted replies and forwards etc."""
|
|
455
506
|
if self.config and self.config.actual_text is not None:
|
|
456
507
|
return self.config.actual_text
|
|
@@ -463,7 +514,7 @@ class Email(Communication):
|
|
|
463
514
|
return self.text
|
|
464
515
|
|
|
465
516
|
self.log_top_lines(20, "Raw text:", logging.DEBUG)
|
|
466
|
-
self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
|
|
517
|
+
self.log(f"With {self.header.num_header_rows} header lines removed:\n{text[0:500]}\n\n", logging.DEBUG)
|
|
467
518
|
reply_text_match = REPLY_TEXT_REGEX.search(text)
|
|
468
519
|
|
|
469
520
|
if reply_text_match:
|
|
@@ -488,51 +539,24 @@ class Email(Communication):
|
|
|
488
539
|
|
|
489
540
|
return text.strip()
|
|
490
541
|
|
|
491
|
-
def _border_style(self) -> str:
|
|
492
|
-
"""Color emails from epstein to others with the color for the first recipient."""
|
|
493
|
-
if self.author == JEFFREY_EPSTEIN and len(self.recipients) > 0:
|
|
494
|
-
style = get_style_for_name(self.recipients[0])
|
|
495
|
-
else:
|
|
496
|
-
style = self.author_style()
|
|
497
|
-
|
|
498
|
-
return style.replace('bold', '').strip()
|
|
499
|
-
|
|
500
542
|
def _extract_author(self) -> None:
|
|
543
|
+
"""Overloads superclass method, called at instantiation time."""
|
|
501
544
|
self._extract_header()
|
|
502
545
|
super()._extract_author()
|
|
503
546
|
|
|
504
547
|
if not self.author and self.header.author:
|
|
505
|
-
authors =
|
|
548
|
+
authors = extract_emailer_names(self.header.author)
|
|
506
549
|
self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
|
|
507
550
|
|
|
508
|
-
def _extract_emailer_names(self, emailer_str: str) -> list[str]:
|
|
509
|
-
"""Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
|
|
510
|
-
emailer_str = EmailHeader.cleanup_str(emailer_str)
|
|
511
|
-
|
|
512
|
-
if len(emailer_str) == 0:
|
|
513
|
-
return []
|
|
514
|
-
|
|
515
|
-
names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
|
|
516
|
-
|
|
517
|
-
if BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
|
|
518
|
-
if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
|
|
519
|
-
logger.warning(f"'{self.filename}': No emailer found in '{escape_single_quotes(emailer_str)}'")
|
|
520
|
-
else:
|
|
521
|
-
logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
|
|
522
|
-
|
|
523
|
-
return names_found
|
|
524
|
-
|
|
525
|
-
names_found = names_found or [emailer_str]
|
|
526
|
-
return [_reverse_first_and_last_names(name) for name in names_found]
|
|
527
|
-
|
|
528
551
|
def _extract_header(self) -> None:
|
|
529
|
-
"""Extract an EmailHeader
|
|
552
|
+
"""Extract an `EmailHeader` from the OCR text."""
|
|
530
553
|
header_match = EMAIL_SIMPLE_HEADER_REGEX.search(self.text)
|
|
531
554
|
|
|
532
555
|
if header_match:
|
|
533
556
|
self.header = EmailHeader.from_header_lines(header_match.group(0))
|
|
534
557
|
|
|
535
|
-
|
|
558
|
+
# DOJ file OCR text is broken in a less consistent way than the HOUSE_OVERSIGHT files
|
|
559
|
+
if self.header.is_empty() and not self.is_doj_file:
|
|
536
560
|
self.header.repair_empty_header(self.lines)
|
|
537
561
|
else:
|
|
538
562
|
log_level = logging.INFO if self.config else logging.WARNING
|
|
@@ -542,22 +566,15 @@ class Email(Communication):
|
|
|
542
566
|
logger.debug(f"{self.file_id} extracted header\n\n{self.header}\n")
|
|
543
567
|
|
|
544
568
|
def _extract_timestamp(self) -> datetime:
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
timestamp = _parse_timestamp(self.header.sent_at)
|
|
549
|
-
|
|
550
|
-
if timestamp:
|
|
551
|
-
return timestamp
|
|
569
|
+
"""Find the time this email was sent."""
|
|
570
|
+
if self.header.sent_at and (timestamp := _parse_timestamp(self.header.sent_at)):
|
|
571
|
+
return timestamp
|
|
552
572
|
|
|
553
573
|
searchable_lines = self.lines[0:MAX_NUM_HEADER_LINES]
|
|
554
574
|
searchable_text = '\n'.join(searchable_lines)
|
|
555
|
-
date_match = DATE_HEADER_REGEX.search(searchable_text)
|
|
556
575
|
|
|
557
|
-
if date_match:
|
|
558
|
-
timestamp
|
|
559
|
-
|
|
560
|
-
if timestamp:
|
|
576
|
+
if (date_match := DATE_HEADER_REGEX.search(searchable_text)):
|
|
577
|
+
if (timestamp := _parse_timestamp(date_match.group(1))):
|
|
561
578
|
return timestamp
|
|
562
579
|
|
|
563
580
|
logger.debug(f"Failed to find timestamp, falling back to parsing {MAX_NUM_HEADER_LINES} lines...")
|
|
@@ -566,42 +583,45 @@ class Email(Communication):
|
|
|
566
583
|
if not TIMESTAMP_LINE_REGEX.search(line):
|
|
567
584
|
continue
|
|
568
585
|
|
|
569
|
-
timestamp
|
|
570
|
-
|
|
571
|
-
if timestamp:
|
|
586
|
+
if (timestamp := _parse_timestamp(line)):
|
|
572
587
|
logger.debug(f"Fell back to timestamp {timestamp} in line '{line}'...")
|
|
573
588
|
return timestamp
|
|
574
589
|
|
|
575
|
-
|
|
590
|
+
no_timestamp_msg = f"No timestamp found in '{self.file_path.name}'"
|
|
576
591
|
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
if text is None:
|
|
580
|
-
header_offset = len(self.header.header_chars)
|
|
581
|
-
text = self.text[header_offset:]
|
|
592
|
+
if self.is_duplicate:
|
|
593
|
+
logger.warning(f"{no_timestamp_msg} but timestamp should be copied from {self.duplicate_of_id}")
|
|
582
594
|
else:
|
|
583
|
-
|
|
595
|
+
raise RuntimeError(f"{no_timestamp_msg}, top lines:\n" + '\n'.join(self.lines[0:MAX_NUM_HEADER_LINES + 10]))
|
|
596
|
+
|
|
597
|
+
def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES) -> int | None:
|
|
598
|
+
"""Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
|
|
599
|
+
header_offset = len(self.header.header_chars)
|
|
600
|
+
text = self.text[header_offset:]
|
|
584
601
|
|
|
585
602
|
for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text)):
|
|
586
603
|
if i >= n:
|
|
587
604
|
return match.end() + header_offset - 1
|
|
588
605
|
|
|
589
|
-
def _merge_lines(self,
|
|
606
|
+
def _merge_lines(self, idx1: int, idx2: int | None = None) -> None:
|
|
590
607
|
"""Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""
|
|
591
608
|
if idx2 is None:
|
|
592
|
-
self._line_merge_arguments.append((
|
|
593
|
-
idx2 =
|
|
609
|
+
self._line_merge_arguments.append((idx1,))
|
|
610
|
+
idx2 = idx1 + 1
|
|
594
611
|
else:
|
|
595
|
-
self._line_merge_arguments.append((
|
|
596
|
-
|
|
597
|
-
lines = self.lines[0:idx]
|
|
612
|
+
self._line_merge_arguments.append((idx1, idx2))
|
|
598
613
|
|
|
599
|
-
if idx2
|
|
600
|
-
|
|
601
|
-
elif idx2 ==
|
|
602
|
-
|
|
614
|
+
if idx2 < idx1:
|
|
615
|
+
lines = self.lines[0:idx2] + self.lines[idx2 + 1:idx1] + [self.lines[idx1] + ' ' + self.lines[idx2]] + self.lines[idx1 + 1:]
|
|
616
|
+
elif idx2 == idx1:
|
|
617
|
+
raise RuntimeError(f"idx2 ({idx2}) must be greater or less than idx ({idx1})")
|
|
603
618
|
else:
|
|
604
|
-
lines
|
|
619
|
+
lines = self.lines[0:idx1]
|
|
620
|
+
|
|
621
|
+
if idx2 == (idx1 + 1):
|
|
622
|
+
lines += [self.lines[idx1] + ' ' + self.lines[idx1 + 1]] + self.lines[idx1 + 2:]
|
|
623
|
+
else:
|
|
624
|
+
lines += [self.lines[idx1] + ' ' + self.lines[idx2]] + self.lines[idx1 + 1:idx2] + self.lines[idx2 + 1:]
|
|
605
625
|
|
|
606
626
|
self._set_computed_fields(lines=lines)
|
|
607
627
|
|
|
@@ -617,6 +637,10 @@ class Email(Communication):
|
|
|
617
637
|
self.signature_substitution_counts[name] = self.signature_substitution_counts.get(name, 0)
|
|
618
638
|
self.signature_substitution_counts[name] += num_replaced
|
|
619
639
|
|
|
640
|
+
# Share / Tweet lines
|
|
641
|
+
if self.author == KATHRYN_RUEMMLER:
|
|
642
|
+
text = '\n'.join([line for line in text.split('\n') if line not in ['Share', 'Tweet', 'Bookmark it']])
|
|
643
|
+
|
|
620
644
|
return collapse_newlines(text).strip()
|
|
621
645
|
|
|
622
646
|
def _remove_line(self, idx: int) -> None:
|
|
@@ -628,7 +652,7 @@ class Email(Communication):
|
|
|
628
652
|
self.log_top_lines(num_lines, msg=f'after removal of line {idx}')
|
|
629
653
|
|
|
630
654
|
def _repair(self) -> None:
|
|
631
|
-
"""Repair particularly janky files."""
|
|
655
|
+
"""Repair particularly janky files. Note that OCR_REPAIRS are applied *after* other line adjustments."""
|
|
632
656
|
if BAD_FIRST_LINE_REGEX.match(self.lines[0]):
|
|
633
657
|
self._set_computed_fields(lines=self.lines[1:])
|
|
634
658
|
|
|
@@ -656,18 +680,26 @@ class Email(Communication):
|
|
|
656
680
|
self.log(f"Modified text, old:\n\n" + '\n'.join(old_text.split('\n')[0:12]) + '\n')
|
|
657
681
|
self.log_top_lines(12, 'Result of modifications')
|
|
658
682
|
|
|
659
|
-
|
|
683
|
+
repaired_text = self._repair_links_and_quoted_subjects(self.repair_ocr_text(OCR_REPAIRS, self.text))
|
|
684
|
+
self._set_computed_fields(text=repaired_text)
|
|
685
|
+
|
|
686
|
+
def _repair_links_and_quoted_subjects(self, text: str) -> str:
|
|
687
|
+
"""Repair links that the OCR has broken into multiple lines as well as 'Subject:' lines."""
|
|
688
|
+
lines = text.split('\n')
|
|
689
|
+
subject_line = next((line for line in lines if line.startswith('Subject:')), None) or ''
|
|
690
|
+
subject = subject_line.split(':')[1].strip() if subject_line else ''
|
|
660
691
|
new_lines = []
|
|
661
692
|
i = 0
|
|
662
693
|
|
|
663
|
-
# Fix links (remove spaces, merge multiline links to a single line)
|
|
664
694
|
while i < len(lines):
|
|
665
695
|
line = lines[i]
|
|
666
696
|
|
|
667
697
|
if LINK_LINE_REGEX.search(line):
|
|
668
698
|
while i < (len(lines) - 1) \
|
|
669
|
-
and
|
|
670
|
-
and (lines[i + 1].endswith('/')
|
|
699
|
+
and not lines[i + 1].startswith('htt') \
|
|
700
|
+
and (lines[i + 1].endswith('/') \
|
|
701
|
+
or any(s in lines[i + 1] for s in URL_SIGNIFIERS) \
|
|
702
|
+
or LINK_LINE2_REGEX.match(lines[i + 1])):
|
|
671
703
|
logger.debug(f"{self.filename}: Joining link lines\n 1. {line}\n 2. {lines[i + 1]}\n")
|
|
672
704
|
line += lines[i + 1]
|
|
673
705
|
i += 1
|
|
@@ -676,22 +708,27 @@ class Email(Communication):
|
|
|
676
708
|
elif ' http' in line and line.endswith('html'):
|
|
677
709
|
pre_link, post_link = line.split(' http', 1)
|
|
678
710
|
line = f"{pre_link} http{post_link.replace(' ', '')}"
|
|
711
|
+
elif line.startswith('Subject:') and i < (len(lines) - 2) and len(line) >= 40:
|
|
712
|
+
next_line = lines[i + 1]
|
|
713
|
+
next_next = lines[i + 2]
|
|
714
|
+
|
|
715
|
+
if len(next_line) <= 1 or any([cont in next_line for cont in BAD_SUBJECT_CONTINUATIONS]):
|
|
716
|
+
pass
|
|
717
|
+
elif (subject.endswith(next_line) and next_line != subject) \
|
|
718
|
+
or (FIELDS_COLON_REGEX.search(next_next) and not FIELDS_COLON_REGEX.search(next_line)):
|
|
719
|
+
self.log(f"Fixing broken subject line\n line: '{line}'\n next: '{next_line}'\n next: '{next_next}'\nsubject='{subject}'\n")
|
|
720
|
+
line += f" {next_line}"
|
|
721
|
+
i += 1
|
|
679
722
|
|
|
680
723
|
new_lines.append(line)
|
|
681
|
-
|
|
682
|
-
# TODO: hacky workaround to get a working link for HOUSE_OVERSIGHT_032564
|
|
683
|
-
if self.file_id == '032564' and line == 'http://m.huffpost.com/us/entry/us_599f532ae4b0dOef9f1c129d':
|
|
684
|
-
new_lines.append('(ed. note: an archived version of the above link is here: https://archive.is/hJxT3 )')
|
|
685
|
-
|
|
686
724
|
i += 1
|
|
687
725
|
|
|
688
|
-
|
|
726
|
+
logger.debug(f"----after line repair---\n" + '\n'.join(new_lines[0:20]) + "\n---")
|
|
727
|
+
return '\n'.join(lines)
|
|
689
728
|
|
|
690
729
|
def _sent_from_device(self) -> str | None:
|
|
691
730
|
"""Find any 'Sent from my iPhone' style signature line if it exist in the 'actual_text'."""
|
|
692
|
-
sent_from_match
|
|
693
|
-
|
|
694
|
-
if sent_from_match:
|
|
731
|
+
if (sent_from_match := SENT_FROM_REGEX.search(self.actual_text)):
|
|
695
732
|
sent_from = sent_from_match.group(0)
|
|
696
733
|
return 'S' + sent_from[1:] if sent_from.startswith('sent') else sent_from
|
|
697
734
|
|
|
@@ -699,13 +736,11 @@ class Email(Communication):
|
|
|
699
736
|
"""Copy info from original config for file this document was extracted from."""
|
|
700
737
|
if self.file_id in ALL_FILE_CONFIGS:
|
|
701
738
|
self.config = cast(EmailCfg, deepcopy(ALL_FILE_CONFIGS[self.file_id]))
|
|
702
|
-
self.
|
|
739
|
+
self.log(f"Merging existing cfg for '{self.file_id}' with cfg for extracted document...")
|
|
703
740
|
else:
|
|
704
741
|
self.config = EmailCfg(id=self.file_id)
|
|
705
742
|
|
|
706
|
-
extracted_from_description
|
|
707
|
-
|
|
708
|
-
if extracted_from_description:
|
|
743
|
+
if (extracted_from_description := extracted_from_doc_cfg.complete_description):
|
|
709
744
|
extracted_description = f"{APPEARS_IN} {extracted_from_description}"
|
|
710
745
|
|
|
711
746
|
if isinstance(extracted_from_doc_cfg, EmailCfg):
|
|
@@ -721,34 +756,58 @@ class Email(Communication):
|
|
|
721
756
|
|
|
722
757
|
def _truncate_to_length(self) -> int:
|
|
723
758
|
"""When printing truncate this email to this length."""
|
|
724
|
-
quote_cutoff = self._idx_of_nth_quoted_reply(
|
|
759
|
+
quote_cutoff = self._idx_of_nth_quoted_reply() # Trim if there's many quoted replies
|
|
725
760
|
includes_truncate_term = next((term for term in TRUNCATE_TERMS if term in self.text), None)
|
|
726
761
|
|
|
727
762
|
if args.whole_file:
|
|
728
763
|
num_chars = len(self.text)
|
|
729
764
|
elif args.truncate:
|
|
730
765
|
num_chars = args.truncate
|
|
731
|
-
elif self.
|
|
732
|
-
num_chars =
|
|
733
|
-
elif self.
|
|
766
|
+
elif self.config and self.config.truncate_to is not None:
|
|
767
|
+
num_chars = len(self.text) if self.config.truncate_to == NO_TRUNCATE else self.config.truncate_to
|
|
768
|
+
elif self.is_interesting:
|
|
769
|
+
num_chars = len(self.text)
|
|
770
|
+
elif self.author in TRUNCATE_EMAILS_FROM \
|
|
771
|
+
or any([self.is_from_or_to(n) for n in TRUNCATE_EMAILS_FROM_OR_TO]) \
|
|
772
|
+
or self.is_fwded_article \
|
|
773
|
+
or includes_truncate_term:
|
|
734
774
|
num_chars = min(quote_cutoff or MAX_CHARS_TO_PRINT, TRUNCATED_CHARS)
|
|
735
|
-
elif quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
|
|
736
|
-
num_chars = quote_cutoff
|
|
737
775
|
else:
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
776
|
+
if quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
|
|
777
|
+
trimmed_words = self.text[quote_cutoff:].split()
|
|
778
|
+
|
|
779
|
+
if '<...snipped' in trimmed_words[:NUM_WORDS_IN_LAST_QUOTE]:
|
|
780
|
+
num_trailing_words = 0
|
|
781
|
+
elif trimmed_words and trimmed_words[0] in ['From:', 'Sent:']:
|
|
782
|
+
num_trailing_words = NUM_WORDS_IN_LAST_QUOTE
|
|
783
|
+
else:
|
|
784
|
+
num_trailing_words = NUM_WORDS_IN_LAST_QUOTE
|
|
785
|
+
|
|
786
|
+
if trimmed_words:
|
|
787
|
+
last_quoted_text = ' '.join(trimmed_words[:num_trailing_words])
|
|
788
|
+
num_chars = quote_cutoff + len(last_quoted_text) + 1 # Give a hint of the next line
|
|
789
|
+
else:
|
|
790
|
+
num_chars = quote_cutoff
|
|
791
|
+
else:
|
|
792
|
+
num_chars = min(self.file_size, MAX_CHARS_TO_PRINT)
|
|
793
|
+
|
|
794
|
+
# Always print whole email for 1st email for user
|
|
795
|
+
if self._is_first_for_user and num_chars < self.file_size and not self.is_duplicate:
|
|
796
|
+
logger.info(f"{self} Overriding cutoff {num_chars} for first email")
|
|
797
|
+
num_chars = self.file_size
|
|
798
|
+
|
|
799
|
+
log_args = {
|
|
800
|
+
'num_chars': num_chars,
|
|
801
|
+
'_is_first_for_user': self._is_first_for_user,
|
|
802
|
+
'author_truncate': self.author in TRUNCATE_EMAILS_FROM,
|
|
803
|
+
'is_fwded_article': self.is_fwded_article,
|
|
804
|
+
'is_quote_cutoff': quote_cutoff == num_chars,
|
|
805
|
+
'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
|
|
806
|
+
'quote_cutoff': quote_cutoff,
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
log_args_str = ', '.join([f"{k}={v}" for k, v in log_args.items() if v])
|
|
810
|
+
logger.debug(f"Truncate determination: {log_args_str}")
|
|
752
811
|
return num_chars
|
|
753
812
|
|
|
754
813
|
def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
|
|
@@ -761,8 +820,8 @@ class Email(Communication):
|
|
|
761
820
|
# Truncate long emails but leave a note explaining what happened w/link to source document
|
|
762
821
|
if len(text) > num_chars:
|
|
763
822
|
text = text[0:num_chars]
|
|
764
|
-
doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style
|
|
765
|
-
trim_note = f"<...trimmed to {num_chars} characters of {self.length
|
|
823
|
+
doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
|
|
824
|
+
trim_note = f"<...trimmed to {num_chars:,} characters of {self.length:,}, read the rest at {doc_link_markup}...>"
|
|
766
825
|
trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
|
|
767
826
|
|
|
768
827
|
# Rewrite broken headers where the values are on separate lines from the field names
|
|
@@ -778,7 +837,7 @@ class Email(Communication):
|
|
|
778
837
|
|
|
779
838
|
lines += text.split('\n')[num_lines_to_skip:]
|
|
780
839
|
text = self.header.rewrite_header() + '\n' + '\n'.join(lines)
|
|
781
|
-
text = _add_line_breaks(text)
|
|
840
|
+
text = _add_line_breaks(text)
|
|
782
841
|
self.rewritten_header_ids.add(self.file_id)
|
|
783
842
|
|
|
784
843
|
lines = [
|
|
@@ -789,8 +848,8 @@ class Email(Communication):
|
|
|
789
848
|
text = join_texts(lines, '\n')
|
|
790
849
|
|
|
791
850
|
email_txt_panel = Panel(
|
|
792
|
-
highlighter(text).append('
|
|
793
|
-
border_style=self.
|
|
851
|
+
highlighter(text).append('...\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
|
|
852
|
+
border_style=self.border_style,
|
|
794
853
|
expand=False,
|
|
795
854
|
subtitle=REWRITTEN_HEADER_MSG if should_rewrite_header else None,
|
|
796
855
|
)
|
|
@@ -798,6 +857,11 @@ class Email(Communication):
|
|
|
798
857
|
yield self.file_info_panel()
|
|
799
858
|
yield Padding(email_txt_panel, (0, 0, 1, INFO_INDENT))
|
|
800
859
|
|
|
860
|
+
if self.attached_docs:
|
|
861
|
+
attachments_table_title = f" {self.url_slug} Email Attachments:"
|
|
862
|
+
attachments_table = OtherFile.files_preview_table(self.attached_docs, title=attachments_table_title)
|
|
863
|
+
yield Padding(attachments_table, (0, 0, 1, 12))
|
|
864
|
+
|
|
801
865
|
if should_rewrite_header:
|
|
802
866
|
self.log_top_lines(self.header.num_header_rows + 4, f'Original header:')
|
|
803
867
|
|
|
@@ -832,11 +896,11 @@ class Email(Communication):
|
|
|
832
896
|
|
|
833
897
|
for email in emails:
|
|
834
898
|
fields = [
|
|
835
|
-
email.epstein_media_link(link_txt=email.timestamp_without_seconds
|
|
836
|
-
email.author_txt
|
|
899
|
+
email.epstein_media_link(link_txt=email.timestamp_without_seconds, style=link_style),
|
|
900
|
+
email.author_txt,
|
|
837
901
|
email.recipients_txt(max_full_names=1),
|
|
838
|
-
f"{email.length
|
|
839
|
-
email.subject
|
|
902
|
+
f"{email.length}",
|
|
903
|
+
email.subject,
|
|
840
904
|
]
|
|
841
905
|
|
|
842
906
|
if not show_length:
|
|
@@ -853,21 +917,14 @@ def _add_line_breaks(email_text: str) -> str:
|
|
|
853
917
|
|
|
854
918
|
def _parse_timestamp(timestamp_str: str) -> None | datetime:
|
|
855
919
|
try:
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
920
|
+
if (american_date_match := AMERICAN_TIME_REGEX.search(timestamp_str)):
|
|
921
|
+
timestamp_str = american_date_match.group(1)
|
|
922
|
+
else:
|
|
923
|
+
timestamp_str = timestamp_str.replace('(GMT-05:00)', 'EST')
|
|
924
|
+
timestamp_str = BAD_TIMEZONE_REGEX.sub(' ', timestamp_str).strip()
|
|
925
|
+
|
|
926
|
+
timestamp = parse(timestamp_str, fuzzy=True, tzinfos=TIMEZONE_INFO)
|
|
859
927
|
logger.debug(f'Parsed timestamp "%s" from string "%s"', timestamp, timestamp_str)
|
|
860
928
|
return remove_timezone(timestamp)
|
|
861
929
|
except Exception as e:
|
|
862
930
|
logger.debug(f'Failed to parse "{timestamp_str}" to timestamp!')
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
def _reverse_first_and_last_names(name: str) -> str:
|
|
866
|
-
if '@' in name:
|
|
867
|
-
return name.lower()
|
|
868
|
-
|
|
869
|
-
if ', ' in name:
|
|
870
|
-
names = name.split(', ')
|
|
871
|
-
return f"{names[1]} {names[0]}"
|
|
872
|
-
else:
|
|
873
|
-
return name
|