epstein-files 1.2.1__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +55 -11
- epstein_files/documents/document.py +13 -2
- epstein_files/documents/email.py +329 -258
- epstein_files/documents/emails/email_header.py +17 -8
- epstein_files/documents/other_file.py +8 -6
- epstein_files/epstein_files.py +18 -4
- epstein_files/person.py +65 -20
- epstein_files/util/constant/names.py +18 -12
- epstein_files/util/constant/output_files.py +8 -5
- epstein_files/util/constant/strings.py +4 -2
- epstein_files/util/constant/urls.py +13 -2
- epstein_files/util/constants.py +486 -224
- epstein_files/util/data.py +1 -0
- epstein_files/util/doc_cfg.py +33 -27
- epstein_files/util/env.py +18 -8
- epstein_files/util/file_helper.py +2 -0
- epstein_files/util/highlighted_group.py +321 -132
- epstein_files/util/output.py +19 -24
- epstein_files/util/rich.py +9 -3
- epstein_files/util/word_count.py +2 -2
- {epstein_files-1.2.1.dist-info → epstein_files-1.4.1.dist-info}/METADATA +3 -3
- epstein_files-1.4.1.dist-info/RECORD +34 -0
- {epstein_files-1.2.1.dist-info → epstein_files-1.4.1.dist-info}/entry_points.txt +1 -1
- epstein_files-1.2.1.dist-info/RECORD +0 -34
- {epstein_files-1.2.1.dist-info → epstein_files-1.4.1.dist-info}/LICENSE +0 -0
- {epstein_files-1.2.1.dist-info → epstein_files-1.4.1.dist-info}/WHEEL +0 -0
epstein_files/documents/email.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import re
|
|
4
|
+
from collections import defaultdict
|
|
4
5
|
from copy import deepcopy
|
|
5
6
|
from dataclasses import asdict, dataclass, field
|
|
6
7
|
from datetime import datetime
|
|
@@ -16,12 +17,12 @@ from rich.text import Text
|
|
|
16
17
|
from epstein_files.documents.communication import Communication
|
|
17
18
|
from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, INFO_INDENT
|
|
18
19
|
from epstein_files.documents.emails.email_header import (BAD_EMAILER_REGEX, EMAIL_SIMPLE_HEADER_REGEX,
|
|
19
|
-
EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, TIME_REGEX, EmailHeader)
|
|
20
|
+
EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, FIELDS_COLON_PATTERN, TIME_REGEX, EmailHeader)
|
|
21
|
+
from epstein_files.documents.other_file import OtherFile
|
|
20
22
|
from epstein_files.util.constant.names import *
|
|
21
23
|
from epstein_files.util.constant.strings import REDACTED
|
|
22
24
|
from epstein_files.util.constants import *
|
|
23
|
-
from epstein_files.util.data import
|
|
24
|
-
flatten, listify, remove_timezone, uniquify)
|
|
25
|
+
from epstein_files.util.data import TIMEZONE_INFO, collapse_newlines, escape_single_quotes, remove_timezone
|
|
25
26
|
from epstein_files.util.doc_cfg import EmailCfg, Metadata
|
|
26
27
|
from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
|
|
27
28
|
from epstein_files.util.highlighted_group import JUNK_EMAILERS, get_style_for_name
|
|
@@ -29,9 +30,12 @@ from epstein_files.util.logging import logger
|
|
|
29
30
|
from epstein_files.util.rich import *
|
|
30
31
|
|
|
31
32
|
BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
|
|
32
|
-
BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|
|
|
33
|
+
BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Hide caption|Importance:?\s*High|[iI,•]|[1i] (_ )?[il]|, [-,]|L\._|_filtered|.*(yiv0232|font-family:|margin-bottom:).*)$')
|
|
34
|
+
BAD_SUBJECT_CONTINUATIONS = ['orwarded', 'Hi ', 'Sent ', 'AmLaw', 'Original Message', 'Privileged', 'Sorry', '---']
|
|
33
35
|
DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
|
|
34
|
-
|
|
36
|
+
FIELDS_COLON_REGEX = re.compile(FIELDS_COLON_PATTERN)
|
|
37
|
+
LINK_LINE_REGEX = re.compile(f"^[>• ]*htt")
|
|
38
|
+
LINK_LINE2_REGEX = re.compile(r"^[-\w.%&=/]{5,}$")
|
|
35
39
|
QUOTED_REPLY_LINE_REGEX = re.compile(r'(\nFrom:(.*)|wrote:)\n', re.IGNORECASE)
|
|
36
40
|
REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
|
|
37
41
|
|
|
@@ -42,11 +46,12 @@ LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
|
|
|
42
46
|
|
|
43
47
|
SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
|
|
44
48
|
REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
|
|
45
|
-
URL_SIGNIFIERS = ['gclid', 'htm', 'ref=', 'utm']
|
|
49
|
+
URL_SIGNIFIERS = ['?amp', 'amp?', 'cd=', 'click', 'CMP=', 'contentId', 'ft=', 'gclid', 'htm', 'mp=', 'keywords=', 'Id=', 'module=', 'mpweb', 'nlid=', 'ref=', 'smid=', 'sp=', 'usg=', 'utm']
|
|
46
50
|
APPEARS_IN = 'appears in'
|
|
47
|
-
|
|
51
|
+
|
|
48
52
|
MAX_NUM_HEADER_LINES = 14
|
|
49
|
-
MAX_QUOTED_REPLIES =
|
|
53
|
+
MAX_QUOTED_REPLIES = 1
|
|
54
|
+
NUM_WORDS_IN_LAST_QUOTE = 6
|
|
50
55
|
|
|
51
56
|
REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
|
|
52
57
|
'********************************',
|
|
@@ -72,18 +77,25 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
|
72
77
|
# Signatures
|
|
73
78
|
'BlackBerry by AT &T': 'BlackBerry by AT&T',
|
|
74
79
|
'BlackBerry from T- Mobile': 'BlackBerry from T-Mobile',
|
|
75
|
-
'Envoy& de
|
|
80
|
+
'Envoy& de': 'Envoyé de',
|
|
76
81
|
"from my 'Phone": 'from my iPhone',
|
|
77
82
|
'from Samsung Mob.le': 'from Samsung Mobile',
|
|
78
83
|
'gJeremyRubin': '@JeremyRubin',
|
|
79
84
|
'Sent from Mabfl': 'Sent from Mobile', # NADIA_MARCINKO signature bad OCR
|
|
80
85
|
'twitter glhsummers': 'twitter @lhsummers',
|
|
86
|
+
re.compile(r"[cC]o-authored with i ?Phone auto-correct"): "Co-authored with iPhone auto-correct",
|
|
81
87
|
re.compile(r"twitter\.com[i/][lI]krauss[1lt]"): "twitter.com/lkrauss1",
|
|
82
88
|
re.compile(r'from my BlackBerry[0°] wireless device'): 'from my BlackBerry® wireless device',
|
|
83
89
|
re.compile(r'^INW$', re.MULTILINE): REDACTED,
|
|
84
90
|
# links
|
|
85
91
|
'Imps ://': 'https://',
|
|
92
|
+
'on-accusers-rose-\nmcgowan/ ': 'on-accusers-rose-\nmcgowan/\n',
|
|
93
|
+
'the-truth-\nabout-the-bitcoin-foundation/ )': 'the-truth-about-the-bitcoin-foundation/ )\n',
|
|
94
|
+
'woody-allen-jeffrey-epsteins-\nsociety-friends-close-ranks/ ---': 'woody-allen-jeffrey-epsteins-society-friends-close_ranks/\n',
|
|
95
|
+
' https://www.theguardian.com/world/2017/may/29/close-friend-trump-thomas-barrack-\nalleged-tax-evasion-italy-sardinia?CMP=share btn fb': '\nhttps://www.theguardian.com/world/2017/may/29/close-friend-trump-thomas-barrack-alleged-tax-evasion-italy-sardinia?CMP=share_btn_fb',
|
|
86
96
|
re.compile(r'timestopics/people/t/landon jr thomas/inde\n?x\n?\.\n?h\n?tml'): 'timestopics/people/t/landon_jr_thomas/index.html',
|
|
97
|
+
re.compile(r" http ?://www. ?dailymail. ?co ?.uk/news/article-\d+/Troub ?led-woman-history-drug-\n?us ?e-\n?.*html"): '\nhttp://www.dailymail.co.uk/news/article-3914012/Troubled-woman-history-drug-use-claimed-assaulted-Donald-Trump-Jeffrey-Epstein-sex-party-age-13-FABRICATED-story.html',
|
|
98
|
+
re.compile(r"http.*steve-bannon-trump-tower-\n?interview-\n?trumps-\n?strategist-plots-\n?new-political-movement-948747"): "\nhttp://www.hollywoodreporter.com/news/steve-bannon-trump-tower-interview-trumps-strategist-plots-new-political-movement-948747",
|
|
87
99
|
# Subject lines
|
|
88
100
|
"Arrested in\nInauguration Day Riot": "Arrested in Inauguration Day Riot",
|
|
89
101
|
"as Putin Mayhem Tests President's Grip\non GOP": "as Putin Mayhem Tests President's Grip on GOP",
|
|
@@ -94,6 +106,7 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
|
94
106
|
"COVER UP SEX ABUSE CRIMES\nBY THE WHITE HOUSE": "COVER UP SEX ABUSE CRIMES BY THE WHITE HOUSE",
|
|
95
107
|
'Priebus, used\nprivate email accounts for': 'Priebus, used private email accounts for',
|
|
96
108
|
"War on the Investigations\nEncircling Him": "War on the Investigations Encircling Him",
|
|
109
|
+
"Subject; RE": "Subject: RE",
|
|
97
110
|
re.compile(r"deadline re Mr Bradley Edwards vs Mr\s*Jeffrey Epstein", re.I): "deadline re Mr Bradley Edwards vs Mr Jeffrey Epstein",
|
|
98
111
|
re.compile(r"Following Plea That Implicated Trump -\s*https://www.npr.org/676040070", re.I): "Following Plea That Implicated Trump - https://www.npr.org/676040070",
|
|
99
112
|
re.compile(r"for Attorney General -\s+Wikisource, the"): r"for Attorney General - Wikisource, the",
|
|
@@ -104,27 +117,43 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
|
104
117
|
re.compile(r"Subject:\s*Fwd: Trending Now: Friends for three decades"): "Subject: Fwd: Trending Now: Friends for three decades",
|
|
105
118
|
# Misc
|
|
106
119
|
'AVG°': 'AVGO',
|
|
120
|
+
'Saw Matt C with DTF at golf': 'Saw Matt C with DJT at golf',
|
|
121
|
+
re.compile(r"[i. ]*Privileged[- ]*Redacted[i. ]*"): '<PRIVILEGED - REDACTED>',
|
|
107
122
|
}
|
|
108
123
|
|
|
109
124
|
EMAIL_SIGNATURE_REGEXES = {
|
|
110
125
|
ARIANE_DE_ROTHSCHILD: re.compile(r"Ensemble.*\nCe.*\ndestinataires.*\nremercions.*\nautorisee.*\nd.*\nLe.*\ncontenues.*\nEdmond.*\nRoth.*\nlo.*\nRoth.*\ninfo.*\nFranc.*\n.2.*", re.I),
|
|
111
126
|
BARBRO_C_EHNBOM: re.compile(r"Barbro C.? Ehn.*\nChairman, Swedish-American.*\n((Office|Cell|Sweden):.*\n)*(360.*\nNew York.*)?"),
|
|
127
|
+
BRAD_KARP: re.compile(r"This message is intended only for the use of the Addressee and may contain information.*\nnot the intended recipient, you are hereby notified.*\nreceived this communication in error.*"),
|
|
128
|
+
DANIEL_SIAD: re.compile(r"Confidentiality Notice: The information contained in this electronic message is PRIVILEGED and confidential information intended only for the use of the individual entity or entities named as recipient or recipients. If the reader is not the intended recipient, be hereby notified that any dissemination, distribution or copy of this communication is strictly prohibited. If you have received this communication in error, please notify me immediately by electronic mail or by telephone and permanently delete this message from your computer system. Thank you.".replace(' ', r'\s*'), re.IGNORECASE),
|
|
112
129
|
DANNY_FROST: re.compile(r"Danny Frost\nDirector.*\nManhattan District.*\n212.*", re.IGNORECASE),
|
|
113
130
|
DARREN_INDYKE: re.compile(r"DARREN K. INDYKE.*?\**\nThe information contained in this communication.*?Darren K.[\n\s]+?[Il]ndyke(, PLLC)? — All rights reserved\.? ?\n\*{50,120}(\n\**)?", re.DOTALL),
|
|
131
|
+
DAVID_FISZEL: re.compile(r"This e-mail and any file.*\nmail and/or any file.*\nmail or any.*\nreceived.*\nmisdirected.*"),
|
|
114
132
|
DAVID_INGRAM: re.compile(r"Thank you in advance.*\nDavid Ingram.*\nCorrespondent\nReuters.*\nThomson.*(\n(Office|Mobile|Reuters.com).*)*"),
|
|
115
133
|
DEEPAK_CHOPRA: re.compile(fr"({DEEPAK_CHOPRA}( MD)?\n)?2013 Costa Del Mar Road\nCarlsbad, CA 92009(\n(Chopra Foundation|Super Genes: Unlock.*))?(\nJiyo)?(\nChopra Center for Wellbeing)?(\nHome: Where Everyone is Welcome)?"),
|
|
134
|
+
EDUARDO_ROBLES: re.compile(r"(• )?email:.*\n(• )?email:\n(• )?website: www.creativekingdom.com\n(• )?address: 5th Floor Office No:504 Aspect Tower,\nBusiness Bay, Dubai United Arab Emirates."),
|
|
135
|
+
ERIC_ROTH: re.compile(r"2221 Smithtown Avenue\nLong Island.*\nRonkonkoma.*\n(.1. )?Phone\nFax\nCell\ne-mail"),
|
|
136
|
+
GHISLAINE_MAXWELL: re.compile(r"FACEBOOK\nTWITTER\nG\+\nPINTEREST\nINSTAGRAM\nPLEDGE\nTHE DAILY CATCH"),
|
|
116
137
|
JEFFREY_EPSTEIN: re.compile(r"((\*+|please note)\n+)?(> )?(• )?(» )?The information contained in this communication is\n(> )*(» )?confidential.*?all attachments.( copyright -all rights reserved?)?", re.DOTALL),
|
|
117
138
|
JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*(\nTel:.*)?(\nEmail:.*)?", re.IGNORECASE),
|
|
118
139
|
KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
|
|
119
140
|
LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
|
|
120
141
|
LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
|
|
121
|
-
|
|
122
|
-
|
|
142
|
+
LEON_BLACK: re.compile(r"This email and any files transmitted with it are confidential and intended solely.*\n(they|whom).*\ndissemination.*\nother.*\nand delete.*"),
|
|
143
|
+
LISA_NEW: re.compile(r"Elisa New\nPowell M. Cabot.*\n(Director.*\n)?Harvard.*\n148.*\n([1I] )?12.*\nCambridge.*\n([1I] )?02138"),
|
|
144
|
+
MARTIN_WEINBERG: re.compile(r"(Martin G. Weinberg, Esq.\n20 Park Plaza((, )|\n)Suite 1000\nBoston, MA 02116(\n61.*?)?(\n.*?([cC]ell|Office))*\n)?This Electronic Message contains.*?contents of this message is.*?prohibited.", re.DOTALL),
|
|
145
|
+
MICHAEL_MILLER: re.compile(r"Michael C. Miller\nPartner\nwww.steptoe.com/mmiller\nSteptoe\n(Privileged.*\n)?(\+1\s+)?direct.*\n(\+1\s+)?(\+1\s+)?fax.*\n(\+1.*)?cell.*\n(www.steptoe.com\n)?This message and any.*\nyou are not.*\nnotify the sender.*"),
|
|
146
|
+
NICHOLAS_RIBIS: re.compile(r"60 Morris Turnpike 2FL\nSummit,? NJ.*\n0:\nF:\n\*{20,}\nCONFIDENTIALITY NOTICE.*\nattachments.*\ncopying.*\nIf you have.*\nthe copy.*\nThank.*\n\*{20,}"),
|
|
123
147
|
PETER_MANDELSON: re.compile(r'Disclaimer This email and any attachments to it may be.*?with[ \n]+number(.*?EC4V[ \n]+6BJ)?', re.DOTALL | re.IGNORECASE),
|
|
124
148
|
PAUL_BARRETT: re.compile(r"Paul Barrett[\n\s]+Alpha Group Capital LLC[\n\s]+(142 W 57th Street, 11th Floor, New York, NY 10019?[\n\s]+)?(al?[\n\s]*)?ALPHA GROUP[\n\s]+CAPITAL"),
|
|
125
|
-
|
|
149
|
+
PETER_ATTIA: re.compile(r"The information contained in this transmission may contain.*\n(laws|patient).*\n(distribution|named).*\n(distribution.*\nplease.*|copies.*)"),
|
|
150
|
+
RICHARD_KAHN: re.compile(fr'Richard Kahn[\n\s]+HBRK Associates Inc.?[\n\s]+((301 East 66th Street, Suite 1OF|575 Lexington Avenue,? 4th Floor,?)[\n\s]+)?New York, (NY|New York) 100(22|65)(\s+(Tel?|Phone)( I|{REDACTED})?\s+Fa[x",]?(_|{REDACTED})*\s+[Ce]el?l?)?', re.IGNORECASE),
|
|
151
|
+
ROSS_GOW: re.compile(r"Ross Gow\nManaging Partner\nACUITY Reputation Limited\n23 Berkeley Square\nLondon.*\nMobile.*\nTel"),
|
|
152
|
+
STEPHEN_HANSON: re.compile(r"(> )?Confidentiality Notice: This e-mail transmission.*\n(which it is addressed )?and may contain.*\n(applicable law. If you are not the intended )?recipient you are hereby.*\n(information contained in or attached to this transmission is )?STRICTLY PROHIBITED.*"),
|
|
153
|
+
STEVEN_PFEIFFER: re.compile(r"Steven\nSteven .*\nAssociate.*\nIndependent Filmmaker Project\nMade in NY.*\n30 .*\nBrooklyn.*\n(p:.*\n)?www\.ifp.*", re.IGNORECASE),
|
|
126
154
|
'Susan Edelman': re.compile(r'Susan Edel.*\nReporter\n1211.*\n917.*\nsedelman.*', re.IGNORECASE),
|
|
127
155
|
TERRY_KAFKA: re.compile(r"((>|I) )?Terry B.? Kafka.*\n(> )?Impact Outdoor.*\n(> )?5454.*\n(> )?Dallas.*\n((> )?c?ell.*\n)?(> )?Impactoutdoor.*(\n(> )?cell.*)?", re.IGNORECASE),
|
|
156
|
+
TOM_PRITZKER: re.compile(r"The contents of this email message.*\ncontain confidential.*\n(not )?the intended.*\n(error|please).*\n(you )?(are )?not the.*\n(this )?message.*"),
|
|
128
157
|
TONJA_HADDAD_COLEMAN: re.compile(fr"Tonja Haddad Coleman.*\nTonja Haddad.*\nAdvocate Building\n315 SE 7th.*(\nSuite.*)?\nFort Lauderdale.*(\n({REDACTED} )?facsimile)?(\nwww.tonjahaddad.com?)?(\nPlease add this efiling.*\nThe information.*\nyou are not.*\nyou are not.*)?", re.IGNORECASE),
|
|
129
158
|
UNKNOWN: re.compile(r"(This message is directed to and is for the use of the above-noted addressee only.*\nhereon\.)", re.DOTALL),
|
|
130
159
|
}
|
|
@@ -136,118 +165,81 @@ MAILING_LISTS = [
|
|
|
136
165
|
JP_MORGAN_USGIO,
|
|
137
166
|
]
|
|
138
167
|
|
|
139
|
-
|
|
168
|
+
BCC_LISTS = JUNK_EMAILERS + MAILING_LISTS
|
|
169
|
+
|
|
170
|
+
TRUNCATE_EMAILS_FROM_OR_TO = [
|
|
171
|
+
AMANDA_ENS,
|
|
172
|
+
ANTHONY_BARRETT,
|
|
173
|
+
DANIEL_SABBA,
|
|
174
|
+
DIANE_ZIMAN,
|
|
175
|
+
JOSCHA_BACH,
|
|
176
|
+
KATHERINE_KEATING,
|
|
177
|
+
LAWRANCE_VISOSKI,
|
|
178
|
+
LAWRENCE_KRAUSS,
|
|
179
|
+
LISA_NEW,
|
|
180
|
+
MOSHE_HOFFMAN,
|
|
181
|
+
NILI_PRIELL_BARAK,
|
|
182
|
+
PAUL_KRASSNER,
|
|
183
|
+
PAUL_PROSPERI,
|
|
184
|
+
'Susan Edelman',
|
|
185
|
+
TERRY_KAFKA,
|
|
186
|
+
]
|
|
140
187
|
|
|
141
|
-
|
|
188
|
+
TRUNCATE_EMAILS_FROM = BCC_LISTS + TRUNCATE_EMAILS_FROM_OR_TO + [
|
|
142
189
|
'Alan S Halperin',
|
|
190
|
+
'Alain Forget',
|
|
191
|
+
ARIANE_DE_ROTHSCHILD,
|
|
192
|
+
AZIZA_ALAHMADI,
|
|
193
|
+
BILL_SIEGEL,
|
|
194
|
+
DAVID_HAIG,
|
|
195
|
+
EDWARD_ROD_LARSEN,
|
|
196
|
+
JOHNNY_EL_HACHEM,
|
|
197
|
+
'Mark Green',
|
|
198
|
+
MELANIE_WALKER,
|
|
143
199
|
'Mitchell Bard',
|
|
200
|
+
PEGGY_SIEGAL,
|
|
201
|
+
ROBERT_LAWRENCE_KUHN,
|
|
202
|
+
ROBERT_TRIVERS,
|
|
144
203
|
'Skip Rimer',
|
|
204
|
+
'Steven Elkman',
|
|
205
|
+
STEVEN_PFEIFFER,
|
|
145
206
|
'Steven Victor MD',
|
|
207
|
+
TERRY_KAFKA,
|
|
146
208
|
]
|
|
147
209
|
|
|
148
|
-
TRUNCATION_LENGTHS = {
|
|
149
|
-
'023627': 16_800, # Micheal Wolff article with brock pierce
|
|
150
|
-
'030245': None, # Epstein rationalizes his behavior in an open letter to the world
|
|
151
|
-
'030781': None, # Bannon email about crypto coin issues
|
|
152
|
-
'032906': None, # David Blaine email
|
|
153
|
-
'026036': 6000, # Gino Yu blockchain mention
|
|
154
|
-
'023208': None, # Long discussion about leon black's finances
|
|
155
|
-
'029609': None, # Joi Ito
|
|
156
|
-
'025233': None, # Reputation.com discussion
|
|
157
|
-
}
|
|
158
|
-
|
|
159
210
|
# These are long forwarded articles so we force a trim to 1,333 chars if these strings exist
|
|
160
211
|
TRUNCATE_TERMS = [
|
|
161
|
-
'The rebuilding of Indonesia',
|
|
162
|
-
'
|
|
163
|
-
'THOMAS L. FRIEDMAN',
|
|
164
|
-
'a sleek, briskly paced film whose title suggests a heist movie',
|
|
165
|
-
'quote from The Colbert Report distinguishes',
|
|
166
|
-
'co-inventor of the GTX Smart Shoe',
|
|
167
|
-
'my latest Washington Post column',
|
|
168
|
-
'supported my humanities work at Harvard',
|
|
212
|
+
'The rebuilding of Indonesia', # Vikcy ward article
|
|
213
|
+
'a sleek, briskly paced film whose title suggests a heist movie', # Inside Job
|
|
169
214
|
'Calendar of Major Events, Openings, and Fundraisers',
|
|
170
|
-
'
|
|
171
|
-
'as responsible for the democratisation of computing and',
|
|
172
|
-
'AROUND 1,000 operational satellites are circling the Earth',
|
|
215
|
+
'sent over from Marshall Heyman at the WSJ',
|
|
173
216
|
"In recent months, China's BAT collapse",
|
|
174
217
|
'President Obama introduces Jim Yong Kim as his nominee',
|
|
175
218
|
'Trump appears with mobster-affiliated felon at New',
|
|
176
|
-
'Lead Code Enforcement Walton presented the facts',
|
|
177
|
-
"Is UNRWA vital for the Palestinians' future",
|
|
178
|
-
'The New York company, led by Stephen Ross',
|
|
179
|
-
'I spent some time mulling additional aspects of a third choice presidential',
|
|
180
|
-
'you are referring to duplication of a gene',
|
|
181
|
-
'i am writing you both because i am attaching a still not-quite-complete response',
|
|
182
|
-
'Learn to meditate and discover what truly nourishes your entire being',
|
|
183
219
|
'Congratulations to the 2019 Hillman Prize recipients',
|
|
184
|
-
'This much we know - the Fall elections are shaping up',
|
|
185
220
|
"Special counsel Robert Mueller's investigation may face a serious legal obstacle",
|
|
186
221
|
"nearly leak-proof since its inception more than a year ago",
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
'
|
|
191
|
-
|
|
192
|
-
'
|
|
193
|
-
|
|
194
|
-
'We remain positive on banks that can make acceptable returns',
|
|
195
|
-
'David Woo (BAML head of FX, Rates and EM Strategy, very highly regarded',
|
|
196
|
-
"Please let me know if you're interested in joining a small group meeting",
|
|
197
|
-
'Erika Najarian, BAML financials research analyst, just returned',
|
|
198
|
-
'We can also discuss single stock and Topix banks',
|
|
199
|
-
'We are recording unprecedented divergences in falling equity vol',
|
|
200
|
-
'As previously discussed between you and Ariane',
|
|
201
|
-
'no evidence you got the latest so i have sent you just the key message',
|
|
202
|
-
# Joscha Bach
|
|
203
|
-
'Cells seem to be mostly indistinguishable (except',
|
|
204
|
-
'gender differenece. unlikely motivational, every cell is different',
|
|
205
|
-
'Some thoughts I meant to send back for a long time',
|
|
206
|
-
# Krassner
|
|
207
|
-
'My friend Michael Simmons, who has been the editor of National Lampoon',
|
|
208
|
-
"In the premiere episode of 'The Last Laugh' podcast, Sarah Silverman",
|
|
209
|
-
'Thanks so much for sharing both your note to Steven and your latest Manson essay',
|
|
210
|
-
# Edward Larson
|
|
211
|
-
'Coming from an international background, and having lived in Oslo, Tel Aviv',
|
|
212
|
-
# Katherine Keating
|
|
213
|
-
'Paul Keating is aware that many people see him as a puzzle and contradiction',
|
|
214
|
-
'his panoramic view of world affairs sharper than ever, Paul Keating blames',
|
|
215
|
-
# melanie
|
|
216
|
-
'Some years ago when I worked at the libertarian Cato Institute'
|
|
217
|
-
# rich kahn
|
|
218
|
-
'House and Senate Republicans on their respective tax overhaul',
|
|
219
|
-
'The Tax Act contains changes to the treatment of "carried interests"',
|
|
220
|
-
'General Election: Trump vs. Clinton LA Times/USC Tracking',
|
|
221
|
-
'Location: Quicken Loans Arena in Cleveland, OH',
|
|
222
|
-
'A friendly discussion about Syria with a former US State Department',
|
|
223
|
-
# Robert Kuhn
|
|
224
|
-
'The US trade war against China: The view from Beijing',
|
|
225
|
-
# Tom / Paul Krassner
|
|
226
|
-
'I forgot to post my cartoon from week before last, about Howard Schultz',
|
|
222
|
+
# Nikolic
|
|
223
|
+
'Nuclear Operator Raises Alarm on Crisis',
|
|
224
|
+
'as responsible for the democratisation of computing and',
|
|
225
|
+
'AROUND 1,000 operational satellites are circling the Earth',
|
|
226
|
+
# Sultan Sulayem
|
|
227
|
+
'co-inventor of the GTX Smart Shoe',
|
|
228
|
+
'my latest Washington Post column',
|
|
227
229
|
# Bannon
|
|
230
|
+
'As Steve Bannon continues his tour of Europe',
|
|
228
231
|
"Bannon the European: He's opening the populist fort in Brussels",
|
|
229
232
|
"Steve Bannon doesn't do subtle.",
|
|
230
233
|
'The Department of Justice lost its latest battle with Congress',
|
|
231
|
-
|
|
232
|
-
#
|
|
233
|
-
'
|
|
234
|
-
#
|
|
235
|
-
'
|
|
236
|
-
|
|
237
|
-
'
|
|
238
|
-
'The
|
|
239
|
-
'
|
|
240
|
-
# Nikolic
|
|
241
|
-
'people from LifeBall',
|
|
242
|
-
# Epstein
|
|
243
|
-
'David Ben Gurion was asked why he, after 2000',
|
|
244
|
-
# Lisa New
|
|
245
|
-
'The raw materials for that period include interviews',
|
|
246
|
-
'Whether you donated to Poetry in America through',
|
|
247
|
-
# Random
|
|
248
|
-
'Little Hodiaki',
|
|
249
|
-
"It began with deep worries regarding China's growth path",
|
|
250
|
-
'https://www.washingtonpost.com/politics/2018/09/04/transcript-phone-call',
|
|
234
|
+
'pedophile Jeffrey Epstein bought his way out',
|
|
235
|
+
# lawyers
|
|
236
|
+
'recuses itself from Jeffrey Epstein case',
|
|
237
|
+
# Misc
|
|
238
|
+
'people from LifeBall', # Nikolic
|
|
239
|
+
"It began with deep worries regarding China's growth path", # Paul Morris
|
|
240
|
+
'A friendly discussion about Syria with a former US State Department', # Fabrice Aidan
|
|
241
|
+
'The US trade war against China: The view from Beijing', # Robert Kuhn / Groff
|
|
242
|
+
'This much we know - the Fall elections are shaping up', # Juleanna Glover / Bannon
|
|
251
243
|
]
|
|
252
244
|
|
|
253
245
|
METADATA_FIELDS = [
|
|
@@ -258,56 +250,100 @@ METADATA_FIELDS = [
|
|
|
258
250
|
'subject',
|
|
259
251
|
]
|
|
260
252
|
|
|
261
|
-
# Note the line repair happens *after* 'Importance: High' is removed
|
|
253
|
+
# Arguments to _merge_lines(). Note the line repair happens *after* 'Importance: High' is removed
|
|
262
254
|
LINE_REPAIR_MERGES = {
|
|
263
|
-
'
|
|
264
|
-
'
|
|
265
|
-
'
|
|
266
|
-
'
|
|
267
|
-
'
|
|
268
|
-
'
|
|
269
|
-
'
|
|
270
|
-
'
|
|
271
|
-
'
|
|
272
|
-
'
|
|
273
|
-
'
|
|
274
|
-
'
|
|
275
|
-
'
|
|
276
|
-
'
|
|
277
|
-
'
|
|
278
|
-
'
|
|
279
|
-
'
|
|
280
|
-
'
|
|
281
|
-
'
|
|
282
|
-
'
|
|
283
|
-
'
|
|
284
|
-
'
|
|
285
|
-
'
|
|
286
|
-
'
|
|
287
|
-
'
|
|
288
|
-
'
|
|
289
|
-
'
|
|
290
|
-
'
|
|
291
|
-
'
|
|
292
|
-
'
|
|
293
|
-
'
|
|
294
|
-
'
|
|
295
|
-
'
|
|
296
|
-
'
|
|
297
|
-
'
|
|
298
|
-
'
|
|
299
|
-
'
|
|
300
|
-
'
|
|
301
|
-
'
|
|
302
|
-
'
|
|
303
|
-
'
|
|
304
|
-
'
|
|
305
|
-
'
|
|
306
|
-
'
|
|
307
|
-
'
|
|
308
|
-
'
|
|
309
|
-
'
|
|
310
|
-
'
|
|
255
|
+
'013405': [[4]] * 2,
|
|
256
|
+
'013415': [[4]] * 2,
|
|
257
|
+
'014397': [[4]] * 2,
|
|
258
|
+
'014860': [[3], [4], [4]],
|
|
259
|
+
'017523': [[4]],
|
|
260
|
+
'030367': [[1, 4], [2, 4]],
|
|
261
|
+
'019105': [[5]] * 4,
|
|
262
|
+
'019407': [[2, 4]],
|
|
263
|
+
'022187': [[1, 8], [2, 8], [3, 8], [4, 8]],
|
|
264
|
+
'021729': [[2]],
|
|
265
|
+
'032896': [[2]],
|
|
266
|
+
'033050': [[0, 6], [1, 6], [2, 6], [3, 6], [4, 6]],
|
|
267
|
+
'022949': [[0, 4], [1, 4]],
|
|
268
|
+
'022197': [[0, 5], [1, 5], [3, 5]],
|
|
269
|
+
'021814': [[1, 6], [2, 6], [3, 6], [4, 6]],
|
|
270
|
+
'022190': [[1, 7], [0, 6], [3, 6], [4, 6]],
|
|
271
|
+
'029582': [[0, 5], [1, 5], [3, 5], [3, 5]],
|
|
272
|
+
'022673': [[9]],
|
|
273
|
+
'022684': [[9]],
|
|
274
|
+
'026625': [[0, 7], [1, 7], [2, 7], [3, 7], [4, 7], [5, 7]],
|
|
275
|
+
'026659': [[0, 5], [1, 5]],
|
|
276
|
+
'026764': [[0, 6], [1, 6]],
|
|
277
|
+
'022695': [[4]],
|
|
278
|
+
'022977': [[9]] * 10,
|
|
279
|
+
'023001': [[5]] * 3,
|
|
280
|
+
'023067': [[3]],
|
|
281
|
+
'025233': [[4]] * 2,
|
|
282
|
+
'025329': [[2]] * 9,
|
|
283
|
+
'025790': [[2]],
|
|
284
|
+
'025812': [[3]] * 2,
|
|
285
|
+
'025589': [[3]] * 12,
|
|
286
|
+
'026345': [[3]],
|
|
287
|
+
'026609': [[4]],
|
|
288
|
+
'028921': [[5, 4], [4, 5]],
|
|
289
|
+
'026620': ([[20]] * 4) + [[3, 2]] + ([[2]] * 15) + [[2, 4]],
|
|
290
|
+
'026829': [[3]],
|
|
291
|
+
'026924': [[2, 4]],
|
|
292
|
+
'028728': [[3]],
|
|
293
|
+
'026451': [[3, 5]] * 2,
|
|
294
|
+
'028931': [[3, 6]],
|
|
295
|
+
'029154': [[2, 5]],
|
|
296
|
+
'029163': [[2, 5]],
|
|
297
|
+
'029282': [[2]],
|
|
298
|
+
'029402': [[5]],
|
|
299
|
+
'029433': [[3]],
|
|
300
|
+
'029458': [[4]] * 3,
|
|
301
|
+
'029498': [[2], [2, 4]],
|
|
302
|
+
'029501': [[2]],
|
|
303
|
+
'029545': [[3, 5]],
|
|
304
|
+
'029773': [[2, 5]],
|
|
305
|
+
'029831': [[3, 6]],
|
|
306
|
+
'029835': [[2, 4]],
|
|
307
|
+
'029841': [[3]],
|
|
308
|
+
'029889': [[2], [2, 5]],
|
|
309
|
+
'029976': [[3]],
|
|
310
|
+
'029977': ([[2]] * 4) + [[4], [2, 4]],
|
|
311
|
+
'030299': [[7, 10]],
|
|
312
|
+
'030315': [[3, 5]],
|
|
313
|
+
'030318': [[3, 5]],
|
|
314
|
+
'030381': [[2, 4]],
|
|
315
|
+
'030384': [[2, 4]],
|
|
316
|
+
'030626': [[2], [4]],
|
|
317
|
+
'030861': [[3, 8]],
|
|
318
|
+
'030999': [[2, 4]],
|
|
319
|
+
'031384': [[2]],
|
|
320
|
+
'031428': [[2], [2, 4]],
|
|
321
|
+
'031442': [[0]],
|
|
322
|
+
'031489': [[2, 4], [3, 4], [3, 4], [10]],
|
|
323
|
+
'031619': [[7], [17], [17]],
|
|
324
|
+
'031748': [[3]] * 2,
|
|
325
|
+
'031764': [[3], [8]], # 8 is just for style fix internally, not header
|
|
326
|
+
'031980': [[2, 4]],
|
|
327
|
+
'032063': [[3, 5]],
|
|
328
|
+
'032272': [[2, 10], [3]],
|
|
329
|
+
'032405': [[4]],
|
|
330
|
+
'032637': [[9]] * 3,
|
|
331
|
+
'033097': [[2]],
|
|
332
|
+
'033144': [[2, 4]],
|
|
333
|
+
'033217': [[3]],
|
|
334
|
+
'033228': [[3, 5]],
|
|
335
|
+
'033252': [[9]] * 2,
|
|
336
|
+
'033271': [[3]],
|
|
337
|
+
'033299': [[3]],
|
|
338
|
+
'033357': [[2, 4]],
|
|
339
|
+
'033486': [[7, 9]],
|
|
340
|
+
'033512': [[2]],
|
|
341
|
+
'026024': [[1, 3], [2, 3]],
|
|
342
|
+
'024923': [[0, 5], [2]],
|
|
343
|
+
'033568': [[5]] * 5,
|
|
344
|
+
'033575': [[2, 4]],
|
|
345
|
+
'033576': [[3]],
|
|
346
|
+
'033583': [[2]],
|
|
311
347
|
}
|
|
312
348
|
|
|
313
349
|
|
|
@@ -322,12 +358,15 @@ class Email(Communication):
|
|
|
322
358
|
sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
|
|
323
359
|
signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
|
|
324
360
|
"""
|
|
361
|
+
attached_docs: list[OtherFile] = field(default_factory=list)
|
|
325
362
|
actual_text: str = field(init=False)
|
|
326
363
|
config: EmailCfg | None = None
|
|
327
364
|
header: EmailHeader = field(init=False)
|
|
328
365
|
recipients: list[Name] = field(default_factory=list)
|
|
329
366
|
sent_from_device: str | None = None
|
|
330
367
|
signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
|
|
368
|
+
_is_first_for_user: bool = False # Only set when printing
|
|
369
|
+
_line_merge_arguments: list[tuple[int] | tuple[int, int]] = field(default_factory=list)
|
|
331
370
|
|
|
332
371
|
# For logging how many headers we prettified while printing, kind of janky
|
|
333
372
|
rewritten_header_ids: ClassVar[set[str]] = set([])
|
|
@@ -353,7 +392,7 @@ class Email(Communication):
|
|
|
353
392
|
self.recipients.extend(self._extract_emailer_names(recipient))
|
|
354
393
|
|
|
355
394
|
# Assume mailing list emails are to Epstein
|
|
356
|
-
if self.author in
|
|
395
|
+
if self.author in BCC_LISTS and (self.is_note_to_self() or not self.recipients):
|
|
357
396
|
self.recipients = [JEFFREY_EPSTEIN]
|
|
358
397
|
|
|
359
398
|
# Remove self CCs but preserve self emails
|
|
@@ -366,6 +405,7 @@ class Email(Communication):
|
|
|
366
405
|
self.sent_from_device = self._sent_from_device()
|
|
367
406
|
|
|
368
407
|
def attachments(self) -> list[str]:
|
|
408
|
+
"""Returns the string in the header."""
|
|
369
409
|
return (self.header.attachments or '').split(';')
|
|
370
410
|
|
|
371
411
|
def info_txt(self) -> Text:
|
|
@@ -379,7 +419,12 @@ class Email(Communication):
|
|
|
379
419
|
return txt.append(highlighter(f" probably sent at {self.timestamp}"))
|
|
380
420
|
|
|
381
421
|
def is_fwded_article(self) -> bool:
|
|
382
|
-
|
|
422
|
+
if self.config is None:
|
|
423
|
+
return False
|
|
424
|
+
elif self.config.fwded_text_after:
|
|
425
|
+
return self.config.is_fwded_article is not False
|
|
426
|
+
else:
|
|
427
|
+
return bool(self.config.is_fwded_article)
|
|
383
428
|
|
|
384
429
|
def is_junk_mail(self) -> bool:
|
|
385
430
|
return self.author in JUNK_EMAILERS
|
|
@@ -390,6 +435,15 @@ class Email(Communication):
|
|
|
390
435
|
def is_note_to_self(self) -> bool:
|
|
391
436
|
return self.recipients == [self.author]
|
|
392
437
|
|
|
438
|
+
def is_from_or_to(self, name: str) -> bool:
|
|
439
|
+
return name in [self.author] + self.recipients
|
|
440
|
+
|
|
441
|
+
def is_word_count_worthy(self) -> bool:
|
|
442
|
+
if self.is_fwded_article():
|
|
443
|
+
return bool(self.config.fwded_text_after) or len(self.actual_text) < 150
|
|
444
|
+
else:
|
|
445
|
+
return not self.is_mailing_list()
|
|
446
|
+
|
|
393
447
|
def metadata(self) -> Metadata:
|
|
394
448
|
local_metadata = asdict(self)
|
|
395
449
|
local_metadata['is_junk_mail'] = self.is_junk_mail()
|
|
@@ -436,8 +490,9 @@ class Email(Communication):
|
|
|
436
490
|
elif self.header.num_header_rows == 0:
|
|
437
491
|
return self.text
|
|
438
492
|
|
|
493
|
+
# import pdb;pdb.set_trace()
|
|
439
494
|
self.log_top_lines(20, "Raw text:", logging.DEBUG)
|
|
440
|
-
self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
|
|
495
|
+
self.log(f"With {self.header.num_header_rows} header lines removed:\n{text[0:500]}\n\n", logging.DEBUG)
|
|
441
496
|
reply_text_match = REPLY_TEXT_REGEX.search(text)
|
|
442
497
|
|
|
443
498
|
if reply_text_match:
|
|
@@ -516,8 +571,8 @@ class Email(Communication):
|
|
|
516
571
|
logger.debug(f"{self.file_id} extracted header\n\n{self.header}\n")
|
|
517
572
|
|
|
518
573
|
def _extract_timestamp(self) -> datetime:
|
|
519
|
-
if self.config and self.config.timestamp:
|
|
520
|
-
return self.config.timestamp
|
|
574
|
+
if self.config and self.config.timestamp():
|
|
575
|
+
return self.config.timestamp()
|
|
521
576
|
elif self.header.sent_at:
|
|
522
577
|
timestamp = _parse_timestamp(self.header.sent_at)
|
|
523
578
|
|
|
@@ -546,31 +601,41 @@ class Email(Communication):
|
|
|
546
601
|
logger.debug(f"Fell back to timestamp {timestamp} in line '{line}'...")
|
|
547
602
|
return timestamp
|
|
548
603
|
|
|
549
|
-
|
|
604
|
+
no_timestamp_msg = f"No timestamp found in '{self.file_path.name}'"
|
|
550
605
|
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
if text is None:
|
|
554
|
-
header_offset = len(self.header.header_chars)
|
|
555
|
-
text = self.text[header_offset:]
|
|
606
|
+
if self.is_duplicate():
|
|
607
|
+
logger.warning(f"{no_timestamp_msg} but timestamp should be copied from {self.duplicate_of_id()}")
|
|
556
608
|
else:
|
|
557
|
-
|
|
609
|
+
raise RuntimeError(f"{no_timestamp_msg}, top lines:\n{searchable_text}")
|
|
610
|
+
|
|
611
|
+
def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES) -> int | None:
|
|
612
|
+
"""Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
|
|
613
|
+
header_offset = len(self.header.header_chars)
|
|
614
|
+
text = self.text[header_offset:]
|
|
558
615
|
|
|
559
616
|
for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text)):
|
|
560
617
|
if i >= n:
|
|
561
618
|
return match.end() + header_offset - 1
|
|
562
619
|
|
|
563
|
-
def _merge_lines(self,
|
|
620
|
+
def _merge_lines(self, idx1: int, idx2: int | None = None) -> None:
|
|
564
621
|
"""Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""
|
|
565
|
-
|
|
566
|
-
|
|
622
|
+
if idx2 is None:
|
|
623
|
+
self._line_merge_arguments.append((idx1,))
|
|
624
|
+
idx2 = idx1 + 1
|
|
625
|
+
else:
|
|
626
|
+
self._line_merge_arguments.append((idx1, idx2))
|
|
567
627
|
|
|
568
|
-
if idx2
|
|
569
|
-
|
|
570
|
-
elif idx2 ==
|
|
571
|
-
|
|
628
|
+
if idx2 < idx1:
|
|
629
|
+
lines = self.lines[0:idx2] + self.lines[idx2 + 1:idx1] + [self.lines[idx1] + ' ' + self.lines[idx2]] + self.lines[idx1 + 1:]
|
|
630
|
+
elif idx2 == idx1:
|
|
631
|
+
raise RuntimeError(f"idx2 ({idx2}) must be greater or less than idx ({idx1})")
|
|
572
632
|
else:
|
|
573
|
-
lines
|
|
633
|
+
lines = self.lines[0:idx1]
|
|
634
|
+
|
|
635
|
+
if idx2 == (idx1 + 1):
|
|
636
|
+
lines += [self.lines[idx1] + ' ' + self.lines[idx1 + 1]] + self.lines[idx1 + 2:]
|
|
637
|
+
else:
|
|
638
|
+
lines += [self.lines[idx1] + ' ' + self.lines[idx2]] + self.lines[idx1 + 1:idx2] + self.lines[idx2 + 1:]
|
|
574
639
|
|
|
575
640
|
self._set_computed_fields(lines=lines)
|
|
576
641
|
|
|
@@ -586,6 +651,10 @@ class Email(Communication):
|
|
|
586
651
|
self.signature_substitution_counts[name] = self.signature_substitution_counts.get(name, 0)
|
|
587
652
|
self.signature_substitution_counts[name] += num_replaced
|
|
588
653
|
|
|
654
|
+
# Share / Tweet lines
|
|
655
|
+
if self.author == KATHRYN_RUEMMLER:
|
|
656
|
+
text = '\n'.join([l for l in text.split('\n') if l not in ['Share', 'Tweet', 'Bookmark it']])
|
|
657
|
+
|
|
589
658
|
return collapse_newlines(text).strip()
|
|
590
659
|
|
|
591
660
|
def _remove_line(self, idx: int) -> None:
|
|
@@ -605,68 +674,15 @@ class Email(Communication):
|
|
|
605
674
|
old_text = self.text
|
|
606
675
|
|
|
607
676
|
if self.file_id in LINE_REPAIR_MERGES:
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
self._merge_lines(*merge_args)
|
|
611
|
-
|
|
612
|
-
# These already had 2nd line merged
|
|
613
|
-
if self.file_id in ['030626']: # Merge 6th and 7th (now 5th and 6th) rows
|
|
614
|
-
self._merge_lines(4)
|
|
615
|
-
elif self.file_id == '029889':
|
|
616
|
-
self._merge_lines(2, 5)
|
|
617
|
-
elif self.file_id in ['029498', '031428']:
|
|
618
|
-
self._merge_lines(2, 4)
|
|
619
|
-
|
|
620
|
-
# Multiline
|
|
621
|
-
if self.file_id == '013415':
|
|
622
|
-
for _i in range(2):
|
|
623
|
-
self._merge_lines(4)
|
|
624
|
-
elif self.file_id == '013405':
|
|
625
|
-
for _i in range(2):
|
|
626
|
-
self._merge_lines(4)
|
|
627
|
-
elif self.file_id == '029458':
|
|
628
|
-
for _i in range(3):
|
|
629
|
-
self._merge_lines(4)
|
|
630
|
-
elif self.file_id in ['025233']:
|
|
631
|
-
for _i in range(2):
|
|
632
|
-
self._merge_lines(4)
|
|
677
|
+
for merge_args in LINE_REPAIR_MERGES[self.file_id]:
|
|
678
|
+
self._merge_lines(*merge_args)
|
|
633
679
|
|
|
680
|
+
if self.file_id in ['025233']:
|
|
634
681
|
self.lines[4] = f"Attachments: {self.lines[4]}"
|
|
635
682
|
self._set_computed_fields(lines=self.lines)
|
|
636
|
-
elif self.file_id in ['023001']:
|
|
637
|
-
for _i in range(3):
|
|
638
|
-
self._merge_lines(5)
|
|
639
|
-
elif self.file_id in ['019105']:
|
|
640
|
-
for _i in range(4):
|
|
641
|
-
self._merge_lines(5)
|
|
642
|
-
elif self.file_id in ['033568']:
|
|
643
|
-
for _i in range(5):
|
|
644
|
-
self._merge_lines(5)
|
|
645
|
-
elif self.file_id in ['025329']:
|
|
646
|
-
for _i in range(9):
|
|
647
|
-
self._merge_lines(2)
|
|
648
|
-
elif self.file_id in ['025812']:
|
|
649
|
-
for _i in range(2):
|
|
650
|
-
self._merge_lines(3)
|
|
651
|
-
elif self.file_id == '014860':
|
|
652
|
-
self._merge_lines(3)
|
|
653
|
-
self._merge_lines(4)
|
|
654
|
-
self._merge_lines(4)
|
|
655
683
|
elif self.file_id == '029977':
|
|
656
684
|
self._set_computed_fields(text=self.text.replace('Sent 9/28/2012 2:41:02 PM', 'Sent: 9/28/2012 2:41:02 PM'))
|
|
657
685
|
|
|
658
|
-
for _i in range(4):
|
|
659
|
-
self._merge_lines(2)
|
|
660
|
-
|
|
661
|
-
self._merge_lines(4)
|
|
662
|
-
self._merge_lines(2, 4)
|
|
663
|
-
elif self.file_id in ['033252']:
|
|
664
|
-
for _i in range(2):
|
|
665
|
-
self._merge_lines(9)
|
|
666
|
-
elif self.file_id in ['032637']:
|
|
667
|
-
for _i in range(3):
|
|
668
|
-
self._merge_lines(9)
|
|
669
|
-
|
|
670
686
|
# Bad line removal
|
|
671
687
|
if self.file_id == '025041':
|
|
672
688
|
self._remove_line(4)
|
|
@@ -679,22 +695,40 @@ class Email(Communication):
|
|
|
679
695
|
self.log_top_lines(12, 'Result of modifications')
|
|
680
696
|
|
|
681
697
|
lines = self.repair_ocr_text(OCR_REPAIRS, self.text).split('\n')
|
|
698
|
+
subject_line = next((line for line in lines if line.startswith('Subject:')), None) or ''
|
|
699
|
+
subject = subject_line.split(':')[1].strip() if subject_line else ''
|
|
682
700
|
new_lines = []
|
|
683
701
|
i = 0
|
|
684
702
|
|
|
685
|
-
# Fix links (remove spaces, merge multiline links to a single line)
|
|
703
|
+
# Fix links and quoted subjects (remove spaces, merge multiline links to a single line)
|
|
686
704
|
while i < len(lines):
|
|
687
705
|
line = lines[i]
|
|
688
706
|
|
|
689
707
|
if LINK_LINE_REGEX.search(line):
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
708
|
+
while i < (len(lines) - 1) \
|
|
709
|
+
and not lines[i + 1].startswith('htt') \
|
|
710
|
+
and (lines[i + 1].endswith('/') \
|
|
711
|
+
or any(s in lines[i + 1] for s in URL_SIGNIFIERS) \
|
|
712
|
+
or LINK_LINE2_REGEX.match(lines[i + 1])):
|
|
693
713
|
logger.debug(f"{self.filename}: Joining link lines\n 1. {line}\n 2. {lines[i + 1]}\n")
|
|
694
714
|
line += lines[i + 1]
|
|
695
715
|
i += 1
|
|
696
716
|
|
|
697
717
|
line = line.replace(' ', '')
|
|
718
|
+
elif ' http' in line and line.endswith('html'):
|
|
719
|
+
pre_link, post_link = line.split(' http', 1)
|
|
720
|
+
line = f"{pre_link} http{post_link.replace(' ', '')}"
|
|
721
|
+
elif line.startswith('Subject:') and i < (len(lines) - 2) and len(line) >= 40:
|
|
722
|
+
next_line = lines[i + 1]
|
|
723
|
+
next_next = lines[i + 2]
|
|
724
|
+
|
|
725
|
+
if len(next_line) <= 1 or any([cont in next_line for cont in BAD_SUBJECT_CONTINUATIONS]):
|
|
726
|
+
pass
|
|
727
|
+
elif (subject.endswith(next_line) and next_line != subject) \
|
|
728
|
+
or (FIELDS_COLON_REGEX.search(next_next) and not FIELDS_COLON_REGEX.search(next_line)):
|
|
729
|
+
self.warn(f"Fixing broken subject line\n line: '{line}'\n next: '{next_line}'\n next: '{next_next}'\nsubject='{subject}'\n")
|
|
730
|
+
line += f" {next_line}"
|
|
731
|
+
i += 1
|
|
698
732
|
|
|
699
733
|
new_lines.append(line)
|
|
700
734
|
|
|
@@ -718,7 +752,7 @@ class Email(Communication):
|
|
|
718
752
|
"""Copy info from original config for file this document was extracted from."""
|
|
719
753
|
if self.file_id in ALL_FILE_CONFIGS:
|
|
720
754
|
self.config = cast(EmailCfg, deepcopy(ALL_FILE_CONFIGS[self.file_id]))
|
|
721
|
-
self.
|
|
755
|
+
self.log(f"Merging existing cfg for '{self.file_id}' with cfg for extracted document...")
|
|
722
756
|
else:
|
|
723
757
|
self.config = EmailCfg(id=self.file_id)
|
|
724
758
|
|
|
@@ -740,33 +774,58 @@ class Email(Communication):
|
|
|
740
774
|
|
|
741
775
|
def _truncate_to_length(self) -> int:
|
|
742
776
|
"""When printing truncate this email to this length."""
|
|
743
|
-
quote_cutoff = self._idx_of_nth_quoted_reply(
|
|
777
|
+
quote_cutoff = self._idx_of_nth_quoted_reply() # Trim if there's many quoted replies
|
|
744
778
|
includes_truncate_term = next((term for term in TRUNCATE_TERMS if term in self.text), None)
|
|
745
779
|
|
|
746
780
|
if args.whole_file:
|
|
747
781
|
num_chars = len(self.text)
|
|
748
|
-
elif
|
|
749
|
-
num_chars =
|
|
750
|
-
elif self.
|
|
751
|
-
num_chars =
|
|
752
|
-
elif
|
|
753
|
-
num_chars =
|
|
782
|
+
elif args.truncate:
|
|
783
|
+
num_chars = args.truncate
|
|
784
|
+
elif self.config and self.config.truncate_to is not None:
|
|
785
|
+
num_chars = len(self.text) if self.config.truncate_to == NO_TRUNCATE else self.config.truncate_to
|
|
786
|
+
elif self.is_interesting():
|
|
787
|
+
num_chars = len(self.text)
|
|
788
|
+
elif self.author in TRUNCATE_EMAILS_FROM \
|
|
789
|
+
or any([self.is_from_or_to(n) for n in TRUNCATE_EMAILS_FROM_OR_TO]) \
|
|
790
|
+
or self.is_fwded_article() \
|
|
791
|
+
or includes_truncate_term:
|
|
792
|
+
num_chars = min(quote_cutoff or MAX_CHARS_TO_PRINT, TRUNCATED_CHARS)
|
|
754
793
|
else:
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
794
|
+
if quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
|
|
795
|
+
trimmed_words = self.text[quote_cutoff:].split()
|
|
796
|
+
|
|
797
|
+
if '<...snipped' in trimmed_words[:NUM_WORDS_IN_LAST_QUOTE]:
|
|
798
|
+
num_trailing_words = 0
|
|
799
|
+
elif trimmed_words and trimmed_words[0] in ['From:', 'Sent:']:
|
|
800
|
+
num_trailing_words = NUM_WORDS_IN_LAST_QUOTE
|
|
801
|
+
else:
|
|
802
|
+
num_trailing_words = NUM_WORDS_IN_LAST_QUOTE
|
|
803
|
+
|
|
804
|
+
if trimmed_words:
|
|
805
|
+
last_quoted_text = ' '.join(trimmed_words[:num_trailing_words])
|
|
806
|
+
num_chars = quote_cutoff + len(last_quoted_text) + 1 # Give a hint of the next line
|
|
807
|
+
else:
|
|
808
|
+
num_chars = quote_cutoff
|
|
809
|
+
else:
|
|
810
|
+
num_chars = min(self.file_size(), MAX_CHARS_TO_PRINT)
|
|
811
|
+
|
|
812
|
+
# Always print whole email for 1st email for user
|
|
813
|
+
if self._is_first_for_user and num_chars < self.file_size() and not self.is_duplicate():
|
|
814
|
+
logger.info(f"{self} Overriding cutoff {num_chars} for first email")
|
|
815
|
+
num_chars = self.file_size()
|
|
816
|
+
|
|
817
|
+
log_args = {
|
|
818
|
+
'num_chars': num_chars,
|
|
819
|
+
'_is_first_for_user': self._is_first_for_user,
|
|
820
|
+
'author_truncate': self.author in TRUNCATE_EMAILS_FROM,
|
|
821
|
+
'is_fwded_article': self.is_fwded_article(),
|
|
822
|
+
'is_quote_cutoff': quote_cutoff == num_chars,
|
|
823
|
+
'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
|
|
824
|
+
'quote_cutoff': quote_cutoff,
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
log_args_str = ', '.join([f"{k}={v}" for k, v in log_args.items() if v])
|
|
828
|
+
logger.debug(f"Truncate determination: {log_args_str}")
|
|
770
829
|
return num_chars
|
|
771
830
|
|
|
772
831
|
def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
|
|
@@ -780,7 +839,7 @@ class Email(Communication):
|
|
|
780
839
|
if len(text) > num_chars:
|
|
781
840
|
text = text[0:num_chars]
|
|
782
841
|
doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style())
|
|
783
|
-
trim_note = f"<...trimmed to {num_chars} characters of {self.length()}, read the rest at {doc_link_markup}...>"
|
|
842
|
+
trim_note = f"<...trimmed to {num_chars:,} characters of {self.length():,}, read the rest at {doc_link_markup}...>"
|
|
784
843
|
trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
|
|
785
844
|
|
|
786
845
|
# Rewrite broken headers where the values are on separate lines from the field names
|
|
@@ -799,8 +858,15 @@ class Email(Communication):
|
|
|
799
858
|
text = _add_line_breaks(text) # This was skipped when _prettify_text() w/a broken header so we do it now
|
|
800
859
|
self.rewritten_header_ids.add(self.file_id)
|
|
801
860
|
|
|
861
|
+
lines = [
|
|
862
|
+
Text.from_markup(f"[link={line}]{line}[/link]") if line.startswith('http') else Text(line)
|
|
863
|
+
for line in text.split('\n')
|
|
864
|
+
]
|
|
865
|
+
|
|
866
|
+
text = join_texts(lines, '\n')
|
|
867
|
+
|
|
802
868
|
email_txt_panel = Panel(
|
|
803
|
-
highlighter(text).append('
|
|
869
|
+
highlighter(text).append('...\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
|
|
804
870
|
border_style=self._border_style(),
|
|
805
871
|
expand=False,
|
|
806
872
|
subtitle=REWRITTEN_HEADER_MSG if should_rewrite_header else None,
|
|
@@ -809,6 +875,11 @@ class Email(Communication):
|
|
|
809
875
|
yield self.file_info_panel()
|
|
810
876
|
yield Padding(email_txt_panel, (0, 0, 1, INFO_INDENT))
|
|
811
877
|
|
|
878
|
+
if self.attached_docs:
|
|
879
|
+
attachments_table_title = f" {self.url_slug} Email Attachments:"
|
|
880
|
+
attachments_table = OtherFile.files_preview_table(self.attached_docs, title=attachments_table_title)
|
|
881
|
+
yield Padding(attachments_table, (0, 0, 1, 12))
|
|
882
|
+
|
|
812
883
|
if should_rewrite_header:
|
|
813
884
|
self.log_top_lines(self.header.num_header_rows + 4, f'Original header:')
|
|
814
885
|
|