epstein-files 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +194 -0
- epstein_files/documents/communication.py +53 -0
- epstein_files/documents/document.py +357 -0
- epstein_files/documents/email.py +655 -0
- epstein_files/documents/emails/email_header.py +167 -0
- epstein_files/documents/imessage/text_message.py +93 -0
- epstein_files/documents/json_file.py +23 -0
- epstein_files/documents/messenger_log.py +73 -0
- epstein_files/documents/other_file.py +117 -0
- epstein_files/epstein_files.py +437 -0
- epstein_files/util/constant/common_words.py +94 -0
- epstein_files/util/constant/html.py +57 -0
- epstein_files/util/constant/names.py +261 -0
- epstein_files/util/constant/strings.py +47 -0
- epstein_files/util/constant/urls.py +103 -0
- epstein_files/util/constants.py +1552 -0
- epstein_files/util/data.py +131 -0
- epstein_files/util/env.py +80 -0
- epstein_files/util/file_cfg.py +172 -0
- epstein_files/util/file_helper.py +81 -0
- epstein_files/util/highlighted_group.py +620 -0
- epstein_files/util/rich.py +324 -0
- epstein_files/util/search_result.py +15 -0
- epstein_files/util/word_count.py +191 -0
- epstein_files-1.0.0.dist-info/LICENSE +674 -0
- epstein_files-1.0.0.dist-info/METADATA +60 -0
- epstein_files-1.0.0.dist-info/RECORD +28 -0
- epstein_files-1.0.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,655 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import ClassVar, cast
|
|
7
|
+
|
|
8
|
+
from dateutil.parser import parse
|
|
9
|
+
from rich.console import Console, ConsoleOptions, RenderResult
|
|
10
|
+
from rich.padding import Padding
|
|
11
|
+
from rich.panel import Panel
|
|
12
|
+
from rich.text import Text
|
|
13
|
+
|
|
14
|
+
from epstein_files.documents.communication import Communication
|
|
15
|
+
from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, INFO_INDENT
|
|
16
|
+
from epstein_files.documents.emails.email_header import (BAD_EMAILER_REGEX, EMAIL_SIMPLE_HEADER_REGEX,
|
|
17
|
+
EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, TIME_REGEX, EmailHeader)
|
|
18
|
+
from epstein_files.util.constant.names import *
|
|
19
|
+
from epstein_files.util.constant.strings import REDACTED, URL_SIGNIFIERS
|
|
20
|
+
from epstein_files.util.constants import *
|
|
21
|
+
from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes, extract_last_name,
|
|
22
|
+
remove_timezone, uniquify)
|
|
23
|
+
from epstein_files.util.env import logger
|
|
24
|
+
from epstein_files.util.highlighted_group import get_style_for_name
|
|
25
|
+
from epstein_files.util.rich import *
|
|
26
|
+
|
|
27
|
+
BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|L\._|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
|
|
28
|
+
BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,])$')
|
|
29
|
+
DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
|
|
30
|
+
LINK_LINE_REGEX = re.compile(f"^(> )?htt")
|
|
31
|
+
QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
|
|
32
|
+
REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + ['********************************']
|
|
33
|
+
REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
|
|
34
|
+
|
|
35
|
+
BAD_TIMEZONE_REGEX = re.compile(fr'\((UTC|GMT\+\d\d:\d\d)\)|{REDACTED}')
|
|
36
|
+
DATE_HEADER_REGEX = re.compile(r'(?:Date|Sent):? +(?!by|from|to|via)([^\n]{6,})\n')
|
|
37
|
+
TIMESTAMP_LINE_REGEX = re.compile(r"\d+:\d+")
|
|
38
|
+
|
|
39
|
+
SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
|
|
40
|
+
REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
|
|
41
|
+
MAX_CHARS_TO_PRINT = 4000
|
|
42
|
+
MAX_QUOTED_REPLIES = 2
|
|
43
|
+
VALID_HEADER_LINES = 14
|
|
44
|
+
|
|
45
|
+
OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
46
|
+
re.compile(r'grnail\.com'): 'gmail.com',
|
|
47
|
+
re.compile(r"^(From|To)(: )?[_1.]{5,}", re.MULTILINE): rf"\1: {REDACTED}", # Redacted email addresses
|
|
48
|
+
# These 3 must come in this order!
|
|
49
|
+
re.compile(r'([/vkT]|Ai|li|(I|7)v)rote:'): 'wrote:',
|
|
50
|
+
re.compile(r"([<>.=_HIM][<>.=_HIM14]{5,}[<>.=_HIM]|MOMMINNEMUMMIN) *(wrote:?)?"): rf"{REDACTED} \2",
|
|
51
|
+
re.compile(r"([,<>_]|AM|PM)\n(>)? ?wrote:?"): r'\1\2 wrote:',
|
|
52
|
+
# Names / email addresses
|
|
53
|
+
'Alireza lttihadieh': ALIREZA_ITTIHADIEH,
|
|
54
|
+
'Miroslav Laj6ak': MIROSLAV_LAJCAK,
|
|
55
|
+
'Ross G°w': ROSS_GOW,
|
|
56
|
+
'Torn Pritzker': TOM_PRITZKER,
|
|
57
|
+
re.compile(r' Banno(r]?|\b)'): ' Bannon',
|
|
58
|
+
re.compile(r'gmax ?[1l] ?[@g]ellmax.c ?om'): 'gmax1@ellmax.com',
|
|
59
|
+
re.compile(r"[ijlp']ee[vy]acation[©@a(&,P ]{1,3}g?mail.com"): 'jeevacation@gmail.com',
|
|
60
|
+
# Signatures
|
|
61
|
+
'BlackBerry by AT &T': 'BlackBerry by AT&T',
|
|
62
|
+
'BlackBerry from T- Mobile': 'BlackBerry from T-Mobile',
|
|
63
|
+
"from my 'Phone": 'from my iPhone',
|
|
64
|
+
'from Samsung Mob.le': 'from Samsung Mobile',
|
|
65
|
+
'gJeremyRubin': '@JeremyRubin',
|
|
66
|
+
'Sent from Mabfl': 'Sent from Mobile', # NADIA_MARCINKO signature bad OCR
|
|
67
|
+
'twitter glhsummers': 'twitter @lhsummers',
|
|
68
|
+
re.compile(r"twitter\.com[i/][lI]krauss[1lt]"): "twitter.com/lkrauss1",
|
|
69
|
+
re.compile(r'from my BlackBerry[0°] wireless device'): 'from my BlackBerry® wireless device',
|
|
70
|
+
# links
|
|
71
|
+
'Imps ://': 'https://',
|
|
72
|
+
re.compile(r'timestopics/people/t/landon jr thomas/inde\n?x\n?\.\n?h\n?tml'): 'timestopics/people/t/landon_jr_thomas/index.html',
|
|
73
|
+
# Subject lines
|
|
74
|
+
r"as Putin Mayhem Tests President's Grip\non GOP": "as Putin Mayhem Tests President's Grip on GOP",
|
|
75
|
+
r"avoids testimony from alleged\nvictims": "avoids testimony from alleged victims",
|
|
76
|
+
r"but\nwatchdogs say probe is tainted": "watchdogs say probe is tainted",
|
|
77
|
+
r"COVER UP SEX ABUSE CRIMES\nBY THE WHITE HOUSE": "COVER UP SEX ABUSE CRIMES BY THE WHITE HOUSE",
|
|
78
|
+
r'Priebus, used\nprivate email accounts for': 'Priebus, used private email accounts for',
|
|
79
|
+
r"War on the Investigations\nEncircling Him": "War on the Investigations Encircling Him",
|
|
80
|
+
re.compile(r"deadline re Mr Bradley Edwards vs Mr\s*Jeffrey Epstein", re.I): "deadline re Mr Bradley Edwards vs Mr Jeffrey Epstein",
|
|
81
|
+
re.compile(r"Following Plea That Implicated Trump -\s*https://www.npr.org/676040070", re.I): "Following Plea That Implicated Trump - https://www.npr.org/676040070",
|
|
82
|
+
re.compile(r"for Attorney General -\s+Wikisource, the"): r"for Attorney General - Wikisource, the",
|
|
83
|
+
re.compile(r"JUDGE SWEET\s+ALLOWING\s+STEVEN\s+HOFFENBERG\s+TO\s+TALK\s+WITH\s+THE\s+TOWERS\s+VICTIMS\s+TO\s+EXPLAIN\s+THE\s+VICTIMS\s+SUI\n?T\s+FILING\s+AGAINST\s+JEFF\s+EPSTEIN"): "JUDGE SWEET ALLOWING STEVEN HOFFENBERG TO TALK WITH THE TOWERS VICTIMS TO EXPLAIN THE VICTIMS SUIT FILING AGAINST JEFF EPSTEIN",
|
|
84
|
+
re.compile(r"Lawyer for Susan Rice: Obama administration '?justifiably concerned' about sharing Intel with\s*Trump team -\s*POLITICO", re.I): "Lawyer for Susan Rice: Obama administration 'justifiably concerned' about sharing Intel with Trump team - POLITICO",
|
|
85
|
+
re.compile(r"PATTERSON NEW\s+BOOK\s+TELLING\s+FEDS\s+COVER\s+UP\s+OF\s+BILLIONAIRE\s+JEFF\s+EPSTEIN\s+CHILD\s+RAPES\s+RELEASE\s+DATE\s+OCT\s+10\s+2016\s+STEVEN\s+HOFFENBERG\s+IS\s+ON\s+THE\s+BOOK\s+WRITING\s+TEAM\s*!!!!"): "PATTERSON NEW BOOK TELLING FEDS COVER UP OF BILLIONAIRE JEFF EPSTEIN CHILD RAPES RELEASE DATE OCT 10 2016 STEVEN HOFFENBERG IS ON THE BOOK WRITING TEAM !!!!",
|
|
86
|
+
re.compile(r"PROCEEDINGS FOR THE ST THOMAS ATTACHMENT OF\s*ALL JEFF EPSTEIN ASSETS"): "PROCEEDINGS FOR THE ST THOMAS ATTACHMENT OF ALL JEFF EPSTEIN ASSETS",
|
|
87
|
+
re.compile(r"Subject:\s*Fwd: Trending Now: Friends for three decades"): "Subject: Fwd: Trending Now: Friends for three decades",
|
|
88
|
+
# Misc
|
|
89
|
+
'AVG°': 'AVGO',
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
MARTIN_WEINBERG_SIGNATURE_PATTERN = r"Martin G. Weinberg, Esq.\n20 Park Plaza((, )|\n)Suite 1000\nBoston, MA 02116(\n61.*)?(\n.*([cC]ell|Office))*"
|
|
93
|
+
|
|
94
|
+
EMAIL_SIGNATURES = {
|
|
95
|
+
ARIANE_DE_ROTHSCHILD: re.compile(r"Ensemble.*\nCe.*\ndestinataires.*\nremercions.*\nautorisee.*\nd.*\nLe.*\ncontenues.*\nEdmond.*\nRoth.*\nlo.*\nRoth.*\ninfo.*\nFranc.*\n.2.*", re.I),
|
|
96
|
+
BARBRO_C_EHNBOM: re.compile(r"Barbro C.? Ehn.*\nChairman, Swedish-American.*\n((Office|Cell|Sweden):.*\n)*(360.*\nNew York.*)?"),
|
|
97
|
+
DANNY_FROST: re.compile(r"Danny Frost\nDirector.*\nManhattan District.*\n212.*", re.IGNORECASE),
|
|
98
|
+
DARREN_INDYKE: re.compile(r"DARREN K. INDYKE.*?\**\nThe information contained in this communication.*?Darren K.[\n\s]+?[Il]ndyke(, PLLC)? — All rights reserved\.? ?\n\*{50,120}(\n\**)?", re.DOTALL),
|
|
99
|
+
DAVID_INGRAM: re.compile(r"Thank you in advance.*\nDavid Ingram.*\nCorrespondent\nReuters.*\nThomson.*(\n(Office|Mobile|Reuters.com).*)*"),
|
|
100
|
+
DEEPAK_CHOPRA: re.compile(fr"({DEEPAK_CHOPRA}( MD)?\n)?2013 Costa Del Mar Road\nCarlsbad, CA 92009(\n(Chopra Foundation|Super Genes: Unlock.*))?(\nJiyo)?(\nChopra Center for Wellbeing)?(\nHome: Where Everyone is Welcome)?"),
|
|
101
|
+
JEFFREY_EPSTEIN: re.compile(r"((\*+|please note)\n+)?(> )?(• )?(» )?The information contained in this communication is\n(> )*(» )?confidential.*?all attachments.( copyright -all rights reserved?)?", re.DOTALL),
|
|
102
|
+
JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*", re.IGNORECASE),
|
|
103
|
+
KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
|
|
104
|
+
LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
|
|
105
|
+
LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
|
|
106
|
+
MARTIN_WEINBERG: re.compile(fr"({MARTIN_WEINBERG_SIGNATURE_PATTERN}\n)?This Electronic Message contains.*?contents of this message is.*?prohibited.", re.DOTALL),
|
|
107
|
+
STEVEN_PFEIFFER: re.compile(r"Steven\nSteven .*\nAssociate.*\nIndependent Filmmaker Project\nMade in NY.*\n30 .*\nBrooklyn.*\n(p:.*\n)?www\.ifp.*", re.IGNORECASE),
|
|
108
|
+
PETER_MANDELSON: re.compile(r'Disclaimer This email and any attachments to it may be.*?with[ \n]+number(.*?EC4V[ \n]+6BJ)?', re.DOTALL | re.IGNORECASE),
|
|
109
|
+
PAUL_BARRETT: re.compile(r"Paul Barrett[\n\s]+Alpha Group Capital LLC[\n\s]+(142 W 57th Street, 11th Floor, New York, NY 10019?[\n\s]+)?(al?[\n\s]*)?ALPHA GROUP[\n\s]+CAPITAL"),
|
|
110
|
+
RICHARD_KAHN: re.compile(r'Richard Kahn[\n\s]+HBRK Associates Inc.?[\n\s]+((301 East 66th Street, Suite 1OF|575 Lexington Avenue,? 4th Floor,?)[\n\s]+)?New York, (NY|New York) 100(22|65)([\n\s]+(Tel?|Phone)( I)?[\n\s]+Fa[x"]?[\n\s]+[Ce]el?l?)?', re.IGNORECASE),
|
|
111
|
+
'Susan Edelman': re.compile(r'Susan Edel.*\nReporter\n1211.*\n917.*\nsedelman.*', re.IGNORECASE),
|
|
112
|
+
TERRY_KAFKA: re.compile(r"((>|I) )?Terry B.? Kafka.*\n(> )?Impact Outdoor.*\n(> )?5454.*\n(> )?Dallas.*\n((> )?c?ell.*\n)?(> )?Impactoutdoor.*(\n(> )?cell.*)?", re.IGNORECASE),
|
|
113
|
+
TONJA_HADDAD_COLEMAN: re.compile(fr"Tonja Haddad Coleman.*\nTonja Haddad.*\nAdvocate Building\n315 SE 7th.*(\nSuite.*)?\nFort Lauderdale.*(\n({REDACTED} )?facsimile)?(\nwww.tonjahaddad.com?)?(\nPlease add this efiling.*\nThe information.*\nyou are not.*\nyou are not.*)?", re.IGNORECASE),
|
|
114
|
+
UNKNOWN: re.compile(r"(This message is directed to and is for the use of the above-noted addressee only.*\nhereon\.)", re.DOTALL),
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
# Invalid for links to EpsteinWeb
|
|
118
|
+
JUNK_EMAILERS = [
|
|
119
|
+
'asmallworld@travel.asmallworld.net',
|
|
120
|
+
'editorialstaff@flipboard.com',
|
|
121
|
+
'How To Academy',
|
|
122
|
+
'Jokeland',
|
|
123
|
+
JP_MORGAN_USGIO,
|
|
124
|
+
'Saved by Internet Explorer 11',
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + [
|
|
128
|
+
'Alan S Halperin',
|
|
129
|
+
'middle.east.update@hotmail.com',
|
|
130
|
+
'Mitchell Bard',
|
|
131
|
+
'Skip Rimer',
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
TRUNCATION_LENGTHS = {
|
|
135
|
+
'023627': 16_800, # Micheal Wolff article with brock pierce
|
|
136
|
+
'030245': 7_500, # Epstein rationalizes his behavior in an open letter to the world
|
|
137
|
+
'030781': 1_700, # Bannon email about crypto coin issues
|
|
138
|
+
'032906': 750, # David Blaine email
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
# These are long forwarded articles so we force a trim to 1,333 chars if these strings exist
|
|
142
|
+
TRUNCATE_TERMS = [
|
|
143
|
+
'The rebuilding of Indonesia',
|
|
144
|
+
'Dominique Strauss-Kahn',
|
|
145
|
+
'THOMAS L. FRIEDMAN',
|
|
146
|
+
'a sleek, briskly paced film whose title suggests a heist movie',
|
|
147
|
+
'quote from The Colbert Report distinguishes',
|
|
148
|
+
'co-inventor of the GTX Smart Shoe',
|
|
149
|
+
'my latest Washington Post column',
|
|
150
|
+
'Whether you donated to Poetry in America through',
|
|
151
|
+
'supported my humanities work at Harvard',
|
|
152
|
+
'Calendar of Major Events, Openings, and Fundraisers',
|
|
153
|
+
'Nuclear Operator Raises Alarm on Crisis',
|
|
154
|
+
'as responsible for the democratisation of computing and',
|
|
155
|
+
'AROUND 1,000 operational satellites are circling the Earth',
|
|
156
|
+
"In recent months, China's BAT collapse",
|
|
157
|
+
'President Obama introduces Jim Yong Kim as his nominee',
|
|
158
|
+
'Trump appears with mobster-affiliated felon at New',
|
|
159
|
+
'Lead Code Enforcement Walton presented the facts',
|
|
160
|
+
"Is UNRWA vital for the Palestinians' future",
|
|
161
|
+
'The New York company, led by Stephen Ross',
|
|
162
|
+
'I spent some time mulling additional aspects of a third choice presidential',
|
|
163
|
+
'you are referring to duplication of a gene',
|
|
164
|
+
'i am writing you both because i am attaching a still not-quite-complete response',
|
|
165
|
+
'Learn to meditate and discover what truly nourishes your entire being',
|
|
166
|
+
'Congratulations to the 2019 Hillman Prize recipients',
|
|
167
|
+
'This much we know - the Fall elections are shaping up',
|
|
168
|
+
"Special counsel Robert Mueller's investigation may face a serious legal obstacle",
|
|
169
|
+
"nearly leak-proof since its inception more than a year ago",
|
|
170
|
+
"I appreciate the opportunity to respond to your email",
|
|
171
|
+
"Hello Peter. I am currently on a plane. I sent you earlier",
|
|
172
|
+
"I appreciate the opportunity to respond to your email",
|
|
173
|
+
'I just wanted to follow up on a couple of notes. I have been coordinating with Richard Kahn',
|
|
174
|
+
'So, Peggy, if you could just let me know what info to include on the donation',
|
|
175
|
+
'Consult a lawyer beforehand, if possible, but be cooperative/nice at this stage',
|
|
176
|
+
# Amanda Ens
|
|
177
|
+
'We remain positive on banks that can make acceptable returns',
|
|
178
|
+
'David Woo (BAML head of FX, Rates and EM Strategy, very highly regarded',
|
|
179
|
+
"Please let me know if you're interested in joining a small group meeting",
|
|
180
|
+
'Erika Najarian, BAML financials research analyst, just returned',
|
|
181
|
+
'We can also discuss single stock and Topix banks',
|
|
182
|
+
'We are recording unprecedented divergences in falling equity vol',
|
|
183
|
+
'As previously discussed between you and Ariane',
|
|
184
|
+
'The US trade war against China: The view from Beijing',
|
|
185
|
+
'no evidence you got the latest so i have sent you just the key message',
|
|
186
|
+
# Joscha Bach
|
|
187
|
+
'Cells seem to be mostly indistinguishable (except',
|
|
188
|
+
'gender differenece. unlikely motivational, every cell is different',
|
|
189
|
+
'Some thoughts I meant to send back for a long time',
|
|
190
|
+
# Krassner
|
|
191
|
+
'My friend Michael Simmons, who has been the editor of National Lampoon',
|
|
192
|
+
"In the premiere episode of 'The Last Laugh' podcast, Sarah Silverman",
|
|
193
|
+
'Thanks so much for sharing both your note to Steven and your latest Manson essay',
|
|
194
|
+
# Edward Larson
|
|
195
|
+
'Coming from an international background, and having lived in Oslo, Tel Aviv',
|
|
196
|
+
# Katherine Keating
|
|
197
|
+
'Paul Keating is aware that many people see him as a puzzle and contradiction',
|
|
198
|
+
'his panoramic view of world affairs sharper than ever, Paul Keating blames',
|
|
199
|
+
# melanie
|
|
200
|
+
'Some years ago when I worked at the libertarian Cato Institute'
|
|
201
|
+
# rich kahn
|
|
202
|
+
'House and Senate Republicans on their respective tax overhaul',
|
|
203
|
+
'The Tax Act contains changes to the treatment of "carried interests"',
|
|
204
|
+
'General Election: Trump vs. Clinton LA Times/USC Tracking',
|
|
205
|
+
'Location: Quicken Loans Arena in Cleveland, OH',
|
|
206
|
+
'A friendly discussion about Syria with a former US State Department',
|
|
207
|
+
# Tom / Paul Krassner
|
|
208
|
+
'I forgot to post my cartoon from week before last, about Howard Schultz',
|
|
209
|
+
# Bannon
|
|
210
|
+
"Bannon the European: He's opening the populist fort in Brussels",
|
|
211
|
+
"Steve Bannon doesn't do subtle.",
|
|
212
|
+
'The Department of Justice lost its latest battle with Congress',
|
|
213
|
+
"Donald Trump's newly named chief strategist and senior counselor",
|
|
214
|
+
# Diane Ziman
|
|
215
|
+
'I was so proud to see him speak at the Women',
|
|
216
|
+
# Krauss
|
|
217
|
+
'On confronting dogma, I of course agree',
|
|
218
|
+
'I did neck with that woman, but never forced myself on her',
|
|
219
|
+
'It is hard to know how to respond to a list of false',
|
|
220
|
+
'The Women in the World Summit opens April 12',
|
|
221
|
+
'lecture in Heidelberg Oct 14 but they had to cancel',
|
|
222
|
+
# Nikolic
|
|
223
|
+
'people from LifeBall',
|
|
224
|
+
# Random
|
|
225
|
+
'Little Hodiaki',
|
|
226
|
+
"It began with deep worries regarding China's growth path",
|
|
227
|
+
'https://www.washingtonpost.com/politics/2018/09/04/transcript-phone-call',
|
|
228
|
+
# Epstein
|
|
229
|
+
'David Ben Gurion was asked why he, after 2000',
|
|
230
|
+
# Lisa New
|
|
231
|
+
'The raw materials for that period include interviews',
|
|
232
|
+
]
|
|
233
|
+
|
|
234
|
+
KRASSNER_RECIPIENTS = uniquify(KRASSNER_MANSON_RECIPIENTS + KRASSNER_024923_RECIPIENTS + KRASSNER_033568_RECIPIENTS)
|
|
235
|
+
|
|
236
|
+
# No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
|
|
237
|
+
USELESS_EMAILERS = IRAN_NUCLEAR_DEAL_SPAM_EMAIL_RECIPIENTS + \
|
|
238
|
+
KRASSNER_RECIPIENTS + \
|
|
239
|
+
FLIGHT_IN_2012_PEOPLE + [
|
|
240
|
+
'Alan Rogers', # Random CC
|
|
241
|
+
'BS Stern', # A random fwd of email we have
|
|
242
|
+
'Cheryl Kleen', # Single email from Anne Boyles, displayed under Anne Boyles
|
|
243
|
+
'Connie Zaguirre', # Random CC
|
|
244
|
+
'Dan Fleuette', # CC from sean bannon
|
|
245
|
+
'Danny Goldberg', # Random Paul Krassner emails
|
|
246
|
+
GERALD_LEFCOURT, # Single CC
|
|
247
|
+
GORDON_GETTY, # Random CC
|
|
248
|
+
JEFF_FULLER, # Random Jean Luc Brunel CC
|
|
249
|
+
'Jojo Fontanilla', # Random CC
|
|
250
|
+
'Joseph Vinciguerra', # Random CC
|
|
251
|
+
'Larry Cohen', # Random Bill Gates CC
|
|
252
|
+
'Lyn Fontanilla', # Random CC
|
|
253
|
+
'Mark Albert', # Random CC
|
|
254
|
+
'Matthew Schafer', # Random CC
|
|
255
|
+
'Michael Simmons', # Random CC
|
|
256
|
+
'Nancy Portland', # Lawrence Krauss CC
|
|
257
|
+
'Oliver Goodenough', # Robert Trivers CC
|
|
258
|
+
'Owen Blicksilver', # Landon Thomas CC
|
|
259
|
+
'Peter Aldhous', # Lawrence Krauss CC
|
|
260
|
+
'Sam Harris', # Lawrence Krauss CC
|
|
261
|
+
SAMUEL_LEFF, # Random CC
|
|
262
|
+
"Saved by Internet Explorer 11",
|
|
263
|
+
'Sean T Lehane', # Random CC
|
|
264
|
+
'Stephen Rubin', # Random CC
|
|
265
|
+
'Tim Kane', # Random CC
|
|
266
|
+
'Travis Pangburn', # Random CC
|
|
267
|
+
'Vahe Stepanian', # Random CC
|
|
268
|
+
]
|
|
269
|
+
|
|
270
|
+
# Emails sent by epstein to himself that are just notes
|
|
271
|
+
NOTES_TO_SELF = [
|
|
272
|
+
'026677',
|
|
273
|
+
'029752',
|
|
274
|
+
'030238',
|
|
275
|
+
# '033274', # TODO: Epstein's note to self doesn't get printed if we don't set the recipients to [None]
|
|
276
|
+
]
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
@dataclass
|
|
280
|
+
class Email(Communication):
|
|
281
|
+
actual_text: str = field(init=False)
|
|
282
|
+
header: EmailHeader = field(init=False)
|
|
283
|
+
is_junk_mail: bool = False
|
|
284
|
+
recipients: list[str | None] = field(default_factory=list)
|
|
285
|
+
sent_from_device: str | None = None
|
|
286
|
+
signature_substitution_counts: dict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
287
|
+
|
|
288
|
+
# Just for logging how many headers we rewrote
|
|
289
|
+
rewritten_header_ids: ClassVar[set[str]] = set([])
|
|
290
|
+
|
|
291
|
+
def __post_init__(self):
|
|
292
|
+
super().__post_init__()
|
|
293
|
+
self.is_junk_mail = self.author in JUNK_EMAILERS
|
|
294
|
+
|
|
295
|
+
if self.config and self.config.recipients:
|
|
296
|
+
self.recipients = cast(list[str | None], self.config.recipients)
|
|
297
|
+
else:
|
|
298
|
+
for recipient in self.header.recipients():
|
|
299
|
+
self.recipients.extend(self._get_names(recipient))
|
|
300
|
+
|
|
301
|
+
recipients = [r for r in self.recipients if r != self.author or self.file_id in NOTES_TO_SELF] # Remove self CCs
|
|
302
|
+
self.recipients = list(set(recipients))
|
|
303
|
+
self.text = self._cleaned_up_text()
|
|
304
|
+
self.actual_text = self._actual_text()
|
|
305
|
+
self.sent_from_device = self._sent_from_device()
|
|
306
|
+
logger.debug(f"Constructed {self.description()}")
|
|
307
|
+
|
|
308
|
+
def description(self) -> Text:
|
|
309
|
+
"""One line summary mostly for logging."""
|
|
310
|
+
txt = self._description()
|
|
311
|
+
|
|
312
|
+
if len(self.recipients) > 0:
|
|
313
|
+
txt.append(', ').append(key_value_txt('recipients', self._recipients_txt()))
|
|
314
|
+
|
|
315
|
+
return txt.append(CLOSE_PROPERTIES_CHAR)
|
|
316
|
+
|
|
317
|
+
def idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
|
|
318
|
+
"""Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
|
|
319
|
+
for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text or self.text)):
|
|
320
|
+
if i >= n:
|
|
321
|
+
return match.end() - 1
|
|
322
|
+
|
|
323
|
+
def info_txt(self) -> Text:
|
|
324
|
+
txt = Text("OCR text of email from ", style='grey46').append(self.author_txt).append(' to ')
|
|
325
|
+
return txt.append(self._recipients_txt()).append(highlighter(f" probably sent at {self.timestamp}"))
|
|
326
|
+
|
|
327
|
+
def subject(self) -> str:
|
|
328
|
+
return self.header.subject or ''
|
|
329
|
+
|
|
330
|
+
def _actual_text(self) -> str:
|
|
331
|
+
"""The text that comes before likely quoted replies and forwards etc."""
|
|
332
|
+
if self.config and self.config.actual_text is not None:
|
|
333
|
+
return self.config.actual_text
|
|
334
|
+
elif self.header.num_header_rows == 0:
|
|
335
|
+
return self.text
|
|
336
|
+
|
|
337
|
+
text = '\n'.join(self.text.split('\n')[self.header.num_header_rows:]).strip()
|
|
338
|
+
reply_text_match = REPLY_TEXT_REGEX.search(text)
|
|
339
|
+
# logger.info(f"Raw text:\n" + self.top_lines(20) + '\n\n')
|
|
340
|
+
# logger.info(f"With header removed:\n" + text[0:500] + '\n\n')
|
|
341
|
+
|
|
342
|
+
if self.file_id in ['024624']:
|
|
343
|
+
return text
|
|
344
|
+
|
|
345
|
+
if reply_text_match:
|
|
346
|
+
actual_num_chars = len(reply_text_match.group(1))
|
|
347
|
+
actual_text_pct = f"{(100 * float(actual_num_chars) / len(text)):.1f}%"
|
|
348
|
+
logger.debug(f"'{self.url_slug}': actual_text() reply_text_match is {actual_num_chars:,} chars ({actual_text_pct} of {len(text):,})")
|
|
349
|
+
text = reply_text_match.group(1)
|
|
350
|
+
|
|
351
|
+
# If all else fails look for lines like 'From: blah', 'Subject: blah', and split on that.
|
|
352
|
+
for field_name in REPLY_SPLITTERS:
|
|
353
|
+
field_string = f'\n{field_name}'
|
|
354
|
+
|
|
355
|
+
if field_string not in text:
|
|
356
|
+
continue
|
|
357
|
+
|
|
358
|
+
logger.debug(f"'{self.url_slug}': Splitting based on '{field_string.strip()}'")
|
|
359
|
+
pre_from_text = text.split(field_string)[0]
|
|
360
|
+
actual_num_chars = len(pre_from_text)
|
|
361
|
+
actual_text_pct = f"{(100 * float(actual_num_chars) / len(text)):.1f}%"
|
|
362
|
+
logger.debug(f"'{self.url_slug}': actual_text() fwd_text_match is {actual_num_chars:,} chars ({actual_text_pct} of {len(text):,})")
|
|
363
|
+
text = pre_from_text
|
|
364
|
+
break
|
|
365
|
+
|
|
366
|
+
return text.strip()
|
|
367
|
+
|
|
368
|
+
def _border_style(self) -> str:
|
|
369
|
+
"""Color emails from epstein to others with the color for the first recipient."""
|
|
370
|
+
if self.author == JEFFREY_EPSTEIN:
|
|
371
|
+
if len(self.recipients) == 0 or self.recipients == [None]:
|
|
372
|
+
style = self.author_style
|
|
373
|
+
else:
|
|
374
|
+
style = get_style_for_name(self.recipients[0])
|
|
375
|
+
else:
|
|
376
|
+
style = self.author_style
|
|
377
|
+
|
|
378
|
+
return style.replace('bold', '').strip()
|
|
379
|
+
|
|
380
|
+
def _cleaned_up_text(self) -> str:
|
|
381
|
+
"""Add newline after headers in text if actual header wasn't empty, remove bad lines, etc."""
|
|
382
|
+
# Insert line breaks now unless header is broken, in which case we'll do it later after fixing header
|
|
383
|
+
text = self.text if self.header.was_initially_empty else _add_line_breaks(self.text)
|
|
384
|
+
text = REPLY_REGEX.sub(r'\n\1', text) # Newlines between quoted replies
|
|
385
|
+
|
|
386
|
+
for name, signature_regex in EMAIL_SIGNATURES.items():
|
|
387
|
+
signature_replacement = f'<...snipped {name.lower()} legal signature...>'
|
|
388
|
+
text, num_replaced = signature_regex.subn(signature_replacement, text)
|
|
389
|
+
self.signature_substitution_counts[name] += num_replaced
|
|
390
|
+
|
|
391
|
+
return collapse_newlines(text).strip()
|
|
392
|
+
|
|
393
|
+
def _debug_info(self) -> str:
|
|
394
|
+
info = [
|
|
395
|
+
f"id={self.file_id}",
|
|
396
|
+
f"url_slug={self.url_slug}",
|
|
397
|
+
f"file_path='{self.file_path}'",
|
|
398
|
+
f"is_local_extract_file={self.is_local_extract_file()}",
|
|
399
|
+
]
|
|
400
|
+
|
|
401
|
+
return f" " + "\n ".join(info)
|
|
402
|
+
|
|
403
|
+
def _extract_author(self) -> None:
|
|
404
|
+
self._extract_header()
|
|
405
|
+
super()._extract_author()
|
|
406
|
+
|
|
407
|
+
if not self.author and self.header.author:
|
|
408
|
+
authors = self._get_names(self.header.author)
|
|
409
|
+
self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
|
|
410
|
+
|
|
411
|
+
def _extract_header(self) -> None:
|
|
412
|
+
"""Extract an EmailHeader object from the OCR text."""
|
|
413
|
+
header_match = EMAIL_SIMPLE_HEADER_REGEX.search(self.text)
|
|
414
|
+
|
|
415
|
+
if header_match:
|
|
416
|
+
self.header = EmailHeader.from_header_lines(header_match.group(0))
|
|
417
|
+
|
|
418
|
+
if self.header.is_empty():
|
|
419
|
+
self.header.repair_empty_header(self.lines)
|
|
420
|
+
else:
|
|
421
|
+
msg = f"No header match found in '{self.filename}'! Top lines:\n\n{self.top_lines()}"
|
|
422
|
+
log_fxn = logger.info if self.config else logger.warning
|
|
423
|
+
log_fxn(msg)
|
|
424
|
+
self.header = EmailHeader(field_names=[])
|
|
425
|
+
|
|
426
|
+
def _extract_timestamp(self) -> datetime:
|
|
427
|
+
if self.config and self.config.timestamp:
|
|
428
|
+
return self.config.timestamp
|
|
429
|
+
elif self.header.sent_at:
|
|
430
|
+
timestamp = _parse_timestamp(self.header.sent_at)
|
|
431
|
+
|
|
432
|
+
if timestamp:
|
|
433
|
+
return timestamp
|
|
434
|
+
|
|
435
|
+
searchable_lines = self.lines[0:VALID_HEADER_LINES]
|
|
436
|
+
searchable_text = '\n'.join(searchable_lines)
|
|
437
|
+
date_match = DATE_HEADER_REGEX.search(searchable_text)
|
|
438
|
+
|
|
439
|
+
if date_match:
|
|
440
|
+
timestamp = _parse_timestamp(date_match.group(1))
|
|
441
|
+
|
|
442
|
+
if timestamp:
|
|
443
|
+
return timestamp
|
|
444
|
+
|
|
445
|
+
logger.debug(f"Failed to find timestamp, falling back to parsing {VALID_HEADER_LINES} lines...")
|
|
446
|
+
|
|
447
|
+
for line in searchable_lines:
|
|
448
|
+
if not TIMESTAMP_LINE_REGEX.search(line):
|
|
449
|
+
continue
|
|
450
|
+
|
|
451
|
+
timestamp = _parse_timestamp(line)
|
|
452
|
+
|
|
453
|
+
if timestamp:
|
|
454
|
+
logger.debug(f"Fell back to timestamp {timestamp} in line '{line}'...")
|
|
455
|
+
return timestamp
|
|
456
|
+
|
|
457
|
+
raise RuntimeError(f"No timestamp found in '{self.file_path.name}' top lines:\n{searchable_text}")
|
|
458
|
+
|
|
459
|
+
def _get_names(self, emailer_str: str) -> list[str]:
|
|
460
|
+
"""Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
|
|
461
|
+
emailer_str = EmailHeader.cleanup_str(emailer_str)
|
|
462
|
+
|
|
463
|
+
if len(emailer_str) == 0:
|
|
464
|
+
return []
|
|
465
|
+
|
|
466
|
+
names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
|
|
467
|
+
|
|
468
|
+
if BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
|
|
469
|
+
if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
|
|
470
|
+
logger.warning(f"'{self.filename}': No emailer found in '{escape_single_quotes(emailer_str)}'")
|
|
471
|
+
else:
|
|
472
|
+
logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
|
|
473
|
+
|
|
474
|
+
return names_found
|
|
475
|
+
|
|
476
|
+
names_found = names_found or [emailer_str]
|
|
477
|
+
return [_reverse_first_and_last_names(name) for name in names_found]
|
|
478
|
+
|
|
479
|
+
def _merge_lines(self, idx: int, idx2: int | None = None) -> None:
|
|
480
|
+
"""Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""
|
|
481
|
+
idx2 = idx2 if idx2 is not None else (idx + 1)
|
|
482
|
+
lines = self.lines[0:idx]
|
|
483
|
+
|
|
484
|
+
if idx2 <= idx:
|
|
485
|
+
raise RuntimeError(f"idx2 ({idx2}) must be greater than idx ({idx})")
|
|
486
|
+
elif idx2 == (idx + 1):
|
|
487
|
+
lines += [self.lines[idx] + ' ' + self.lines[idx + 1]] + self.lines[idx + 2:]
|
|
488
|
+
else:
|
|
489
|
+
lines += [self.lines[idx] + ' ' + self.lines[idx2]] + self.lines[idx + 1:idx2] + self.lines[idx2 + 1:]
|
|
490
|
+
|
|
491
|
+
self._set_computed_fields(lines=lines)
|
|
492
|
+
|
|
493
|
+
def _recipients_txt(self) -> Text:
|
|
494
|
+
"""Text object with comma separated colored versions of all recipients."""
|
|
495
|
+
recipients = [r or UNKNOWN for r in self.recipients] if len(self.recipients) > 0 else [UNKNOWN]
|
|
496
|
+
|
|
497
|
+
# Use just the last name for each recipient if there's 3 or more recipients
|
|
498
|
+
return join_texts([
|
|
499
|
+
Text(r if len(recipients) < 3 else extract_last_name(r), style=get_style_for_name(r))
|
|
500
|
+
for r in recipients
|
|
501
|
+
], join=', ')
|
|
502
|
+
|
|
503
|
+
def _repair(self) -> None:
|
|
504
|
+
"""Repair particularly janky files."""
|
|
505
|
+
if BAD_FIRST_LINE_REGEX.match(self.lines[0]):
|
|
506
|
+
self._set_computed_fields(lines=self.lines[1:])
|
|
507
|
+
|
|
508
|
+
self._set_computed_fields(lines=[line for line in self.lines if not BAD_LINE_REGEX.match(line)])
|
|
509
|
+
old_text = self.text
|
|
510
|
+
|
|
511
|
+
if self.file_id in ['031442']:
|
|
512
|
+
self._merge_lines(0) # Merge 1st and 2nd rows
|
|
513
|
+
elif self.file_id in '021729 029501 029282 030626 031384 033512'.split():
|
|
514
|
+
self._merge_lines(2) # Merge 3rd and 4th rows
|
|
515
|
+
|
|
516
|
+
if self.file_id in ['030626']: # Merge 6th and 7th (now 5th and 6th) rows
|
|
517
|
+
self._merge_lines(4)
|
|
518
|
+
elif self.file_id in ['029976']:
|
|
519
|
+
self._merge_lines(3) # Merge 4th and 5th rows
|
|
520
|
+
elif self.file_id in '026609 029402 032405'.split():
|
|
521
|
+
self._merge_lines(4) # Merge 5th and 6th rows
|
|
522
|
+
elif self.file_id in ['033568']:
|
|
523
|
+
for _i in range(5):
|
|
524
|
+
self._merge_lines(5)
|
|
525
|
+
elif self.file_id in ['025329']:
|
|
526
|
+
for _i in range(9):
|
|
527
|
+
self._merge_lines(2)
|
|
528
|
+
elif self.file_id == '029977':
|
|
529
|
+
self._set_computed_fields(text=self.text.replace('Sent 9/28/2012 2:41:02 PM', 'Sent: 9/28/2012 2:41:02 PM'))
|
|
530
|
+
|
|
531
|
+
for _i in range(4):
|
|
532
|
+
self._merge_lines(2)
|
|
533
|
+
|
|
534
|
+
self._merge_lines(4)
|
|
535
|
+
self._merge_lines(2, 4)
|
|
536
|
+
|
|
537
|
+
if old_text != self.text:
|
|
538
|
+
self.log(f"Modified text, old:\n\n" + '\n'.join(old_text.split('\n')[0:12]) + '\n', logging.INFO)
|
|
539
|
+
self.log_top_lines(12, 'Result of modifications', logging.INFO)
|
|
540
|
+
self.log('', logging.INFO)
|
|
541
|
+
|
|
542
|
+
lines = self.repair_ocr_text(OCR_REPAIRS, self.text).split('\n')
|
|
543
|
+
new_lines = []
|
|
544
|
+
i = 0
|
|
545
|
+
|
|
546
|
+
# Fix links (remove spaces, merge multiline links to a single line)
|
|
547
|
+
while i < len(lines):
|
|
548
|
+
line = lines[i]
|
|
549
|
+
|
|
550
|
+
if LINK_LINE_REGEX.search(line):
|
|
551
|
+
if 'htm' not in line \
|
|
552
|
+
and i < (len(lines) - 1) \
|
|
553
|
+
and (lines[i + 1].endswith('/') or any(s in lines[i + 1] for s in URL_SIGNIFIERS)):
|
|
554
|
+
logger.debug(f"{self.filename}: Joining link lines\n 1. {line}\n 2. {lines[i + 1]}\n")
|
|
555
|
+
line += lines[i + 1]
|
|
556
|
+
i += 1
|
|
557
|
+
|
|
558
|
+
line = line.replace(' ', '')
|
|
559
|
+
|
|
560
|
+
new_lines.append(line)
|
|
561
|
+
|
|
562
|
+
# TODO: hacky workaround to get a working link for HOUSE_OVERSIGHT_032564
|
|
563
|
+
if self.file_id == '032564' and line == 'http://m.huffpost.com/us/entry/us_599f532ae4b0dOef9f1c129d':
|
|
564
|
+
new_lines.append('(ed. note: an archived version of the above link is here: https://archive.is/hJxT3 )')
|
|
565
|
+
|
|
566
|
+
i += 1
|
|
567
|
+
|
|
568
|
+
self._set_computed_fields(lines=new_lines)
|
|
569
|
+
|
|
570
|
+
def _sent_from_device(self) -> str | None:
|
|
571
|
+
"""Find any 'Sent from my iPhone' style lines if they exist."""
|
|
572
|
+
sent_from_match = SENT_FROM_REGEX.search(self.actual_text)
|
|
573
|
+
|
|
574
|
+
if sent_from_match:
|
|
575
|
+
sent_from = sent_from_match.group(0)
|
|
576
|
+
return 'S' + sent_from[1:] if sent_from.startswith('sent') else sent_from
|
|
577
|
+
|
|
578
|
+
def __rich_console__(self, _console: Console, _options: ConsoleOptions) -> RenderResult:
|
|
579
|
+
logger.debug(f"Printing '{self.filename}'...")
|
|
580
|
+
yield self.file_info_panel()
|
|
581
|
+
text = self.text
|
|
582
|
+
should_rewrite_header = self.header.was_initially_empty and self.header.num_header_rows > 0
|
|
583
|
+
quote_cutoff = self.idx_of_nth_quoted_reply(text=text) # Trim if there's many quoted replies
|
|
584
|
+
num_chars = MAX_CHARS_TO_PRINT
|
|
585
|
+
trim_footer_txt = None
|
|
586
|
+
|
|
587
|
+
if self.file_id in TRUNCATION_LENGTHS:
|
|
588
|
+
num_chars = TRUNCATION_LENGTHS[self.file_id]
|
|
589
|
+
elif self.author in TRUNCATE_ALL_EMAILS_FROM or any((term in self.text) for term in TRUNCATE_TERMS):
|
|
590
|
+
num_chars = int(MAX_CHARS_TO_PRINT / 3)
|
|
591
|
+
elif quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
|
|
592
|
+
num_chars = quote_cutoff
|
|
593
|
+
|
|
594
|
+
# Truncate long emails but leave a note explaining what happened w/link to source document
|
|
595
|
+
if len(text) > num_chars:
|
|
596
|
+
text = text[0:num_chars]
|
|
597
|
+
doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
|
|
598
|
+
trim_note = f"<...trimmed to {num_chars} characters of {self.length}, read the rest at {doc_link_markup}...>"
|
|
599
|
+
trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
|
|
600
|
+
|
|
601
|
+
# Rewrite broken headers where the values are on separate lines from the field names
|
|
602
|
+
if should_rewrite_header:
|
|
603
|
+
configured_actual_text = self.config.actual_text if self.config and self.config.actual_text else None
|
|
604
|
+
num_lines_to_skip = self.header.num_header_rows
|
|
605
|
+
lines = []
|
|
606
|
+
|
|
607
|
+
# Emails w/configured 'actual_text' are particularly broken; need to shuffle some lines
|
|
608
|
+
if configured_actual_text is not None:
|
|
609
|
+
num_lines_to_skip += 1
|
|
610
|
+
lines += [cast(str, configured_actual_text), '\n']
|
|
611
|
+
|
|
612
|
+
lines += text.split('\n')[num_lines_to_skip:]
|
|
613
|
+
text = self.header.rewrite_header() + '\n' + '\n'.join(lines)
|
|
614
|
+
text = _add_line_breaks(text) # This was skipped when _cleaned_up_text() w/a broken header so we do it now
|
|
615
|
+
self.rewritten_header_ids.add(self.file_id)
|
|
616
|
+
|
|
617
|
+
panel_txt = highlighter(text)
|
|
618
|
+
|
|
619
|
+
email_txt_panel = Panel(
|
|
620
|
+
panel_txt.append('\n\n').append(trim_footer_txt) if trim_footer_txt else panel_txt,
|
|
621
|
+
border_style=self._border_style(),
|
|
622
|
+
expand=False,
|
|
623
|
+
subtitle=REWRITTEN_HEADER_MSG if should_rewrite_header else None,
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
yield Padding(email_txt_panel, (0, 0, 1, INFO_INDENT))
|
|
627
|
+
|
|
628
|
+
if should_rewrite_header:
|
|
629
|
+
self.log_top_lines(self.header.num_header_rows + 4, f'Original header:', logging.INFO)
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
def _add_line_breaks(email_text: str) -> str:
|
|
633
|
+
return EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX.sub(r'\n\1\n', email_text).strip()
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
def _parse_timestamp(timestamp_str: str) -> None | datetime:
|
|
637
|
+
try:
|
|
638
|
+
timestamp_str = timestamp_str.replace('(GMT-05:00)', 'EST')
|
|
639
|
+
timestamp_str = BAD_TIMEZONE_REGEX.sub(' ', timestamp_str).strip()
|
|
640
|
+
timestamp = parse(timestamp_str, tzinfos=TIMEZONE_INFO)
|
|
641
|
+
logger.debug(f'Parsed timestamp "%s" from string "%s"', timestamp, timestamp_str)
|
|
642
|
+
return remove_timezone(timestamp)
|
|
643
|
+
except Exception as e:
|
|
644
|
+
logger.debug(f'Failed to parse "{timestamp_str}" to timestamp!')
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
def _reverse_first_and_last_names(name: str) -> str:
|
|
648
|
+
if '@' in name:
|
|
649
|
+
return name.lower()
|
|
650
|
+
|
|
651
|
+
if ', ' in name:
|
|
652
|
+
names = name.split(', ')
|
|
653
|
+
return f"{names[1]} {names[0]}"
|
|
654
|
+
else:
|
|
655
|
+
return name
|