epstein-files 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +75 -135
- epstein_files/documents/communication.py +9 -9
- epstein_files/documents/document.py +115 -87
- epstein_files/documents/email.py +154 -85
- epstein_files/documents/emails/email_header.py +7 -6
- epstein_files/documents/imessage/text_message.py +3 -2
- epstein_files/documents/json_file.py +17 -0
- epstein_files/documents/messenger_log.py +62 -3
- epstein_files/documents/other_file.py +165 -17
- epstein_files/epstein_files.py +128 -169
- epstein_files/util/constant/names.py +8 -1
- epstein_files/util/constant/output_files.py +29 -0
- epstein_files/util/constant/strings.py +27 -0
- epstein_files/util/constant/urls.py +25 -9
- epstein_files/util/constants.py +1018 -1045
- epstein_files/util/data.py +20 -55
- epstein_files/util/{file_cfg.py → doc_cfg.py} +121 -43
- epstein_files/util/env.py +19 -20
- epstein_files/util/file_helper.py +38 -21
- epstein_files/util/highlighted_group.py +229 -177
- epstein_files/util/logging.py +63 -0
- epstein_files/util/output.py +180 -0
- epstein_files/util/rich.py +29 -17
- epstein_files/util/search_result.py +14 -6
- epstein_files/util/timer.py +24 -0
- epstein_files/util/word_count.py +2 -1
- {epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/METADATA +20 -4
- epstein_files-1.0.2.dist-info/RECORD +33 -0
- epstein_files-1.0.2.dist-info/entry_points.txt +7 -0
- epstein_files-1.0.0.dist-info/RECORD +0 -28
- {epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/LICENSE +0 -0
- {epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/WHEEL +0 -0
epstein_files/documents/email.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import re
|
|
3
|
-
from
|
|
4
|
-
from dataclasses import dataclass, field
|
|
3
|
+
from dataclasses import asdict, dataclass, field
|
|
5
4
|
from datetime import datetime
|
|
6
5
|
from typing import ClassVar, cast
|
|
7
6
|
|
|
@@ -9,6 +8,7 @@ from dateutil.parser import parse
|
|
|
9
8
|
from rich.console import Console, ConsoleOptions, RenderResult
|
|
10
9
|
from rich.padding import Padding
|
|
11
10
|
from rich.panel import Panel
|
|
11
|
+
from rich.table import Table
|
|
12
12
|
from rich.text import Text
|
|
13
13
|
|
|
14
14
|
from epstein_files.documents.communication import Communication
|
|
@@ -19,13 +19,14 @@ from epstein_files.util.constant.names import *
|
|
|
19
19
|
from epstein_files.util.constant.strings import REDACTED, URL_SIGNIFIERS
|
|
20
20
|
from epstein_files.util.constants import *
|
|
21
21
|
from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes, extract_last_name,
|
|
22
|
-
remove_timezone, uniquify)
|
|
23
|
-
from epstein_files.util.
|
|
22
|
+
flatten, remove_timezone, uniquify)
|
|
23
|
+
from epstein_files.util.doc_cfg import EmailCfg, Metadata
|
|
24
24
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
25
|
+
from epstein_files.util.logging import logger
|
|
25
26
|
from epstein_files.util.rich import *
|
|
26
27
|
|
|
27
|
-
BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|
|
|
28
|
-
BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,])$')
|
|
28
|
+
BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
|
|
29
|
+
BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
|
|
29
30
|
DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
|
|
30
31
|
LINK_LINE_REGEX = re.compile(f"^(> )?htt")
|
|
31
32
|
QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
|
|
@@ -39,8 +40,8 @@ TIMESTAMP_LINE_REGEX = re.compile(r"\d+:\d+")
|
|
|
39
40
|
SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
|
|
40
41
|
REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
|
|
41
42
|
MAX_CHARS_TO_PRINT = 4000
|
|
43
|
+
MAX_NUM_HEADER_LINES = 14
|
|
42
44
|
MAX_QUOTED_REPLIES = 2
|
|
43
|
-
VALID_HEADER_LINES = 14
|
|
44
45
|
|
|
45
46
|
OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
46
47
|
re.compile(r'grnail\.com'): 'gmail.com',
|
|
@@ -71,12 +72,15 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
|
71
72
|
'Imps ://': 'https://',
|
|
72
73
|
re.compile(r'timestopics/people/t/landon jr thomas/inde\n?x\n?\.\n?h\n?tml'): 'timestopics/people/t/landon_jr_thomas/index.html',
|
|
73
74
|
# Subject lines
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
75
|
+
"Arrested in\nInauguration Day Riot": "Arrested in Inauguration Day Riot",
|
|
76
|
+
"as Putin Mayhem Tests President's Grip\non GOP": "as Putin Mayhem Tests President's Grip on GOP",
|
|
77
|
+
"avoids testimony from alleged\nvictims": "avoids testimony from alleged victims",
|
|
78
|
+
"but\nwatchdogs say probe is tainted": "watchdogs say probe is tainted",
|
|
79
|
+
"Christmas comes\nearly for most of macro": "Christmas comes early for most of macro", # 023717
|
|
80
|
+
"but majority still made good\nmoney because": "but majority still made good money because", # 023717
|
|
81
|
+
"COVER UP SEX ABUSE CRIMES\nBY THE WHITE HOUSE": "COVER UP SEX ABUSE CRIMES BY THE WHITE HOUSE",
|
|
82
|
+
'Priebus, used\nprivate email accounts for': 'Priebus, used private email accounts for',
|
|
83
|
+
"War on the Investigations\nEncircling Him": "War on the Investigations Encircling Him",
|
|
80
84
|
re.compile(r"deadline re Mr Bradley Edwards vs Mr\s*Jeffrey Epstein", re.I): "deadline re Mr Bradley Edwards vs Mr Jeffrey Epstein",
|
|
81
85
|
re.compile(r"Following Plea That Implicated Trump -\s*https://www.npr.org/676040070", re.I): "Following Plea That Implicated Trump - https://www.npr.org/676040070",
|
|
82
86
|
re.compile(r"for Attorney General -\s+Wikisource, the"): r"for Attorney General - Wikisource, the",
|
|
@@ -89,9 +93,7 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
|
89
93
|
'AVG°': 'AVGO',
|
|
90
94
|
}
|
|
91
95
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
EMAIL_SIGNATURES = {
|
|
96
|
+
EMAIL_SIGNATURE_REGEXES = {
|
|
95
97
|
ARIANE_DE_ROTHSCHILD: re.compile(r"Ensemble.*\nCe.*\ndestinataires.*\nremercions.*\nautorisee.*\nd.*\nLe.*\ncontenues.*\nEdmond.*\nRoth.*\nlo.*\nRoth.*\ninfo.*\nFranc.*\n.2.*", re.I),
|
|
96
98
|
BARBRO_C_EHNBOM: re.compile(r"Barbro C.? Ehn.*\nChairman, Swedish-American.*\n((Office|Cell|Sweden):.*\n)*(360.*\nNew York.*)?"),
|
|
97
99
|
DANNY_FROST: re.compile(r"Danny Frost\nDirector.*\nManhattan District.*\n212.*", re.IGNORECASE),
|
|
@@ -103,7 +105,7 @@ EMAIL_SIGNATURES = {
|
|
|
103
105
|
KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
|
|
104
106
|
LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
|
|
105
107
|
LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
|
|
106
|
-
MARTIN_WEINBERG: re.compile(
|
|
108
|
+
MARTIN_WEINBERG: re.compile(r"(Martin G. Weinberg, Esq.\n20 Park Plaza((, )|\n)Suite 1000\nBoston, MA 02116(\n61.*)?(\n.*([cC]ell|Office))*\n)?This Electronic Message contains.*?contents of this message is.*?prohibited.", re.DOTALL),
|
|
107
109
|
STEVEN_PFEIFFER: re.compile(r"Steven\nSteven .*\nAssociate.*\nIndependent Filmmaker Project\nMade in NY.*\n30 .*\nBrooklyn.*\n(p:.*\n)?www\.ifp.*", re.IGNORECASE),
|
|
108
110
|
PETER_MANDELSON: re.compile(r'Disclaimer This email and any attachments to it may be.*?with[ \n]+number(.*?EC4V[ \n]+6BJ)?', re.DOTALL | re.IGNORECASE),
|
|
109
111
|
PAUL_BARRETT: re.compile(r"Paul Barrett[\n\s]+Alpha Group Capital LLC[\n\s]+(142 W 57th Street, 11th Floor, New York, NY 10019?[\n\s]+)?(al?[\n\s]*)?ALPHA GROUP[\n\s]+CAPITAL"),
|
|
@@ -147,7 +149,6 @@ TRUNCATE_TERMS = [
|
|
|
147
149
|
'quote from The Colbert Report distinguishes',
|
|
148
150
|
'co-inventor of the GTX Smart Shoe',
|
|
149
151
|
'my latest Washington Post column',
|
|
150
|
-
'Whether you donated to Poetry in America through',
|
|
151
152
|
'supported my humanities work at Harvard',
|
|
152
153
|
'Calendar of Major Events, Openings, and Fundraisers',
|
|
153
154
|
'Nuclear Operator Raises Alarm on Crisis',
|
|
@@ -181,7 +182,6 @@ TRUNCATE_TERMS = [
|
|
|
181
182
|
'We can also discuss single stock and Topix banks',
|
|
182
183
|
'We are recording unprecedented divergences in falling equity vol',
|
|
183
184
|
'As previously discussed between you and Ariane',
|
|
184
|
-
'The US trade war against China: The view from Beijing',
|
|
185
185
|
'no evidence you got the latest so i have sent you just the key message',
|
|
186
186
|
# Joscha Bach
|
|
187
187
|
'Cells seem to be mostly indistinguishable (except',
|
|
@@ -204,6 +204,8 @@ TRUNCATE_TERMS = [
|
|
|
204
204
|
'General Election: Trump vs. Clinton LA Times/USC Tracking',
|
|
205
205
|
'Location: Quicken Loans Arena in Cleveland, OH',
|
|
206
206
|
'A friendly discussion about Syria with a former US State Department',
|
|
207
|
+
# Robert Kuhn
|
|
208
|
+
'The US trade war against China: The view from Beijing',
|
|
207
209
|
# Tom / Paul Krassner
|
|
208
210
|
'I forgot to post my cartoon from week before last, about Howard Schultz',
|
|
209
211
|
# Bannon
|
|
@@ -221,23 +223,26 @@ TRUNCATE_TERMS = [
|
|
|
221
223
|
'lecture in Heidelberg Oct 14 but they had to cancel',
|
|
222
224
|
# Nikolic
|
|
223
225
|
'people from LifeBall',
|
|
224
|
-
# Random
|
|
225
|
-
'Little Hodiaki',
|
|
226
|
-
"It began with deep worries regarding China's growth path",
|
|
227
|
-
'https://www.washingtonpost.com/politics/2018/09/04/transcript-phone-call',
|
|
228
226
|
# Epstein
|
|
229
227
|
'David Ben Gurion was asked why he, after 2000',
|
|
230
228
|
# Lisa New
|
|
231
229
|
'The raw materials for that period include interviews',
|
|
230
|
+
'Whether you donated to Poetry in America through',
|
|
231
|
+
# Random
|
|
232
|
+
'Little Hodiaki',
|
|
233
|
+
"It began with deep worries regarding China's growth path",
|
|
234
|
+
'https://www.washingtonpost.com/politics/2018/09/04/transcript-phone-call',
|
|
232
235
|
]
|
|
233
236
|
|
|
234
|
-
|
|
237
|
+
# Some Paul Krassner emails have a ton of CCed parties we don't care about
|
|
238
|
+
KRASSNER_RECIPIENTS = uniquify(flatten(ALL_FILE_CONFIGS[id].recipients for id in ['025329', '024923', '033568']))
|
|
235
239
|
|
|
236
240
|
# No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
|
|
237
241
|
USELESS_EMAILERS = IRAN_NUCLEAR_DEAL_SPAM_EMAIL_RECIPIENTS + \
|
|
238
242
|
KRASSNER_RECIPIENTS + \
|
|
239
243
|
FLIGHT_IN_2012_PEOPLE + [
|
|
240
244
|
'Alan Rogers', # Random CC
|
|
245
|
+
'Andrew Friendly', # Presumably some relation of Kelly Friendly
|
|
241
246
|
'BS Stern', # A random fwd of email we have
|
|
242
247
|
'Cheryl Kleen', # Single email from Anne Boyles, displayed under Anne Boyles
|
|
243
248
|
'Connie Zaguirre', # Random CC
|
|
@@ -268,24 +273,41 @@ USELESS_EMAILERS = IRAN_NUCLEAR_DEAL_SPAM_EMAIL_RECIPIENTS + \
|
|
|
268
273
|
]
|
|
269
274
|
|
|
270
275
|
# Emails sent by epstein to himself that are just notes
|
|
271
|
-
|
|
276
|
+
SELF_EMAILS_FILE_IDS = [
|
|
272
277
|
'026677',
|
|
273
|
-
'029752',
|
|
278
|
+
'029752', # TODO: jokeland...
|
|
274
279
|
'030238',
|
|
275
280
|
# '033274', # TODO: Epstein's note to self doesn't get printed if we don't set the recipients to [None]
|
|
276
281
|
]
|
|
277
282
|
|
|
283
|
+
METADATA_FIELDS = [
|
|
284
|
+
'is_junk_mail',
|
|
285
|
+
'recipients',
|
|
286
|
+
'sent_from_device',
|
|
287
|
+
]
|
|
288
|
+
|
|
278
289
|
|
|
279
290
|
@dataclass
|
|
280
291
|
class Email(Communication):
|
|
292
|
+
"""
|
|
293
|
+
Attributes:
|
|
294
|
+
actual_text (str) - best effort at the text actually sent in this email, excluding quoted replies and forwards
|
|
295
|
+
config (EmailCfg | None) - manual config for this email (if it exists)
|
|
296
|
+
header (EmailHeader) - header data extracted from the text (from/to/sent/subject etc)
|
|
297
|
+
is_junk_mail (bool) - True if this is junk mail
|
|
298
|
+
recipients (list[str | None]) - who this email was sent to
|
|
299
|
+
sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
|
|
300
|
+
signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
|
|
301
|
+
"""
|
|
281
302
|
actual_text: str = field(init=False)
|
|
303
|
+
config: EmailCfg | None = None
|
|
282
304
|
header: EmailHeader = field(init=False)
|
|
283
305
|
is_junk_mail: bool = False
|
|
284
306
|
recipients: list[str | None] = field(default_factory=list)
|
|
285
307
|
sent_from_device: str | None = None
|
|
286
|
-
signature_substitution_counts: dict[str, int] = field(default_factory=
|
|
308
|
+
signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
|
|
287
309
|
|
|
288
|
-
#
|
|
310
|
+
# For logging how many headers we prettified while printing, kind of janky
|
|
289
311
|
rewritten_header_ids: ClassVar[set[str]] = set([])
|
|
290
312
|
|
|
291
313
|
def __post_init__(self):
|
|
@@ -298,35 +320,34 @@ class Email(Communication):
|
|
|
298
320
|
for recipient in self.header.recipients():
|
|
299
321
|
self.recipients.extend(self._get_names(recipient))
|
|
300
322
|
|
|
301
|
-
|
|
323
|
+
# Remove self CCs
|
|
324
|
+
recipients = [r for r in self.recipients if r != self.author or self.file_id in SELF_EMAILS_FILE_IDS]
|
|
302
325
|
self.recipients = list(set(recipients))
|
|
303
|
-
self.text = self.
|
|
326
|
+
self.text = self._prettify_text()
|
|
304
327
|
self.actual_text = self._actual_text()
|
|
305
328
|
self.sent_from_device = self._sent_from_device()
|
|
306
|
-
logger.debug(f"Constructed {self.description()}")
|
|
307
|
-
|
|
308
|
-
def description(self) -> Text:
|
|
309
|
-
"""One line summary mostly for logging."""
|
|
310
|
-
txt = self._description()
|
|
311
|
-
|
|
312
|
-
if len(self.recipients) > 0:
|
|
313
|
-
txt.append(', ').append(key_value_txt('recipients', self._recipients_txt()))
|
|
314
|
-
|
|
315
|
-
return txt.append(CLOSE_PROPERTIES_CHAR)
|
|
316
|
-
|
|
317
|
-
def idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
|
|
318
|
-
"""Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
|
|
319
|
-
for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text or self.text)):
|
|
320
|
-
if i >= n:
|
|
321
|
-
return match.end() - 1
|
|
322
329
|
|
|
323
330
|
def info_txt(self) -> Text:
|
|
324
331
|
txt = Text("OCR text of email from ", style='grey46').append(self.author_txt).append(' to ')
|
|
325
332
|
return txt.append(self._recipients_txt()).append(highlighter(f" probably sent at {self.timestamp}"))
|
|
326
333
|
|
|
334
|
+
def metadata(self) -> Metadata:
|
|
335
|
+
metadata = super().metadata()
|
|
336
|
+
metadata.update({k: v for k, v in asdict(self).items() if v and k in METADATA_FIELDS})
|
|
337
|
+
return metadata
|
|
338
|
+
|
|
327
339
|
def subject(self) -> str:
|
|
328
340
|
return self.header.subject or ''
|
|
329
341
|
|
|
342
|
+
def summary(self) -> Text:
|
|
343
|
+
"""One line summary mostly for logging."""
|
|
344
|
+
txt = self._summary()
|
|
345
|
+
|
|
346
|
+
if len(self.recipients) > 0:
|
|
347
|
+
txt.append(', ').append(key_value_txt('recipients', self._recipients_txt()))
|
|
348
|
+
|
|
349
|
+
return txt.append(CLOSE_PROPERTIES_CHAR)
|
|
350
|
+
|
|
330
351
|
def _actual_text(self) -> str:
|
|
331
352
|
"""The text that comes before likely quoted replies and forwards etc."""
|
|
332
353
|
if self.config and self.config.actual_text is not None:
|
|
@@ -339,8 +360,8 @@ class Email(Communication):
|
|
|
339
360
|
# logger.info(f"Raw text:\n" + self.top_lines(20) + '\n\n')
|
|
340
361
|
# logger.info(f"With header removed:\n" + text[0:500] + '\n\n')
|
|
341
362
|
|
|
342
|
-
if self.file_id in ['024624']:
|
|
343
|
-
return text
|
|
363
|
+
if self.file_id in ['024624']: # This email starts with "On September 14th"
|
|
364
|
+
return text.split('On Tue, May 14')[0].strip()
|
|
344
365
|
|
|
345
366
|
if reply_text_match:
|
|
346
367
|
actual_num_chars = len(reply_text_match.group(1))
|
|
@@ -355,7 +376,6 @@ class Email(Communication):
|
|
|
355
376
|
if field_string not in text:
|
|
356
377
|
continue
|
|
357
378
|
|
|
358
|
-
logger.debug(f"'{self.url_slug}': Splitting based on '{field_string.strip()}'")
|
|
359
379
|
pre_from_text = text.split(field_string)[0]
|
|
360
380
|
actual_num_chars = len(pre_from_text)
|
|
361
381
|
actual_text_pct = f"{(100 * float(actual_num_chars) / len(text)):.1f}%"
|
|
@@ -377,29 +397,6 @@ class Email(Communication):
|
|
|
377
397
|
|
|
378
398
|
return style.replace('bold', '').strip()
|
|
379
399
|
|
|
380
|
-
def _cleaned_up_text(self) -> str:
|
|
381
|
-
"""Add newline after headers in text if actual header wasn't empty, remove bad lines, etc."""
|
|
382
|
-
# Insert line breaks now unless header is broken, in which case we'll do it later after fixing header
|
|
383
|
-
text = self.text if self.header.was_initially_empty else _add_line_breaks(self.text)
|
|
384
|
-
text = REPLY_REGEX.sub(r'\n\1', text) # Newlines between quoted replies
|
|
385
|
-
|
|
386
|
-
for name, signature_regex in EMAIL_SIGNATURES.items():
|
|
387
|
-
signature_replacement = f'<...snipped {name.lower()} legal signature...>'
|
|
388
|
-
text, num_replaced = signature_regex.subn(signature_replacement, text)
|
|
389
|
-
self.signature_substitution_counts[name] += num_replaced
|
|
390
|
-
|
|
391
|
-
return collapse_newlines(text).strip()
|
|
392
|
-
|
|
393
|
-
def _debug_info(self) -> str:
|
|
394
|
-
info = [
|
|
395
|
-
f"id={self.file_id}",
|
|
396
|
-
f"url_slug={self.url_slug}",
|
|
397
|
-
f"file_path='{self.file_path}'",
|
|
398
|
-
f"is_local_extract_file={self.is_local_extract_file()}",
|
|
399
|
-
]
|
|
400
|
-
|
|
401
|
-
return f" " + "\n ".join(info)
|
|
402
|
-
|
|
403
400
|
def _extract_author(self) -> None:
|
|
404
401
|
self._extract_header()
|
|
405
402
|
super()._extract_author()
|
|
@@ -418,9 +415,8 @@ class Email(Communication):
|
|
|
418
415
|
if self.header.is_empty():
|
|
419
416
|
self.header.repair_empty_header(self.lines)
|
|
420
417
|
else:
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
log_fxn(msg)
|
|
418
|
+
log_level = logging.INFO if self.config else logging.WARNING
|
|
419
|
+
self.log_top_lines(msg='No email header match found!', level=log_level)
|
|
424
420
|
self.header = EmailHeader(field_names=[])
|
|
425
421
|
|
|
426
422
|
def _extract_timestamp(self) -> datetime:
|
|
@@ -432,7 +428,7 @@ class Email(Communication):
|
|
|
432
428
|
if timestamp:
|
|
433
429
|
return timestamp
|
|
434
430
|
|
|
435
|
-
searchable_lines = self.lines[0:
|
|
431
|
+
searchable_lines = self.lines[0:MAX_NUM_HEADER_LINES]
|
|
436
432
|
searchable_text = '\n'.join(searchable_lines)
|
|
437
433
|
date_match = DATE_HEADER_REGEX.search(searchable_text)
|
|
438
434
|
|
|
@@ -442,7 +438,7 @@ class Email(Communication):
|
|
|
442
438
|
if timestamp:
|
|
443
439
|
return timestamp
|
|
444
440
|
|
|
445
|
-
logger.debug(f"Failed to find timestamp, falling back to parsing {
|
|
441
|
+
logger.debug(f"Failed to find timestamp, falling back to parsing {MAX_NUM_HEADER_LINES} lines...")
|
|
446
442
|
|
|
447
443
|
for line in searchable_lines:
|
|
448
444
|
if not TIMESTAMP_LINE_REGEX.search(line):
|
|
@@ -476,6 +472,12 @@ class Email(Communication):
|
|
|
476
472
|
names_found = names_found or [emailer_str]
|
|
477
473
|
return [_reverse_first_and_last_names(name) for name in names_found]
|
|
478
474
|
|
|
475
|
+
def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
|
|
476
|
+
"""Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
|
|
477
|
+
for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text or self.text)):
|
|
478
|
+
if i >= n:
|
|
479
|
+
return match.end() - 1
|
|
480
|
+
|
|
479
481
|
def _merge_lines(self, idx: int, idx2: int | None = None) -> None:
|
|
480
482
|
"""Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""
|
|
481
483
|
idx2 = idx2 if idx2 is not None else (idx + 1)
|
|
@@ -490,6 +492,20 @@ class Email(Communication):
|
|
|
490
492
|
|
|
491
493
|
self._set_computed_fields(lines=lines)
|
|
492
494
|
|
|
495
|
+
def _prettify_text(self) -> str:
|
|
496
|
+
"""Add newlines before quoted replies and snip signatures."""
|
|
497
|
+
# Insert line breaks now unless header is broken, in which case we'll do it later after fixing header
|
|
498
|
+
text = self.text if self.header.was_initially_empty else _add_line_breaks(self.text)
|
|
499
|
+
text = REPLY_REGEX.sub(r'\n\1', text) # Newlines between quoted replies
|
|
500
|
+
|
|
501
|
+
for name, signature_regex in EMAIL_SIGNATURE_REGEXES.items():
|
|
502
|
+
signature_replacement = f'<...snipped {name.lower()} legal signature...>'
|
|
503
|
+
text, num_replaced = signature_regex.subn(signature_replacement, text)
|
|
504
|
+
self.signature_substitution_counts[name] = self.signature_substitution_counts.get(name, 0)
|
|
505
|
+
self.signature_substitution_counts[name] += num_replaced
|
|
506
|
+
|
|
507
|
+
return collapse_newlines(text).strip()
|
|
508
|
+
|
|
493
509
|
def _recipients_txt(self) -> Text:
|
|
494
510
|
"""Text object with comma separated colored versions of all recipients."""
|
|
495
511
|
recipients = [r or UNKNOWN for r in self.recipients] if len(self.recipients) > 0 else [UNKNOWN]
|
|
@@ -500,6 +516,14 @@ class Email(Communication):
|
|
|
500
516
|
for r in recipients
|
|
501
517
|
], join=', ')
|
|
502
518
|
|
|
519
|
+
def _remove_line(self, idx: int) -> None:
|
|
520
|
+
"""Remove a line from self.lines."""
|
|
521
|
+
num_lines = idx * 2
|
|
522
|
+
self.log_top_lines(num_lines, msg=f'before removal of line {idx}')
|
|
523
|
+
del self.lines[idx]
|
|
524
|
+
self._set_computed_fields(lines=self.lines)
|
|
525
|
+
self.log_top_lines(num_lines, msg=f'after removal of line {idx}')
|
|
526
|
+
|
|
503
527
|
def _repair(self) -> None:
|
|
504
528
|
"""Repair particularly janky files."""
|
|
505
529
|
if BAD_FIRST_LINE_REGEX.match(self.lines[0]):
|
|
@@ -510,21 +534,37 @@ class Email(Communication):
|
|
|
510
534
|
|
|
511
535
|
if self.file_id in ['031442']:
|
|
512
536
|
self._merge_lines(0) # Merge 1st and 2nd rows
|
|
513
|
-
elif self.file_id in '021729
|
|
537
|
+
elif self.file_id in '021729 025790 029282 029501 029889 030626 031384 031428 033097 033512 033583 029498 033583'.split():
|
|
514
538
|
self._merge_lines(2) # Merge 3rd and 4th rows
|
|
515
539
|
|
|
516
540
|
if self.file_id in ['030626']: # Merge 6th and 7th (now 5th and 6th) rows
|
|
517
541
|
self._merge_lines(4)
|
|
518
|
-
|
|
542
|
+
elif self.file_id == '029889':
|
|
543
|
+
self._merge_lines(2, 5)
|
|
544
|
+
elif self.file_id in ['029498', '031428']:
|
|
545
|
+
self._merge_lines(2, 4)
|
|
546
|
+
elif self.file_id in ['029976', '023067']:
|
|
519
547
|
self._merge_lines(3) # Merge 4th and 5th rows
|
|
520
|
-
elif self.file_id in '026609 029402 032405'.split():
|
|
548
|
+
elif self.file_id in '026609 029402 032405 022695'.split():
|
|
521
549
|
self._merge_lines(4) # Merge 5th and 6th rows
|
|
550
|
+
elif self.file_id in ['019407', '031980', '030384', '033144', '030999', '033575', '029835', '030381']:
|
|
551
|
+
self._merge_lines(2, 4)
|
|
552
|
+
elif self.file_id in ['029154', '029163']:
|
|
553
|
+
self._merge_lines(2, 5)
|
|
554
|
+
elif self.file_id in ['033228', '032063']:
|
|
555
|
+
self._merge_lines(3, 5)
|
|
556
|
+
elif self.file_id == '028931':
|
|
557
|
+
self._merge_lines(3, 6)
|
|
522
558
|
elif self.file_id in ['033568']:
|
|
523
559
|
for _i in range(5):
|
|
524
560
|
self._merge_lines(5)
|
|
525
561
|
elif self.file_id in ['025329']:
|
|
526
562
|
for _i in range(9):
|
|
527
563
|
self._merge_lines(2)
|
|
564
|
+
elif self.file_id == '033486':
|
|
565
|
+
self._merge_lines(7, 9)
|
|
566
|
+
elif self.file_id == '030299':
|
|
567
|
+
self._merge_lines(7, 10)
|
|
528
568
|
elif self.file_id == '029977':
|
|
529
569
|
self._set_computed_fields(text=self.text.replace('Sent 9/28/2012 2:41:02 PM', 'Sent: 9/28/2012 2:41:02 PM'))
|
|
530
570
|
|
|
@@ -533,6 +573,11 @@ class Email(Communication):
|
|
|
533
573
|
|
|
534
574
|
self._merge_lines(4)
|
|
535
575
|
self._merge_lines(2, 4)
|
|
576
|
+
elif self.file_id == '025041':
|
|
577
|
+
self._remove_line(4)
|
|
578
|
+
self._remove_line(4)
|
|
579
|
+
elif self.file_id == '029692':
|
|
580
|
+
self._remove_line(3)
|
|
536
581
|
|
|
537
582
|
if old_text != self.text:
|
|
538
583
|
self.log(f"Modified text, old:\n\n" + '\n'.join(old_text.split('\n')[0:12]) + '\n', logging.INFO)
|
|
@@ -568,21 +613,21 @@ class Email(Communication):
|
|
|
568
613
|
self._set_computed_fields(lines=new_lines)
|
|
569
614
|
|
|
570
615
|
def _sent_from_device(self) -> str | None:
|
|
571
|
-
"""Find any 'Sent from my iPhone' style
|
|
616
|
+
"""Find any 'Sent from my iPhone' style signature line if it exist in the 'actual_text'."""
|
|
572
617
|
sent_from_match = SENT_FROM_REGEX.search(self.actual_text)
|
|
573
618
|
|
|
574
619
|
if sent_from_match:
|
|
575
620
|
sent_from = sent_from_match.group(0)
|
|
576
621
|
return 'S' + sent_from[1:] if sent_from.startswith('sent') else sent_from
|
|
577
622
|
|
|
578
|
-
def __rich_console__(self,
|
|
623
|
+
def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
|
|
579
624
|
logger.debug(f"Printing '{self.filename}'...")
|
|
580
625
|
yield self.file_info_panel()
|
|
581
|
-
text = self.text
|
|
582
626
|
should_rewrite_header = self.header.was_initially_empty and self.header.num_header_rows > 0
|
|
583
|
-
quote_cutoff = self.
|
|
627
|
+
quote_cutoff = self._idx_of_nth_quoted_reply(text=self.text) # Trim if there's many quoted replies
|
|
584
628
|
num_chars = MAX_CHARS_TO_PRINT
|
|
585
629
|
trim_footer_txt = None
|
|
630
|
+
text = self.text
|
|
586
631
|
|
|
587
632
|
if self.file_id in TRUNCATION_LENGTHS:
|
|
588
633
|
num_chars = TRUNCATION_LENGTHS[self.file_id]
|
|
@@ -611,7 +656,7 @@ class Email(Communication):
|
|
|
611
656
|
|
|
612
657
|
lines += text.split('\n')[num_lines_to_skip:]
|
|
613
658
|
text = self.header.rewrite_header() + '\n' + '\n'.join(lines)
|
|
614
|
-
text = _add_line_breaks(text) # This was skipped when
|
|
659
|
+
text = _add_line_breaks(text) # This was skipped when _prettify_text() w/a broken header so we do it now
|
|
615
660
|
self.rewritten_header_ids.add(self.file_id)
|
|
616
661
|
|
|
617
662
|
panel_txt = highlighter(text)
|
|
@@ -628,6 +673,30 @@ class Email(Communication):
|
|
|
628
673
|
if should_rewrite_header:
|
|
629
674
|
self.log_top_lines(self.header.num_header_rows + 4, f'Original header:', logging.INFO)
|
|
630
675
|
|
|
676
|
+
@staticmethod
|
|
677
|
+
def build_table(emails: list['Email'], _author: str | None) -> Table:
|
|
678
|
+
"""Turn a set of Email objects into a Table."""
|
|
679
|
+
author = _author or UNKNOWN
|
|
680
|
+
|
|
681
|
+
table = Table(
|
|
682
|
+
title=f"Emails to/from {author} starting {emails[0].timestamp.date()}",
|
|
683
|
+
border_style=get_style_for_name(author, allow_bold=False),
|
|
684
|
+
header_style="bold"
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
table.add_column('From', justify='left')
|
|
688
|
+
table.add_column('Timestamp', justify='center')
|
|
689
|
+
table.add_column('Subject', justify='left', style='honeydew2', min_width=60)
|
|
690
|
+
|
|
691
|
+
for email in emails:
|
|
692
|
+
table.add_row(
|
|
693
|
+
email.author_txt,
|
|
694
|
+
email.epstein_media_link(link_txt=email.timestamp_without_seconds()),
|
|
695
|
+
highlighter(email.subject())
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
return table
|
|
699
|
+
|
|
631
700
|
|
|
632
701
|
def _add_line_breaks(email_text: str) -> str:
|
|
633
702
|
return EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX.sub(r'\n\1\n', email_text).strip()
|
|
@@ -4,8 +4,8 @@ from dataclasses import asdict, dataclass, field
|
|
|
4
4
|
|
|
5
5
|
from epstein_files.util.constant.strings import AUTHOR, REDACTED
|
|
6
6
|
from epstein_files.util.constants import ALL_CONFIGS
|
|
7
|
-
from epstein_files.util.
|
|
8
|
-
from epstein_files.util.
|
|
7
|
+
from epstein_files.util.doc_cfg import EmailCfg
|
|
8
|
+
from epstein_files.util.logging import logger
|
|
9
9
|
from epstein_files.util.rich import UNKNOWN
|
|
10
10
|
|
|
11
11
|
FIELD_NAMES = ['From', 'Date', 'Sent', 'Subject']
|
|
@@ -21,11 +21,11 @@ EMAIL_PRE_FORWARD_REGEX = re.compile(r"(.{3,2000}?)" + HEADER_REGEX_STR, re.DOTA
|
|
|
21
21
|
TIME_REGEX = re.compile(r'^(\d{1,2}/\d{1,2}/\d{2,4}|Thursday|Monday|Tuesday|Wednesday|Friday|Saturday|Sunday).*')
|
|
22
22
|
|
|
23
23
|
BAD_NAME_CHARS_REGEX = re.compile(r"[\"'\[\]*><•]")
|
|
24
|
-
BAD_EMAILER_REGEX = re.compile(r'^(>|11111111)|agreed|ok|sexy|
|
|
24
|
+
BAD_EMAILER_REGEX = re.compile(r'^(>|11111111)|agreed|ok|sexy|re:|fwd:|Multiple Senders|((sent|attachments|subject|importance).*|.*(january|201\d|hysterical|i have|image0|so that people|article 1.?|momminnemummin|These conspiracy theories|your state|undisclosed|www\.theguardian|talk in|it was a|what do|cc:|call (back|me)).*)$', re.IGNORECASE)
|
|
25
25
|
|
|
26
26
|
CONFIGURED_ACTUAL_TEXTS = [
|
|
27
27
|
cfg.actual_text for cfg in ALL_CONFIGS
|
|
28
|
-
if isinstance(cfg,
|
|
28
|
+
if isinstance(cfg, EmailCfg) and cfg.actual_text is not None
|
|
29
29
|
]
|
|
30
30
|
|
|
31
31
|
|
|
@@ -70,7 +70,7 @@ class EmailHeader:
|
|
|
70
70
|
raise RuntimeError(f"Ran out of header rows to check for '{field_name}'")
|
|
71
71
|
|
|
72
72
|
value = email_lines[row_number_to_check]
|
|
73
|
-
log_prefix = f"Looks like '{value}' is a mismatch for '{field_name}'
|
|
73
|
+
log_prefix = f"Looks like '{value}' is a mismatch for '{field_name}'"
|
|
74
74
|
|
|
75
75
|
if field_name == AUTHOR:
|
|
76
76
|
if value in CONFIGURED_ACTUAL_TEXTS:
|
|
@@ -99,7 +99,8 @@ class EmailHeader:
|
|
|
99
99
|
setattr(self, field_name, value)
|
|
100
100
|
|
|
101
101
|
self.num_header_rows = len(self.field_names) + num_headers
|
|
102
|
-
|
|
102
|
+
log_msg = f"Corrected empty header using {self.num_header_rows} lines to:\n"
|
|
103
|
+
logger.debug(f"{log_msg}{self}\n\nTop lines:\n\n%s", '\n'.join(email_lines[0:(num_headers + 1) * 2]))
|
|
103
104
|
|
|
104
105
|
def rewrite_header(self) -> str:
|
|
105
106
|
header_fields = {}
|
|
@@ -7,7 +7,8 @@ from rich.text import Text
|
|
|
7
7
|
from epstein_files.util.constant.names import JEFFREY_EPSTEIN, ANTHONY_SCARAMUCCI, STEVE_BANNON, UNKNOWN
|
|
8
8
|
from epstein_files.util.data import extract_last_name
|
|
9
9
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
10
|
-
from epstein_files.util.
|
|
10
|
+
from epstein_files.util.logging import logger
|
|
11
|
+
from epstein_files.util.rich import TEXT_LINK, highlighter
|
|
11
12
|
|
|
12
13
|
MSG_DATE_FORMAT = r"%m/%d/%y %I:%M:%S %p"
|
|
13
14
|
PHONE_NUMBER_REGEX = re.compile(r'^[\d+]+.*')
|
|
@@ -55,7 +56,7 @@ class TextMessage:
|
|
|
55
56
|
else:
|
|
56
57
|
self.author_str = self.author
|
|
57
58
|
|
|
58
|
-
if not self.id_confirmed and self.author is not None:
|
|
59
|
+
if not self.id_confirmed and self.author is not None and self.author != JEFFREY_EPSTEIN:
|
|
59
60
|
self.author_str = self.author + ' (?)'
|
|
60
61
|
|
|
61
62
|
def timestamp(self) -> datetime:
|
|
@@ -1,14 +1,19 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
1
3
|
from dataclasses import dataclass
|
|
2
4
|
from pathlib import Path
|
|
5
|
+
from typing import ClassVar
|
|
3
6
|
|
|
4
7
|
from rich.text import Text
|
|
5
8
|
|
|
6
9
|
from epstein_files.documents.other_file import OtherFile
|
|
10
|
+
from epstein_files.util.constant.strings import JSON
|
|
7
11
|
|
|
8
12
|
|
|
9
13
|
@dataclass
|
|
10
14
|
class JsonFile(OtherFile):
|
|
11
15
|
"""File containing JSON data."""
|
|
16
|
+
strip_whitespace: ClassVar[bool] = False
|
|
12
17
|
|
|
13
18
|
def __post_init__(self):
|
|
14
19
|
super().__post_init__()
|
|
@@ -16,8 +21,20 @@ class JsonFile(OtherFile):
|
|
|
16
21
|
if self.url_slug.endswith('.txt') or self.url_slug.endswith('.json'):
|
|
17
22
|
self.url_slug = Path(self.url_slug).stem
|
|
18
23
|
|
|
24
|
+
self._set_computed_fields(text=self.formatted_json())
|
|
25
|
+
|
|
26
|
+
def category(self) -> str:
|
|
27
|
+
return JSON
|
|
28
|
+
|
|
29
|
+
def formatted_json(self) -> str:
|
|
30
|
+
return json.dumps(self.json_data(), indent=4)
|
|
31
|
+
|
|
19
32
|
def info_txt(self) -> Text | None:
|
|
20
33
|
return Text(f"JSON file, possibly iMessage or similar app metadata", style='white dim italic')
|
|
21
34
|
|
|
22
35
|
def is_interesting(self):
|
|
23
36
|
return False
|
|
37
|
+
|
|
38
|
+
def json_data(self) -> object:
|
|
39
|
+
with open(self.file_path, encoding='utf-8-sig') as f:
|
|
40
|
+
return json.load(f)
|