epstein-files 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +31 -18
- epstein_files/documents/communication.py +9 -5
- epstein_files/documents/document.py +225 -136
- epstein_files/documents/doj_file.py +242 -0
- epstein_files/documents/doj_files/full_text.py +166 -0
- epstein_files/documents/email.py +138 -163
- epstein_files/documents/emails/email_header.py +21 -11
- epstein_files/documents/emails/emailers.py +223 -0
- epstein_files/documents/imessage/text_message.py +2 -3
- epstein_files/documents/json_file.py +18 -14
- epstein_files/documents/messenger_log.py +23 -39
- epstein_files/documents/other_file.py +48 -44
- epstein_files/epstein_files.py +54 -33
- epstein_files/person.py +142 -110
- epstein_files/util/constant/names.py +29 -6
- epstein_files/util/constant/output_files.py +2 -0
- epstein_files/util/constant/strings.py +12 -6
- epstein_files/util/constant/urls.py +17 -0
- epstein_files/util/constants.py +101 -174
- epstein_files/util/data.py +2 -0
- epstein_files/util/doc_cfg.py +20 -15
- epstein_files/util/env.py +24 -16
- epstein_files/util/file_helper.py +28 -6
- epstein_files/util/helpers/debugging_helper.py +13 -0
- epstein_files/util/helpers/env_helpers.py +21 -0
- epstein_files/util/highlighted_group.py +57 -16
- epstein_files/util/layout/left_bar_panel.py +26 -0
- epstein_files/util/logging.py +28 -13
- epstein_files/util/output.py +33 -10
- epstein_files/util/rich.py +28 -2
- epstein_files/util/word_count.py +7 -7
- {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/METADATA +14 -1
- epstein_files-1.5.0.dist-info/RECORD +40 -0
- epstein_files-1.4.1.dist-info/RECORD +0 -34
- {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/LICENSE +0 -0
- {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/WHEEL +0 -0
- {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/entry_points.txt +0 -0
epstein_files/documents/email.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import re
|
|
4
|
-
from collections import defaultdict
|
|
5
4
|
from copy import deepcopy
|
|
6
5
|
from dataclasses import asdict, dataclass, field
|
|
7
6
|
from datetime import datetime
|
|
@@ -16,13 +15,14 @@ from rich.text import Text
|
|
|
16
15
|
|
|
17
16
|
from epstein_files.documents.communication import Communication
|
|
18
17
|
from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, INFO_INDENT
|
|
19
|
-
from epstein_files.documents.emails.email_header import (
|
|
20
|
-
EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, FIELDS_COLON_PATTERN,
|
|
18
|
+
from epstein_files.documents.emails.email_header import (EMAIL_SIMPLE_HEADER_REGEX,
|
|
19
|
+
EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, FIELDS_COLON_PATTERN, EmailHeader)
|
|
20
|
+
from epstein_files.documents.emails.emailers import extract_emailer_names
|
|
21
21
|
from epstein_files.documents.other_file import OtherFile
|
|
22
22
|
from epstein_files.util.constant.names import *
|
|
23
23
|
from epstein_files.util.constant.strings import REDACTED
|
|
24
24
|
from epstein_files.util.constants import *
|
|
25
|
-
from epstein_files.util.data import TIMEZONE_INFO, collapse_newlines,
|
|
25
|
+
from epstein_files.util.data import AMERICAN_TIME_REGEX, TIMEZONE_INFO, collapse_newlines, remove_timezone
|
|
26
26
|
from epstein_files.util.doc_cfg import EmailCfg, Metadata
|
|
27
27
|
from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
|
|
28
28
|
from epstein_files.util.highlighted_group import JUNK_EMAILERS, get_style_for_name
|
|
@@ -32,7 +32,6 @@ from epstein_files.util.rich import *
|
|
|
32
32
|
BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
|
|
33
33
|
BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Hide caption|Importance:?\s*High|[iI,•]|[1i] (_ )?[il]|, [-,]|L\._|_filtered|.*(yiv0232|font-family:|margin-bottom:).*)$')
|
|
34
34
|
BAD_SUBJECT_CONTINUATIONS = ['orwarded', 'Hi ', 'Sent ', 'AmLaw', 'Original Message', 'Privileged', 'Sorry', '---']
|
|
35
|
-
DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
|
|
36
35
|
FIELDS_COLON_REGEX = re.compile(FIELDS_COLON_PATTERN)
|
|
37
36
|
LINK_LINE_REGEX = re.compile(f"^[>• ]*htt")
|
|
38
37
|
LINK_LINE2_REGEX = re.compile(r"^[-\w.%&=/]{5,}$")
|
|
@@ -44,7 +43,6 @@ DATE_HEADER_REGEX = re.compile(r'(?:Date|Sent):? +(?!by|from|to|via)([^\n]{6,})\
|
|
|
44
43
|
TIMESTAMP_LINE_REGEX = re.compile(r"\d+:\d+")
|
|
45
44
|
LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
|
|
46
45
|
|
|
47
|
-
SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
|
|
48
46
|
REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
|
|
49
47
|
URL_SIGNIFIERS = ['?amp', 'amp?', 'cd=', 'click', 'CMP=', 'contentId', 'ft=', 'gclid', 'htm', 'mp=', 'keywords=', 'Id=', 'module=', 'mpweb', 'nlid=', 'ref=', 'smid=', 'sp=', 'usg=', 'utm']
|
|
50
48
|
APPEARS_IN = 'appears in'
|
|
@@ -107,6 +105,7 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
|
107
105
|
'Priebus, used\nprivate email accounts for': 'Priebus, used private email accounts for',
|
|
108
106
|
"War on the Investigations\nEncircling Him": "War on the Investigations Encircling Him",
|
|
109
107
|
"Subject; RE": "Subject: RE",
|
|
108
|
+
"straining relations between UK and\nAmerica": "straining relations between UK and America",
|
|
110
109
|
re.compile(r"deadline re Mr Bradley Edwards vs Mr\s*Jeffrey Epstein", re.I): "deadline re Mr Bradley Edwards vs Mr Jeffrey Epstein",
|
|
111
110
|
re.compile(r"Following Plea That Implicated Trump -\s*https://www.npr.org/676040070", re.I): "Following Plea That Implicated Trump - https://www.npr.org/676040070",
|
|
112
111
|
re.compile(r"for Attorney General -\s+Wikisource, the"): r"for Attorney General - Wikisource, the",
|
|
@@ -344,6 +343,10 @@ LINE_REPAIR_MERGES = {
|
|
|
344
343
|
'033575': [[2, 4]],
|
|
345
344
|
'033576': [[3]],
|
|
346
345
|
'033583': [[2]],
|
|
346
|
+
|
|
347
|
+
# Note DOJ file line adjustments happen *after* DojFile._repair() is called
|
|
348
|
+
'EFTA00039689': [[4]],
|
|
349
|
+
'EFTA00040118': [[2], [2], [2], [2], [2], [2], [6], [6]],
|
|
347
350
|
}
|
|
348
351
|
|
|
349
352
|
|
|
@@ -351,12 +354,13 @@ LINE_REPAIR_MERGES = {
|
|
|
351
354
|
class Email(Communication):
|
|
352
355
|
"""
|
|
353
356
|
Attributes:
|
|
354
|
-
actual_text (str) -
|
|
355
|
-
config (EmailCfg
|
|
356
|
-
header (EmailHeader) -
|
|
357
|
-
recipients (list[Name]) -
|
|
358
|
-
sent_from_device (str
|
|
359
|
-
signature_substitution_counts (dict[str, int]) -
|
|
357
|
+
actual_text (str) - Best effort at the text actually sent in this email, excluding quoted replies and forwards.
|
|
358
|
+
config (EmailCfg, optional) - Manual config for this email (if it exists).
|
|
359
|
+
header (EmailHeader) - Header data extracted from the text (from/to/sent/subject etc).
|
|
360
|
+
recipients (list[Name]) - People to whom this email was sent.
|
|
361
|
+
sent_from_device (str, optional) - "Sent from my iPhone" style signature (if it exists).
|
|
362
|
+
signature_substitution_counts (dict[str, int]) - Number of times a signature was replaced with
|
|
363
|
+
<...snipped...> for each participant
|
|
360
364
|
"""
|
|
361
365
|
attached_docs: list[OtherFile] = field(default_factory=list)
|
|
362
366
|
actual_text: str = field(init=False)
|
|
@@ -371,53 +375,33 @@ class Email(Communication):
|
|
|
371
375
|
# For logging how many headers we prettified while printing, kind of janky
|
|
372
376
|
rewritten_header_ids: ClassVar[set[str]] = set([])
|
|
373
377
|
|
|
374
|
-
|
|
375
|
-
self.filename = self.file_path.name
|
|
376
|
-
self.file_id = extract_file_id(self.filename)
|
|
377
|
-
|
|
378
|
-
# Special handling for copying properties out of the config for the document this one was extracted from
|
|
379
|
-
if self.is_local_extract_file():
|
|
380
|
-
self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
|
|
381
|
-
extracted_from_doc_id = self.url_slug.split('_')[-1]
|
|
382
|
-
|
|
383
|
-
if extracted_from_doc_id in ALL_FILE_CONFIGS:
|
|
384
|
-
self._set_config_for_extracted_file(ALL_FILE_CONFIGS[extracted_from_doc_id])
|
|
385
|
-
|
|
386
|
-
super().__post_init__()
|
|
387
|
-
|
|
388
|
-
if self.config and self.config.recipients:
|
|
389
|
-
self.recipients = self.config.recipients
|
|
390
|
-
else:
|
|
391
|
-
for recipient in self.header.recipients():
|
|
392
|
-
self.recipients.extend(self._extract_emailer_names(recipient))
|
|
393
|
-
|
|
394
|
-
# Assume mailing list emails are to Epstein
|
|
395
|
-
if self.author in BCC_LISTS and (self.is_note_to_self() or not self.recipients):
|
|
396
|
-
self.recipients = [JEFFREY_EPSTEIN]
|
|
397
|
-
|
|
398
|
-
# Remove self CCs but preserve self emails
|
|
399
|
-
if not self.is_note_to_self():
|
|
400
|
-
self.recipients = [r for r in self.recipients if r != self.author]
|
|
401
|
-
|
|
402
|
-
self.recipients = sorted(list(set(self.recipients)), key=lambda r: r or UNKNOWN)
|
|
403
|
-
self.text = self._prettify_text()
|
|
404
|
-
self.actual_text = self._actual_text()
|
|
405
|
-
self.sent_from_device = self._sent_from_device()
|
|
406
|
-
|
|
378
|
+
@property
|
|
407
379
|
def attachments(self) -> list[str]:
|
|
408
380
|
"""Returns the string in the header."""
|
|
409
381
|
return (self.header.attachments or '').split(';')
|
|
410
382
|
|
|
383
|
+
@property
|
|
384
|
+
def border_style(self) -> str:
|
|
385
|
+
"""Color emails from epstein to others with the color for the first recipient."""
|
|
386
|
+
if self.author == JEFFREY_EPSTEIN and len(self.recipients) > 0:
|
|
387
|
+
style = get_style_for_name(self.recipients[0])
|
|
388
|
+
else:
|
|
389
|
+
style = self.author_style
|
|
390
|
+
|
|
391
|
+
return style.replace('bold', '').strip()
|
|
392
|
+
|
|
393
|
+
@property
|
|
411
394
|
def info_txt(self) -> Text:
|
|
412
|
-
email_type = 'fwded article' if self.is_fwded_article
|
|
413
|
-
txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt
|
|
395
|
+
email_type = 'fwded article' if self.is_fwded_article else 'email'
|
|
396
|
+
txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt)
|
|
414
397
|
|
|
415
398
|
if self.config and self.config.is_attribution_uncertain:
|
|
416
|
-
txt.append(f" {QUESTION_MARKS}", style=self.author_style
|
|
399
|
+
txt.append(f" {QUESTION_MARKS}", style=self.author_style)
|
|
417
400
|
|
|
418
401
|
txt.append(' to ').append(self.recipients_txt())
|
|
419
402
|
return txt.append(highlighter(f" probably sent at {self.timestamp}"))
|
|
420
403
|
|
|
404
|
+
@property
|
|
421
405
|
def is_fwded_article(self) -> bool:
|
|
422
406
|
if self.config is None:
|
|
423
407
|
return False
|
|
@@ -426,33 +410,78 @@ class Email(Communication):
|
|
|
426
410
|
else:
|
|
427
411
|
return bool(self.config.is_fwded_article)
|
|
428
412
|
|
|
413
|
+
@property
|
|
429
414
|
def is_junk_mail(self) -> bool:
|
|
430
415
|
return self.author in JUNK_EMAILERS
|
|
431
416
|
|
|
417
|
+
@property
|
|
432
418
|
def is_mailing_list(self) -> bool:
|
|
433
|
-
return self.author in MAILING_LISTS or self.is_junk_mail
|
|
419
|
+
return self.author in MAILING_LISTS or self.is_junk_mail
|
|
434
420
|
|
|
421
|
+
@property
|
|
435
422
|
def is_note_to_self(self) -> bool:
|
|
436
423
|
return self.recipients == [self.author]
|
|
437
424
|
|
|
438
|
-
|
|
439
|
-
return name in [self.author] + self.recipients
|
|
440
|
-
|
|
425
|
+
@property
|
|
441
426
|
def is_word_count_worthy(self) -> bool:
|
|
442
|
-
if self.is_fwded_article
|
|
427
|
+
if self.is_fwded_article:
|
|
443
428
|
return bool(self.config.fwded_text_after) or len(self.actual_text) < 150
|
|
444
429
|
else:
|
|
445
|
-
return not self.is_mailing_list
|
|
430
|
+
return not self.is_mailing_list
|
|
446
431
|
|
|
432
|
+
@property
|
|
447
433
|
def metadata(self) -> Metadata:
|
|
448
434
|
local_metadata = asdict(self)
|
|
449
|
-
local_metadata['is_junk_mail'] = self.is_junk_mail
|
|
450
|
-
local_metadata['is_mailing_list'] = self.is_junk_mail
|
|
451
|
-
local_metadata['subject'] = self.subject
|
|
452
|
-
metadata = super().metadata
|
|
435
|
+
local_metadata['is_junk_mail'] = self.is_junk_mail
|
|
436
|
+
local_metadata['is_mailing_list'] = self.is_junk_mail
|
|
437
|
+
local_metadata['subject'] = self.subject or None
|
|
438
|
+
metadata = super().metadata
|
|
453
439
|
metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
|
|
454
440
|
return metadata
|
|
455
441
|
|
|
442
|
+
@property
|
|
443
|
+
def subject(self) -> str:
|
|
444
|
+
if self.config and self.config.subject:
|
|
445
|
+
return self.config.subject
|
|
446
|
+
else:
|
|
447
|
+
return self.header.subject or ''
|
|
448
|
+
|
|
449
|
+
def __post_init__(self):
|
|
450
|
+
self.filename = self.file_path.name
|
|
451
|
+
self.file_id = extract_file_id(self.filename)
|
|
452
|
+
|
|
453
|
+
# Special handling for copying properties out of the config for the document this one was extracted from
|
|
454
|
+
if self.is_local_extract_file:
|
|
455
|
+
self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
|
|
456
|
+
extracted_from_doc_id = self.url_slug.split('_')[-1]
|
|
457
|
+
|
|
458
|
+
if extracted_from_doc_id in ALL_FILE_CONFIGS:
|
|
459
|
+
self._set_config_for_extracted_file(ALL_FILE_CONFIGS[extracted_from_doc_id])
|
|
460
|
+
|
|
461
|
+
super().__post_init__()
|
|
462
|
+
|
|
463
|
+
if self.config and self.config.recipients:
|
|
464
|
+
self.recipients = self.config.recipients
|
|
465
|
+
else:
|
|
466
|
+
for recipient in self.header.recipients():
|
|
467
|
+
self.recipients.extend(extract_emailer_names(recipient))
|
|
468
|
+
|
|
469
|
+
# Assume mailing list emails are to Epstein
|
|
470
|
+
if self.author in BCC_LISTS and (self.is_note_to_self or not self.recipients):
|
|
471
|
+
self.recipients = [JEFFREY_EPSTEIN]
|
|
472
|
+
|
|
473
|
+
# Remove self CCs but preserve self emails
|
|
474
|
+
if not self.is_note_to_self:
|
|
475
|
+
self.recipients = [r for r in self.recipients if r != self.author]
|
|
476
|
+
|
|
477
|
+
self.recipients = sorted(list(set(self.recipients)), key=lambda r: r or UNKNOWN)
|
|
478
|
+
self.text = self._prettify_text()
|
|
479
|
+
self.actual_text = self._extract_actual_text()
|
|
480
|
+
self.sent_from_device = self._sent_from_device()
|
|
481
|
+
|
|
482
|
+
def is_from_or_to(self, name: str) -> bool:
|
|
483
|
+
return name in [self.author] + self.recipients
|
|
484
|
+
|
|
456
485
|
def recipients_txt(self, max_full_names: int = 2) -> Text:
|
|
457
486
|
"""Text object with comma separated colored versions of all recipients."""
|
|
458
487
|
recipients = [r or UNKNOWN for r in self.recipients] if len(self.recipients) > 0 else [UNKNOWN]
|
|
@@ -463,12 +492,6 @@ class Email(Communication):
|
|
|
463
492
|
for r in recipients
|
|
464
493
|
], join=', ')
|
|
465
494
|
|
|
466
|
-
def subject(self) -> str:
|
|
467
|
-
if self.config and self.config.subject:
|
|
468
|
-
return self.config.subject
|
|
469
|
-
else:
|
|
470
|
-
return self.header.subject or ''
|
|
471
|
-
|
|
472
495
|
def summary(self) -> Text:
|
|
473
496
|
"""One line summary mostly for logging."""
|
|
474
497
|
txt = self._summary()
|
|
@@ -478,7 +501,7 @@ class Email(Communication):
|
|
|
478
501
|
|
|
479
502
|
return txt.append(CLOSE_PROPERTIES_CHAR)
|
|
480
503
|
|
|
481
|
-
def
|
|
504
|
+
def _extract_actual_text(self) -> str:
|
|
482
505
|
"""The text that comes before likely quoted replies and forwards etc."""
|
|
483
506
|
if self.config and self.config.actual_text is not None:
|
|
484
507
|
return self.config.actual_text
|
|
@@ -490,7 +513,6 @@ class Email(Communication):
|
|
|
490
513
|
elif self.header.num_header_rows == 0:
|
|
491
514
|
return self.text
|
|
492
515
|
|
|
493
|
-
# import pdb;pdb.set_trace()
|
|
494
516
|
self.log_top_lines(20, "Raw text:", logging.DEBUG)
|
|
495
517
|
self.log(f"With {self.header.num_header_rows} header lines removed:\n{text[0:500]}\n\n", logging.DEBUG)
|
|
496
518
|
reply_text_match = REPLY_TEXT_REGEX.search(text)
|
|
@@ -517,51 +539,24 @@ class Email(Communication):
|
|
|
517
539
|
|
|
518
540
|
return text.strip()
|
|
519
541
|
|
|
520
|
-
def _border_style(self) -> str:
|
|
521
|
-
"""Color emails from epstein to others with the color for the first recipient."""
|
|
522
|
-
if self.author == JEFFREY_EPSTEIN and len(self.recipients) > 0:
|
|
523
|
-
style = get_style_for_name(self.recipients[0])
|
|
524
|
-
else:
|
|
525
|
-
style = self.author_style()
|
|
526
|
-
|
|
527
|
-
return style.replace('bold', '').strip()
|
|
528
|
-
|
|
529
542
|
def _extract_author(self) -> None:
|
|
543
|
+
"""Overloads superclass method, called at instantiation time."""
|
|
530
544
|
self._extract_header()
|
|
531
545
|
super()._extract_author()
|
|
532
546
|
|
|
533
547
|
if not self.author and self.header.author:
|
|
534
|
-
authors =
|
|
548
|
+
authors = extract_emailer_names(self.header.author)
|
|
535
549
|
self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
|
|
536
550
|
|
|
537
|
-
def _extract_emailer_names(self, emailer_str: str) -> list[str]:
|
|
538
|
-
"""Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
|
|
539
|
-
emailer_str = EmailHeader.cleanup_str(emailer_str)
|
|
540
|
-
|
|
541
|
-
if len(emailer_str) == 0:
|
|
542
|
-
return []
|
|
543
|
-
|
|
544
|
-
names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
|
|
545
|
-
|
|
546
|
-
if BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
|
|
547
|
-
if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
|
|
548
|
-
logger.warning(f"'{self.filename}': No emailer found in '{escape_single_quotes(emailer_str)}'")
|
|
549
|
-
else:
|
|
550
|
-
logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
|
|
551
|
-
|
|
552
|
-
return names_found
|
|
553
|
-
|
|
554
|
-
names_found = names_found or [emailer_str]
|
|
555
|
-
return [_reverse_first_and_last_names(name) for name in names_found]
|
|
556
|
-
|
|
557
551
|
def _extract_header(self) -> None:
|
|
558
|
-
"""Extract an EmailHeader
|
|
552
|
+
"""Extract an `EmailHeader` from the OCR text."""
|
|
559
553
|
header_match = EMAIL_SIMPLE_HEADER_REGEX.search(self.text)
|
|
560
554
|
|
|
561
555
|
if header_match:
|
|
562
556
|
self.header = EmailHeader.from_header_lines(header_match.group(0))
|
|
563
557
|
|
|
564
|
-
|
|
558
|
+
# DOJ file OCR text is broken in a less consistent way than the HOUSE_OVERSIGHT files
|
|
559
|
+
if self.header.is_empty() and not self.is_doj_file:
|
|
565
560
|
self.header.repair_empty_header(self.lines)
|
|
566
561
|
else:
|
|
567
562
|
log_level = logging.INFO if self.config else logging.WARNING
|
|
@@ -571,22 +566,15 @@ class Email(Communication):
|
|
|
571
566
|
logger.debug(f"{self.file_id} extracted header\n\n{self.header}\n")
|
|
572
567
|
|
|
573
568
|
def _extract_timestamp(self) -> datetime:
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
timestamp = _parse_timestamp(self.header.sent_at)
|
|
578
|
-
|
|
579
|
-
if timestamp:
|
|
580
|
-
return timestamp
|
|
569
|
+
"""Find the time this email was sent."""
|
|
570
|
+
if self.header.sent_at and (timestamp := _parse_timestamp(self.header.sent_at)):
|
|
571
|
+
return timestamp
|
|
581
572
|
|
|
582
573
|
searchable_lines = self.lines[0:MAX_NUM_HEADER_LINES]
|
|
583
574
|
searchable_text = '\n'.join(searchable_lines)
|
|
584
|
-
date_match = DATE_HEADER_REGEX.search(searchable_text)
|
|
585
|
-
|
|
586
|
-
if date_match:
|
|
587
|
-
timestamp = _parse_timestamp(date_match.group(1))
|
|
588
575
|
|
|
589
|
-
|
|
576
|
+
if (date_match := DATE_HEADER_REGEX.search(searchable_text)):
|
|
577
|
+
if (timestamp := _parse_timestamp(date_match.group(1))):
|
|
590
578
|
return timestamp
|
|
591
579
|
|
|
592
580
|
logger.debug(f"Failed to find timestamp, falling back to parsing {MAX_NUM_HEADER_LINES} lines...")
|
|
@@ -595,18 +583,16 @@ class Email(Communication):
|
|
|
595
583
|
if not TIMESTAMP_LINE_REGEX.search(line):
|
|
596
584
|
continue
|
|
597
585
|
|
|
598
|
-
timestamp
|
|
599
|
-
|
|
600
|
-
if timestamp:
|
|
586
|
+
if (timestamp := _parse_timestamp(line)):
|
|
601
587
|
logger.debug(f"Fell back to timestamp {timestamp} in line '{line}'...")
|
|
602
588
|
return timestamp
|
|
603
589
|
|
|
604
590
|
no_timestamp_msg = f"No timestamp found in '{self.file_path.name}'"
|
|
605
591
|
|
|
606
|
-
if self.is_duplicate
|
|
607
|
-
logger.warning(f"{no_timestamp_msg} but timestamp should be copied from {self.duplicate_of_id
|
|
592
|
+
if self.is_duplicate:
|
|
593
|
+
logger.warning(f"{no_timestamp_msg} but timestamp should be copied from {self.duplicate_of_id}")
|
|
608
594
|
else:
|
|
609
|
-
raise RuntimeError(f"{no_timestamp_msg}, top lines:\n
|
|
595
|
+
raise RuntimeError(f"{no_timestamp_msg}, top lines:\n" + '\n'.join(self.lines[0:MAX_NUM_HEADER_LINES + 10]))
|
|
610
596
|
|
|
611
597
|
def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES) -> int | None:
|
|
612
598
|
"""Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
|
|
@@ -653,7 +639,7 @@ class Email(Communication):
|
|
|
653
639
|
|
|
654
640
|
# Share / Tweet lines
|
|
655
641
|
if self.author == KATHRYN_RUEMMLER:
|
|
656
|
-
text = '\n'.join([
|
|
642
|
+
text = '\n'.join([line for line in text.split('\n') if line not in ['Share', 'Tweet', 'Bookmark it']])
|
|
657
643
|
|
|
658
644
|
return collapse_newlines(text).strip()
|
|
659
645
|
|
|
@@ -666,7 +652,7 @@ class Email(Communication):
|
|
|
666
652
|
self.log_top_lines(num_lines, msg=f'after removal of line {idx}')
|
|
667
653
|
|
|
668
654
|
def _repair(self) -> None:
|
|
669
|
-
"""Repair particularly janky files."""
|
|
655
|
+
"""Repair particularly janky files. Note that OCR_REPAIRS are applied *after* other line adjustments."""
|
|
670
656
|
if BAD_FIRST_LINE_REGEX.match(self.lines[0]):
|
|
671
657
|
self._set_computed_fields(lines=self.lines[1:])
|
|
672
658
|
|
|
@@ -694,13 +680,17 @@ class Email(Communication):
|
|
|
694
680
|
self.log(f"Modified text, old:\n\n" + '\n'.join(old_text.split('\n')[0:12]) + '\n')
|
|
695
681
|
self.log_top_lines(12, 'Result of modifications')
|
|
696
682
|
|
|
697
|
-
|
|
683
|
+
repaired_text = self._repair_links_and_quoted_subjects(self.repair_ocr_text(OCR_REPAIRS, self.text))
|
|
684
|
+
self._set_computed_fields(text=repaired_text)
|
|
685
|
+
|
|
686
|
+
def _repair_links_and_quoted_subjects(self, text: str) -> str:
|
|
687
|
+
"""Repair links that the OCR has broken into multiple lines as well as 'Subject:' lines."""
|
|
688
|
+
lines = text.split('\n')
|
|
698
689
|
subject_line = next((line for line in lines if line.startswith('Subject:')), None) or ''
|
|
699
690
|
subject = subject_line.split(':')[1].strip() if subject_line else ''
|
|
700
691
|
new_lines = []
|
|
701
692
|
i = 0
|
|
702
693
|
|
|
703
|
-
# Fix links and quoted subjects (remove spaces, merge multiline links to a single line)
|
|
704
694
|
while i < len(lines):
|
|
705
695
|
line = lines[i]
|
|
706
696
|
|
|
@@ -708,8 +698,8 @@ class Email(Communication):
|
|
|
708
698
|
while i < (len(lines) - 1) \
|
|
709
699
|
and not lines[i + 1].startswith('htt') \
|
|
710
700
|
and (lines[i + 1].endswith('/') \
|
|
711
|
-
|
|
712
|
-
|
|
701
|
+
or any(s in lines[i + 1] for s in URL_SIGNIFIERS) \
|
|
702
|
+
or LINK_LINE2_REGEX.match(lines[i + 1])):
|
|
713
703
|
logger.debug(f"{self.filename}: Joining link lines\n 1. {line}\n 2. {lines[i + 1]}\n")
|
|
714
704
|
line += lines[i + 1]
|
|
715
705
|
i += 1
|
|
@@ -726,25 +716,19 @@ class Email(Communication):
|
|
|
726
716
|
pass
|
|
727
717
|
elif (subject.endswith(next_line) and next_line != subject) \
|
|
728
718
|
or (FIELDS_COLON_REGEX.search(next_next) and not FIELDS_COLON_REGEX.search(next_line)):
|
|
729
|
-
self.
|
|
719
|
+
self.log(f"Fixing broken subject line\n line: '{line}'\n next: '{next_line}'\n next: '{next_next}'\nsubject='{subject}'\n")
|
|
730
720
|
line += f" {next_line}"
|
|
731
721
|
i += 1
|
|
732
722
|
|
|
733
723
|
new_lines.append(line)
|
|
734
|
-
|
|
735
|
-
# TODO: hacky workaround to get a working link for HOUSE_OVERSIGHT_032564
|
|
736
|
-
if self.file_id == '032564' and line == 'http://m.huffpost.com/us/entry/us_599f532ae4b0dOef9f1c129d':
|
|
737
|
-
new_lines.append('(ed. note: an archived version of the above link is here: https://archive.is/hJxT3 )')
|
|
738
|
-
|
|
739
724
|
i += 1
|
|
740
725
|
|
|
741
|
-
|
|
726
|
+
logger.debug(f"----after line repair---\n" + '\n'.join(new_lines[0:20]) + "\n---")
|
|
727
|
+
return '\n'.join(lines)
|
|
742
728
|
|
|
743
729
|
def _sent_from_device(self) -> str | None:
|
|
744
730
|
"""Find any 'Sent from my iPhone' style signature line if it exist in the 'actual_text'."""
|
|
745
|
-
sent_from_match
|
|
746
|
-
|
|
747
|
-
if sent_from_match:
|
|
731
|
+
if (sent_from_match := SENT_FROM_REGEX.search(self.actual_text)):
|
|
748
732
|
sent_from = sent_from_match.group(0)
|
|
749
733
|
return 'S' + sent_from[1:] if sent_from.startswith('sent') else sent_from
|
|
750
734
|
|
|
@@ -756,9 +740,7 @@ class Email(Communication):
|
|
|
756
740
|
else:
|
|
757
741
|
self.config = EmailCfg(id=self.file_id)
|
|
758
742
|
|
|
759
|
-
extracted_from_description
|
|
760
|
-
|
|
761
|
-
if extracted_from_description:
|
|
743
|
+
if (extracted_from_description := extracted_from_doc_cfg.complete_description):
|
|
762
744
|
extracted_description = f"{APPEARS_IN} {extracted_from_description}"
|
|
763
745
|
|
|
764
746
|
if isinstance(extracted_from_doc_cfg, EmailCfg):
|
|
@@ -783,11 +765,11 @@ class Email(Communication):
|
|
|
783
765
|
num_chars = args.truncate
|
|
784
766
|
elif self.config and self.config.truncate_to is not None:
|
|
785
767
|
num_chars = len(self.text) if self.config.truncate_to == NO_TRUNCATE else self.config.truncate_to
|
|
786
|
-
elif self.is_interesting
|
|
768
|
+
elif self.is_interesting:
|
|
787
769
|
num_chars = len(self.text)
|
|
788
770
|
elif self.author in TRUNCATE_EMAILS_FROM \
|
|
789
771
|
or any([self.is_from_or_to(n) for n in TRUNCATE_EMAILS_FROM_OR_TO]) \
|
|
790
|
-
or self.is_fwded_article
|
|
772
|
+
or self.is_fwded_article \
|
|
791
773
|
or includes_truncate_term:
|
|
792
774
|
num_chars = min(quote_cutoff or MAX_CHARS_TO_PRINT, TRUNCATED_CHARS)
|
|
793
775
|
else:
|
|
@@ -807,18 +789,18 @@ class Email(Communication):
|
|
|
807
789
|
else:
|
|
808
790
|
num_chars = quote_cutoff
|
|
809
791
|
else:
|
|
810
|
-
num_chars = min(self.file_size
|
|
792
|
+
num_chars = min(self.file_size, MAX_CHARS_TO_PRINT)
|
|
811
793
|
|
|
812
794
|
# Always print whole email for 1st email for user
|
|
813
|
-
if self._is_first_for_user and num_chars < self.file_size
|
|
795
|
+
if self._is_first_for_user and num_chars < self.file_size and not self.is_duplicate:
|
|
814
796
|
logger.info(f"{self} Overriding cutoff {num_chars} for first email")
|
|
815
|
-
num_chars = self.file_size
|
|
797
|
+
num_chars = self.file_size
|
|
816
798
|
|
|
817
799
|
log_args = {
|
|
818
800
|
'num_chars': num_chars,
|
|
819
801
|
'_is_first_for_user': self._is_first_for_user,
|
|
820
802
|
'author_truncate': self.author in TRUNCATE_EMAILS_FROM,
|
|
821
|
-
'is_fwded_article': self.is_fwded_article
|
|
803
|
+
'is_fwded_article': self.is_fwded_article,
|
|
822
804
|
'is_quote_cutoff': quote_cutoff == num_chars,
|
|
823
805
|
'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
|
|
824
806
|
'quote_cutoff': quote_cutoff,
|
|
@@ -838,8 +820,8 @@ class Email(Communication):
|
|
|
838
820
|
# Truncate long emails but leave a note explaining what happened w/link to source document
|
|
839
821
|
if len(text) > num_chars:
|
|
840
822
|
text = text[0:num_chars]
|
|
841
|
-
doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style
|
|
842
|
-
trim_note = f"<...trimmed to {num_chars:,} characters of {self.length
|
|
823
|
+
doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
|
|
824
|
+
trim_note = f"<...trimmed to {num_chars:,} characters of {self.length:,}, read the rest at {doc_link_markup}...>"
|
|
843
825
|
trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
|
|
844
826
|
|
|
845
827
|
# Rewrite broken headers where the values are on separate lines from the field names
|
|
@@ -855,7 +837,7 @@ class Email(Communication):
|
|
|
855
837
|
|
|
856
838
|
lines += text.split('\n')[num_lines_to_skip:]
|
|
857
839
|
text = self.header.rewrite_header() + '\n' + '\n'.join(lines)
|
|
858
|
-
text = _add_line_breaks(text)
|
|
840
|
+
text = _add_line_breaks(text)
|
|
859
841
|
self.rewritten_header_ids.add(self.file_id)
|
|
860
842
|
|
|
861
843
|
lines = [
|
|
@@ -867,7 +849,7 @@ class Email(Communication):
|
|
|
867
849
|
|
|
868
850
|
email_txt_panel = Panel(
|
|
869
851
|
highlighter(text).append('...\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
|
|
870
|
-
border_style=self.
|
|
852
|
+
border_style=self.border_style,
|
|
871
853
|
expand=False,
|
|
872
854
|
subtitle=REWRITTEN_HEADER_MSG if should_rewrite_header else None,
|
|
873
855
|
)
|
|
@@ -914,11 +896,11 @@ class Email(Communication):
|
|
|
914
896
|
|
|
915
897
|
for email in emails:
|
|
916
898
|
fields = [
|
|
917
|
-
email.epstein_media_link(link_txt=email.timestamp_without_seconds
|
|
918
|
-
email.author_txt
|
|
899
|
+
email.epstein_media_link(link_txt=email.timestamp_without_seconds, style=link_style),
|
|
900
|
+
email.author_txt,
|
|
919
901
|
email.recipients_txt(max_full_names=1),
|
|
920
|
-
f"{email.length
|
|
921
|
-
email.subject
|
|
902
|
+
f"{email.length}",
|
|
903
|
+
email.subject,
|
|
922
904
|
]
|
|
923
905
|
|
|
924
906
|
if not show_length:
|
|
@@ -935,21 +917,14 @@ def _add_line_breaks(email_text: str) -> str:
|
|
|
935
917
|
|
|
936
918
|
def _parse_timestamp(timestamp_str: str) -> None | datetime:
|
|
937
919
|
try:
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
920
|
+
if (american_date_match := AMERICAN_TIME_REGEX.search(timestamp_str)):
|
|
921
|
+
timestamp_str = american_date_match.group(1)
|
|
922
|
+
else:
|
|
923
|
+
timestamp_str = timestamp_str.replace('(GMT-05:00)', 'EST')
|
|
924
|
+
timestamp_str = BAD_TIMEZONE_REGEX.sub(' ', timestamp_str).strip()
|
|
925
|
+
|
|
926
|
+
timestamp = parse(timestamp_str, fuzzy=True, tzinfos=TIMEZONE_INFO)
|
|
941
927
|
logger.debug(f'Parsed timestamp "%s" from string "%s"', timestamp, timestamp_str)
|
|
942
928
|
return remove_timezone(timestamp)
|
|
943
929
|
except Exception as e:
|
|
944
930
|
logger.debug(f'Failed to parse "{timestamp_str}" to timestamp!')
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
def _reverse_first_and_last_names(name: str) -> str:
|
|
948
|
-
if '@' in name:
|
|
949
|
-
return name.lower()
|
|
950
|
-
|
|
951
|
-
if ', ' in name:
|
|
952
|
-
names = name.split(', ')
|
|
953
|
-
return f"{names[1]} {names[0]}"
|
|
954
|
-
else:
|
|
955
|
-
return name
|
|
@@ -2,7 +2,8 @@ import json
|
|
|
2
2
|
import re
|
|
3
3
|
from dataclasses import asdict, dataclass, field
|
|
4
4
|
|
|
5
|
-
from epstein_files.
|
|
5
|
+
from epstein_files.documents.emails.emailers import BAD_EMAILER_REGEX, TIME_REGEX
|
|
6
|
+
from epstein_files.util.constant.strings import AUTHOR, indented
|
|
6
7
|
from epstein_files.util.constants import ALL_CONFIGS
|
|
7
8
|
from epstein_files.util.doc_cfg import EmailCfg
|
|
8
9
|
from epstein_files.util.logging import logger
|
|
@@ -13,17 +14,29 @@ ON_BEHALF_OF = 'on behalf of'
|
|
|
13
14
|
TO_FIELDS = ['bcc', 'cc', 'to']
|
|
14
15
|
EMAILER_FIELDS = [AUTHOR] + TO_FIELDS
|
|
15
16
|
|
|
16
|
-
FIELD_PATTERNS = [
|
|
17
|
+
FIELD_PATTERNS = [
|
|
18
|
+
'Date',
|
|
19
|
+
'From',
|
|
20
|
+
'Sent',
|
|
21
|
+
'To',
|
|
22
|
+
r"C[cC]",
|
|
23
|
+
r"B[cC][cC]",
|
|
24
|
+
'Importance',
|
|
25
|
+
'Subject',
|
|
26
|
+
'Attachments',
|
|
27
|
+
'Classification',
|
|
28
|
+
'Flag',
|
|
29
|
+
'Reply-To',
|
|
30
|
+
'Inline-Images'
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}(From|Subject):') # IDed 140 emails out of 3777 DOJ files with just 'From:' match
|
|
17
34
|
FIELDS_PATTERN = '|'.join(FIELD_PATTERNS)
|
|
18
35
|
FIELDS_COLON_PATTERN = fr"^({FIELDS_PATTERN}):"
|
|
19
36
|
HEADER_REGEX_STR = fr"(((?:(?:{FIELDS_PATTERN}|Bee):|on behalf of ?)(?! +(by |from my|via )).*\n){{3,}})"
|
|
20
37
|
EMAIL_SIMPLE_HEADER_REGEX = re.compile(rf'^{HEADER_REGEX_STR}')
|
|
21
38
|
EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX = re.compile(HEADER_REGEX_STR)
|
|
22
39
|
EMAIL_PRE_FORWARD_REGEX = re.compile(r"(.{3,2000}?)" + HEADER_REGEX_STR, re.DOTALL) # Match up to the next email header section
|
|
23
|
-
TIME_REGEX = re.compile(r'^(\d{1,2}/\d{1,2}/\d{2,4}|Thursday|Monday|Tuesday|Wednesday|Friday|Saturday|Sunday).*')
|
|
24
|
-
|
|
25
|
-
BAD_NAME_CHARS_REGEX = re.compile(r"[\"'\[\]*><•]")
|
|
26
|
-
BAD_EMAILER_REGEX = re.compile(r'^(>|11111111)|agreed|ok|sexy|re:|fwd:|Multiple Senders|((sent|attachments|subject|importance).*|.*(january|201\d|hysterical|i have|image0|so that people|article 1.?|momminnemummin|These conspiracy theories|your state|undisclosed|www\.theguardian|talk in|it was a|what do|cc:|call (back|me)).*)$', re.IGNORECASE)
|
|
27
40
|
|
|
28
41
|
CONFIGURED_ACTUAL_TEXTS = [
|
|
29
42
|
cfg.actual_text for cfg in ALL_CONFIGS
|
|
@@ -54,6 +67,7 @@ class EmailHeader:
|
|
|
54
67
|
classification: str | None = None
|
|
55
68
|
flag: str | None = None
|
|
56
69
|
importance: str | None = None
|
|
70
|
+
inline_images: str | None = None
|
|
57
71
|
attachments: str | None = None
|
|
58
72
|
to: list[str] | None = None
|
|
59
73
|
reply_to: str | None = None
|
|
@@ -112,7 +126,7 @@ class EmailHeader:
|
|
|
112
126
|
self.header_chars = '\n'.join(email_lines[0:self.num_header_rows])
|
|
113
127
|
log_msg = f"Corrected empty header using {self.num_header_rows} lines to:\n"
|
|
114
128
|
|
|
115
|
-
logger.
|
|
129
|
+
logger.info(
|
|
116
130
|
f"{log_msg}{self}\n\n[top lines]:\n\n%s\n\n[body_lines]:\n\n%s\n\n",
|
|
117
131
|
indented('\n'.join(email_lines[0:(num_headers + 1) * 2]), prefix='> '),
|
|
118
132
|
indented('\n'.join(email_lines[self.num_header_rows:self.num_header_rows + 5]), prefix='> '),
|
|
@@ -181,7 +195,3 @@ class EmailHeader:
|
|
|
181
195
|
logger.debug(f"Header being parsed was this:\n\n{header}\n")
|
|
182
196
|
|
|
183
197
|
return cls(field_names=field_names, header_chars=header, **kw_args)
|
|
184
|
-
|
|
185
|
-
@staticmethod
|
|
186
|
-
def cleanup_str(_str: str) -> str:
|
|
187
|
-
return BAD_NAME_CHARS_REGEX.sub('', _str.replace(REDACTED, '')).strip().strip('_').strip()
|