epstein-files 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. epstein_files/__init__.py +31 -18
  2. epstein_files/documents/communication.py +9 -5
  3. epstein_files/documents/document.py +225 -136
  4. epstein_files/documents/doj_file.py +242 -0
  5. epstein_files/documents/doj_files/full_text.py +166 -0
  6. epstein_files/documents/email.py +138 -163
  7. epstein_files/documents/emails/email_header.py +21 -11
  8. epstein_files/documents/emails/emailers.py +223 -0
  9. epstein_files/documents/imessage/text_message.py +2 -3
  10. epstein_files/documents/json_file.py +18 -14
  11. epstein_files/documents/messenger_log.py +23 -39
  12. epstein_files/documents/other_file.py +48 -44
  13. epstein_files/epstein_files.py +54 -33
  14. epstein_files/person.py +142 -110
  15. epstein_files/util/constant/names.py +29 -6
  16. epstein_files/util/constant/output_files.py +2 -0
  17. epstein_files/util/constant/strings.py +12 -6
  18. epstein_files/util/constant/urls.py +17 -0
  19. epstein_files/util/constants.py +101 -174
  20. epstein_files/util/data.py +2 -0
  21. epstein_files/util/doc_cfg.py +20 -15
  22. epstein_files/util/env.py +24 -16
  23. epstein_files/util/file_helper.py +28 -6
  24. epstein_files/util/helpers/debugging_helper.py +13 -0
  25. epstein_files/util/helpers/env_helpers.py +21 -0
  26. epstein_files/util/highlighted_group.py +57 -16
  27. epstein_files/util/layout/left_bar_panel.py +26 -0
  28. epstein_files/util/logging.py +28 -13
  29. epstein_files/util/output.py +33 -10
  30. epstein_files/util/rich.py +28 -2
  31. epstein_files/util/word_count.py +7 -7
  32. {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/METADATA +14 -1
  33. epstein_files-1.5.0.dist-info/RECORD +40 -0
  34. epstein_files-1.4.1.dist-info/RECORD +0 -34
  35. {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/LICENSE +0 -0
  36. {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/WHEEL +0 -0
  37. {epstein_files-1.4.1.dist-info → epstein_files-1.5.0.dist-info}/entry_points.txt +0 -0
@@ -1,7 +1,6 @@
1
1
  import json
2
2
  import logging
3
3
  import re
4
- from collections import defaultdict
5
4
  from copy import deepcopy
6
5
  from dataclasses import asdict, dataclass, field
7
6
  from datetime import datetime
@@ -16,13 +15,14 @@ from rich.text import Text
16
15
 
17
16
  from epstein_files.documents.communication import Communication
18
17
  from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, INFO_INDENT
19
- from epstein_files.documents.emails.email_header import (BAD_EMAILER_REGEX, EMAIL_SIMPLE_HEADER_REGEX,
20
- EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, FIELDS_COLON_PATTERN, TIME_REGEX, EmailHeader)
18
+ from epstein_files.documents.emails.email_header import (EMAIL_SIMPLE_HEADER_REGEX,
19
+ EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, FIELDS_COLON_PATTERN, EmailHeader)
20
+ from epstein_files.documents.emails.emailers import extract_emailer_names
21
21
  from epstein_files.documents.other_file import OtherFile
22
22
  from epstein_files.util.constant.names import *
23
23
  from epstein_files.util.constant.strings import REDACTED
24
24
  from epstein_files.util.constants import *
25
- from epstein_files.util.data import TIMEZONE_INFO, collapse_newlines, escape_single_quotes, remove_timezone
25
+ from epstein_files.util.data import AMERICAN_TIME_REGEX, TIMEZONE_INFO, collapse_newlines, remove_timezone
26
26
  from epstein_files.util.doc_cfg import EmailCfg, Metadata
27
27
  from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
28
28
  from epstein_files.util.highlighted_group import JUNK_EMAILERS, get_style_for_name
@@ -32,7 +32,6 @@ from epstein_files.util.rich import *
32
32
  BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
33
33
  BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Hide caption|Importance:?\s*High|[iI,•]|[1i] (_ )?[il]|, [-,]|L\._|_filtered|.*(yiv0232|font-family:|margin-bottom:).*)$')
34
34
  BAD_SUBJECT_CONTINUATIONS = ['orwarded', 'Hi ', 'Sent ', 'AmLaw', 'Original Message', 'Privileged', 'Sorry', '---']
35
- DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
36
35
  FIELDS_COLON_REGEX = re.compile(FIELDS_COLON_PATTERN)
37
36
  LINK_LINE_REGEX = re.compile(f"^[>• ]*htt")
38
37
  LINK_LINE2_REGEX = re.compile(r"^[-\w.%&=/]{5,}$")
@@ -44,7 +43,6 @@ DATE_HEADER_REGEX = re.compile(r'(?:Date|Sent):? +(?!by|from|to|via)([^\n]{6,})\
44
43
  TIMESTAMP_LINE_REGEX = re.compile(r"\d+:\d+")
45
44
  LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
46
45
 
47
- SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
48
46
  REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
49
47
  URL_SIGNIFIERS = ['?amp', 'amp?', 'cd=', 'click', 'CMP=', 'contentId', 'ft=', 'gclid', 'htm', 'mp=', 'keywords=', 'Id=', 'module=', 'mpweb', 'nlid=', 'ref=', 'smid=', 'sp=', 'usg=', 'utm']
50
48
  APPEARS_IN = 'appears in'
@@ -107,6 +105,7 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
107
105
  'Priebus, used\nprivate email accounts for': 'Priebus, used private email accounts for',
108
106
  "War on the Investigations\nEncircling Him": "War on the Investigations Encircling Him",
109
107
  "Subject; RE": "Subject: RE",
108
+ "straining relations between UK and\nAmerica": "straining relations between UK and America",
110
109
  re.compile(r"deadline re Mr Bradley Edwards vs Mr\s*Jeffrey Epstein", re.I): "deadline re Mr Bradley Edwards vs Mr Jeffrey Epstein",
111
110
  re.compile(r"Following Plea That Implicated Trump -\s*https://www.npr.org/676040070", re.I): "Following Plea That Implicated Trump - https://www.npr.org/676040070",
112
111
  re.compile(r"for Attorney General -\s+Wikisource, the"): r"for Attorney General - Wikisource, the",
@@ -344,6 +343,10 @@ LINE_REPAIR_MERGES = {
344
343
  '033575': [[2, 4]],
345
344
  '033576': [[3]],
346
345
  '033583': [[2]],
346
+
347
+ # Note DOJ file line adjustments happen *after* DojFile._repair() is called
348
+ 'EFTA00039689': [[4]],
349
+ 'EFTA00040118': [[2], [2], [2], [2], [2], [2], [6], [6]],
347
350
  }
348
351
 
349
352
 
@@ -351,12 +354,13 @@ LINE_REPAIR_MERGES = {
351
354
  class Email(Communication):
352
355
  """
353
356
  Attributes:
354
- actual_text (str) - best effort at the text actually sent in this email, excluding quoted replies and forwards
355
- config (EmailCfg | None) - manual config for this email (if it exists)
356
- header (EmailHeader) - header data extracted from the text (from/to/sent/subject etc)
357
- recipients (list[Name]) - who this email was sent to
358
- sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
359
- signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
357
+ actual_text (str) - Best effort at the text actually sent in this email, excluding quoted replies and forwards.
358
+ config (EmailCfg, optional) - Manual config for this email (if it exists).
359
+ header (EmailHeader) - Header data extracted from the text (from/to/sent/subject etc).
360
+ recipients (list[Name]) - People to whom this email was sent.
361
+ sent_from_device (str, optional) - "Sent from my iPhone" style signature (if it exists).
362
+ signature_substitution_counts (dict[str, int]) - Number of times a signature was replaced with
363
+ <...snipped...> for each participant
360
364
  """
361
365
  attached_docs: list[OtherFile] = field(default_factory=list)
362
366
  actual_text: str = field(init=False)
@@ -371,53 +375,33 @@ class Email(Communication):
371
375
  # For logging how many headers we prettified while printing, kind of janky
372
376
  rewritten_header_ids: ClassVar[set[str]] = set([])
373
377
 
374
- def __post_init__(self):
375
- self.filename = self.file_path.name
376
- self.file_id = extract_file_id(self.filename)
377
-
378
- # Special handling for copying properties out of the config for the document this one was extracted from
379
- if self.is_local_extract_file():
380
- self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
381
- extracted_from_doc_id = self.url_slug.split('_')[-1]
382
-
383
- if extracted_from_doc_id in ALL_FILE_CONFIGS:
384
- self._set_config_for_extracted_file(ALL_FILE_CONFIGS[extracted_from_doc_id])
385
-
386
- super().__post_init__()
387
-
388
- if self.config and self.config.recipients:
389
- self.recipients = self.config.recipients
390
- else:
391
- for recipient in self.header.recipients():
392
- self.recipients.extend(self._extract_emailer_names(recipient))
393
-
394
- # Assume mailing list emails are to Epstein
395
- if self.author in BCC_LISTS and (self.is_note_to_self() or not self.recipients):
396
- self.recipients = [JEFFREY_EPSTEIN]
397
-
398
- # Remove self CCs but preserve self emails
399
- if not self.is_note_to_self():
400
- self.recipients = [r for r in self.recipients if r != self.author]
401
-
402
- self.recipients = sorted(list(set(self.recipients)), key=lambda r: r or UNKNOWN)
403
- self.text = self._prettify_text()
404
- self.actual_text = self._actual_text()
405
- self.sent_from_device = self._sent_from_device()
406
-
378
+ @property
407
379
  def attachments(self) -> list[str]:
408
380
  """Returns the string in the header."""
409
381
  return (self.header.attachments or '').split(';')
410
382
 
383
+ @property
384
+ def border_style(self) -> str:
385
+ """Color emails from epstein to others with the color for the first recipient."""
386
+ if self.author == JEFFREY_EPSTEIN and len(self.recipients) > 0:
387
+ style = get_style_for_name(self.recipients[0])
388
+ else:
389
+ style = self.author_style
390
+
391
+ return style.replace('bold', '').strip()
392
+
393
+ @property
411
394
  def info_txt(self) -> Text:
412
- email_type = 'fwded article' if self.is_fwded_article() else 'email'
413
- txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt())
395
+ email_type = 'fwded article' if self.is_fwded_article else 'email'
396
+ txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt)
414
397
 
415
398
  if self.config and self.config.is_attribution_uncertain:
416
- txt.append(f" {QUESTION_MARKS}", style=self.author_style())
399
+ txt.append(f" {QUESTION_MARKS}", style=self.author_style)
417
400
 
418
401
  txt.append(' to ').append(self.recipients_txt())
419
402
  return txt.append(highlighter(f" probably sent at {self.timestamp}"))
420
403
 
404
+ @property
421
405
  def is_fwded_article(self) -> bool:
422
406
  if self.config is None:
423
407
  return False
@@ -426,33 +410,78 @@ class Email(Communication):
426
410
  else:
427
411
  return bool(self.config.is_fwded_article)
428
412
 
413
+ @property
429
414
  def is_junk_mail(self) -> bool:
430
415
  return self.author in JUNK_EMAILERS
431
416
 
417
+ @property
432
418
  def is_mailing_list(self) -> bool:
433
- return self.author in MAILING_LISTS or self.is_junk_mail()
419
+ return self.author in MAILING_LISTS or self.is_junk_mail
434
420
 
421
+ @property
435
422
  def is_note_to_self(self) -> bool:
436
423
  return self.recipients == [self.author]
437
424
 
438
- def is_from_or_to(self, name: str) -> bool:
439
- return name in [self.author] + self.recipients
440
-
425
+ @property
441
426
  def is_word_count_worthy(self) -> bool:
442
- if self.is_fwded_article():
427
+ if self.is_fwded_article:
443
428
  return bool(self.config.fwded_text_after) or len(self.actual_text) < 150
444
429
  else:
445
- return not self.is_mailing_list()
430
+ return not self.is_mailing_list
446
431
 
432
+ @property
447
433
  def metadata(self) -> Metadata:
448
434
  local_metadata = asdict(self)
449
- local_metadata['is_junk_mail'] = self.is_junk_mail()
450
- local_metadata['is_mailing_list'] = self.is_junk_mail()
451
- local_metadata['subject'] = self.subject() or None
452
- metadata = super().metadata()
435
+ local_metadata['is_junk_mail'] = self.is_junk_mail
436
+ local_metadata['is_mailing_list'] = self.is_junk_mail
437
+ local_metadata['subject'] = self.subject or None
438
+ metadata = super().metadata
453
439
  metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
454
440
  return metadata
455
441
 
442
+ @property
443
+ def subject(self) -> str:
444
+ if self.config and self.config.subject:
445
+ return self.config.subject
446
+ else:
447
+ return self.header.subject or ''
448
+
449
+ def __post_init__(self):
450
+ self.filename = self.file_path.name
451
+ self.file_id = extract_file_id(self.filename)
452
+
453
+ # Special handling for copying properties out of the config for the document this one was extracted from
454
+ if self.is_local_extract_file:
455
+ self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
456
+ extracted_from_doc_id = self.url_slug.split('_')[-1]
457
+
458
+ if extracted_from_doc_id in ALL_FILE_CONFIGS:
459
+ self._set_config_for_extracted_file(ALL_FILE_CONFIGS[extracted_from_doc_id])
460
+
461
+ super().__post_init__()
462
+
463
+ if self.config and self.config.recipients:
464
+ self.recipients = self.config.recipients
465
+ else:
466
+ for recipient in self.header.recipients():
467
+ self.recipients.extend(extract_emailer_names(recipient))
468
+
469
+ # Assume mailing list emails are to Epstein
470
+ if self.author in BCC_LISTS and (self.is_note_to_self or not self.recipients):
471
+ self.recipients = [JEFFREY_EPSTEIN]
472
+
473
+ # Remove self CCs but preserve self emails
474
+ if not self.is_note_to_self:
475
+ self.recipients = [r for r in self.recipients if r != self.author]
476
+
477
+ self.recipients = sorted(list(set(self.recipients)), key=lambda r: r or UNKNOWN)
478
+ self.text = self._prettify_text()
479
+ self.actual_text = self._extract_actual_text()
480
+ self.sent_from_device = self._sent_from_device()
481
+
482
+ def is_from_or_to(self, name: str) -> bool:
483
+ return name in [self.author] + self.recipients
484
+
456
485
  def recipients_txt(self, max_full_names: int = 2) -> Text:
457
486
  """Text object with comma separated colored versions of all recipients."""
458
487
  recipients = [r or UNKNOWN for r in self.recipients] if len(self.recipients) > 0 else [UNKNOWN]
@@ -463,12 +492,6 @@ class Email(Communication):
463
492
  for r in recipients
464
493
  ], join=', ')
465
494
 
466
- def subject(self) -> str:
467
- if self.config and self.config.subject:
468
- return self.config.subject
469
- else:
470
- return self.header.subject or ''
471
-
472
495
  def summary(self) -> Text:
473
496
  """One line summary mostly for logging."""
474
497
  txt = self._summary()
@@ -478,7 +501,7 @@ class Email(Communication):
478
501
 
479
502
  return txt.append(CLOSE_PROPERTIES_CHAR)
480
503
 
481
- def _actual_text(self) -> str:
504
+ def _extract_actual_text(self) -> str:
482
505
  """The text that comes before likely quoted replies and forwards etc."""
483
506
  if self.config and self.config.actual_text is not None:
484
507
  return self.config.actual_text
@@ -490,7 +513,6 @@ class Email(Communication):
490
513
  elif self.header.num_header_rows == 0:
491
514
  return self.text
492
515
 
493
- # import pdb;pdb.set_trace()
494
516
  self.log_top_lines(20, "Raw text:", logging.DEBUG)
495
517
  self.log(f"With {self.header.num_header_rows} header lines removed:\n{text[0:500]}\n\n", logging.DEBUG)
496
518
  reply_text_match = REPLY_TEXT_REGEX.search(text)
@@ -517,51 +539,24 @@ class Email(Communication):
517
539
 
518
540
  return text.strip()
519
541
 
520
- def _border_style(self) -> str:
521
- """Color emails from epstein to others with the color for the first recipient."""
522
- if self.author == JEFFREY_EPSTEIN and len(self.recipients) > 0:
523
- style = get_style_for_name(self.recipients[0])
524
- else:
525
- style = self.author_style()
526
-
527
- return style.replace('bold', '').strip()
528
-
529
542
  def _extract_author(self) -> None:
543
+ """Overloads superclass method, called at instantiation time."""
530
544
  self._extract_header()
531
545
  super()._extract_author()
532
546
 
533
547
  if not self.author and self.header.author:
534
- authors = self._extract_emailer_names(self.header.author)
548
+ authors = extract_emailer_names(self.header.author)
535
549
  self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
536
550
 
537
- def _extract_emailer_names(self, emailer_str: str) -> list[str]:
538
- """Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
539
- emailer_str = EmailHeader.cleanup_str(emailer_str)
540
-
541
- if len(emailer_str) == 0:
542
- return []
543
-
544
- names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
545
-
546
- if BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
547
- if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
548
- logger.warning(f"'{self.filename}': No emailer found in '{escape_single_quotes(emailer_str)}'")
549
- else:
550
- logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
551
-
552
- return names_found
553
-
554
- names_found = names_found or [emailer_str]
555
- return [_reverse_first_and_last_names(name) for name in names_found]
556
-
557
551
  def _extract_header(self) -> None:
558
- """Extract an EmailHeader object from the OCR text."""
552
+ """Extract an `EmailHeader` from the OCR text."""
559
553
  header_match = EMAIL_SIMPLE_HEADER_REGEX.search(self.text)
560
554
 
561
555
  if header_match:
562
556
  self.header = EmailHeader.from_header_lines(header_match.group(0))
563
557
 
564
- if self.header.is_empty():
558
+ # DOJ file OCR text is broken in a less consistent way than the HOUSE_OVERSIGHT files
559
+ if self.header.is_empty() and not self.is_doj_file:
565
560
  self.header.repair_empty_header(self.lines)
566
561
  else:
567
562
  log_level = logging.INFO if self.config else logging.WARNING
@@ -571,22 +566,15 @@ class Email(Communication):
571
566
  logger.debug(f"{self.file_id} extracted header\n\n{self.header}\n")
572
567
 
573
568
  def _extract_timestamp(self) -> datetime:
574
- if self.config and self.config.timestamp():
575
- return self.config.timestamp()
576
- elif self.header.sent_at:
577
- timestamp = _parse_timestamp(self.header.sent_at)
578
-
579
- if timestamp:
580
- return timestamp
569
+ """Find the time this email was sent."""
570
+ if self.header.sent_at and (timestamp := _parse_timestamp(self.header.sent_at)):
571
+ return timestamp
581
572
 
582
573
  searchable_lines = self.lines[0:MAX_NUM_HEADER_LINES]
583
574
  searchable_text = '\n'.join(searchable_lines)
584
- date_match = DATE_HEADER_REGEX.search(searchable_text)
585
-
586
- if date_match:
587
- timestamp = _parse_timestamp(date_match.group(1))
588
575
 
589
- if timestamp:
576
+ if (date_match := DATE_HEADER_REGEX.search(searchable_text)):
577
+ if (timestamp := _parse_timestamp(date_match.group(1))):
590
578
  return timestamp
591
579
 
592
580
  logger.debug(f"Failed to find timestamp, falling back to parsing {MAX_NUM_HEADER_LINES} lines...")
@@ -595,18 +583,16 @@ class Email(Communication):
595
583
  if not TIMESTAMP_LINE_REGEX.search(line):
596
584
  continue
597
585
 
598
- timestamp = _parse_timestamp(line)
599
-
600
- if timestamp:
586
+ if (timestamp := _parse_timestamp(line)):
601
587
  logger.debug(f"Fell back to timestamp {timestamp} in line '{line}'...")
602
588
  return timestamp
603
589
 
604
590
  no_timestamp_msg = f"No timestamp found in '{self.file_path.name}'"
605
591
 
606
- if self.is_duplicate():
607
- logger.warning(f"{no_timestamp_msg} but timestamp should be copied from {self.duplicate_of_id()}")
592
+ if self.is_duplicate:
593
+ logger.warning(f"{no_timestamp_msg} but timestamp should be copied from {self.duplicate_of_id}")
608
594
  else:
609
- raise RuntimeError(f"{no_timestamp_msg}, top lines:\n{searchable_text}")
595
+ raise RuntimeError(f"{no_timestamp_msg}, top lines:\n" + '\n'.join(self.lines[0:MAX_NUM_HEADER_LINES + 10]))
610
596
 
611
597
  def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES) -> int | None:
612
598
  """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
@@ -653,7 +639,7 @@ class Email(Communication):
653
639
 
654
640
  # Share / Tweet lines
655
641
  if self.author == KATHRYN_RUEMMLER:
656
- text = '\n'.join([l for l in text.split('\n') if l not in ['Share', 'Tweet', 'Bookmark it']])
642
+ text = '\n'.join([line for line in text.split('\n') if line not in ['Share', 'Tweet', 'Bookmark it']])
657
643
 
658
644
  return collapse_newlines(text).strip()
659
645
 
@@ -666,7 +652,7 @@ class Email(Communication):
666
652
  self.log_top_lines(num_lines, msg=f'after removal of line {idx}')
667
653
 
668
654
  def _repair(self) -> None:
669
- """Repair particularly janky files."""
655
+ """Repair particularly janky files. Note that OCR_REPAIRS are applied *after* other line adjustments."""
670
656
  if BAD_FIRST_LINE_REGEX.match(self.lines[0]):
671
657
  self._set_computed_fields(lines=self.lines[1:])
672
658
 
@@ -694,13 +680,17 @@ class Email(Communication):
694
680
  self.log(f"Modified text, old:\n\n" + '\n'.join(old_text.split('\n')[0:12]) + '\n')
695
681
  self.log_top_lines(12, 'Result of modifications')
696
682
 
697
- lines = self.repair_ocr_text(OCR_REPAIRS, self.text).split('\n')
683
+ repaired_text = self._repair_links_and_quoted_subjects(self.repair_ocr_text(OCR_REPAIRS, self.text))
684
+ self._set_computed_fields(text=repaired_text)
685
+
686
+ def _repair_links_and_quoted_subjects(self, text: str) -> str:
687
+ """Repair links that the OCR has broken into multiple lines as well as 'Subject:' lines."""
688
+ lines = text.split('\n')
698
689
  subject_line = next((line for line in lines if line.startswith('Subject:')), None) or ''
699
690
  subject = subject_line.split(':')[1].strip() if subject_line else ''
700
691
  new_lines = []
701
692
  i = 0
702
693
 
703
- # Fix links and quoted subjects (remove spaces, merge multiline links to a single line)
704
694
  while i < len(lines):
705
695
  line = lines[i]
706
696
 
@@ -708,8 +698,8 @@ class Email(Communication):
708
698
  while i < (len(lines) - 1) \
709
699
  and not lines[i + 1].startswith('htt') \
710
700
  and (lines[i + 1].endswith('/') \
711
- or any(s in lines[i + 1] for s in URL_SIGNIFIERS) \
712
- or LINK_LINE2_REGEX.match(lines[i + 1])):
701
+ or any(s in lines[i + 1] for s in URL_SIGNIFIERS) \
702
+ or LINK_LINE2_REGEX.match(lines[i + 1])):
713
703
  logger.debug(f"{self.filename}: Joining link lines\n 1. {line}\n 2. {lines[i + 1]}\n")
714
704
  line += lines[i + 1]
715
705
  i += 1
@@ -726,25 +716,19 @@ class Email(Communication):
726
716
  pass
727
717
  elif (subject.endswith(next_line) and next_line != subject) \
728
718
  or (FIELDS_COLON_REGEX.search(next_next) and not FIELDS_COLON_REGEX.search(next_line)):
729
- self.warn(f"Fixing broken subject line\n line: '{line}'\n next: '{next_line}'\n next: '{next_next}'\nsubject='{subject}'\n")
719
+ self.log(f"Fixing broken subject line\n line: '{line}'\n next: '{next_line}'\n next: '{next_next}'\nsubject='{subject}'\n")
730
720
  line += f" {next_line}"
731
721
  i += 1
732
722
 
733
723
  new_lines.append(line)
734
-
735
- # TODO: hacky workaround to get a working link for HOUSE_OVERSIGHT_032564
736
- if self.file_id == '032564' and line == 'http://m.huffpost.com/us/entry/us_599f532ae4b0dOef9f1c129d':
737
- new_lines.append('(ed. note: an archived version of the above link is here: https://archive.is/hJxT3 )')
738
-
739
724
  i += 1
740
725
 
741
- self._set_computed_fields(lines=new_lines)
726
+ logger.debug(f"----after line repair---\n" + '\n'.join(new_lines[0:20]) + "\n---")
727
+ return '\n'.join(lines)
742
728
 
743
729
  def _sent_from_device(self) -> str | None:
744
730
  """Find any 'Sent from my iPhone' style signature line if it exist in the 'actual_text'."""
745
- sent_from_match = SENT_FROM_REGEX.search(self.actual_text)
746
-
747
- if sent_from_match:
731
+ if (sent_from_match := SENT_FROM_REGEX.search(self.actual_text)):
748
732
  sent_from = sent_from_match.group(0)
749
733
  return 'S' + sent_from[1:] if sent_from.startswith('sent') else sent_from
750
734
 
@@ -756,9 +740,7 @@ class Email(Communication):
756
740
  else:
757
741
  self.config = EmailCfg(id=self.file_id)
758
742
 
759
- extracted_from_description = extracted_from_doc_cfg.complete_description()
760
-
761
- if extracted_from_description:
743
+ if (extracted_from_description := extracted_from_doc_cfg.complete_description):
762
744
  extracted_description = f"{APPEARS_IN} {extracted_from_description}"
763
745
 
764
746
  if isinstance(extracted_from_doc_cfg, EmailCfg):
@@ -783,11 +765,11 @@ class Email(Communication):
783
765
  num_chars = args.truncate
784
766
  elif self.config and self.config.truncate_to is not None:
785
767
  num_chars = len(self.text) if self.config.truncate_to == NO_TRUNCATE else self.config.truncate_to
786
- elif self.is_interesting():
768
+ elif self.is_interesting:
787
769
  num_chars = len(self.text)
788
770
  elif self.author in TRUNCATE_EMAILS_FROM \
789
771
  or any([self.is_from_or_to(n) for n in TRUNCATE_EMAILS_FROM_OR_TO]) \
790
- or self.is_fwded_article() \
772
+ or self.is_fwded_article \
791
773
  or includes_truncate_term:
792
774
  num_chars = min(quote_cutoff or MAX_CHARS_TO_PRINT, TRUNCATED_CHARS)
793
775
  else:
@@ -807,18 +789,18 @@ class Email(Communication):
807
789
  else:
808
790
  num_chars = quote_cutoff
809
791
  else:
810
- num_chars = min(self.file_size(), MAX_CHARS_TO_PRINT)
792
+ num_chars = min(self.file_size, MAX_CHARS_TO_PRINT)
811
793
 
812
794
  # Always print whole email for 1st email for user
813
- if self._is_first_for_user and num_chars < self.file_size() and not self.is_duplicate():
795
+ if self._is_first_for_user and num_chars < self.file_size and not self.is_duplicate:
814
796
  logger.info(f"{self} Overriding cutoff {num_chars} for first email")
815
- num_chars = self.file_size()
797
+ num_chars = self.file_size
816
798
 
817
799
  log_args = {
818
800
  'num_chars': num_chars,
819
801
  '_is_first_for_user': self._is_first_for_user,
820
802
  'author_truncate': self.author in TRUNCATE_EMAILS_FROM,
821
- 'is_fwded_article': self.is_fwded_article(),
803
+ 'is_fwded_article': self.is_fwded_article,
822
804
  'is_quote_cutoff': quote_cutoff == num_chars,
823
805
  'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
824
806
  'quote_cutoff': quote_cutoff,
@@ -838,8 +820,8 @@ class Email(Communication):
838
820
  # Truncate long emails but leave a note explaining what happened w/link to source document
839
821
  if len(text) > num_chars:
840
822
  text = text[0:num_chars]
841
- doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style())
842
- trim_note = f"<...trimmed to {num_chars:,} characters of {self.length():,}, read the rest at {doc_link_markup}...>"
823
+ doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
824
+ trim_note = f"<...trimmed to {num_chars:,} characters of {self.length:,}, read the rest at {doc_link_markup}...>"
843
825
  trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
844
826
 
845
827
  # Rewrite broken headers where the values are on separate lines from the field names
@@ -855,7 +837,7 @@ class Email(Communication):
855
837
 
856
838
  lines += text.split('\n')[num_lines_to_skip:]
857
839
  text = self.header.rewrite_header() + '\n' + '\n'.join(lines)
858
- text = _add_line_breaks(text) # This was skipped when _prettify_text() w/a broken header so we do it now
840
+ text = _add_line_breaks(text)
859
841
  self.rewritten_header_ids.add(self.file_id)
860
842
 
861
843
  lines = [
@@ -867,7 +849,7 @@ class Email(Communication):
867
849
 
868
850
  email_txt_panel = Panel(
869
851
  highlighter(text).append('...\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
870
- border_style=self._border_style(),
852
+ border_style=self.border_style,
871
853
  expand=False,
872
854
  subtitle=REWRITTEN_HEADER_MSG if should_rewrite_header else None,
873
855
  )
@@ -914,11 +896,11 @@ class Email(Communication):
914
896
 
915
897
  for email in emails:
916
898
  fields = [
917
- email.epstein_media_link(link_txt=email.timestamp_without_seconds(), style=link_style),
918
- email.author_txt(),
899
+ email.epstein_media_link(link_txt=email.timestamp_without_seconds, style=link_style),
900
+ email.author_txt,
919
901
  email.recipients_txt(max_full_names=1),
920
- f"{email.length()}",
921
- email.subject(),
902
+ f"{email.length}",
903
+ email.subject,
922
904
  ]
923
905
 
924
906
  if not show_length:
@@ -935,21 +917,14 @@ def _add_line_breaks(email_text: str) -> str:
935
917
 
936
918
  def _parse_timestamp(timestamp_str: str) -> None | datetime:
937
919
  try:
938
- timestamp_str = timestamp_str.replace('(GMT-05:00)', 'EST')
939
- timestamp_str = BAD_TIMEZONE_REGEX.sub(' ', timestamp_str).strip()
940
- timestamp = parse(timestamp_str, tzinfos=TIMEZONE_INFO)
920
+ if (american_date_match := AMERICAN_TIME_REGEX.search(timestamp_str)):
921
+ timestamp_str = american_date_match.group(1)
922
+ else:
923
+ timestamp_str = timestamp_str.replace('(GMT-05:00)', 'EST')
924
+ timestamp_str = BAD_TIMEZONE_REGEX.sub(' ', timestamp_str).strip()
925
+
926
+ timestamp = parse(timestamp_str, fuzzy=True, tzinfos=TIMEZONE_INFO)
941
927
  logger.debug(f'Parsed timestamp "%s" from string "%s"', timestamp, timestamp_str)
942
928
  return remove_timezone(timestamp)
943
929
  except Exception as e:
944
930
  logger.debug(f'Failed to parse "{timestamp_str}" to timestamp!')
945
-
946
-
947
- def _reverse_first_and_last_names(name: str) -> str:
948
- if '@' in name:
949
- return name.lower()
950
-
951
- if ', ' in name:
952
- names = name.split(', ')
953
- return f"{names[1]} {names[0]}"
954
- else:
955
- return name
@@ -2,7 +2,8 @@ import json
2
2
  import re
3
3
  from dataclasses import asdict, dataclass, field
4
4
 
5
- from epstein_files.util.constant.strings import AUTHOR, REDACTED, indented
5
+ from epstein_files.documents.emails.emailers import BAD_EMAILER_REGEX, TIME_REGEX
6
+ from epstein_files.util.constant.strings import AUTHOR, indented
6
7
  from epstein_files.util.constants import ALL_CONFIGS
7
8
  from epstein_files.util.doc_cfg import EmailCfg
8
9
  from epstein_files.util.logging import logger
@@ -13,17 +14,29 @@ ON_BEHALF_OF = 'on behalf of'
13
14
  TO_FIELDS = ['bcc', 'cc', 'to']
14
15
  EMAILER_FIELDS = [AUTHOR] + TO_FIELDS
15
16
 
16
- FIELD_PATTERNS = ['Date', 'From', 'Sent', 'To', r"C[cC]", r"B[cC][cC]", 'Importance', 'Subject', 'Attachments', 'Classification', 'Flag', 'Reply-To']
17
+ FIELD_PATTERNS = [
18
+ 'Date',
19
+ 'From',
20
+ 'Sent',
21
+ 'To',
22
+ r"C[cC]",
23
+ r"B[cC][cC]",
24
+ 'Importance',
25
+ 'Subject',
26
+ 'Attachments',
27
+ 'Classification',
28
+ 'Flag',
29
+ 'Reply-To',
30
+ 'Inline-Images'
31
+ ]
32
+
33
+ DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}(From|Subject):') # IDed 140 emails out of 3777 DOJ files with just 'From:' match
17
34
  FIELDS_PATTERN = '|'.join(FIELD_PATTERNS)
18
35
  FIELDS_COLON_PATTERN = fr"^({FIELDS_PATTERN}):"
19
36
  HEADER_REGEX_STR = fr"(((?:(?:{FIELDS_PATTERN}|Bee):|on behalf of ?)(?! +(by |from my|via )).*\n){{3,}})"
20
37
  EMAIL_SIMPLE_HEADER_REGEX = re.compile(rf'^{HEADER_REGEX_STR}')
21
38
  EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX = re.compile(HEADER_REGEX_STR)
22
39
  EMAIL_PRE_FORWARD_REGEX = re.compile(r"(.{3,2000}?)" + HEADER_REGEX_STR, re.DOTALL) # Match up to the next email header section
23
- TIME_REGEX = re.compile(r'^(\d{1,2}/\d{1,2}/\d{2,4}|Thursday|Monday|Tuesday|Wednesday|Friday|Saturday|Sunday).*')
24
-
25
- BAD_NAME_CHARS_REGEX = re.compile(r"[\"'\[\]*><•]")
26
- BAD_EMAILER_REGEX = re.compile(r'^(>|11111111)|agreed|ok|sexy|re:|fwd:|Multiple Senders|((sent|attachments|subject|importance).*|.*(january|201\d|hysterical|i have|image0|so that people|article 1.?|momminnemummin|These conspiracy theories|your state|undisclosed|www\.theguardian|talk in|it was a|what do|cc:|call (back|me)).*)$', re.IGNORECASE)
27
40
 
28
41
  CONFIGURED_ACTUAL_TEXTS = [
29
42
  cfg.actual_text for cfg in ALL_CONFIGS
@@ -54,6 +67,7 @@ class EmailHeader:
54
67
  classification: str | None = None
55
68
  flag: str | None = None
56
69
  importance: str | None = None
70
+ inline_images: str | None = None
57
71
  attachments: str | None = None
58
72
  to: list[str] | None = None
59
73
  reply_to: str | None = None
@@ -112,7 +126,7 @@ class EmailHeader:
112
126
  self.header_chars = '\n'.join(email_lines[0:self.num_header_rows])
113
127
  log_msg = f"Corrected empty header using {self.num_header_rows} lines to:\n"
114
128
 
115
- logger.warning(
129
+ logger.info(
116
130
  f"{log_msg}{self}\n\n[top lines]:\n\n%s\n\n[body_lines]:\n\n%s\n\n",
117
131
  indented('\n'.join(email_lines[0:(num_headers + 1) * 2]), prefix='> '),
118
132
  indented('\n'.join(email_lines[self.num_header_rows:self.num_header_rows + 5]), prefix='> '),
@@ -181,7 +195,3 @@ class EmailHeader:
181
195
  logger.debug(f"Header being parsed was this:\n\n{header}\n")
182
196
 
183
197
  return cls(field_names=field_names, header_chars=header, **kw_args)
184
-
185
- @staticmethod
186
- def cleanup_str(_str: str) -> str:
187
- return BAD_NAME_CHARS_REGEX.sub('', _str.replace(REDACTED, '')).strip().strip('_').strip()