epstein-files 1.1.5__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
epstein_files/__init__.py CHANGED
@@ -21,7 +21,8 @@ from epstein_files.util.env import args
21
21
  from epstein_files.util.file_helper import coerce_file_path, extract_file_id
22
22
  from epstein_files.util.logging import exit_with_error, logger
23
23
  from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
24
- print_other_files_section, print_text_messages_section, print_email_timeline, print_json_metadata, write_urls)
24
+ print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info_png,
25
+ print_json_metadata, write_urls)
25
26
  from epstein_files.util.rich import (build_highlighter, console, print_color_key, print_title_page_header,
26
27
  print_title_page_tables, print_subtitle_panel, write_html)
27
28
  from epstein_files.util.timer import Timer
@@ -43,6 +44,9 @@ def generate_html() -> None:
43
44
  elif args.json_files:
44
45
  print_json_files(epstein_files)
45
46
  exit()
47
+ elif args.emailers_info_png:
48
+ print_emailers_info_png(epstein_files)
49
+ exit()
46
50
 
47
51
  print_title_page_header()
48
52
 
@@ -63,7 +63,7 @@ class Document:
63
63
 
64
64
  Attributes:
65
65
  file_path (Path): Local path to file
66
- author (str | None): Who is responsible for the text in the file
66
+ author (Name): Who is responsible for the text in the file
67
67
  config (DocCfg): Information about this fil
68
68
  file_id (str): 6 digit (or 8 digits if it's a local extract file) string ID
69
69
  filename (str): File's basename
@@ -74,7 +74,7 @@ class Document:
74
74
  """
75
75
  file_path: Path
76
76
  # Optional fields
77
- author: str | None = None
77
+ author: Name = None
78
78
  config: EmailCfg | DocCfg | TextCfg | None = None
79
79
  file_id: str = field(init=False)
80
80
  filename: str = field(init=False)
@@ -121,6 +121,10 @@ class Document:
121
121
  txt.append(f" because it's {DUPE_TYPE_STRS[self.config.dupe_type]} ")
122
122
  return txt.append(epstein_media_doc_link_txt(self.config.duplicate_of_id, style='royal_blue1'))
123
123
 
124
+ def duplicate_of_id(self) -> str | None:
125
+ if self.config and self.config.duplicate_of_id:
126
+ return self.config.duplicate_of_id
127
+
124
128
  def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
125
129
  return self.external_link(epsteinify_doc_url, style, link_txt)
126
130
 
@@ -178,7 +182,7 @@ class Document:
178
182
  return None
179
183
 
180
184
  def is_duplicate(self) -> bool:
181
- return bool(self.config and self.config.duplicate_of_id)
185
+ return bool(self.duplicate_of_id())
182
186
 
183
187
  def is_local_extract_file(self) -> bool:
184
188
  """True if extracted from other file (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
@@ -20,7 +20,7 @@ from epstein_files.documents.emails.email_header import (BAD_EMAILER_REGEX, EMAI
20
20
  from epstein_files.util.constant.names import *
21
21
  from epstein_files.util.constant.strings import REDACTED
22
22
  from epstein_files.util.constants import *
23
- from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes, extract_last_name,
23
+ from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes,
24
24
  flatten, listify, remove_timezone, uniquify)
25
25
  from epstein_files.util.doc_cfg import EmailCfg, Metadata
26
26
  from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
@@ -55,6 +55,7 @@ REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
55
55
 
56
56
  OCR_REPAIRS: dict[str | re.Pattern, str] = {
57
57
  re.compile(r'grnail\.com'): 'gmail.com',
58
+ 'Newsmax. corn': 'Newsmax.com',
58
59
  re.compile(r"^(From|To)(: )?[_1.]{5,}", re.MULTILINE): rf"\1: {REDACTED}", # Redacted email addresses
59
60
  # These 3 must come in this order!
60
61
  re.compile(r'([/vkT]|Ai|li|(I|7)v)rote:'): 'wrote:',
@@ -79,6 +80,7 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
79
80
  'twitter glhsummers': 'twitter @lhsummers',
80
81
  re.compile(r"twitter\.com[i/][lI]krauss[1lt]"): "twitter.com/lkrauss1",
81
82
  re.compile(r'from my BlackBerry[0°] wireless device'): 'from my BlackBerry® wireless device',
83
+ re.compile(r'^INW$', re.MULTILINE): REDACTED,
82
84
  # links
83
85
  'Imps ://': 'https://',
84
86
  re.compile(r'timestopics/people/t/landon jr thomas/inde\n?x\n?\.\n?h\n?tml'): 'timestopics/people/t/landon_jr_thomas/index.html',
@@ -127,14 +129,6 @@ EMAIL_SIGNATURE_REGEXES = {
127
129
  UNKNOWN: re.compile(r"(This message is directed to and is for the use of the above-noted addressee only.*\nhereon\.)", re.DOTALL),
128
130
  }
129
131
 
130
- EMAIL_TABLE_COLS = [
131
- {'name': 'Sent At', 'justify': 'left', 'style': TIMESTAMP_DIM},
132
- {'name': 'From', 'justify': 'left', 'max_width': 20},
133
- {'name': 'To', 'justify': 'left', 'max_width': 22},
134
- {'name': 'Length', 'justify': 'right', 'style': 'wheat4'},
135
- {'name': 'Subject', 'justify': 'left', 'min_width': 35, 'style': 'honeydew2'},
136
- ]
137
-
138
132
  MAILING_LISTS = [
139
133
  CAROLYN_RANGEL,
140
134
  INTELLIGENCE_SQUARED,
@@ -142,10 +136,13 @@ MAILING_LISTS = [
142
136
  JP_MORGAN_USGIO,
143
137
  ]
144
138
 
145
- TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + MAILING_LISTS + [
139
+ BBC_LISTS = JUNK_EMAILERS + MAILING_LISTS
140
+
141
+ TRUNCATE_ALL_EMAILS_FROM = BBC_LISTS + [
146
142
  'Alan S Halperin',
147
143
  'Mitchell Bard',
148
144
  'Skip Rimer',
145
+ 'Steven Victor MD',
149
146
  ]
150
147
 
151
148
  TRUNCATION_LENGTHS = {
@@ -253,58 +250,15 @@ TRUNCATE_TERMS = [
253
250
  'https://www.washingtonpost.com/politics/2018/09/04/transcript-phone-call',
254
251
  ]
255
252
 
256
- # Some Paul Krassner emails have a ton of CCed parties we don't care about
257
- KRASSNER_RECIPIENTS = uniquify(flatten([ALL_FILE_CONFIGS[id].recipients for id in ['025329', '024923', '033568']]))
258
-
259
- # No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
260
- USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIPIENTS + [
261
- 'Alan Dlugash', # CCed with Richard Kahn
262
- 'Alan Rogers', # Random CC
263
- 'Andrew Friendly', # Presumably some relation of Kelly Friendly
264
- 'BS Stern', # A random fwd of email we have
265
- 'Cheryl Kleen', # Single email from Anne Boyles, displayed under Anne Boyles
266
- 'Connie Zaguirre', # Random CC
267
- 'Dan Fleuette', # CC from sean bannon
268
- 'Danny Goldberg', # Random Paul Krassner emails
269
- GERALD_LEFCOURT, # Single CC
270
- GORDON_GETTY, # Random CC
271
- JEFF_FULLER, # Random Jean Luc Brunel CC
272
- 'Jojo Fontanilla', # Random CC
273
- 'Joseph Vinciguerra', # Random CC
274
- 'Larry Cohen', # Random Bill Gates CC
275
- 'Lyn Fontanilla', # Random CC
276
- 'Mark Albert', # Random CC
277
- 'Matthew Schafer', # Random CC
278
- MICHAEL_BUCHHOLTZ, # Terry Kafka CC
279
- 'Nancy Dahl', # covered by Lawrence Krauss (her husband)
280
- 'Michael Simmons', # Random CC
281
- 'Nancy Portland', # Lawrence Krauss CC
282
- 'Oliver Goodenough', # Robert Trivers CC
283
- 'Peter Aldhous', # Lawrence Krauss CC
284
- 'Players2', # Hoffenberg CC
285
- 'Sam Harris', # Lawrence Krauss CC
286
- SAMUEL_LEFF, # Random CC
287
- 'Sean T Lehane', # Random CC
288
- 'Stephen Rubin', # Random CC
289
- 'Tim Kane', # Random CC
290
- 'Travis Pangburn', # Random CC
291
- 'Vahe Stepanian', # Random CC
292
- # Ross Gow BCC
293
- 'david.brown@thetimes.co.uk',
294
- 'io-anne.pugh@bbc.co.uk',
295
- 'martin.robinson@mailonline.co.uk',
296
- 'nick.alwav@bbc.co.uk'
297
- 'nick.sommerlad@mirror.co.uk',
298
- 'p.peachev@independent.co.uk',
299
- ]
300
-
301
253
  METADATA_FIELDS = [
302
254
  'is_junk_mail',
255
+ 'is_mailing_list',
303
256
  'recipients',
304
257
  'sent_from_device',
305
258
  'subject',
306
259
  ]
307
260
 
261
+ # Note the line repair happens *after* 'Importance: High' is removed
308
262
  LINE_REPAIR_MERGES = {
309
263
  '017523': 4,
310
264
  '019407': [2, 4],
@@ -312,10 +266,14 @@ LINE_REPAIR_MERGES = {
312
266
  '022673': 9,
313
267
  '022684': 9,
314
268
  '022695': 4,
269
+ '029773': [2, 5],
315
270
  '023067': 3,
316
271
  '025790': 2,
272
+ '029841': 3,
317
273
  '026345': 3,
318
274
  '026609': 4,
275
+ '033299': 3,
276
+ '026829': 3,
319
277
  '026924': [2, 4],
320
278
  '028931': [3, 6],
321
279
  '029154': [2, 5],
@@ -326,6 +284,7 @@ LINE_REPAIR_MERGES = {
326
284
  '029501': 2,
327
285
  '029835': [2, 4],
328
286
  '029889': 2,
287
+ '029545': [3, 5],
329
288
  '029976': 3,
330
289
  '030299': [7, 10],
331
290
  '030381': [2, 4],
@@ -359,14 +318,14 @@ class Email(Communication):
359
318
  actual_text (str) - best effort at the text actually sent in this email, excluding quoted replies and forwards
360
319
  config (EmailCfg | None) - manual config for this email (if it exists)
361
320
  header (EmailHeader) - header data extracted from the text (from/to/sent/subject etc)
362
- recipients (list[str | None]) - who this email was sent to
321
+ recipients (list[Name]) - who this email was sent to
363
322
  sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
364
323
  signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
365
324
  """
366
325
  actual_text: str = field(init=False)
367
326
  config: EmailCfg | None = None
368
327
  header: EmailHeader = field(init=False)
369
- recipients: list[str | None] = field(default_factory=list)
328
+ recipients: list[Name] = field(default_factory=list)
370
329
  sent_from_device: str | None = None
371
330
  signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
372
331
 
@@ -394,7 +353,7 @@ class Email(Communication):
394
353
  self.recipients.extend(self._extract_emailer_names(recipient))
395
354
 
396
355
  # Assume mailing list emails are to Epstein
397
- if self.author in MAILING_LISTS and (self.is_note_to_self() or not self.recipients):
356
+ if self.author in BBC_LISTS and (self.is_note_to_self() or not self.recipients):
398
357
  self.recipients = [JEFFREY_EPSTEIN]
399
358
 
400
359
  # Remove self CCs but preserve self emails
@@ -423,7 +382,10 @@ class Email(Communication):
423
382
  return bool(self.config and self.config.is_fwded_article)
424
383
 
425
384
  def is_junk_mail(self) -> bool:
426
- return self.author in JUNK_EMAILERS or self.author in MAILING_LISTS
385
+ return self.author in JUNK_EMAILERS
386
+
387
+ def is_mailing_list(self) -> bool:
388
+ return self.author in MAILING_LISTS or self.is_junk_mail()
427
389
 
428
390
  def is_note_to_self(self) -> bool:
429
391
  return self.recipients == [self.author]
@@ -431,6 +393,7 @@ class Email(Communication):
431
393
  def metadata(self) -> Metadata:
432
394
  local_metadata = asdict(self)
433
395
  local_metadata['is_junk_mail'] = self.is_junk_mail()
396
+ local_metadata['is_mailing_list'] = self.is_junk_mail()
434
397
  local_metadata['subject'] = self.subject() or None
435
398
  metadata = super().metadata()
436
399
  metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
@@ -550,6 +513,8 @@ class Email(Communication):
550
513
  self.log_top_lines(msg='No email header match found!', level=log_level)
551
514
  self.header = EmailHeader(field_names=[])
552
515
 
516
+ logger.debug(f"{self.file_id} extracted header\n\n{self.header}\n")
517
+
553
518
  def _extract_timestamp(self) -> datetime:
554
519
  if self.config and self.config.timestamp:
555
520
  return self.config.timestamp
@@ -674,6 +639,9 @@ class Email(Communication):
674
639
  elif self.file_id in ['025329']:
675
640
  for _i in range(9):
676
641
  self._merge_lines(2)
642
+ elif self.file_id in ['025812']:
643
+ for _i in range(2):
644
+ self._merge_lines(3)
677
645
  elif self.file_id == '014860':
678
646
  self._merge_lines(3)
679
647
  self._merge_lines(4)
@@ -839,19 +807,29 @@ class Email(Communication):
839
807
  self.log_top_lines(self.header.num_header_rows + 4, f'Original header:')
840
808
 
841
809
  @staticmethod
842
- def build_emails_table(emails: list['Email'], author: str | None = '', title: str = '', show_length: bool = False) -> Table:
810
+ def build_emails_table(emails: list['Email'], name: Name = '', title: str = '', show_length: bool = False) -> Table:
843
811
  """Turn a set of Emails into a Table."""
844
- if title and author:
812
+ if title and name:
845
813
  raise ValueError(f"Can't provide both 'author' and 'title' args")
846
- elif author == '' and title == '':
814
+ elif name == '' and title == '':
847
815
  raise ValueError(f"Must provide either 'author' or 'title' arg")
848
816
 
849
- author_style = get_style_for_name(author, allow_bold=False)
850
- link_style = author_style if author else ARCHIVE_LINK_COLOR
817
+ author_style = get_style_for_name(name, allow_bold=False)
818
+ link_style = author_style if name else ARCHIVE_LINK_COLOR
819
+ min_width = len(name or UNKNOWN)
820
+ max_width = max(20, min_width)
821
+
822
+ columns = [
823
+ {'name': 'Sent At', 'justify': 'left', 'style': TIMESTAMP_DIM},
824
+ {'name': 'From', 'justify': 'left', 'min_width': min_width, 'max_width': max_width},
825
+ {'name': 'To', 'justify': 'left', 'min_width': min_width, 'max_width': max_width + 2},
826
+ {'name': 'Length', 'justify': 'right', 'style': 'wheat4'},
827
+ {'name': 'Subject', 'justify': 'left', 'min_width': 35, 'style': 'honeydew2'},
828
+ ]
851
829
 
852
830
  table = build_table(
853
831
  title or None,
854
- cols=[col for col in EMAIL_TABLE_COLS if show_length or col['name'] not in ['Length']],
832
+ cols=[col for col in columns if show_length or col['name'] not in ['Length']],
855
833
  border_style=DEFAULT_TABLE_KWARGS['border_style'] if title else author_style,
856
834
  header_style="bold",
857
835
  highlight=True,
@@ -8,13 +8,13 @@ from epstein_files.util.doc_cfg import EmailCfg
8
8
  from epstein_files.util.logging import logger
9
9
  from epstein_files.util.rich import UNKNOWN
10
10
 
11
- FIELD_NAMES = ['From', 'Date', 'Sent', 'Subject']
11
+ FIELD_NAMES = ['Date', 'From', 'Sent', 'Subject']
12
12
  NON_HEADER_FIELDS = ['field_names', 'num_header_rows', 'was_initially_empty']
13
13
  ON_BEHALF_OF = 'on behalf of'
14
14
  TO_FIELDS = ['bcc', 'cc', 'to']
15
15
  EMAILER_FIELDS = [AUTHOR] + TO_FIELDS
16
16
 
17
- HEADER_REGEX_STR = r'(((?:(?:Date|From|Sent|To|C[cC]|Importance|Subject|Bee|B[cC]{2}|Attachments):|on behalf of ?)(?! +(by |from my|via )).*\n){3,})'
17
+ HEADER_REGEX_STR = r'(((?:(?:Date|From|Sent|To|C[cC]|Importance|Subject|Bee|B[cC]{2}|Attachments|Classification|Flag):|on behalf of ?)(?! +(by |from my|via )).*\n){3,})'
18
18
  EMAIL_SIMPLE_HEADER_REGEX = re.compile(rf'^{HEADER_REGEX_STR}')
19
19
  EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX = re.compile(HEADER_REGEX_STR)
20
20
  EMAIL_PRE_FORWARD_REGEX = re.compile(r"(.{3,2000}?)" + HEADER_REGEX_STR, re.DOTALL) # Match up to the next email header section
@@ -41,6 +41,8 @@ class EmailHeader:
41
41
  subject: str | None = None
42
42
  bcc: list[str] | None = None
43
43
  cc: list[str] | None = None
44
+ classification: str | None = None
45
+ flag: str | None = None
44
46
  importance: str | None = None
45
47
  attachments: str | None = None
46
48
  to: list[str] | None = None
@@ -4,9 +4,9 @@ from datetime import datetime
4
4
 
5
5
  from rich.text import Text
6
6
 
7
- from epstein_files.util.constant.names import JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN
7
+ from epstein_files.util.constant.names import JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN, Name, extract_last_name
8
8
  from epstein_files.util.constant.strings import TIMESTAMP_DIM
9
- from epstein_files.util.data import extract_last_name, iso_timestamp
9
+ from epstein_files.util.data import iso_timestamp
10
10
  from epstein_files.util.highlighted_group import get_style_for_name
11
11
  from epstein_files.util.logging import logger
12
12
  from epstein_files.util.rich import TEXT_LINK, highlighter
@@ -25,7 +25,7 @@ DISPLAY_LAST_NAME_ONLY = [
25
25
  @dataclass(kw_only=True)
26
26
  class TextMessage:
27
27
  """Class representing a single iMessage text message."""
28
- author: str | None
28
+ author: Name
29
29
  author_str: str = ''
30
30
  is_id_confirmed: bool = False
31
31
  text: str
@@ -10,11 +10,11 @@ from rich.text import Text
10
10
 
11
11
  from epstein_files.documents.communication import Communication
12
12
  from epstein_files.documents.imessage.text_message import TextMessage
13
- from epstein_files.util.constant.names import JEFFREY_EPSTEIN, UNKNOWN
13
+ from epstein_files.util.constant.names import JEFFREY_EPSTEIN, Name
14
14
  from epstein_files.util.constant.strings import AUTHOR, TIMESTAMP_STYLE
15
15
  from epstein_files.util.data import days_between, days_between_str, iso_timestamp, sort_dict
16
16
  from epstein_files.util.doc_cfg import Metadata, TextCfg
17
- from epstein_files.util.highlighted_group import get_style_for_name, styled_name
17
+ from epstein_files.util.highlighted_group import styled_name
18
18
  from epstein_files.util.logging import logger
19
19
  from epstein_files.util.rich import LAST_TIMESTAMP_STYLE, build_table, highlighter
20
20
 
@@ -35,7 +35,7 @@ class MessengerLog(Communication):
35
35
  super().__post_init__()
36
36
  self.messages = [self._build_message(match) for match in MSG_REGEX.finditer(self.text)]
37
37
 
38
- def first_message_at(self, name: str | None) -> datetime:
38
+ def first_message_at(self, name: Name) -> datetime:
39
39
  return self.messages_by(name)[0].parse_timestamp()
40
40
 
41
41
  def info_txt(self) -> Text | None:
@@ -54,10 +54,10 @@ class MessengerLog(Communication):
54
54
 
55
55
  return txt.append(')')
56
56
 
57
- def last_message_at(self, name: str | None) -> datetime:
57
+ def last_message_at(self, name: Name) -> datetime:
58
58
  return self.messages_by(name)[-1].parse_timestamp()
59
59
 
60
- def messages_by(self, name: str | None) -> list[TextMessage]:
60
+ def messages_by(self, name: Name) -> list[TextMessage]:
61
61
  """Return all messages by 'name'."""
62
62
  return [m for m in self.messages if m.author == name]
63
63
 
@@ -129,9 +129,9 @@ class MessengerLog(Communication):
129
129
  yield message
130
130
 
131
131
  @classmethod
132
- def count_authors(cls, imessage_logs: list['MessengerLog']) -> dict[str | None, int]:
132
+ def count_authors(cls, imessage_logs: list['MessengerLog']) -> dict[Name, int]:
133
133
  """Count up how many texts were sent by each author."""
134
- sender_counts: dict[str | None, int] = defaultdict(int)
134
+ sender_counts: dict[Name, int] = defaultdict(int)
135
135
 
136
136
  for message_log in imessage_logs:
137
137
  for message in message_log.messages: