epstein-files 1.1.5__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {epstein_files-1.1.5 → epstein_files-1.2.1}/PKG-INFO +6 -2
  2. {epstein_files-1.1.5 → epstein_files-1.2.1}/README.md +4 -1
  3. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/__init__.py +12 -21
  4. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/communication.py +0 -3
  5. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/document.py +68 -21
  6. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/email.py +54 -70
  7. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/emails/email_header.py +14 -4
  8. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/imessage/text_message.py +5 -4
  9. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/messenger_log.py +7 -7
  10. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/other_file.py +16 -34
  11. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/epstein_files.py +133 -141
  12. epstein_files-1.2.1/epstein_files/person.py +324 -0
  13. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/constant/names.py +46 -15
  14. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/constant/output_files.py +1 -0
  15. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/constant/strings.py +3 -3
  16. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/constant/urls.py +15 -2
  17. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/constants.py +75 -21
  18. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/data.py +1 -20
  19. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/doc_cfg.py +27 -17
  20. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/env.py +5 -3
  21. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/highlighted_group.py +248 -203
  22. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/logging.py +1 -1
  23. epstein_files-1.2.1/epstein_files/util/output.py +306 -0
  24. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/rich.py +20 -35
  25. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/timer.py +14 -0
  26. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/word_count.py +1 -1
  27. {epstein_files-1.1.5 → epstein_files-1.2.1}/pyproject.toml +2 -1
  28. epstein_files-1.1.5/epstein_files/util/output.py +0 -350
  29. {epstein_files-1.1.5 → epstein_files-1.2.1}/LICENSE +0 -0
  30. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/documents/json_file.py +0 -0
  31. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/constant/common_words.py +0 -0
  32. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/constant/html.py +0 -0
  33. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/file_helper.py +0 -0
  34. {epstein_files-1.1.5 → epstein_files-1.2.1}/epstein_files/util/search_result.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: epstein-files
3
- Version: 1.1.5
3
+ Version: 1.2.1
4
4
  Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
5
5
  Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
6
6
  License: GPL-3.0-or-later
@@ -17,6 +17,7 @@ Classifier: Programming Language :: Python :: 3.11
17
17
  Classifier: Programming Language :: Python :: 3.10
18
18
  Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Programming Language :: Python :: 3.13
20
+ Requires-Dist: cairosvg (>=2.8.2,<3.0.0)
20
21
  Requires-Dist: datefinder (>=0.7.3,<0.8.0)
21
22
  Requires-Dist: inflection (>=0.5.1,<0.6.0)
22
23
  Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
@@ -31,7 +32,7 @@ Project-URL: TextMessages, https://michelcrypt4d4mus.github.io/epstein_text_mess
31
32
  Project-URL: WordCounts, https://michelcrypt4d4mus.github.io/epstein_text_messages/communication_word_count_epstein_files_nov_2025.html
32
33
  Description-Content-Type: text/markdown
33
34
 
34
- # I Made Epstein's Text Messages Great Again
35
+ # Color Highlighted Epstein Emails and Text Messages
35
36
 
36
37
  ![joi](https://github.com/michelcrypt4d4mus/epstein_text_messages/raw/master/docs/joi_ito_gavin_is_clever_epstein_funds_bitcoin_dev_team.png)
37
38
 
@@ -119,3 +120,6 @@ for file in epstein_files.other_files:
119
120
  do_stuff(file)
120
121
  ```
121
122
 
123
+ # Everyone Who Sent or Received an Email in the November Document Dump
124
+ ![emails](https://github.com/michelcrypt4d4mus/epstein_text_messages/raw/master/docs/emailers_info_table.png)
125
+
@@ -1,4 +1,4 @@
1
- # I Made Epstein's Text Messages Great Again
1
+ # Color Highlighted Epstein Emails and Text Messages
2
2
 
3
3
  ![joi](https://github.com/michelcrypt4d4mus/epstein_text_messages/raw/master/docs/joi_ito_gavin_is_clever_epstein_funds_bitcoin_dev_team.png)
4
4
 
@@ -85,3 +85,6 @@ for json_file in epstein_files.json_files:
85
85
  for file in epstein_files.other_files:
86
86
  do_stuff(file)
87
87
  ```
88
+
89
+ # Everyone Who Sent or Received an Email in the November Document Dump
90
+ ![emails](https://github.com/michelcrypt4d4mus/epstein_text_messages/raw/master/docs/emailers_info_table.png)
@@ -21,7 +21,8 @@ from epstein_files.util.env import args
21
21
  from epstein_files.util.file_helper import coerce_file_path, extract_file_id
22
22
  from epstein_files.util.logging import exit_with_error, logger
23
23
  from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
24
- print_other_files_section, print_text_messages_section, print_email_timeline, print_json_metadata, write_urls)
24
+ print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info,
25
+ print_json_metadata, write_urls)
25
26
  from epstein_files.util.rich import (build_highlighter, console, print_color_key, print_title_page_header,
26
27
  print_title_page_tables, print_subtitle_panel, write_html)
27
28
  from epstein_files.util.timer import Timer
@@ -37,7 +38,10 @@ def generate_html() -> None:
37
38
  timer = Timer()
38
39
  epstein_files = EpsteinFiles.get_files(timer)
39
40
 
40
- if args.json_metadata:
41
+ if args.emailers_info:
42
+ print_emailers_info(epstein_files)
43
+ exit()
44
+ elif args.json_metadata:
41
45
  print_json_metadata(epstein_files)
42
46
  exit()
43
47
  elif args.json_files:
@@ -55,25 +59,19 @@ def generate_html() -> None:
55
59
  exit()
56
60
 
57
61
  if args.output_texts:
58
- imessage_logs = [log for log in epstein_files.imessage_logs if not args.names or log.author in args.names]
59
- print_text_messages_section(imessage_logs)
60
- timer.print_at_checkpoint(f'Printed {len(imessage_logs)} text message log files')
62
+ printed_logs = print_text_messages_section(epstein_files)
63
+ timer.log_section_complete('MessengerLog', epstein_files.imessage_logs, printed_logs)
61
64
 
62
65
  if args.output_emails:
63
- emails_that_were_printed = print_emails_section(epstein_files)
64
- timer.print_at_checkpoint(f"Printed {len(emails_that_were_printed):,} emails")
66
+ printed_emails = print_emails_section(epstein_files)
67
+ timer.log_section_complete('Email', epstein_files.emails, printed_emails)
65
68
  elif args.email_timeline:
66
69
  print_email_timeline(epstein_files)
67
70
  timer.print_at_checkpoint(f"Printed chronological emails table")
68
71
 
69
72
  if args.output_other:
70
- if args.uninteresting:
71
- files = [f for f in epstein_files.other_files if not f.is_interesting()]
72
- else:
73
- files = [f for f in epstein_files.other_files if args.all_other_files or f.is_interesting()]
74
-
75
- print_other_files_section(files, epstein_files)
76
- timer.print_at_checkpoint(f"Printed {len(files)} other files (skipped {len(epstein_files.other_files) - len(files)})")
73
+ printed_files = print_other_files_section(epstein_files)
74
+ timer.log_section_complete('OtherFile', epstein_files.other_files, printed_files)
77
75
 
78
76
  write_html(args.build)
79
77
  logger.warning(f"Total time: {timer.seconds_since_start_str()}")
@@ -90,7 +88,6 @@ def epstein_diff():
90
88
 
91
89
  def epstein_search():
92
90
  """Search the cleaned up text of the files."""
93
- _assert_positional_args()
94
91
  epstein_files = EpsteinFiles.get_files()
95
92
 
96
93
  for search_term in args.positional_args:
@@ -113,7 +110,6 @@ def epstein_search():
113
110
 
114
111
  def epstein_show():
115
112
  """Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
116
- _assert_positional_args()
117
113
  raw_docs: list[Document] = []
118
114
  console.line()
119
115
 
@@ -138,8 +134,3 @@ def epstein_show():
138
134
 
139
135
  def epstein_word_count() -> None:
140
136
  write_word_counts_html()
141
-
142
-
143
- def _assert_positional_args():
144
- if not args.positional_args:
145
- exit_with_error(f"No positional args provided!\n")
@@ -34,9 +34,6 @@ class Communication(Document):
34
34
  """Overrides super() method to apply self.author_style."""
35
35
  return super().external_links_txt(self.author_style(), include_alt_links=include_alt_links)
36
36
 
37
- def is_attribution_uncertain(self) -> bool:
38
- return bool(self.config and self.config.is_attribution_uncertain)
39
-
40
37
  def summary(self) -> Text:
41
38
  return self._summary().append(CLOSE_PROPERTIES_CHAR)
42
39
 
@@ -11,17 +11,19 @@ from rich.console import Console, ConsoleOptions, Group, RenderResult
11
11
  from rich.padding import Padding
12
12
  from rich.panel import Panel
13
13
  from rich.text import Text
14
+ from rich.table import Table
14
15
 
15
16
  from epstein_files.util.constant.names import *
16
17
  from epstein_files.util.constant.strings import *
17
18
  from epstein_files.util.constant.urls import *
18
19
  from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
19
- from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time_from_timestamp_str, without_falsey
20
+ from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time, without_falsey
20
21
  from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
21
22
  from epstein_files.util.env import DOCS_DIR, args
22
- from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, is_local_extract_file
23
+ from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, file_size_to_str, is_local_extract_file
23
24
  from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
24
- from epstein_files.util.rich import INFO_STYLE, SYMBOL_STYLE, console, highlighter, join_texts, key_value_txt, link_text_obj, parenthesize
25
+ from epstein_files.util.rich import (INFO_STYLE, NA_TXT, SYMBOL_STYLE, add_cols_to_table, build_table, console,
26
+ highlighter, join_texts, key_value_txt, link_text_obj, parenthesize)
25
27
  from epstein_files.util.search_result import MatchedLine
26
28
 
27
29
  ALT_LINK_STYLE = 'white dim'
@@ -55,6 +57,14 @@ OCR_REPAIRS = {
55
57
  'Nil Priell': 'Nili Priell',
56
58
  }
57
59
 
60
+ SUMMARY_TABLE_COLS: list[str | dict] = [
61
+ 'Count',
62
+ {'name': 'Has Author', 'style': 'honeydew2'},
63
+ {'name': 'No Author', 'style': 'wheat4'},
64
+ {'name': 'Uncertain Author', 'style': 'royal_blue1 dim'},
65
+ {'name': 'Size', 'justify': 'right', 'style': 'dim'},
66
+ ]
67
+
58
68
 
59
69
  @dataclass
60
70
  class Document:
@@ -63,7 +73,7 @@ class Document:
63
73
 
64
74
  Attributes:
65
75
  file_path (Path): Local path to file
66
- author (str | None): Who is responsible for the text in the file
76
+ author (Name): Who is responsible for the text in the file
67
77
  config (DocCfg): Information about this fil
68
78
  file_id (str): 6 digit (or 8 digits if it's a local extract file) string ID
69
79
  filename (str): File's basename
@@ -74,7 +84,7 @@ class Document:
74
84
  """
75
85
  file_path: Path
76
86
  # Optional fields
77
- author: str | None = None
87
+ author: Name = None
78
88
  config: EmailCfg | DocCfg | TextCfg | None = None
79
89
  file_id: str = field(init=False)
80
90
  filename: str = field(init=False)
@@ -121,6 +131,10 @@ class Document:
121
131
  txt.append(f" because it's {DUPE_TYPE_STRS[self.config.dupe_type]} ")
122
132
  return txt.append(epstein_media_doc_link_txt(self.config.duplicate_of_id, style='royal_blue1'))
123
133
 
134
+ def duplicate_of_id(self) -> str | None:
135
+ if self.config and self.config.duplicate_of_id:
136
+ return self.config.duplicate_of_id
137
+
124
138
  def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
125
139
  return self.external_link(epsteinify_doc_url, style, link_txt)
126
140
 
@@ -177,8 +191,11 @@ class Document:
177
191
  """Secondary info about this file (description recipients, etc). Overload in subclasses."""
178
192
  return None
179
193
 
194
+ def is_attribution_uncertain(self) -> bool:
195
+ return bool(self.config and self.config.is_attribution_uncertain)
196
+
180
197
  def is_duplicate(self) -> bool:
181
- return bool(self.config and self.config.duplicate_of_id)
198
+ return bool(self.duplicate_of_id())
182
199
 
183
200
  def is_local_extract_file(self) -> bool:
184
201
  """True if extracted from other file (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
@@ -236,17 +253,6 @@ class Document:
236
253
 
237
254
  return text
238
255
 
239
- def sort_key(self) -> tuple[datetime, str, int]:
240
- """Sort by timestamp, file_id, then whether or not it's a duplicate file."""
241
- if self.is_duplicate():
242
- sort_id = self.config.duplicate_of_id
243
- dupe_idx = 1
244
- else:
245
- sort_id = self.file_id
246
- dupe_idx = 0
247
-
248
- return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
249
-
250
256
  def source_file_id(self) -> str:
251
257
  """Strip off the _1, _2, etc. suffixes for extracted documents."""
252
258
  return self.file_id[0:6]
@@ -257,7 +263,7 @@ class Document:
257
263
  txt.append(f" {self.file_path.stem}", style=FILENAME_STYLE)
258
264
 
259
265
  if self.timestamp:
260
- timestamp_str = remove_zero_time_from_timestamp_str(self.timestamp).replace('T', ' ')
266
+ timestamp_str = remove_zero_time(self.timestamp).replace('T', ' ')
261
267
  txt.append(' (', style=SYMBOL_STYLE)
262
268
  txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
263
269
 
@@ -278,6 +284,17 @@ class Document:
278
284
 
279
285
  return Panel(Group(*sentences), border_style=self._class_style(), expand=False)
280
286
 
287
+ def timestamp_sort_key(self) -> tuple[datetime, str, int]:
288
+ """Sort by timestamp, file_id, then whether or not it's a duplicate file."""
289
+ if self.is_duplicate():
290
+ sort_id = self.config.duplicate_of_id
291
+ dupe_idx = 1
292
+ else:
293
+ sort_id = self.file_id
294
+ dupe_idx = 0
295
+
296
+ return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
297
+
281
298
  def top_lines(self, n: int = 10) -> str:
282
299
  """First n lines."""
283
300
  return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
@@ -357,6 +374,32 @@ class Document:
357
374
  def __str__(self) -> str:
358
375
  return self.summary().plain
359
376
 
377
+ @classmethod
378
+ def file_info_table(cls, title: str, first_col_name: str) -> Table:
379
+ """Empty table with appropriate cols for summarizing groups of files."""
380
+ table = build_table(title)
381
+ cols = [{'name': first_col_name, 'min_width': 14}] + SUMMARY_TABLE_COLS
382
+ add_cols_to_table(table, cols, 'right')
383
+ return table
384
+
385
+ @classmethod
386
+ def files_info(cls, files: Sequence['Document'], is_author_na: bool = False) -> dict[str, str | Text]:
387
+ """Summary info about a group of files."""
388
+ file_count = len(files)
389
+ author_count = cls.known_author_count(files)
390
+
391
+ return {
392
+ 'count': str(file_count),
393
+ 'author_count': NA_TXT if is_author_na else str(author_count),
394
+ 'no_author_count': NA_TXT if is_author_na else str(file_count - author_count),
395
+ 'uncertain_author_count': NA_TXT if is_author_na else str(len([f for f in files if f.is_attribution_uncertain()])),
396
+ 'bytes': file_size_to_str(sum([f.file_size() for f in files])),
397
+ }
398
+
399
+ @classmethod
400
+ def files_info_row(cls, files: Sequence['Document'], author_na: bool = False) -> Sequence[str | Text]:
401
+ return [v for v in cls.files_info(files, author_na).values()]
402
+
360
403
  @staticmethod
361
404
  def diff_files(files: list[str]) -> None:
362
405
  """Diff the contents of two Documents after all cleanup, BOM removal, etc."""
@@ -394,14 +437,18 @@ class Document:
394
437
 
395
438
  @staticmethod
396
439
  def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
397
- return sorted(docs, key=lambda doc: doc.sort_key())
440
+ return sorted(docs, key=lambda doc: doc.timestamp_sort_key())
398
441
 
399
- @classmethod
400
- def uniquify(cls, documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
442
+ @staticmethod
443
+ def uniquify(documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
401
444
  """Uniquify by file_id."""
402
445
  id_map = {doc.file_id: doc for doc in documents}
403
446
  return [doc for doc in id_map.values()]
404
447
 
448
+ @staticmethod
449
+ def without_dupes(docs: Sequence['DocumentType']) -> list['DocumentType']:
450
+ return [doc for doc in docs if not doc.is_duplicate()]
451
+
405
452
 
406
453
  DocumentType = TypeVar('DocumentType', bound=Document)
407
454
 
@@ -20,7 +20,7 @@ from epstein_files.documents.emails.email_header import (BAD_EMAILER_REGEX, EMAI
20
20
  from epstein_files.util.constant.names import *
21
21
  from epstein_files.util.constant.strings import REDACTED
22
22
  from epstein_files.util.constants import *
23
- from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes, extract_last_name,
23
+ from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes,
24
24
  flatten, listify, remove_timezone, uniquify)
25
25
  from epstein_files.util.doc_cfg import EmailCfg, Metadata
26
26
  from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
@@ -32,7 +32,7 @@ BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE
32
32
  BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
33
33
  DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
34
34
  LINK_LINE_REGEX = re.compile(f"^(> )?htt")
35
- QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
35
+ QUOTED_REPLY_LINE_REGEX = re.compile(r'(\nFrom:(.*)|wrote:)\n', re.IGNORECASE)
36
36
  REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
37
37
 
38
38
  BAD_TIMEZONE_REGEX = re.compile(fr'\((UTC|GMT\+\d\d:\d\d)\)|{REDACTED}')
@@ -55,6 +55,7 @@ REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
55
55
 
56
56
  OCR_REPAIRS: dict[str | re.Pattern, str] = {
57
57
  re.compile(r'grnail\.com'): 'gmail.com',
58
+ 'Newsmax. corn': 'Newsmax.com',
58
59
  re.compile(r"^(From|To)(: )?[_1.]{5,}", re.MULTILINE): rf"\1: {REDACTED}", # Redacted email addresses
59
60
  # These 3 must come in this order!
60
61
  re.compile(r'([/vkT]|Ai|li|(I|7)v)rote:'): 'wrote:',
@@ -79,6 +80,7 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
79
80
  'twitter glhsummers': 'twitter @lhsummers',
80
81
  re.compile(r"twitter\.com[i/][lI]krauss[1lt]"): "twitter.com/lkrauss1",
81
82
  re.compile(r'from my BlackBerry[0°] wireless device'): 'from my BlackBerry® wireless device',
83
+ re.compile(r'^INW$', re.MULTILINE): REDACTED,
82
84
  # links
83
85
  'Imps ://': 'https://',
84
86
  re.compile(r'timestopics/people/t/landon jr thomas/inde\n?x\n?\.\n?h\n?tml'): 'timestopics/people/t/landon_jr_thomas/index.html',
@@ -112,7 +114,7 @@ EMAIL_SIGNATURE_REGEXES = {
112
114
  DAVID_INGRAM: re.compile(r"Thank you in advance.*\nDavid Ingram.*\nCorrespondent\nReuters.*\nThomson.*(\n(Office|Mobile|Reuters.com).*)*"),
113
115
  DEEPAK_CHOPRA: re.compile(fr"({DEEPAK_CHOPRA}( MD)?\n)?2013 Costa Del Mar Road\nCarlsbad, CA 92009(\n(Chopra Foundation|Super Genes: Unlock.*))?(\nJiyo)?(\nChopra Center for Wellbeing)?(\nHome: Where Everyone is Welcome)?"),
114
116
  JEFFREY_EPSTEIN: re.compile(r"((\*+|please note)\n+)?(> )?(• )?(» )?The information contained in this communication is\n(> )*(» )?confidential.*?all attachments.( copyright -all rights reserved?)?", re.DOTALL),
115
- JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*", re.IGNORECASE),
117
+ JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*(\nTel:.*)?(\nEmail:.*)?", re.IGNORECASE),
116
118
  KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
117
119
  LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
118
120
  LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
@@ -127,14 +129,6 @@ EMAIL_SIGNATURE_REGEXES = {
127
129
  UNKNOWN: re.compile(r"(This message is directed to and is for the use of the above-noted addressee only.*\nhereon\.)", re.DOTALL),
128
130
  }
129
131
 
130
- EMAIL_TABLE_COLS = [
131
- {'name': 'Sent At', 'justify': 'left', 'style': TIMESTAMP_DIM},
132
- {'name': 'From', 'justify': 'left', 'max_width': 20},
133
- {'name': 'To', 'justify': 'left', 'max_width': 22},
134
- {'name': 'Length', 'justify': 'right', 'style': 'wheat4'},
135
- {'name': 'Subject', 'justify': 'left', 'min_width': 35, 'style': 'honeydew2'},
136
- ]
137
-
138
132
  MAILING_LISTS = [
139
133
  CAROLYN_RANGEL,
140
134
  INTELLIGENCE_SQUARED,
@@ -142,10 +136,13 @@ MAILING_LISTS = [
142
136
  JP_MORGAN_USGIO,
143
137
  ]
144
138
 
145
- TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + MAILING_LISTS + [
139
+ BBC_LISTS = JUNK_EMAILERS + MAILING_LISTS
140
+
141
+ TRUNCATE_ALL_EMAILS_FROM = BBC_LISTS + [
146
142
  'Alan S Halperin',
147
143
  'Mitchell Bard',
148
144
  'Skip Rimer',
145
+ 'Steven Victor MD',
149
146
  ]
150
147
 
151
148
  TRUNCATION_LENGTHS = {
@@ -253,58 +250,15 @@ TRUNCATE_TERMS = [
253
250
  'https://www.washingtonpost.com/politics/2018/09/04/transcript-phone-call',
254
251
  ]
255
252
 
256
- # Some Paul Krassner emails have a ton of CCed parties we don't care about
257
- KRASSNER_RECIPIENTS = uniquify(flatten([ALL_FILE_CONFIGS[id].recipients for id in ['025329', '024923', '033568']]))
258
-
259
- # No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
260
- USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIPIENTS + [
261
- 'Alan Dlugash', # CCed with Richard Kahn
262
- 'Alan Rogers', # Random CC
263
- 'Andrew Friendly', # Presumably some relation of Kelly Friendly
264
- 'BS Stern', # A random fwd of email we have
265
- 'Cheryl Kleen', # Single email from Anne Boyles, displayed under Anne Boyles
266
- 'Connie Zaguirre', # Random CC
267
- 'Dan Fleuette', # CC from sean bannon
268
- 'Danny Goldberg', # Random Paul Krassner emails
269
- GERALD_LEFCOURT, # Single CC
270
- GORDON_GETTY, # Random CC
271
- JEFF_FULLER, # Random Jean Luc Brunel CC
272
- 'Jojo Fontanilla', # Random CC
273
- 'Joseph Vinciguerra', # Random CC
274
- 'Larry Cohen', # Random Bill Gates CC
275
- 'Lyn Fontanilla', # Random CC
276
- 'Mark Albert', # Random CC
277
- 'Matthew Schafer', # Random CC
278
- MICHAEL_BUCHHOLTZ, # Terry Kafka CC
279
- 'Nancy Dahl', # covered by Lawrence Krauss (her husband)
280
- 'Michael Simmons', # Random CC
281
- 'Nancy Portland', # Lawrence Krauss CC
282
- 'Oliver Goodenough', # Robert Trivers CC
283
- 'Peter Aldhous', # Lawrence Krauss CC
284
- 'Players2', # Hoffenberg CC
285
- 'Sam Harris', # Lawrence Krauss CC
286
- SAMUEL_LEFF, # Random CC
287
- 'Sean T Lehane', # Random CC
288
- 'Stephen Rubin', # Random CC
289
- 'Tim Kane', # Random CC
290
- 'Travis Pangburn', # Random CC
291
- 'Vahe Stepanian', # Random CC
292
- # Ross Gow BCC
293
- 'david.brown@thetimes.co.uk',
294
- 'io-anne.pugh@bbc.co.uk',
295
- 'martin.robinson@mailonline.co.uk',
296
- 'nick.alwav@bbc.co.uk'
297
- 'nick.sommerlad@mirror.co.uk',
298
- 'p.peachev@independent.co.uk',
299
- ]
300
-
301
253
  METADATA_FIELDS = [
302
254
  'is_junk_mail',
255
+ 'is_mailing_list',
303
256
  'recipients',
304
257
  'sent_from_device',
305
258
  'subject',
306
259
  ]
307
260
 
261
+ # Note the line repair happens *after* 'Importance: High' is removed
308
262
  LINE_REPAIR_MERGES = {
309
263
  '017523': 4,
310
264
  '019407': [2, 4],
@@ -312,10 +266,14 @@ LINE_REPAIR_MERGES = {
312
266
  '022673': 9,
313
267
  '022684': 9,
314
268
  '022695': 4,
269
+ '029773': [2, 5],
315
270
  '023067': 3,
316
271
  '025790': 2,
272
+ '029841': 3,
317
273
  '026345': 3,
318
274
  '026609': 4,
275
+ '033299': 3,
276
+ '026829': 3,
319
277
  '026924': [2, 4],
320
278
  '028931': [3, 6],
321
279
  '029154': [2, 5],
@@ -326,6 +284,7 @@ LINE_REPAIR_MERGES = {
326
284
  '029501': 2,
327
285
  '029835': [2, 4],
328
286
  '029889': 2,
287
+ '029545': [3, 5],
329
288
  '029976': 3,
330
289
  '030299': [7, 10],
331
290
  '030381': [2, 4],
@@ -359,14 +318,14 @@ class Email(Communication):
359
318
  actual_text (str) - best effort at the text actually sent in this email, excluding quoted replies and forwards
360
319
  config (EmailCfg | None) - manual config for this email (if it exists)
361
320
  header (EmailHeader) - header data extracted from the text (from/to/sent/subject etc)
362
- recipients (list[str | None]) - who this email was sent to
321
+ recipients (list[Name]) - who this email was sent to
363
322
  sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
364
323
  signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
365
324
  """
366
325
  actual_text: str = field(init=False)
367
326
  config: EmailCfg | None = None
368
327
  header: EmailHeader = field(init=False)
369
- recipients: list[str | None] = field(default_factory=list)
328
+ recipients: list[Name] = field(default_factory=list)
370
329
  sent_from_device: str | None = None
371
330
  signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
372
331
 
@@ -394,7 +353,7 @@ class Email(Communication):
394
353
  self.recipients.extend(self._extract_emailer_names(recipient))
395
354
 
396
355
  # Assume mailing list emails are to Epstein
397
- if self.author in MAILING_LISTS and (self.is_note_to_self() or not self.recipients):
356
+ if self.author in BBC_LISTS and (self.is_note_to_self() or not self.recipients):
398
357
  self.recipients = [JEFFREY_EPSTEIN]
399
358
 
400
359
  # Remove self CCs but preserve self emails
@@ -423,7 +382,10 @@ class Email(Communication):
423
382
  return bool(self.config and self.config.is_fwded_article)
424
383
 
425
384
  def is_junk_mail(self) -> bool:
426
- return self.author in JUNK_EMAILERS or self.author in MAILING_LISTS
385
+ return self.author in JUNK_EMAILERS
386
+
387
+ def is_mailing_list(self) -> bool:
388
+ return self.author in MAILING_LISTS or self.is_junk_mail()
427
389
 
428
390
  def is_note_to_self(self) -> bool:
429
391
  return self.recipients == [self.author]
@@ -431,6 +393,7 @@ class Email(Communication):
431
393
  def metadata(self) -> Metadata:
432
394
  local_metadata = asdict(self)
433
395
  local_metadata['is_junk_mail'] = self.is_junk_mail()
396
+ local_metadata['is_mailing_list'] = self.is_junk_mail()
434
397
  local_metadata['subject'] = self.subject() or None
435
398
  metadata = super().metadata()
436
399
  metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
@@ -473,9 +436,9 @@ class Email(Communication):
473
436
  elif self.header.num_header_rows == 0:
474
437
  return self.text
475
438
 
476
- reply_text_match = REPLY_TEXT_REGEX.search(text)
477
439
  self.log_top_lines(20, "Raw text:", logging.DEBUG)
478
440
  self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
441
+ reply_text_match = REPLY_TEXT_REGEX.search(text)
479
442
 
480
443
  if reply_text_match:
481
444
  actual_num_chars = len(reply_text_match.group(1))
@@ -550,6 +513,8 @@ class Email(Communication):
550
513
  self.log_top_lines(msg='No email header match found!', level=log_level)
551
514
  self.header = EmailHeader(field_names=[])
552
515
 
516
+ logger.debug(f"{self.file_id} extracted header\n\n{self.header}\n")
517
+
553
518
  def _extract_timestamp(self) -> datetime:
554
519
  if self.config and self.config.timestamp:
555
520
  return self.config.timestamp
@@ -585,9 +550,15 @@ class Email(Communication):
585
550
 
586
551
  def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
587
552
  """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
588
- for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text or self.text)):
553
+ if text is None:
554
+ header_offset = len(self.header.header_chars)
555
+ text = self.text[header_offset:]
556
+ else:
557
+ header_offset = 0
558
+
559
+ for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text)):
589
560
  if i >= n:
590
- return match.end() - 1
561
+ return match.end() + header_offset - 1
591
562
 
592
563
  def _merge_lines(self, idx: int, idx2: int | None = None) -> None:
593
564
  """Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""
@@ -674,6 +645,9 @@ class Email(Communication):
674
645
  elif self.file_id in ['025329']:
675
646
  for _i in range(9):
676
647
  self._merge_lines(2)
648
+ elif self.file_id in ['025812']:
649
+ for _i in range(2):
650
+ self._merge_lines(3)
677
651
  elif self.file_id == '014860':
678
652
  self._merge_lines(3)
679
653
  self._merge_lines(4)
@@ -839,19 +813,29 @@ class Email(Communication):
839
813
  self.log_top_lines(self.header.num_header_rows + 4, f'Original header:')
840
814
 
841
815
  @staticmethod
842
- def build_emails_table(emails: list['Email'], author: str | None = '', title: str = '', show_length: bool = False) -> Table:
816
+ def build_emails_table(emails: list['Email'], name: Name = '', title: str = '', show_length: bool = False) -> Table:
843
817
  """Turn a set of Emails into a Table."""
844
- if title and author:
818
+ if title and name:
845
819
  raise ValueError(f"Can't provide both 'author' and 'title' args")
846
- elif author == '' and title == '':
820
+ elif name == '' and title == '':
847
821
  raise ValueError(f"Must provide either 'author' or 'title' arg")
848
822
 
849
- author_style = get_style_for_name(author, allow_bold=False)
850
- link_style = author_style if author else ARCHIVE_LINK_COLOR
823
+ author_style = get_style_for_name(name, allow_bold=False)
824
+ link_style = author_style if name else ARCHIVE_LINK_COLOR
825
+ min_width = len(name or UNKNOWN)
826
+ max_width = max(20, min_width)
827
+
828
+ columns = [
829
+ {'name': 'Sent At', 'justify': 'left', 'style': TIMESTAMP_DIM},
830
+ {'name': 'From', 'justify': 'left', 'min_width': min_width, 'max_width': max_width},
831
+ {'name': 'To', 'justify': 'left', 'min_width': min_width, 'max_width': max_width + 2},
832
+ {'name': 'Length', 'justify': 'right', 'style': 'wheat4'},
833
+ {'name': 'Subject', 'justify': 'left', 'min_width': 35, 'style': 'honeydew2'},
834
+ ]
851
835
 
852
836
  table = build_table(
853
837
  title or None,
854
- cols=[col for col in EMAIL_TABLE_COLS if show_length or col['name'] not in ['Length']],
838
+ cols=[col for col in columns if show_length or col['name'] not in ['Length']],
855
839
  border_style=DEFAULT_TABLE_KWARGS['border_style'] if title else author_style,
856
840
  header_style="bold",
857
841
  highlight=True,
@@ -8,13 +8,12 @@ from epstein_files.util.doc_cfg import EmailCfg
8
8
  from epstein_files.util.logging import logger
9
9
  from epstein_files.util.rich import UNKNOWN
10
10
 
11
- FIELD_NAMES = ['From', 'Date', 'Sent', 'Subject']
12
- NON_HEADER_FIELDS = ['field_names', 'num_header_rows', 'was_initially_empty']
11
+ FIELD_NAMES = ['Date', 'From', 'Sent', 'Subject']
13
12
  ON_BEHALF_OF = 'on behalf of'
14
13
  TO_FIELDS = ['bcc', 'cc', 'to']
15
14
  EMAILER_FIELDS = [AUTHOR] + TO_FIELDS
16
15
 
17
- HEADER_REGEX_STR = r'(((?:(?:Date|From|Sent|To|C[cC]|Importance|Subject|Bee|B[cC]{2}|Attachments):|on behalf of ?)(?! +(by |from my|via )).*\n){3,})'
16
+ HEADER_REGEX_STR = r'(((?:(?:Date|From|Sent|To|C[cC]|Importance|Subject|Bee|B[cC]{2}|Attachments|Classification|Flag):|on behalf of ?)(?! +(by |from my|via )).*\n){3,})'
18
17
  EMAIL_SIMPLE_HEADER_REGEX = re.compile(rf'^{HEADER_REGEX_STR}')
19
18
  EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX = re.compile(HEADER_REGEX_STR)
20
19
  EMAIL_PRE_FORWARD_REGEX = re.compile(r"(.{3,2000}?)" + HEADER_REGEX_STR, re.DOTALL) # Match up to the next email header section
@@ -28,10 +27,18 @@ CONFIGURED_ACTUAL_TEXTS = [
28
27
  if isinstance(cfg, EmailCfg) and cfg.actual_text is not None
29
28
  ]
30
29
 
30
+ NON_HEADER_FIELDS = [
31
+ 'field_names',
32
+ 'header_chars',
33
+ 'num_header_rows',
34
+ 'was_initially_empty',
35
+ ]
36
+
31
37
 
32
38
  @dataclass(kw_only=True)
33
39
  class EmailHeader:
34
40
  field_names: list[str] # Order is same as the order header fields appear in the email file text
41
+ header_chars: str = ''
35
42
  num_header_rows: int = field(init=False)
36
43
  was_initially_empty: bool = False
37
44
 
@@ -41,6 +48,8 @@ class EmailHeader:
41
48
  subject: str | None = None
42
49
  bcc: list[str] | None = None
43
50
  cc: list[str] | None = None
51
+ classification: str | None = None
52
+ flag: str | None = None
44
53
  importance: str | None = None
45
54
  attachments: str | None = None
46
55
  to: list[str] | None = None
@@ -99,6 +108,7 @@ class EmailHeader:
99
108
  setattr(self, field_name, value)
100
109
 
101
110
  self.num_header_rows = len(self.field_names) + num_headers
111
+ self.header_chars = '\n'.join(email_lines[0:self.num_header_rows])
102
112
  log_msg = f"Corrected empty header using {self.num_header_rows} lines to:\n"
103
113
  logger.debug(f"{log_msg}{self}\n\nTop lines:\n\n%s", '\n'.join(email_lines[0:(num_headers + 1) * 2]))
104
114
 
@@ -161,7 +171,7 @@ class EmailHeader:
161
171
  if should_log_header:
162
172
  logger.debug(f"Header being parsed was this:\n\n{header}\n")
163
173
 
164
- return EmailHeader(field_names=field_names, **kw_args)
174
+ return cls(field_names=field_names, header_chars=header, **kw_args)
165
175
 
166
176
  @staticmethod
167
177
  def cleanup_str(_str: str) -> str: