epstein-files 1.1.0__tar.gz → 1.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {epstein_files-1.1.0 → epstein_files-1.1.3}/PKG-INFO +4 -1
  2. {epstein_files-1.1.0 → epstein_files-1.1.3}/README.md +3 -0
  3. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/__init__.py +16 -27
  4. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/documents/communication.py +10 -14
  5. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/documents/document.py +1 -1
  6. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/documents/email.py +152 -75
  7. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/documents/imessage/text_message.py +42 -25
  8. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/documents/messenger_log.py +31 -12
  9. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/documents/other_file.py +13 -12
  10. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/epstein_files.py +20 -81
  11. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/util/constant/common_words.py +3 -3
  12. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/util/constant/html.py +4 -5
  13. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/util/constant/names.py +18 -6
  14. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/util/constant/strings.py +6 -2
  15. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/util/constant/urls.py +1 -1
  16. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/util/constants.py +19 -23
  17. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/util/env.py +55 -36
  18. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/util/file_helper.py +1 -2
  19. epstein_files-1.1.3/epstein_files/util/highlighted_group.py +1525 -0
  20. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/util/logging.py +8 -1
  21. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/util/output.py +183 -89
  22. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/util/rich.py +35 -69
  23. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/util/timer.py +1 -1
  24. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/util/word_count.py +3 -4
  25. {epstein_files-1.1.0 → epstein_files-1.1.3}/pyproject.toml +1 -1
  26. epstein_files-1.1.0/epstein_files/util/highlighted_group.py +0 -695
  27. {epstein_files-1.1.0 → epstein_files-1.1.3}/LICENSE +0 -0
  28. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/documents/emails/email_header.py +0 -0
  29. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/documents/json_file.py +0 -0
  30. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/util/constant/output_files.py +0 -0
  31. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/util/data.py +0 -0
  32. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/util/doc_cfg.py +0 -0
  33. {epstein_files-1.1.0 → epstein_files-1.1.3}/epstein_files/util/search_result.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: epstein-files
3
- Version: 1.1.0
3
+ Version: 1.1.3
4
4
  Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
5
5
  Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
6
6
  License: GPL-3.0-or-later
@@ -81,6 +81,9 @@ epstein_diff 030999 020442
81
81
  ```
82
82
 
83
83
  The first time you run anything it will take a few minutes to fix all the janky OCR text, attribute the redacted emails, etc. After that things will be quick.
84
+
85
+ The commands used to build the various sites that are deployed on Github Pages can be found in [`deploy.sh`](./deploy.sh).
86
+
84
87
  Run `epstein_generate --help` for command line option assistance.
85
88
 
86
89
  **Optional:** There are a handful of emails that I extracted from the legal filings they were contained in. If you want to include these files in your local analysis you'll need to copy those files from the repo into your local document directory. Something like:
@@ -48,6 +48,9 @@ epstein_diff 030999 020442
48
48
  ```
49
49
 
50
50
  The first time you run anything it will take a few minutes to fix all the janky OCR text, attribute the redacted emails, etc. After that things will be quick.
51
+
52
+ The commands used to build the various sites that are deployed on Github Pages can be found in [`deploy.sh`](./deploy.sh).
53
+
51
54
  Run `epstein_generate --help` for command line option assistance.
52
55
 
53
56
  **Optional:** There are a handful of emails that I extracted from the legal filings they were contained in. If you want to include these files in your local analysis you'll need to copy those files from the repo into your local document directory. Something like:
@@ -16,13 +16,14 @@ from rich.text import Text
16
16
  from epstein_files.epstein_files import EpsteinFiles, document_cls
17
17
  from epstein_files.documents.document import INFO_PADDING, Document
18
18
  from epstein_files.documents.email import Email
19
- from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, CHRONOLOGICAL_EMAILS_PATH, TEXT_MSGS_HTML_PATH, make_clean
19
+ from epstein_files.util.constant.output_files import make_clean
20
20
  from epstein_files.util.env import args
21
21
  from epstein_files.util.file_helper import coerce_file_path, extract_file_id
22
- from epstein_files.util.logging import logger
22
+ from epstein_files.util.logging import exit_with_error, logger
23
23
  from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
24
- print_other_files_section, print_text_messages_section, write_complete_emails_timeline, write_json_metadata, write_urls)
25
- from epstein_files.util.rich import build_highlighter, console, print_title_page_header, print_title_page_tables, print_panel, write_html
24
+ print_other_files_section, print_text_messages_section, print_email_timeline, print_json_metadata, write_urls)
25
+ from epstein_files.util.rich import (build_highlighter, console, print_color_key, print_title_page_header,
26
+ print_title_page_tables, print_subtitle_panel, write_html)
26
27
  from epstein_files.util.timer import Timer
27
28
  from epstein_files.util.word_count import write_word_counts_html
28
29
 
@@ -37,7 +38,7 @@ def generate_html() -> None:
37
38
  epstein_files = EpsteinFiles.get_files(timer)
38
39
 
39
40
  if args.json_metadata:
40
- write_json_metadata(epstein_files)
41
+ print_json_metadata(epstein_files)
41
42
  exit()
42
43
  elif args.json_files:
43
44
  print_json_files(epstein_files)
@@ -45,21 +46,24 @@ def generate_html() -> None:
45
46
 
46
47
  print_title_page_header(epstein_files)
47
48
 
48
- if not args.email_timeline:
49
+ if args.email_timeline:
50
+ print_color_key()
51
+ else:
49
52
  print_title_page_tables(epstein_files)
50
53
 
51
54
  if args.colors_only:
52
55
  exit()
53
56
 
54
57
  if args.output_texts:
55
- print_text_messages_section(epstein_files)
56
- timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs)} text message logs')
58
+ imessage_logs = [log for log in epstein_files.imessage_logs if not args.names or log.author in args.names]
59
+ print_text_messages_section(imessage_logs)
60
+ timer.print_at_checkpoint(f'Printed {len(imessage_logs)} text message log files')
57
61
 
58
62
  if args.output_emails:
59
63
  emails_that_were_printed = print_emails_section(epstein_files)
60
64
  timer.print_at_checkpoint(f"Printed {len(emails_that_were_printed):,} emails")
61
65
  elif args.email_timeline:
62
- write_complete_emails_timeline(epstein_files)
66
+ print_email_timeline(epstein_files)
63
67
  timer.print_at_checkpoint(f"Printed chronological emails table")
64
68
 
65
69
  if args.output_other:
@@ -71,15 +75,7 @@ def generate_html() -> None:
71
75
  print_other_files_section(files, epstein_files)
72
76
  timer.print_at_checkpoint(f"Printed {len(files)} other files (skipped {len(epstein_files.other_files) - len(files)})")
73
77
 
74
- # Save output
75
- if args.all_emails:
76
- output_path = ALL_EMAILS_PATH
77
- elif args.email_timeline:
78
- output_path = CHRONOLOGICAL_EMAILS_PATH
79
- else:
80
- output_path = TEXT_MSGS_HTML_PATH
81
-
82
- write_html(output_path)
78
+ write_html(args.build)
83
79
  logger.warning(f"Total time: {timer.seconds_since_start_str()}")
84
80
 
85
81
  # JSON stats (mostly used for building pytest checks)
@@ -101,15 +97,12 @@ def epstein_search():
101
97
  temp_highlighter = build_highlighter(search_term)
102
98
  search_results = epstein_files.docs_matching(search_term, args.names)
103
99
  console.line(2)
104
- print_panel(f"Found {len(search_results)} documents matching '{search_term}'", padding=(0, 0, 0, 3))
100
+ print_subtitle_panel(f"Found {len(search_results)} documents matching '{search_term}'", padding=(0, 0, 0, 3))
105
101
 
106
102
  for search_result in search_results:
107
103
  console.line()
108
104
 
109
105
  if args.whole_file:
110
- if isinstance(search_result.document, Email):
111
- search_result.document._truncation_allowed = False
112
-
113
106
  console.print(search_result.document)
114
107
  else:
115
108
  console.print(search_result.document.summary_panel())
@@ -128,9 +121,6 @@ def epstein_show():
128
121
  console.line()
129
122
 
130
123
  for doc in docs:
131
- if isinstance(doc, Email):
132
- doc._truncation_allowed = False
133
-
134
124
  console.print('\n', doc, '\n')
135
125
 
136
126
  if args.raw:
@@ -148,5 +138,4 @@ def epstein_word_count() -> None:
148
138
 
149
139
  def _assert_positional_args():
150
140
  if not args.positional_args:
151
- console.print(f"\n ERROR: No positional args!\n", style='red1')
152
- exit(1)
141
+ exit_with_error(f"No positional args provided!\n")
@@ -18,25 +18,24 @@ TIMESTAMP_SECONDS_REGEX = re.compile(r":\d{2}$")
18
18
  @dataclass
19
19
  class Communication(Document):
20
20
  """Superclass for Email and MessengerLog."""
21
- author_style: str = 'white'
22
- author_txt: Text = field(init=False)
23
21
  config: CommunicationCfg | None = None
24
22
  timestamp: datetime = FALLBACK_TIMESTAMP # TODO this default sucks (though it never happens)
25
23
 
26
- def __post_init__(self):
27
- super().__post_init__()
28
- self.author_style = get_style_for_name(self.author_or_unknown())
29
- self.author_txt = Text(self.author_or_unknown(), style=self.author_style)
30
-
31
24
  def author_or_unknown(self) -> str:
32
25
  return self.author or UNKNOWN
33
26
 
34
- def is_attribution_uncertain(self) -> bool:
35
- return bool(self.config and self.config.is_attribution_uncertain)
27
+ def author_style(self) -> str:
28
+ return get_style_for_name(self.author_or_unknown())
29
+
30
+ def author_txt(self) -> Text:
31
+ return Text(self.author_or_unknown(), style=self.author_style())
36
32
 
37
33
  def external_links_txt(self, _style: str = '', include_alt_links: bool = True) -> Text:
38
34
  """Overrides super() method to apply self.author_style."""
39
- return super().external_links_txt(self.author_style, include_alt_links=include_alt_links)
35
+ return super().external_links_txt(self.author_style(), include_alt_links=include_alt_links)
36
+
37
+ def is_attribution_uncertain(self) -> bool:
38
+ return bool(self.config and self.config.is_attribution_uncertain)
40
39
 
41
40
  def summary(self) -> Text:
42
41
  return self._summary().append(CLOSE_PROPERTIES_CHAR)
@@ -47,7 +46,4 @@ class Communication(Document):
47
46
  def _summary(self) -> Text:
48
47
  """One line summary mostly for logging."""
49
48
  txt = super().summary().append(', ')
50
- return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown()}'", style=self.author_style)))
51
-
52
-
53
- CommunicationType = TypeVar('CommunicationType', bound=Document)
49
+ return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown()}'", style=self.author_style())))
@@ -251,7 +251,7 @@ class Document:
251
251
  def summary(self) -> Text:
252
252
  """Summary of this file for logging. Brackets are left open for subclasses to add stuff."""
253
253
  txt = Text('').append(self._class_name(), style=self._class_style())
254
- txt.append(f" {self.url_slug}", style=FILENAME_STYLE)
254
+ txt.append(f" {self.file_path.stem}", style=FILENAME_STYLE)
255
255
 
256
256
  if self.timestamp:
257
257
  timestamp_str = remove_zero_time_from_timestamp_str(self.timestamp).replace('T', ' ')
@@ -1,3 +1,4 @@
1
+ import json
1
2
  import logging
2
3
  import re
3
4
  from copy import deepcopy
@@ -20,7 +21,7 @@ from epstein_files.util.constant.names import *
20
21
  from epstein_files.util.constant.strings import REDACTED
21
22
  from epstein_files.util.constants import *
22
23
  from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes, extract_last_name,
23
- flatten, remove_timezone, uniquify)
24
+ flatten, listify, remove_timezone, uniquify)
24
25
  from epstein_files.util.doc_cfg import EmailCfg, Metadata
25
26
  from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
26
27
  from epstein_files.util.highlighted_group import get_style_for_name
@@ -42,7 +43,7 @@ LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
42
43
  SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
43
44
  REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
44
45
  URL_SIGNIFIERS = ['gclid', 'htm', 'ref=', 'utm']
45
- APPEARS_IN = 'Appears in'
46
+ APPEARS_IN = 'appears in'
46
47
  MAX_CHARS_TO_PRINT = 4000
47
48
  MAX_NUM_HEADER_LINES = 14
48
49
  MAX_QUOTED_REPLIES = 2
@@ -125,15 +126,6 @@ EMAIL_SIGNATURE_REGEXES = {
125
126
  UNKNOWN: re.compile(r"(This message is directed to and is for the use of the above-noted addressee only.*\nhereon\.)", re.DOTALL),
126
127
  }
127
128
 
128
- # Invalid for links to EpsteinWeb
129
- JUNK_EMAILERS = [
130
- 'asmallworld@travel.asmallworld.net',
131
- "digest-noreply@quora.com",
132
- 'editorialstaff@flipboard.com',
133
- 'How To Academy',
134
- 'Jokeland',
135
- ]
136
-
137
129
  MAILING_LISTS = [
138
130
  CAROLYN_RANGEL,
139
131
  INTELLIGENCE_SQUARED,
@@ -152,6 +144,8 @@ TRUNCATION_LENGTHS = {
152
144
  '030245': 7_500, # Epstein rationalizes his behavior in an open letter to the world
153
145
  '030781': 1_700, # Bannon email about crypto coin issues
154
146
  '032906': 750, # David Blaine email
147
+ '026036': 6000, # Gino Yu blockchain mention
148
+ '023208': 350_000, # Long discussion about leon black's finances
155
149
  }
156
150
 
157
151
  # These are long forwarded articles so we force a trim to 1,333 chars if these strings exist
@@ -276,6 +270,7 @@ USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIP
276
270
  'Nancy Portland', # Lawrence Krauss CC
277
271
  'Oliver Goodenough', # Robert Trivers CC
278
272
  'Peter Aldhous', # Lawrence Krauss CC
273
+ 'Players2', # Hoffenberg CC
279
274
  'Sam Harris', # Lawrence Krauss CC
280
275
  SAMUEL_LEFF, # Random CC
281
276
  'Sean T Lehane', # Random CC
@@ -283,6 +278,13 @@ USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIP
283
278
  'Tim Kane', # Random CC
284
279
  'Travis Pangburn', # Random CC
285
280
  'Vahe Stepanian', # Random CC
281
+ # Ross Gow BCC
282
+ 'david.brown@thetimes.co.uk',
283
+ 'io-anne.pugh@bbc.co.uk',
284
+ 'martin.robinson@mailonline.co.uk',
285
+ 'nick.alwav@bbc.co.uk'
286
+ 'nick.sommerlad@mirror.co.uk',
287
+ 'p.peachev@independent.co.uk',
286
288
  ]
287
289
 
288
290
  # Emails sent by epstein to himself that are just notes
@@ -300,6 +302,50 @@ METADATA_FIELDS = [
300
302
  'subject',
301
303
  ]
302
304
 
305
+ LINE_REPAIR_MERGES = {
306
+ '017523': 4,
307
+ '019407': [2, 4],
308
+ '021729': 2,
309
+ '022673': 9,
310
+ '022684': 9,
311
+ '022695': 4,
312
+ '023067': 3,
313
+ '025790': 2,
314
+ '026609': 4,
315
+ '026924': [2, 4],
316
+ '028931': [3, 6],
317
+ '029154': [2, 5],
318
+ '029163': [2, 5],
319
+ '029282': 2,
320
+ '029402': 5,
321
+ '029498': 2,
322
+ '029501': 2,
323
+ '029835': [2, 4],
324
+ '029889': 2,
325
+ '029976': 3,
326
+ '030299': [7, 10],
327
+ '030381': [2, 4],
328
+ '030384': [2, 4],
329
+ '030626': 2,
330
+ '030999': [2, 4],
331
+ '031384': 2,
332
+ '031428': 2,
333
+ '031442': 0,
334
+ '031980': [2, 4],
335
+ '032063': [3, 5],
336
+ '032272': 3,
337
+ '032405': 4,
338
+ '033097': 2,
339
+ '033144': [2, 4],
340
+ '033228': [3, 5],
341
+ '033357': [2, 4],
342
+ '033486': [7, 9],
343
+ '033512': 2,
344
+ '033575': [2, 4],
345
+ '033576': 3,
346
+ '033583': 2,
347
+ }
348
+
303
349
 
304
350
  @dataclass
305
351
  class Email(Communication):
@@ -318,7 +364,6 @@ class Email(Communication):
318
364
  recipients: list[str | None] = field(default_factory=list)
319
365
  sent_from_device: str | None = None
320
366
  signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
321
- _truncation_allowed: bool = True # Hacky way to get __rich_console__() not to truncate in epstein_show script
322
367
 
323
368
  # For logging how many headers we prettified while printing, kind of janky
324
369
  rewritten_header_ids: ClassVar[set[str]] = set([])
@@ -342,7 +387,7 @@ class Email(Communication):
342
387
  self.recipients = self.config.recipients
343
388
  else:
344
389
  for recipient in self.header.recipients():
345
- self.recipients.extend(self._emailer_names(recipient))
390
+ self.recipients.extend(self._extract_emailer_names(recipient))
346
391
 
347
392
  if self.author in MAILING_LISTS and (len(self.recipients) == 0 or self.recipients == [self.author]):
348
393
  self.recipients = [JEFFREY_EPSTEIN] # Assume mailing list emails are to Epstein
@@ -365,7 +410,7 @@ class Email(Communication):
365
410
 
366
411
  def info_txt(self) -> Text:
367
412
  email_type = 'fwded article' if self.is_fwded_article() else 'email'
368
- txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt).append(' to ')
413
+ txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt()).append(' to ')
369
414
  return txt.append(self.recipients_txt()).append(highlighter(f" probably sent at {self.timestamp}"))
370
415
 
371
416
  def is_fwded_article(self) -> bool:
@@ -446,15 +491,23 @@ class Email(Communication):
446
491
  """Color emails from epstein to others with the color for the first recipient."""
447
492
  if self.author == JEFFREY_EPSTEIN:
448
493
  if len(self.recipients) == 0 or self.recipients == [None]:
449
- style = self.author_style
494
+ style = self.author_style()
450
495
  else:
451
496
  style = get_style_for_name(self.recipients[0])
452
497
  else:
453
- style = self.author_style
498
+ style = self.author_style()
454
499
 
455
500
  return style.replace('bold', '').strip()
456
501
 
457
- def _emailer_names(self, emailer_str: str) -> list[str]:
502
+ def _extract_author(self) -> None:
503
+ self._extract_header()
504
+ super()._extract_author()
505
+
506
+ if not self.author and self.header.author:
507
+ authors = self._extract_emailer_names(self.header.author)
508
+ self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
509
+
510
+ def _extract_emailer_names(self, emailer_str: str) -> list[str]:
458
511
  """Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
459
512
  emailer_str = EmailHeader.cleanup_str(emailer_str)
460
513
 
@@ -474,14 +527,6 @@ class Email(Communication):
474
527
  names_found = names_found or [emailer_str]
475
528
  return [_reverse_first_and_last_names(name) for name in names_found]
476
529
 
477
- def _extract_author(self) -> None:
478
- self._extract_header()
479
- super()._extract_author()
480
-
481
- if not self.author and self.header.author:
482
- authors = self._emailer_names(self.header.author)
483
- self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
484
-
485
530
  def _extract_header(self) -> None:
486
531
  """Extract an EmailHeader object from the OCR text."""
487
532
  header_match = EMAIL_SIMPLE_HEADER_REGEX.search(self.text)
@@ -579,44 +624,47 @@ class Email(Communication):
579
624
  self._set_computed_fields(lines=[line for line in self.lines if not BAD_LINE_REGEX.match(line)])
580
625
  old_text = self.text
581
626
 
582
- if self.file_id in ['031442']:
583
- self._merge_lines(0) # Merge 1st and 2nd rows
584
- elif self.file_id in '021729 025790 029282 029501 029889 030626 031384 031428 033097 033512 033583 029498 033583'.split():
585
- self._merge_lines(2) # Merge 3rd and 4th rows
627
+ if self.file_id in LINE_REPAIR_MERGES:
628
+ merge = LINE_REPAIR_MERGES[self.file_id]
629
+ merge_args = merge if isinstance(merge, list) else [merge]
630
+ self._merge_lines(*merge_args)
586
631
 
587
- if self.file_id in ['030626']: # Merge 6th and 7th (now 5th and 6th) rows
588
- self._merge_lines(4)
589
- elif self.file_id == '029889':
590
- self._merge_lines(2, 5)
591
- elif self.file_id in ['029498', '031428']:
592
- self._merge_lines(2, 4)
593
- elif self.file_id in ['029976', '023067', '033576']:
594
- self._merge_lines(3) # Merge 4th and 5th rows
595
- elif self.file_id in '026609 029402 032405 022695'.split():
596
- self._merge_lines(4) # Merge 5th and 6th rows
597
- elif self.file_id in ['019407', '031980', '030384', '033144', '030999', '033575', '029835', '030381', '033357', '026924']:
598
- self._merge_lines(2, 4)
599
- elif self.file_id in ['029154', '029163']:
632
+ # These already had 2nd line merged
633
+ if self.file_id in ['030626']: # Merge 6th and 7th (now 5th and 6th) rows
634
+ self._merge_lines(4)
635
+ elif self.file_id == '029889':
600
636
  self._merge_lines(2, 5)
601
- elif self.file_id in ['033228', '032063']:
602
- self._merge_lines(3, 5)
603
- elif self.file_id == '028931':
604
- self._merge_lines(3, 6)
605
- elif self.file_id == '013415':
637
+ elif self.file_id in ['029498', '031428']:
638
+ self._merge_lines(2, 4)
639
+
640
+ # Multiline
641
+ if self.file_id == '013415':
642
+ for _i in range(2):
643
+ self._merge_lines(4)
644
+ elif self.file_id == '013405':
645
+ for _i in range(2):
646
+ self._merge_lines(4)
647
+ elif self.file_id == '029458':
648
+ for _i in range(3):
649
+ self._merge_lines(4)
650
+ elif self.file_id in ['025233']:
606
651
  for _i in range(2):
607
652
  self._merge_lines(4)
653
+
654
+ self.lines[4] = f"Attachments: {self.lines[4]}"
655
+ self._set_computed_fields(lines=self.lines)
656
+ elif self.file_id in ['023001']:
657
+ for _i in range(3):
658
+ self._merge_lines(5)
659
+ elif self.file_id in ['019105']:
660
+ for _i in range(4):
661
+ self._merge_lines(5)
608
662
  elif self.file_id in ['033568']:
609
663
  for _i in range(5):
610
664
  self._merge_lines(5)
611
665
  elif self.file_id in ['025329']:
612
666
  for _i in range(9):
613
667
  self._merge_lines(2)
614
- elif self.file_id == '033486':
615
- self._merge_lines(7, 9)
616
- elif self.file_id == '030299':
617
- self._merge_lines(7, 10)
618
- elif self.file_id in ['022673', '022684']:
619
- self._merge_lines(9)
620
668
  elif self.file_id == '014860':
621
669
  self._merge_lines(3)
622
670
  self._merge_lines(4)
@@ -629,7 +677,15 @@ class Email(Communication):
629
677
 
630
678
  self._merge_lines(4)
631
679
  self._merge_lines(2, 4)
632
- elif self.file_id == '025041':
680
+ elif self.file_id in ['033252']:
681
+ for _i in range(2):
682
+ self._merge_lines(9)
683
+ elif self.file_id in ['032637']:
684
+ for _i in range(3):
685
+ self._merge_lines(9)
686
+
687
+ # Bad line removal
688
+ if self.file_id == '025041':
633
689
  self._remove_line(4)
634
690
  self._remove_line(4)
635
691
  elif self.file_id == '029692':
@@ -679,7 +735,7 @@ class Email(Communication):
679
735
  """Copy info from original config for file this document was extracted from."""
680
736
  if self.file_id in ALL_FILE_CONFIGS:
681
737
  self.config = cast(EmailCfg, deepcopy(ALL_FILE_CONFIGS[self.file_id]))
682
- self.warn(f"Merging existing config for {self.file_id} with config for file this document was extracted from")
738
+ self.warn(f"Merging existing cfg for '{self.file_id}' with cfg for extracted document...")
683
739
  else:
684
740
  self.config = EmailCfg(id=self.file_id)
685
741
 
@@ -692,33 +748,55 @@ class Email(Communication):
692
748
  extracted_description += ' email'
693
749
 
694
750
  if self.config.description:
695
- self.warn(f"Overwriting description '{self.config.description}' with extract description '{self.config.description}'")
751
+ self.warn(f"Overwriting description '{self.config.description}' with extract's '{self.config.description}'")
696
752
 
697
753
  self.config.description = extracted_description
698
754
 
699
755
  self.config.is_interesting = self.config.is_interesting or extracted_from_doc_cfg.is_interesting
700
756
  self.log(f"Constructed synthetic config: {self.config}")
701
757
 
702
- def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
703
- logger.debug(f"Printing '{self.filename}'...")
704
- yield self.file_info_panel()
705
- should_rewrite_header = self.header.was_initially_empty and self.header.num_header_rows > 0
758
+ def _truncate_to_length(self) -> int:
759
+ """When printing truncate this email to this length."""
706
760
  quote_cutoff = self._idx_of_nth_quoted_reply(text=self.text) # Trim if there's many quoted replies
707
- num_chars = MAX_CHARS_TO_PRINT
708
- trim_footer_txt = None
709
- text = self.text
761
+ includes_truncate_term = next((term for term in TRUNCATE_TERMS if term in self.text), None)
710
762
 
711
- if self.file_id in TRUNCATION_LENGTHS:
763
+ if args.whole_file:
764
+ num_chars = len(self.text)
765
+ elif self.file_id in TRUNCATION_LENGTHS:
712
766
  num_chars = TRUNCATION_LENGTHS[self.file_id]
713
- elif self.author in TRUNCATE_ALL_EMAILS_FROM or any((term in self.text) for term in TRUNCATE_TERMS):
767
+ elif self.author in TRUNCATE_ALL_EMAILS_FROM or includes_truncate_term:
714
768
  num_chars = int(MAX_CHARS_TO_PRINT / 3)
715
769
  elif quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
716
770
  num_chars = quote_cutoff
771
+ else:
772
+ num_chars = MAX_CHARS_TO_PRINT
773
+
774
+ if num_chars != MAX_CHARS_TO_PRINT and not self.is_duplicate():
775
+ log_args = {
776
+ 'num_chars': num_chars,
777
+ 'author_truncate': self.author in TRUNCATE_ALL_EMAILS_FROM,
778
+ 'is_fwded_article': self.is_fwded_article(),
779
+ 'is_quote_cutoff': quote_cutoff == num_chars,
780
+ 'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
781
+ 'quote_cutoff': quote_cutoff,
782
+ }
783
+
784
+ if quote_cutoff != num_chars:
785
+ logger.debug(f'{self.summary()} truncating: ' + ', '.join([f"{k}={v}" for k, v in log_args.items() if v]) + '\n')
786
+
787
+ return num_chars
788
+
789
+ def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
790
+ logger.debug(f"Printing '{self.filename}'...")
791
+ should_rewrite_header = self.header.was_initially_empty and self.header.num_header_rows > 0
792
+ num_chars = self._truncate_to_length()
793
+ trim_footer_txt = None
794
+ text = self.text
717
795
 
718
796
  # Truncate long emails but leave a note explaining what happened w/link to source document
719
- if len(text) > num_chars and self._truncation_allowed:
797
+ if len(text) > num_chars:
720
798
  text = text[0:num_chars]
721
- doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
799
+ doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style())
722
800
  trim_note = f"<...trimmed to {num_chars} characters of {self.length()}, read the rest at {doc_link_markup}...>"
723
801
  trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
724
802
 
@@ -738,38 +816,37 @@ class Email(Communication):
738
816
  text = _add_line_breaks(text) # This was skipped when _prettify_text() w/a broken header so we do it now
739
817
  self.rewritten_header_ids.add(self.file_id)
740
818
 
741
- panel_txt = highlighter(text)
742
-
743
819
  email_txt_panel = Panel(
744
- panel_txt.append('\n\n').append(trim_footer_txt) if trim_footer_txt else panel_txt,
820
+ highlighter(text).append('\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
745
821
  border_style=self._border_style(),
746
822
  expand=False,
747
823
  subtitle=REWRITTEN_HEADER_MSG if should_rewrite_header else None,
748
824
  )
749
825
 
826
+ yield self.file_info_panel()
750
827
  yield Padding(email_txt_panel, (0, 0, 1, INFO_INDENT))
751
828
 
752
829
  if should_rewrite_header:
753
830
  self.log_top_lines(self.header.num_header_rows + 4, f'Original header:')
754
831
 
755
832
  @staticmethod
756
- def build_table(emails: list['Email'], _author: str | None) -> Table:
757
- """Turn a set of Email objects into a Table."""
833
+ def build_emails_table(emails: list['Email'], _author: str | None, include_title: bool = False) -> Table:
834
+ """Turn a set of Emails to/from a given _author into a Table."""
758
835
  author = _author or UNKNOWN
759
836
 
760
837
  table = Table(
761
- title=f"Emails to/from {author} starting {emails[0].timestamp.date()}",
838
+ title=f"Emails to/from {author} starting {emails[0].timestamp.date()}" if include_title else None,
762
839
  border_style=get_style_for_name(author, allow_bold=False),
763
840
  header_style="bold"
764
841
  )
765
842
 
766
843
  table.add_column('From', justify='left')
767
844
  table.add_column('Timestamp', justify='center')
768
- table.add_column('Subject', justify='left', style='honeydew2', min_width=60)
845
+ table.add_column('Subject', justify='left', style='honeydew2', min_width=70)
769
846
 
770
847
  for email in emails:
771
848
  table.add_row(
772
- email.author_txt,
849
+ email.author_txt(),
773
850
  email.epstein_media_link(link_txt=email.timestamp_without_seconds()),
774
851
  highlighter(email.subject())
775
852
  )