epstein-files 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
epstein_files/__init__.py CHANGED
@@ -21,7 +21,7 @@ from epstein_files.util.env import args
21
21
  from epstein_files.util.file_helper import coerce_file_path, extract_file_id
22
22
  from epstein_files.util.logging import exit_with_error, logger
23
23
  from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
24
- print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info_png,
24
+ print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info,
25
25
  print_json_metadata, write_urls)
26
26
  from epstein_files.util.rich import (build_highlighter, console, print_color_key, print_title_page_header,
27
27
  print_title_page_tables, print_subtitle_panel, write_html)
@@ -38,15 +38,15 @@ def generate_html() -> None:
38
38
  timer = Timer()
39
39
  epstein_files = EpsteinFiles.get_files(timer)
40
40
 
41
- if args.json_metadata:
41
+ if args.emailers_info:
42
+ print_emailers_info(epstein_files)
43
+ exit()
44
+ elif args.json_metadata:
42
45
  print_json_metadata(epstein_files)
43
46
  exit()
44
47
  elif args.json_files:
45
48
  print_json_files(epstein_files)
46
49
  exit()
47
- elif args.emailers_info_png:
48
- print_emailers_info_png(epstein_files)
49
- exit()
50
50
 
51
51
  print_title_page_header()
52
52
 
@@ -59,25 +59,19 @@ def generate_html() -> None:
59
59
  exit()
60
60
 
61
61
  if args.output_texts:
62
- imessage_logs = [log for log in epstein_files.imessage_logs if not args.names or log.author in args.names]
63
- print_text_messages_section(imessage_logs)
64
- timer.print_at_checkpoint(f'Printed {len(imessage_logs)} text message log files')
62
+ printed_logs = print_text_messages_section(epstein_files)
63
+ timer.log_section_complete('MessengerLog', epstein_files.imessage_logs, printed_logs)
65
64
 
66
65
  if args.output_emails:
67
- emails_that_were_printed = print_emails_section(epstein_files)
68
- timer.print_at_checkpoint(f"Printed {len(emails_that_were_printed):,} emails")
66
+ printed_emails = print_emails_section(epstein_files)
67
+ timer.log_section_complete('Email', epstein_files.emails, printed_emails)
69
68
  elif args.email_timeline:
70
69
  print_email_timeline(epstein_files)
71
70
  timer.print_at_checkpoint(f"Printed chronological emails table")
72
71
 
73
72
  if args.output_other:
74
- if args.uninteresting:
75
- files = [f for f in epstein_files.other_files if not f.is_interesting()]
76
- else:
77
- files = [f for f in epstein_files.other_files if args.all_other_files or f.is_interesting()]
78
-
79
- print_other_files_section(files, epstein_files)
80
- timer.print_at_checkpoint(f"Printed {len(files)} other files (skipped {len(epstein_files.other_files) - len(files)})")
73
+ printed_files = print_other_files_section(epstein_files)
74
+ timer.log_section_complete('OtherFile', epstein_files.other_files, printed_files)
81
75
 
82
76
  write_html(args.build)
83
77
  logger.warning(f"Total time: {timer.seconds_since_start_str()}")
@@ -94,7 +88,6 @@ def epstein_diff():
94
88
 
95
89
  def epstein_search():
96
90
  """Search the cleaned up text of the files."""
97
- _assert_positional_args()
98
91
  epstein_files = EpsteinFiles.get_files()
99
92
 
100
93
  for search_term in args.positional_args:
@@ -117,7 +110,6 @@ def epstein_search():
117
110
 
118
111
  def epstein_show():
119
112
  """Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
120
- _assert_positional_args()
121
113
  raw_docs: list[Document] = []
122
114
  console.line()
123
115
 
@@ -142,8 +134,3 @@ def epstein_show():
142
134
 
143
135
  def epstein_word_count() -> None:
144
136
  write_word_counts_html()
145
-
146
-
147
- def _assert_positional_args():
148
- if not args.positional_args:
149
- exit_with_error(f"No positional args provided!\n")
@@ -34,9 +34,6 @@ class Communication(Document):
34
34
  """Overrides super() method to apply self.author_style."""
35
35
  return super().external_links_txt(self.author_style(), include_alt_links=include_alt_links)
36
36
 
37
- def is_attribution_uncertain(self) -> bool:
38
- return bool(self.config and self.config.is_attribution_uncertain)
39
-
40
37
  def summary(self) -> Text:
41
38
  return self._summary().append(CLOSE_PROPERTIES_CHAR)
42
39
 
@@ -11,17 +11,19 @@ from rich.console import Console, ConsoleOptions, Group, RenderResult
11
11
  from rich.padding import Padding
12
12
  from rich.panel import Panel
13
13
  from rich.text import Text
14
+ from rich.table import Table
14
15
 
15
16
  from epstein_files.util.constant.names import *
16
17
  from epstein_files.util.constant.strings import *
17
18
  from epstein_files.util.constant.urls import *
18
19
  from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
19
- from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time_from_timestamp_str, without_falsey
20
+ from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time, without_falsey
20
21
  from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
21
22
  from epstein_files.util.env import DOCS_DIR, args
22
- from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, is_local_extract_file
23
+ from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, file_size_to_str, is_local_extract_file
23
24
  from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
24
- from epstein_files.util.rich import INFO_STYLE, SYMBOL_STYLE, console, highlighter, join_texts, key_value_txt, link_text_obj, parenthesize
25
+ from epstein_files.util.rich import (INFO_STYLE, NA_TXT, SYMBOL_STYLE, add_cols_to_table, build_table, console,
26
+ highlighter, join_texts, key_value_txt, link_text_obj, parenthesize)
25
27
  from epstein_files.util.search_result import MatchedLine
26
28
 
27
29
  ALT_LINK_STYLE = 'white dim'
@@ -55,6 +57,14 @@ OCR_REPAIRS = {
55
57
  'Nil Priell': 'Nili Priell',
56
58
  }
57
59
 
60
+ SUMMARY_TABLE_COLS: list[str | dict] = [
61
+ 'Count',
62
+ {'name': 'Has Author', 'style': 'honeydew2'},
63
+ {'name': 'No Author', 'style': 'wheat4'},
64
+ {'name': 'Uncertain Author', 'style': 'royal_blue1 dim'},
65
+ {'name': 'Size', 'justify': 'right', 'style': 'dim'},
66
+ ]
67
+
58
68
 
59
69
  @dataclass
60
70
  class Document:
@@ -181,6 +191,9 @@ class Document:
181
191
  """Secondary info about this file (description recipients, etc). Overload in subclasses."""
182
192
  return None
183
193
 
194
+ def is_attribution_uncertain(self) -> bool:
195
+ return bool(self.config and self.config.is_attribution_uncertain)
196
+
184
197
  def is_duplicate(self) -> bool:
185
198
  return bool(self.duplicate_of_id())
186
199
 
@@ -240,17 +253,6 @@ class Document:
240
253
 
241
254
  return text
242
255
 
243
- def sort_key(self) -> tuple[datetime, str, int]:
244
- """Sort by timestamp, file_id, then whether or not it's a duplicate file."""
245
- if self.is_duplicate():
246
- sort_id = self.config.duplicate_of_id
247
- dupe_idx = 1
248
- else:
249
- sort_id = self.file_id
250
- dupe_idx = 0
251
-
252
- return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
253
-
254
256
  def source_file_id(self) -> str:
255
257
  """Strip off the _1, _2, etc. suffixes for extracted documents."""
256
258
  return self.file_id[0:6]
@@ -261,7 +263,7 @@ class Document:
261
263
  txt.append(f" {self.file_path.stem}", style=FILENAME_STYLE)
262
264
 
263
265
  if self.timestamp:
264
- timestamp_str = remove_zero_time_from_timestamp_str(self.timestamp).replace('T', ' ')
266
+ timestamp_str = remove_zero_time(self.timestamp).replace('T', ' ')
265
267
  txt.append(' (', style=SYMBOL_STYLE)
266
268
  txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
267
269
 
@@ -282,6 +284,17 @@ class Document:
282
284
 
283
285
  return Panel(Group(*sentences), border_style=self._class_style(), expand=False)
284
286
 
287
+ def timestamp_sort_key(self) -> tuple[datetime, str, int]:
288
+ """Sort by timestamp, file_id, then whether or not it's a duplicate file."""
289
+ if self.is_duplicate():
290
+ sort_id = self.config.duplicate_of_id
291
+ dupe_idx = 1
292
+ else:
293
+ sort_id = self.file_id
294
+ dupe_idx = 0
295
+
296
+ return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
297
+
285
298
  def top_lines(self, n: int = 10) -> str:
286
299
  """First n lines."""
287
300
  return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
@@ -361,6 +374,32 @@ class Document:
361
374
  def __str__(self) -> str:
362
375
  return self.summary().plain
363
376
 
377
+ @classmethod
378
+ def file_info_table(cls, title: str, first_col_name: str) -> Table:
379
+ """Empty table with appropriate cols for summarizing groups of files."""
380
+ table = build_table(title)
381
+ cols = [{'name': first_col_name, 'min_width': 14}] + SUMMARY_TABLE_COLS
382
+ add_cols_to_table(table, cols, 'right')
383
+ return table
384
+
385
+ @classmethod
386
+ def files_info(cls, files: Sequence['Document'], is_author_na: bool = False) -> dict[str, str | Text]:
387
+ """Summary info about a group of files."""
388
+ file_count = len(files)
389
+ author_count = cls.known_author_count(files)
390
+
391
+ return {
392
+ 'count': str(file_count),
393
+ 'author_count': NA_TXT if is_author_na else str(author_count),
394
+ 'no_author_count': NA_TXT if is_author_na else str(file_count - author_count),
395
+ 'uncertain_author_count': NA_TXT if is_author_na else str(len([f for f in files if f.is_attribution_uncertain()])),
396
+ 'bytes': file_size_to_str(sum([f.file_size() for f in files])),
397
+ }
398
+
399
+ @classmethod
400
+ def files_info_row(cls, files: Sequence['Document'], author_na: bool = False) -> Sequence[str | Text]:
401
+ return [v for v in cls.files_info(files, author_na).values()]
402
+
364
403
  @staticmethod
365
404
  def diff_files(files: list[str]) -> None:
366
405
  """Diff the contents of two Documents after all cleanup, BOM removal, etc."""
@@ -398,14 +437,18 @@ class Document:
398
437
 
399
438
  @staticmethod
400
439
  def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
401
- return sorted(docs, key=lambda doc: doc.sort_key())
440
+ return sorted(docs, key=lambda doc: doc.timestamp_sort_key())
402
441
 
403
- @classmethod
404
- def uniquify(cls, documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
442
+ @staticmethod
443
+ def uniquify(documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
405
444
  """Uniquify by file_id."""
406
445
  id_map = {doc.file_id: doc for doc in documents}
407
446
  return [doc for doc in id_map.values()]
408
447
 
448
+ @staticmethod
449
+ def without_dupes(docs: Sequence['DocumentType']) -> list['DocumentType']:
450
+ return [doc for doc in docs if not doc.is_duplicate()]
451
+
409
452
 
410
453
  DocumentType = TypeVar('DocumentType', bound=Document)
411
454
 
@@ -32,7 +32,7 @@ BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE
32
32
  BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
33
33
  DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
34
34
  LINK_LINE_REGEX = re.compile(f"^(> )?htt")
35
- QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
35
+ QUOTED_REPLY_LINE_REGEX = re.compile(r'(\nFrom:(.*)|wrote:)\n', re.IGNORECASE)
36
36
  REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
37
37
 
38
38
  BAD_TIMEZONE_REGEX = re.compile(fr'\((UTC|GMT\+\d\d:\d\d)\)|{REDACTED}')
@@ -114,7 +114,7 @@ EMAIL_SIGNATURE_REGEXES = {
114
114
  DAVID_INGRAM: re.compile(r"Thank you in advance.*\nDavid Ingram.*\nCorrespondent\nReuters.*\nThomson.*(\n(Office|Mobile|Reuters.com).*)*"),
115
115
  DEEPAK_CHOPRA: re.compile(fr"({DEEPAK_CHOPRA}( MD)?\n)?2013 Costa Del Mar Road\nCarlsbad, CA 92009(\n(Chopra Foundation|Super Genes: Unlock.*))?(\nJiyo)?(\nChopra Center for Wellbeing)?(\nHome: Where Everyone is Welcome)?"),
116
116
  JEFFREY_EPSTEIN: re.compile(r"((\*+|please note)\n+)?(> )?(• )?(» )?The information contained in this communication is\n(> )*(» )?confidential.*?all attachments.( copyright -all rights reserved?)?", re.DOTALL),
117
- JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*", re.IGNORECASE),
117
+ JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*(\nTel:.*)?(\nEmail:.*)?", re.IGNORECASE),
118
118
  KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
119
119
  LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
120
120
  LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
@@ -436,9 +436,9 @@ class Email(Communication):
436
436
  elif self.header.num_header_rows == 0:
437
437
  return self.text
438
438
 
439
- reply_text_match = REPLY_TEXT_REGEX.search(text)
440
439
  self.log_top_lines(20, "Raw text:", logging.DEBUG)
441
440
  self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
441
+ reply_text_match = REPLY_TEXT_REGEX.search(text)
442
442
 
443
443
  if reply_text_match:
444
444
  actual_num_chars = len(reply_text_match.group(1))
@@ -550,9 +550,15 @@ class Email(Communication):
550
550
 
551
551
  def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
552
552
  """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
553
- for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text or self.text)):
553
+ if text is None:
554
+ header_offset = len(self.header.header_chars)
555
+ text = self.text[header_offset:]
556
+ else:
557
+ header_offset = 0
558
+
559
+ for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text)):
554
560
  if i >= n:
555
- return match.end() - 1
561
+ return match.end() + header_offset - 1
556
562
 
557
563
  def _merge_lines(self, idx: int, idx2: int | None = None) -> None:
558
564
  """Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""
@@ -9,7 +9,6 @@ from epstein_files.util.logging import logger
9
9
  from epstein_files.util.rich import UNKNOWN
10
10
 
11
11
  FIELD_NAMES = ['Date', 'From', 'Sent', 'Subject']
12
- NON_HEADER_FIELDS = ['field_names', 'num_header_rows', 'was_initially_empty']
13
12
  ON_BEHALF_OF = 'on behalf of'
14
13
  TO_FIELDS = ['bcc', 'cc', 'to']
15
14
  EMAILER_FIELDS = [AUTHOR] + TO_FIELDS
@@ -28,10 +27,18 @@ CONFIGURED_ACTUAL_TEXTS = [
28
27
  if isinstance(cfg, EmailCfg) and cfg.actual_text is not None
29
28
  ]
30
29
 
30
+ NON_HEADER_FIELDS = [
31
+ 'field_names',
32
+ 'header_chars',
33
+ 'num_header_rows',
34
+ 'was_initially_empty',
35
+ ]
36
+
31
37
 
32
38
  @dataclass(kw_only=True)
33
39
  class EmailHeader:
34
40
  field_names: list[str] # Order is same as the order header fields appear in the email file text
41
+ header_chars: str = ''
35
42
  num_header_rows: int = field(init=False)
36
43
  was_initially_empty: bool = False
37
44
 
@@ -101,6 +108,7 @@ class EmailHeader:
101
108
  setattr(self, field_name, value)
102
109
 
103
110
  self.num_header_rows = len(self.field_names) + num_headers
111
+ self.header_chars = '\n'.join(email_lines[0:self.num_header_rows])
104
112
  log_msg = f"Corrected empty header using {self.num_header_rows} lines to:\n"
105
113
  logger.debug(f"{log_msg}{self}\n\nTop lines:\n\n%s", '\n'.join(email_lines[0:(num_headers + 1) * 2]))
106
114
 
@@ -163,7 +171,7 @@ class EmailHeader:
163
171
  if should_log_header:
164
172
  logger.debug(f"Header being parsed was this:\n\n{header}\n")
165
173
 
166
- return EmailHeader(field_names=field_names, **kw_args)
174
+ return cls(field_names=field_names, header_chars=header, **kw_args)
167
175
 
168
176
  @staticmethod
169
177
  def cleanup_str(_str: str) -> str:
@@ -4,7 +4,7 @@ from datetime import datetime
4
4
 
5
5
  from rich.text import Text
6
6
 
7
- from epstein_files.util.constant.names import JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN, Name, extract_last_name
7
+ from epstein_files.util.constant.names import ANTHONY_SCARAMUCCI, JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN, Name, extract_last_name
8
8
  from epstein_files.util.constant.strings import TIMESTAMP_DIM
9
9
  from epstein_files.util.data import iso_timestamp
10
10
  from epstein_files.util.highlighted_group import get_style_for_name
@@ -17,6 +17,7 @@ PHONE_NUMBER_REGEX = re.compile(r'^[\d+]+.*')
17
17
  UNCERTAIN_SUFFIX = ' (?)'
18
18
 
19
19
  DISPLAY_LAST_NAME_ONLY = [
20
+ ANTHONY_SCARAMUCCI,
20
21
  JEFFREY_EPSTEIN,
21
22
  STEVE_BANNON,
22
23
  ]
@@ -59,7 +60,7 @@ class TextMessage:
59
60
  try:
60
61
  timestamp_str = iso_timestamp(self.parse_timestamp())
61
62
  except Exception as e:
62
- logger.warning(f"Failed to parse timestamp for {self}")
63
+ logger.info(f"Failed to parse timestamp for {self}")
63
64
  timestamp_str = self.timestamp_str
64
65
 
65
66
  return Text(f"[{timestamp_str}]", style=TIMESTAMP_DIM)
@@ -22,7 +22,7 @@ from epstein_files.util.data import days_between, escape_single_quotes, remove_t
22
22
  from epstein_files.util.file_helper import FILENAME_LENGTH, file_size_to_str
23
23
  from epstein_files.util.env import args
24
24
  from epstein_files.util.highlighted_group import QUESTION_MARKS_TXT, styled_category
25
- from epstein_files.util.rich import build_table, highlighter
25
+ from epstein_files.util.rich import add_cols_to_table, build_table, highlighter
26
26
  from epstein_files.util.logging import logger
27
27
 
28
28
  FIRST_FEW_LINES = 'First Few Lines'
@@ -209,39 +209,8 @@ class OtherFile(Document):
209
209
  if num_days_spanned > MAX_DAYS_SPANNED_TO_BE_VALID and VAST_HOUSE not in self.text:
210
210
  self.log_top_lines(15, msg=timestamps_log_msg, level=logging.DEBUG)
211
211
 
212
- @staticmethod
213
- def count_by_category_table(files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
214
- counts = defaultdict(int)
215
- category_bytes = defaultdict(int)
216
-
217
- for file in files:
218
- if file.category() is None:
219
- logger.warning(f"file {file.file_id} has no category")
220
-
221
- counts[file.category()] += 1
222
- category_bytes[file.category()] += file.file_size()
223
-
224
- table = build_table(f'{title_pfx}Other Files Summary', ['Category', 'Count', 'Has Author', 'No Author', 'Size'])
225
- table.columns[-1].justify = 'right'
226
- table.columns[0].min_width = 14
227
- table.columns[-1].style = 'dim'
228
-
229
- for (category, count) in sort_dict(counts):
230
- category_files = [f for f in files if f.category() == category]
231
- known_author_count = Document.known_author_count(category_files)
232
-
233
- table.add_row(
234
- styled_category(category),
235
- str(count),
236
- str(known_author_count),
237
- str(count - known_author_count),
238
- file_size_to_str(category_bytes[category]),
239
- )
240
-
241
- return table
242
-
243
- @staticmethod
244
- def files_preview_table(files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
212
+ @classmethod
213
+ def files_preview_table(cls, files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
245
214
  """Build a table of OtherFile documents."""
246
215
  table = build_table(f'{title_pfx}Other Files Details in Chronological Order', show_lines=True)
247
216
  table.add_column('File', justify='center', width=FILENAME_LENGTH)
@@ -272,3 +241,16 @@ class OtherFile(Document):
272
241
  )
273
242
 
274
243
  return table
244
+
245
+ @classmethod
246
+ def summary_table(cls, files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
247
+ categories = uniquify([f.category() for f in files])
248
+ categories = sorted(categories, key=lambda c: -len([f for f in files if f.category() == c]))
249
+ table = cls.file_info_table(f'{title_pfx}Other Files Summary', 'Category')
250
+
251
+ for category in categories:
252
+ category_files = [f for f in files if f.category() == category]
253
+ table.add_row(styled_category(category), *cls.files_info_row(category_files))
254
+
255
+ table.columns = table.columns[:-2] + [table.columns[-1]] # Removee unknown author col
256
+ return table
@@ -9,6 +9,8 @@ from datetime import datetime
9
9
  from pathlib import Path
10
10
  from typing import Sequence, Type, cast
11
11
 
12
+ from rich.table import Table
13
+
12
14
  from epstein_files.documents.document import Document
13
15
  from epstein_files.documents.email import DETECT_EMAIL_REGEX, Email
14
16
  from epstein_files.documents.json_file import JsonFile
@@ -22,7 +24,6 @@ from epstein_files.util.doc_cfg import EmailCfg, Metadata
22
24
  from epstein_files.util.env import DOCS_DIR, args, logger
23
25
  from epstein_files.util.file_helper import file_size_str
24
26
  from epstein_files.util.highlighted_group import HIGHLIGHTED_NAMES, HighlightedNames
25
- from epstein_files.util.rich import NA_TXT, add_cols_to_table, build_table, console, print_centered
26
27
  from epstein_files.util.search_result import SearchResult
27
28
  from epstein_files.util.timer import Timer
28
29
 
@@ -31,9 +32,13 @@ PICKLED_PATH = Path("the_epstein_files.pkl.gz")
31
32
  SLOW_FILE_SECONDS = 1.0
32
33
 
33
34
  EMAILS_WITH_UNINTERESTING_CCS = [
34
- '025329', # Krassner
35
- '024923', # Krassner
36
- '033568', # Krassner
35
+ '025329', # Krassner
36
+ '024923', # Krassner
37
+ '033568', # Krassner
38
+ ]
39
+
40
+ EMAILS_WITH_UNINTERESTING_BCCS = [
41
+ '014797_1', # Ross Gow
37
42
  ]
38
43
 
39
44
 
@@ -45,7 +50,7 @@ class EpsteinFiles:
45
50
  json_files: list[JsonFile] = field(default_factory=list)
46
51
  other_files: list[OtherFile] = field(default_factory=list)
47
52
  timer: Timer = field(default_factory=lambda: Timer())
48
- uninteresting_ccs: list[Name] = field(init=False)
53
+ uninteresting_ccs: list[Name] = field(default_factory=list)
49
54
 
50
55
  def __post_init__(self):
51
56
  """Iterate through files and build appropriate objects."""
@@ -88,13 +93,12 @@ class EpsteinFiles:
88
93
  if PICKLED_PATH.exists() and not args.overwrite_pickle and not args.skip_other_files:
89
94
  with gzip.open(PICKLED_PATH, 'rb') as file:
90
95
  epstein_files = pickle.load(file)
91
- epstein_files.timer = timer
92
96
  timer_msg = f"Loaded {len(epstein_files.all_files):,} documents from '{PICKLED_PATH}'"
93
- epstein_files.timer.print_at_checkpoint(f"{timer_msg} ({file_size_str(PICKLED_PATH)})")
97
+ timer.print_at_checkpoint(f"{timer_msg} ({file_size_str(PICKLED_PATH)})")
94
98
  return epstein_files
95
99
 
96
100
  logger.warning(f"Building new cache file, this will take a few minutes...")
97
- epstein_files = EpsteinFiles(timer=timer)
101
+ epstein_files = EpsteinFiles()
98
102
 
99
103
  if args.skip_other_files:
100
104
  logger.warning(f"Not writing pickled data because --skip-other-files")
@@ -235,7 +239,7 @@ class EpsteinFiles:
235
239
  return json.dumps(metadata, indent=4, sort_keys=True)
236
240
 
237
241
  def non_duplicate_emails(self) -> list[Email]:
238
- return [email for email in self.emails if not email.is_duplicate()]
242
+ return Document.without_dupes(self.emails)
239
243
 
240
244
  def non_json_other_files(self) -> list[OtherFile]:
241
245
  return [doc for doc in self.other_files if not isinstance(doc, JsonFile)]
@@ -253,34 +257,20 @@ class EpsteinFiles:
253
257
  for name in names
254
258
  ]
255
259
 
256
- def print_files_summary(self) -> None:
257
- table = build_table('File Overview')
258
- add_cols_to_table(table, ['File Type', 'Count', 'Author Known', 'Author Unknown', 'Duplicates'])
259
- table.columns[1].justify = 'right'
260
-
261
- def add_row(label: str, docs: list):
262
- known = None if isinstance(docs[0], JsonFile) else Document.known_author_count(docs)
263
-
264
- table.add_row(
265
- label,
266
- f"{len(docs):,}",
267
- f"{known:,}" if known is not None else NA_TXT,
268
- f"{len(docs) - known:,}" if known is not None else NA_TXT,
269
- f"{len([d for d in docs if d.is_duplicate()])}",
270
- )
271
-
272
- add_row('Emails', self.emails)
273
- add_row('iMessage Logs', self.imessage_logs)
274
- add_row('JSON Data', self.json_files)
275
- add_row('Other', self.non_json_other_files())
276
- print_centered(table)
277
- console.line()
260
+ def overview_table(self) -> Table:
261
+ table = Document.file_info_table('Files Overview', 'File Type')
262
+ table.add_row('Emails', *Document.files_info_row(self.emails))
263
+ table.add_row('iMessage Logs', *Document.files_info_row(self.imessage_logs))
264
+ table.add_row('JSON Data', *Document.files_info_row(self.json_files, True))
265
+ table.add_row('Other', *Document.files_info_row(self.non_json_other_files()))
266
+ return table
278
267
 
279
268
  def unknown_recipient_ids(self) -> list[str]:
280
269
  """IDs of emails whose recipient is not known."""
281
270
  return sorted([e.file_id for e in self.emails if None in e.recipients or not e.recipients])
282
271
 
283
272
  def uninteresting_emailers(self) -> list[Name]:
273
+ """Emailers whom we don't want to print a separate section for because they're just CCed."""
284
274
  if '_uninteresting_emailers' not in vars(self):
285
275
  self._uninteresting_emailers = sorted(uniquify(UNINTERESTING_EMAILERS + self.uninteresting_ccs))
286
276
 
@@ -306,8 +296,8 @@ class EpsteinFiles:
306
296
  self.emails = Document.sort_by_timestamp(self.emails)
307
297
 
308
298
  def _set_uninteresting_ccs(self) -> None:
309
- ross_gow_email = self.email_for_id('014797_1')
310
- self.uninteresting_ccs = copy(cast(list[Name], ross_gow_email.header.bcc))
299
+ for id in EMAILS_WITH_UNINTERESTING_BCCS:
300
+ self.uninteresting_ccs += copy(cast(list[Name], self.email_for_id(id).header.bcc))
311
301
 
312
302
  for id in EMAILS_WITH_UNINTERESTING_CCS:
313
303
  self.uninteresting_ccs += self.email_for_id(id).recipients