epstein-files 1.2.0__py3-none-any.whl → 1.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
epstein_files/__init__.py CHANGED
@@ -16,14 +16,18 @@ from rich.text import Text
16
16
  from epstein_files.epstein_files import EpsteinFiles, document_cls
17
17
  from epstein_files.documents.document import INFO_PADDING, Document
18
18
  from epstein_files.documents.email import Email
19
+ from epstein_files.documents.messenger_log import MessengerLog
20
+ from epstein_files.documents.other_file import OtherFile
19
21
  from epstein_files.util.constant.output_files import make_clean
22
+ from epstein_files.util.constant.strings import ID_REGEX
23
+ from epstein_files.util.data import flatten
20
24
  from epstein_files.util.env import args
21
25
  from epstein_files.util.file_helper import coerce_file_path, extract_file_id
22
26
  from epstein_files.util.logging import exit_with_error, logger
23
27
  from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
24
- print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info_png,
28
+ print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info,
25
29
  print_json_metadata, write_urls)
26
- from epstein_files.util.rich import (build_highlighter, console, print_color_key, print_title_page_header,
30
+ from epstein_files.util.rich import (build_highlighter, console, highlighter, print_color_key, print_title_page_header,
27
31
  print_title_page_tables, print_subtitle_panel, write_html)
28
32
  from epstein_files.util.timer import Timer
29
33
  from epstein_files.util.word_count import write_word_counts_html
@@ -38,15 +42,15 @@ def generate_html() -> None:
38
42
  timer = Timer()
39
43
  epstein_files = EpsteinFiles.get_files(timer)
40
44
 
41
- if args.json_metadata:
45
+ if args.emailers_info:
46
+ print_emailers_info(epstein_files)
47
+ exit()
48
+ elif args.json_metadata:
42
49
  print_json_metadata(epstein_files)
43
50
  exit()
44
51
  elif args.json_files:
45
52
  print_json_files(epstein_files)
46
53
  exit()
47
- elif args.emailers_info_png:
48
- print_emailers_info_png(epstein_files)
49
- exit()
50
54
 
51
55
  print_title_page_header()
52
56
 
@@ -59,29 +63,26 @@ def generate_html() -> None:
59
63
  exit()
60
64
 
61
65
  if args.output_texts:
62
- imessage_logs = [log for log in epstein_files.imessage_logs if not args.names or log.author in args.names]
63
- print_text_messages_section(imessage_logs)
64
- timer.print_at_checkpoint(f'Printed {len(imessage_logs)} text message log files')
66
+ printed_logs = print_text_messages_section(epstein_files)
67
+ timer.log_section_complete('MessengerLog', epstein_files.imessage_logs, printed_logs)
65
68
 
66
69
  if args.output_emails:
67
- emails_that_were_printed = print_emails_section(epstein_files)
68
- timer.print_at_checkpoint(f"Printed {len(emails_that_were_printed):,} emails")
70
+ printed_emails = print_emails_section(epstein_files)
71
+ timer.log_section_complete('Email', epstein_files.emails, printed_emails)
69
72
  elif args.email_timeline:
70
73
  print_email_timeline(epstein_files)
71
74
  timer.print_at_checkpoint(f"Printed chronological emails table")
72
75
 
73
76
  if args.output_other:
74
- if args.uninteresting:
75
- files = [f for f in epstein_files.other_files if not f.is_interesting()]
76
- else:
77
- files = [f for f in epstein_files.other_files if args.all_other_files or f.is_interesting()]
78
-
79
- print_other_files_section(files, epstein_files)
80
- timer.print_at_checkpoint(f"Printed {len(files)} other files (skipped {len(epstein_files.other_files) - len(files)})")
77
+ printed_files = print_other_files_section(epstein_files)
78
+ timer.log_section_complete('OtherFile', epstein_files.other_files, printed_files)
81
79
 
82
80
  write_html(args.build)
83
81
  logger.warning(f"Total time: {timer.seconds_since_start_str()}")
84
82
 
83
+ if args.debug:
84
+ highlighter.print_highlight_counts(console)
85
+
85
86
  # JSON stats (mostly used for building pytest checks)
86
87
  if args.json_stats:
87
88
  print_json_stats(epstein_files)
@@ -94,36 +95,52 @@ def epstein_diff():
94
95
 
95
96
  def epstein_search():
96
97
  """Search the cleaned up text of the files."""
97
- _assert_positional_args()
98
98
  epstein_files = EpsteinFiles.get_files()
99
99
 
100
+ if ID_REGEX.match(args.positional_args[0]):
101
+ logger.warning(f"'{args.positional_args[0]}' seems to be an ID, running epstein_show instead...")
102
+ epstein_show()
103
+ return
104
+
100
105
  for search_term in args.positional_args:
101
106
  temp_highlighter = build_highlighter(search_term)
102
107
  search_results = epstein_files.docs_matching(search_term, args.names)
103
108
  print_subtitle_panel(f"Found {len(search_results)} documents matching '{search_term}'")
104
109
 
105
110
  for search_result in search_results:
106
- console.line()
111
+ document = search_result.document
112
+
113
+ if (isinstance(document, Email) and not args.output_emails) \
114
+ or (isinstance(document, OtherFile) and not args.output_other) \
115
+ or (isinstance(document, MessengerLog) and not args.output_texts):
116
+ document.warn(f"{type(document).__name__} Skipping search result...")
117
+ continue
107
118
 
108
119
  if args.whole_file:
109
- console.print(search_result.document)
120
+ console.print(document)
110
121
  else:
111
- console.print(search_result.document.summary_panel())
122
+ console.print(document.summary_panel())
112
123
 
113
124
  for matching_line in search_result.lines:
114
125
  line_txt = matching_line.__rich__()
115
126
  console.print(Padding(temp_highlighter(line_txt), INFO_PADDING), style='gray37')
116
127
 
128
+ console.line()
129
+
117
130
 
118
131
  def epstein_show():
119
132
  """Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
120
- _assert_positional_args()
121
133
  raw_docs: list[Document] = []
122
134
  console.line()
123
135
 
124
136
  try:
125
- ids = [extract_file_id(arg) for arg in args.positional_args]
126
- raw_docs = [Document(coerce_file_path(id)) for id in ids]
137
+ if args.names:
138
+ people = EpsteinFiles.get_files().person_objs(args.names)
139
+ raw_docs = [doc for doc in flatten([p.emails for p in people])]
140
+ else:
141
+ ids = [extract_file_id(arg) for arg in args.positional_args]
142
+ raw_docs = [Document(coerce_file_path(id)) for id in ids]
143
+
127
144
  docs = Document.sort_by_timestamp([document_cls(doc)(doc.file_path) for doc in raw_docs])
128
145
  except Exception as e:
129
146
  exit_with_error(str(e))
@@ -142,8 +159,3 @@ def epstein_show():
142
159
 
143
160
  def epstein_word_count() -> None:
144
161
  write_word_counts_html()
145
-
146
-
147
- def _assert_positional_args():
148
- if not args.positional_args:
149
- exit_with_error(f"No positional args provided!\n")
@@ -34,9 +34,6 @@ class Communication(Document):
34
34
  """Overrides super() method to apply self.author_style."""
35
35
  return super().external_links_txt(self.author_style(), include_alt_links=include_alt_links)
36
36
 
37
- def is_attribution_uncertain(self) -> bool:
38
- return bool(self.config and self.config.is_attribution_uncertain)
39
-
40
37
  def summary(self) -> Text:
41
38
  return self._summary().append(CLOSE_PROPERTIES_CHAR)
42
39
 
@@ -11,17 +11,19 @@ from rich.console import Console, ConsoleOptions, Group, RenderResult
11
11
  from rich.padding import Padding
12
12
  from rich.panel import Panel
13
13
  from rich.text import Text
14
+ from rich.table import Table
14
15
 
15
16
  from epstein_files.util.constant.names import *
16
17
  from epstein_files.util.constant.strings import *
17
18
  from epstein_files.util.constant.urls import *
18
19
  from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
19
- from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time_from_timestamp_str, without_falsey
20
+ from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time, without_falsey
20
21
  from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
21
22
  from epstein_files.util.env import DOCS_DIR, args
22
- from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, is_local_extract_file
23
+ from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, file_size_to_str, is_local_extract_file
23
24
  from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
24
- from epstein_files.util.rich import INFO_STYLE, SYMBOL_STYLE, console, highlighter, join_texts, key_value_txt, link_text_obj, parenthesize
25
+ from epstein_files.util.rich import (INFO_STYLE, NA_TXT, SYMBOL_STYLE, add_cols_to_table, build_table, console,
26
+ highlighter, join_texts, key_value_txt, link_text_obj, parenthesize)
25
27
  from epstein_files.util.search_result import MatchedLine
26
28
 
27
29
  ALT_LINK_STYLE = 'white dim'
@@ -55,6 +57,14 @@ OCR_REPAIRS = {
55
57
  'Nil Priell': 'Nili Priell',
56
58
  }
57
59
 
60
+ SUMMARY_TABLE_COLS: list[str | dict] = [
61
+ 'Count',
62
+ {'name': 'Has Author', 'style': 'honeydew2'},
63
+ {'name': 'No Author', 'style': 'wheat4'},
64
+ {'name': 'Uncertain Author', 'style': 'royal_blue1 dim'},
65
+ {'name': 'Size', 'justify': 'right', 'style': 'dim'},
66
+ ]
67
+
58
68
 
59
69
  @dataclass
60
70
  class Document:
@@ -181,6 +191,9 @@ class Document:
181
191
  """Secondary info about this file (description recipients, etc). Overload in subclasses."""
182
192
  return None
183
193
 
194
+ def is_attribution_uncertain(self) -> bool:
195
+ return bool(self.config and self.config.is_attribution_uncertain)
196
+
184
197
  def is_duplicate(self) -> bool:
185
198
  return bool(self.duplicate_of_id())
186
199
 
@@ -240,17 +253,6 @@ class Document:
240
253
 
241
254
  return text
242
255
 
243
- def sort_key(self) -> tuple[datetime, str, int]:
244
- """Sort by timestamp, file_id, then whether or not it's a duplicate file."""
245
- if self.is_duplicate():
246
- sort_id = self.config.duplicate_of_id
247
- dupe_idx = 1
248
- else:
249
- sort_id = self.file_id
250
- dupe_idx = 0
251
-
252
- return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
253
-
254
256
  def source_file_id(self) -> str:
255
257
  """Strip off the _1, _2, etc. suffixes for extracted documents."""
256
258
  return self.file_id[0:6]
@@ -261,7 +263,7 @@ class Document:
261
263
  txt.append(f" {self.file_path.stem}", style=FILENAME_STYLE)
262
264
 
263
265
  if self.timestamp:
264
- timestamp_str = remove_zero_time_from_timestamp_str(self.timestamp).replace('T', ' ')
266
+ timestamp_str = remove_zero_time(self.timestamp).replace('T', ' ')
265
267
  txt.append(' (', style=SYMBOL_STYLE)
266
268
  txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
267
269
 
@@ -269,7 +271,7 @@ class Document:
269
271
  txt.append(", ").append(key_value_txt('lines', self.num_lines()))
270
272
 
271
273
  if self.config and self.config.duplicate_of_id:
272
- txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='magenta')))
274
+ txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='cyan dim')))
273
275
 
274
276
  return txt
275
277
 
@@ -282,6 +284,17 @@ class Document:
282
284
 
283
285
  return Panel(Group(*sentences), border_style=self._class_style(), expand=False)
284
286
 
287
+ def timestamp_sort_key(self) -> tuple[datetime, str, int]:
288
+ """Sort by timestamp, file_id, then whether or not it's a duplicate file."""
289
+ if self.is_duplicate():
290
+ sort_id = self.config.duplicate_of_id
291
+ dupe_idx = 1
292
+ else:
293
+ sort_id = self.file_id
294
+ dupe_idx = 0
295
+
296
+ return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
297
+
285
298
  def top_lines(self, n: int = 10) -> str:
286
299
  """First n lines."""
287
300
  return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
@@ -361,6 +374,32 @@ class Document:
361
374
  def __str__(self) -> str:
362
375
  return self.summary().plain
363
376
 
377
+ @classmethod
378
+ def file_info_table(cls, title: str, first_col_name: str) -> Table:
379
+ """Empty table with appropriate cols for summarizing groups of files."""
380
+ table = build_table(title)
381
+ cols = [{'name': first_col_name, 'min_width': 14}] + SUMMARY_TABLE_COLS
382
+ add_cols_to_table(table, cols, 'right')
383
+ return table
384
+
385
+ @classmethod
386
+ def files_info(cls, files: Sequence['Document'], is_author_na: bool = False) -> dict[str, str | Text]:
387
+ """Summary info about a group of files."""
388
+ file_count = len(files)
389
+ author_count = cls.known_author_count(files)
390
+
391
+ return {
392
+ 'count': str(file_count),
393
+ 'author_count': NA_TXT if is_author_na else str(author_count),
394
+ 'no_author_count': NA_TXT if is_author_na else str(file_count - author_count),
395
+ 'uncertain_author_count': NA_TXT if is_author_na else str(len([f for f in files if f.is_attribution_uncertain()])),
396
+ 'bytes': file_size_to_str(sum([f.file_size() for f in files])),
397
+ }
398
+
399
+ @classmethod
400
+ def files_info_row(cls, files: Sequence['Document'], author_na: bool = False) -> Sequence[str | Text]:
401
+ return [v for v in cls.files_info(files, author_na).values()]
402
+
364
403
  @staticmethod
365
404
  def diff_files(files: list[str]) -> None:
366
405
  """Diff the contents of two Documents after all cleanup, BOM removal, etc."""
@@ -396,16 +435,24 @@ class Document:
396
435
  """Count of how many Document objects have an author attribution."""
397
436
  return len([doc for doc in docs if doc.author])
398
437
 
438
+ @staticmethod
439
+ def sort_by_id(docs: Sequence['DocumentType']) -> list['DocumentType']:
440
+ return sorted(docs, key=lambda d: d.file_id)
441
+
399
442
  @staticmethod
400
443
  def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
401
- return sorted(docs, key=lambda doc: doc.sort_key())
444
+ return sorted(docs, key=lambda doc: doc.timestamp_sort_key())
402
445
 
403
- @classmethod
404
- def uniquify(cls, documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
446
+ @staticmethod
447
+ def uniquify(documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
405
448
  """Uniquify by file_id."""
406
449
  id_map = {doc.file_id: doc for doc in documents}
407
450
  return [doc for doc in id_map.values()]
408
451
 
452
+ @staticmethod
453
+ def without_dupes(docs: Sequence['DocumentType']) -> list['DocumentType']:
454
+ return [doc for doc in docs if not doc.is_duplicate()]
455
+
409
456
 
410
457
  DocumentType = TypeVar('DocumentType', bound=Document)
411
458