epstein-files 1.2.1__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
epstein_files/__init__.py CHANGED
@@ -16,15 +16,19 @@ from rich.text import Text
16
16
  from epstein_files.epstein_files import EpsteinFiles, document_cls
17
17
  from epstein_files.documents.document import INFO_PADDING, Document
18
18
  from epstein_files.documents.email import Email
19
+ from epstein_files.documents.messenger_log import MessengerLog
20
+ from epstein_files.documents.other_file import OtherFile
19
21
  from epstein_files.util.constant.output_files import make_clean
22
+ from epstein_files.util.constant.strings import ID_REGEX
23
+ from epstein_files.util.data import flatten
20
24
  from epstein_files.util.env import args
21
25
  from epstein_files.util.file_helper import coerce_file_path, extract_file_id
22
26
  from epstein_files.util.logging import exit_with_error, logger
23
27
  from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
24
28
  print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info,
25
29
  print_json_metadata, write_urls)
26
- from epstein_files.util.rich import (build_highlighter, console, print_color_key, print_title_page_header,
27
- print_title_page_tables, print_subtitle_panel, write_html)
30
+ from epstein_files.util.rich import (build_highlighter, console, highlighter, print_color_key, print_json,
31
+ print_title_page_header, print_title_page_tables, print_subtitle_panel, write_html)
28
32
  from epstein_files.util.timer import Timer
29
33
  from epstein_files.util.word_count import write_word_counts_html
30
34
 
@@ -76,6 +80,9 @@ def generate_html() -> None:
76
80
  write_html(args.build)
77
81
  logger.warning(f"Total time: {timer.seconds_since_start_str()}")
78
82
 
83
+ if args.debug:
84
+ highlighter.print_highlight_counts(console)
85
+
79
86
  # JSON stats (mostly used for building pytest checks)
80
87
  if args.json_stats:
81
88
  print_json_stats(epstein_files)
@@ -86,27 +93,54 @@ def epstein_diff():
86
93
  Document.diff_files(args.positional_args)
87
94
 
88
95
 
89
- def epstein_search():
96
+ def epstein_grep():
90
97
  """Search the cleaned up text of the files."""
91
98
  epstein_files = EpsteinFiles.get_files()
92
99
 
100
+ if ID_REGEX.match(args.positional_args[0]):
101
+ logger.warning(f"'{args.positional_args[0]}' seems to be an ID, running epstein_show instead...")
102
+ epstein_show()
103
+ return
104
+
93
105
  for search_term in args.positional_args:
94
106
  temp_highlighter = build_highlighter(search_term)
95
107
  search_results = epstein_files.docs_matching(search_term, args.names)
96
108
  print_subtitle_panel(f"Found {len(search_results)} documents matching '{search_term}'")
109
+ last_document = None
97
110
 
98
111
  for search_result in search_results:
99
- console.line()
100
-
101
- if args.whole_file:
102
- console.print(search_result.document)
112
+ doc = search_result.document
113
+ lines = search_result.lines
114
+
115
+ if (isinstance(doc, Email) and not args.output_emails) \
116
+ or (isinstance(doc, OtherFile) and not args.output_other) \
117
+ or (isinstance(doc, MessengerLog) and not args.output_texts):
118
+ doc.log(f"{type(doc).__name__} Skipping search result...")
119
+ continue
120
+ elif isinstance(doc, Email) and args.email_body:
121
+ lines = [l for l in search_result.lines if l.line_number > doc.header.num_header_rows]
122
+
123
+ if not lines:
124
+ doc.log(f"None of the matches for '{search_term}' seem to be in the body of the email")
125
+ continue
126
+
127
+ if doc.is_duplicate():
128
+ if last_document and not last_document.is_duplicate():
129
+ console.line()
130
+
131
+ last_document = doc
132
+ console.print(doc.duplicate_file_txt())
133
+ elif args.whole_file:
134
+ console.print(doc)
103
135
  else:
104
- console.print(search_result.document.summary_panel())
136
+ console.print(doc.summary_panel())
105
137
 
106
- for matching_line in search_result.lines:
138
+ for matching_line in lines:
107
139
  line_txt = matching_line.__rich__()
108
140
  console.print(Padding(temp_highlighter(line_txt), INFO_PADDING), style='gray37')
109
141
 
142
+ console.line()
143
+
110
144
 
111
145
  def epstein_show():
112
146
  """Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
@@ -114,8 +148,13 @@ def epstein_show():
114
148
  console.line()
115
149
 
116
150
  try:
117
- ids = [extract_file_id(arg) for arg in args.positional_args]
118
- raw_docs = [Document(coerce_file_path(id)) for id in ids]
151
+ if args.names:
152
+ people = EpsteinFiles.get_files().person_objs(args.names)
153
+ raw_docs = [doc for doc in flatten([p.emails for p in people])]
154
+ else:
155
+ ids = [extract_file_id(arg.strip().strip('_')) for arg in args.positional_args]
156
+ raw_docs = [Document(coerce_file_path(id)) for id in ids]
157
+
119
158
  docs = Document.sort_by_timestamp([document_cls(doc)(doc.file_path) for doc in raw_docs])
120
159
  except Exception as e:
121
160
  exit_with_error(str(e))
@@ -130,6 +169,11 @@ def epstein_show():
130
169
  if isinstance(doc, Email):
131
170
  console.print(Panel(Text("actual_text: ").append(doc.summary()), expand=False, style=doc._border_style()))
132
171
  console.print(escape(doc._actual_text()), '\n')
172
+ metadata = doc.metadata()
173
+ metadata['is_fwded_article'] = doc.is_fwded_article()
174
+ metadata['is_word_count_worthy'] = doc.is_word_count_worthy()
175
+ metadata['_is_first_for_user'] = doc._is_first_for_user
176
+ print_json(f"{doc.file_id} Metadata", metadata)
133
177
 
134
178
 
135
179
  def epstein_word_count() -> None:
@@ -197,6 +197,9 @@ class Document:
197
197
  def is_duplicate(self) -> bool:
198
198
  return bool(self.duplicate_of_id())
199
199
 
200
+ def is_interesting(self) -> bool:
201
+ return bool(self.config and self.config.is_interesting)
202
+
200
203
  def is_local_extract_file(self) -> bool:
201
204
  """True if extracted from other file (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
202
205
  return is_local_extract_file(self.filename)
@@ -267,11 +270,11 @@ class Document:
267
270
  txt.append(' (', style=SYMBOL_STYLE)
268
271
  txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
269
272
 
270
- txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(0), style='aquamarine1')))
273
+ txt.append(' [').append(key_value_txt('size', Text(str(self.length()), style='aquamarine1')))
271
274
  txt.append(", ").append(key_value_txt('lines', self.num_lines()))
272
275
 
273
276
  if self.config and self.config.duplicate_of_id:
274
- txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='magenta')))
277
+ txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='cyan dim')))
275
278
 
276
279
  return txt
277
280
 
@@ -435,6 +438,14 @@ class Document:
435
438
  """Count of how many Document objects have an author attribution."""
436
439
  return len([doc for doc in docs if doc.author])
437
440
 
441
+ @staticmethod
442
+ def sort_by_id(docs: Sequence['DocumentType']) -> list['DocumentType']:
443
+ return sorted(docs, key=lambda d: d.file_id)
444
+
445
+ @staticmethod
446
+ def sort_by_length(docs: Sequence['DocumentType']) -> list['DocumentType']:
447
+ return sorted(docs, key=lambda d: d.file_size(), reverse=True)
448
+
438
449
  @staticmethod
439
450
  def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
440
451
  return sorted(docs, key=lambda doc: doc.timestamp_sort_key())