epstein-files 1.2.5__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
epstein_files/__init__.py CHANGED
@@ -27,8 +27,8 @@ from epstein_files.util.logging import exit_with_error, logger
27
27
  from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
28
28
  print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info,
29
29
  print_json_metadata, write_urls)
30
- from epstein_files.util.rich import (build_highlighter, console, highlighter, print_color_key, print_title_page_header,
31
- print_title_page_tables, print_subtitle_panel, write_html)
30
+ from epstein_files.util.rich import (build_highlighter, console, highlighter, print_color_key, print_json,
31
+ print_title_page_header, print_title_page_tables, print_subtitle_panel, write_html)
32
32
  from epstein_files.util.timer import Timer
33
33
  from epstein_files.util.word_count import write_word_counts_html
34
34
 
@@ -93,7 +93,7 @@ def epstein_diff():
93
93
  Document.diff_files(args.positional_args)
94
94
 
95
95
 
96
- def epstein_search():
96
+ def epstein_grep():
97
97
  """Search the cleaned up text of the files."""
98
98
  epstein_files = EpsteinFiles.get_files()
99
99
 
@@ -106,22 +106,36 @@ def epstein_search():
106
106
  temp_highlighter = build_highlighter(search_term)
107
107
  search_results = epstein_files.docs_matching(search_term, args.names)
108
108
  print_subtitle_panel(f"Found {len(search_results)} documents matching '{search_term}'")
109
+ last_document = None
109
110
 
110
111
  for search_result in search_results:
111
- document = search_result.document
112
+ doc = search_result.document
113
+ lines = search_result.lines
112
114
 
113
- if (isinstance(document, Email) and not args.output_emails) \
114
- or (isinstance(document, OtherFile) and not args.output_other) \
115
- or (isinstance(document, MessengerLog) and not args.output_texts):
116
- document.warn(f"{type(document).__name__} Skipping search result...")
115
+ if (isinstance(doc, Email) and not args.output_emails) \
116
+ or (isinstance(doc, OtherFile) and not args.output_other) \
117
+ or (isinstance(doc, MessengerLog) and not args.output_texts):
118
+ doc.log(f"{type(doc).__name__} Skipping search result...")
117
119
  continue
120
+ elif isinstance(doc, Email) and args.email_body:
121
+ lines = [l for l in search_result.lines if l.line_number > doc.header.num_header_rows]
118
122
 
119
- if args.whole_file:
120
- console.print(document)
123
+ if not lines:
124
+ doc.log(f"None of the matches for '{search_term}' seem to be in the body of the email")
125
+ continue
126
+
127
+ if doc.is_duplicate():
128
+ if last_document and not last_document.is_duplicate():
129
+ console.line()
130
+
131
+ last_document = doc
132
+ console.print(doc.duplicate_file_txt())
133
+ elif args.whole_file:
134
+ console.print(doc)
121
135
  else:
122
- console.print(document.summary_panel())
136
+ console.print(doc.summary_panel())
123
137
 
124
- for matching_line in search_result.lines:
138
+ for matching_line in lines:
125
139
  line_txt = matching_line.__rich__()
126
140
  console.print(Padding(temp_highlighter(line_txt), INFO_PADDING), style='gray37')
127
141
 
@@ -138,7 +152,7 @@ def epstein_show():
138
152
  people = EpsteinFiles.get_files().person_objs(args.names)
139
153
  raw_docs = [doc for doc in flatten([p.emails for p in people])]
140
154
  else:
141
- ids = [extract_file_id(arg) for arg in args.positional_args]
155
+ ids = [extract_file_id(arg.strip().strip('_')) for arg in args.positional_args]
142
156
  raw_docs = [Document(coerce_file_path(id)) for id in ids]
143
157
 
144
158
  docs = Document.sort_by_timestamp([document_cls(doc)(doc.file_path) for doc in raw_docs])
@@ -155,6 +169,11 @@ def epstein_show():
155
169
  if isinstance(doc, Email):
156
170
  console.print(Panel(Text("actual_text: ").append(doc.summary()), expand=False, style=doc._border_style()))
157
171
  console.print(escape(doc._actual_text()), '\n')
172
+ metadata = doc.metadata()
173
+ metadata['is_fwded_article'] = doc.is_fwded_article()
174
+ metadata['is_word_count_worthy'] = doc.is_word_count_worthy()
175
+ metadata['_is_first_for_user'] = doc._is_first_for_user
176
+ print_json(f"{doc.file_id} Metadata", metadata)
158
177
 
159
178
 
160
179
  def epstein_word_count() -> None:
@@ -197,6 +197,9 @@ class Document:
197
197
  def is_duplicate(self) -> bool:
198
198
  return bool(self.duplicate_of_id())
199
199
 
200
+ def is_interesting(self) -> bool:
201
+ return bool(self.config and self.config.is_interesting)
202
+
200
203
  def is_local_extract_file(self) -> bool:
201
204
  """True if extracted from other file (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
202
205
  return is_local_extract_file(self.filename)
@@ -267,7 +270,7 @@ class Document:
267
270
  txt.append(' (', style=SYMBOL_STYLE)
268
271
  txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
269
272
 
270
- txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(0), style='aquamarine1')))
273
+ txt.append(' [').append(key_value_txt('size', Text(str(self.length()), style='aquamarine1')))
271
274
  txt.append(", ").append(key_value_txt('lines', self.num_lines()))
272
275
 
273
276
  if self.config and self.config.duplicate_of_id:
@@ -439,6 +442,10 @@ class Document:
439
442
  def sort_by_id(docs: Sequence['DocumentType']) -> list['DocumentType']:
440
443
  return sorted(docs, key=lambda d: d.file_id)
441
444
 
445
+ @staticmethod
446
+ def sort_by_length(docs: Sequence['DocumentType']) -> list['DocumentType']:
447
+ return sorted(docs, key=lambda d: d.file_size(), reverse=True)
448
+
442
449
  @staticmethod
443
450
  def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
444
451
  return sorted(docs, key=lambda doc: doc.timestamp_sort_key())