epstein-files 1.0.10__py3-none-any.whl → 1.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,10 @@
1
1
  import re
2
2
  import logging
3
3
  import warnings
4
+ from collections import defaultdict
4
5
  from dataclasses import asdict, dataclass
5
6
  from datetime import datetime
7
+ from typing import ClassVar, Sequence
6
8
 
7
9
  import datefinder
8
10
  import dateutil
@@ -16,11 +18,11 @@ from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, WHITESPACE_R
16
18
  from epstein_files.util.constant.strings import *
17
19
  from epstein_files.util.constants import *
18
20
  from epstein_files.util.doc_cfg import FINANCIAL_REPORTS_AUTHORS, DocCfg, Metadata
19
- from epstein_files.util.data import escape_single_quotes, remove_timezone, uniquify
20
- from epstein_files.util.file_helper import FILENAME_LENGTH
21
+ from epstein_files.util.data import escape_single_quotes, remove_timezone, sort_dict, uniquify
22
+ from epstein_files.util.file_helper import FILENAME_LENGTH, file_size_to_str
21
23
  from epstein_files.util.env import args
22
- from epstein_files.util.highlighted_group import get_style_for_category
23
- from epstein_files.util.rich import QUESTION_MARK_TXT, build_table, highlighter
24
+ from epstein_files.util.highlighted_group import styled_category
25
+ from epstein_files.util.rich import QUESTION_MARK_TXT, add_cols_to_table, build_table, highlighter
24
26
  from epstein_files.util.logging import logger
25
27
 
26
28
  MAX_DAYS_SPANNED_TO_BE_VALID = 10
@@ -38,14 +40,11 @@ UNINTERESTING_CATEGORES = [
38
40
  ARTS,
39
41
  BOOK,
40
42
  JUNK,
43
+ SKYPE_LOG,
41
44
  SPEECH,
42
45
  ]
43
46
 
44
- UNINTERESTING_IDS = [
45
- '031794',
46
- ]
47
-
48
- # OtherFiles whose description/hints match these prefixes are not displayed unless --all-other-files is used
47
+ # OtherFiles whose descriptions/info match these prefixes are not displayed unless --all-other-files is used
49
48
  UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
50
49
  'article about',
51
50
  ARTICLE_DRAFT,
@@ -60,7 +59,7 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
60
59
  CVRA,
61
60
  DAILY_MAIL,
62
61
  DAILY_TELEGRAPH,
63
- DAVID_SCHOEN_CVRA_LEXIS_SEARCH[0:-12], # Because date at end :(
62
+ CVRA_LEXIS_SEARCH[0:-12], # Because date at end :(
64
63
  DERSH_GIUFFRE_TWEET,
65
64
  'Financial Times',
66
65
  'Forbes',
@@ -78,8 +77,10 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
78
77
  LA_TIMES,
79
78
  'Litigation Daily',
80
79
  LAWRENCE_KRAUSS,
80
+ LAWRENCE_KRAUSS_ASU_ORIGINS,
81
81
  'MarketWatch',
82
82
  MARTIN_NOWAK,
83
+ 'Morning News',
83
84
  NOBEL_CHARITABLE_TRUST,
84
85
  'Nautilus',
85
86
  'New Yorker',
@@ -122,24 +123,25 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
122
123
  class OtherFile(Document):
123
124
  """File that is not an email, an iMessage log, or JSON data."""
124
125
 
126
+ include_description_in_summary_panel: ClassVar[bool] = True
127
+
125
128
  def __post_init__(self):
126
129
  super().__post_init__()
127
130
 
128
131
  if self.config is None and VI_DAILY_NEWS_REGEX.search(self.text):
129
- self.log(f"Creating synthetic config for VI Daily News article...", logging.INFO)
132
+ self.log(f"Creating synthetic config for VI Daily News article...")
130
133
  self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
131
134
 
132
135
  def category(self) -> str | None:
133
136
  return self.config and self.config.category
134
137
 
135
- def configured_description(self) -> str | None:
138
+ def category_txt(self) -> Text | None:
139
+ return styled_category(self.category() or UNKNOWN)
140
+
141
+ def config_description(self) -> str | None:
136
142
  """Overloads superclass method."""
137
143
  if self.config is not None:
138
- return self.config.info_str()
139
-
140
- def description_panel(self, include_hints=True) -> Panel:
141
- """Panelized description() with info_txt(), used in search results."""
142
- return super().description_panel(include_hints=include_hints)
144
+ return self.config.complete_description()
143
145
 
144
146
  def highlighted_preview_text(self) -> Text:
145
147
  try:
@@ -153,13 +155,11 @@ class OtherFile(Document):
153
155
 
154
156
  def is_interesting(self):
155
157
  """False for lame prefixes, duplicates, and other boring files."""
156
- hints = self.hints()
158
+ info_sentences = self.info()
157
159
 
158
- if self.is_duplicate:
160
+ if self.is_duplicate():
159
161
  return False
160
- elif self.file_id in UNINTERESTING_IDS:
161
- return False
162
- elif len(hints) == 0:
162
+ elif len(info_sentences) == 0:
163
163
  return True
164
164
  elif self.config:
165
165
  if self.config.is_interesting:
@@ -170,7 +170,7 @@ class OtherFile(Document):
170
170
  return False
171
171
 
172
172
  for prefix in UNINTERESTING_PREFIXES:
173
- if hints[0].plain.startswith(prefix):
173
+ if info_sentences[0].plain.startswith(prefix):
174
174
  return False
175
175
 
176
176
  return True
@@ -195,7 +195,6 @@ class OtherFile(Document):
195
195
  timestamps: list[datetime] = []
196
196
 
197
197
  with warnings.catch_warnings():
198
- warnings.filterwarnings("ignore", module="datefinder")
199
198
  warnings.filterwarnings("ignore", module="dateutil")
200
199
 
201
200
  try:
@@ -208,11 +207,11 @@ class OtherFile(Document):
208
207
  if len(timestamps) >= MAX_EXTRACTED_TIMESTAMPS:
209
208
  break
210
209
  except ValueError as e:
211
- logger.warning(f"Error while iterating through datefinder.find_dates(): {e}")
210
+ self.log(f"Error while iterating through datefinder.find_dates(): {e}", logging.WARNING)
212
211
 
213
212
  if len(timestamps) == 0:
214
- if not self.is_duplicate and VAST_HOUSE not in self.text:
215
- self.log_top_lines(15, msg=f"No timestamps found", level=logging.INFO)
213
+ if not (self.is_duplicate() or VAST_HOUSE in self.text):
214
+ self.log_top_lines(15, msg=f"No timestamps found")
216
215
 
217
216
  return None
218
217
  elif len(timestamps) == 1:
@@ -231,7 +230,7 @@ class OtherFile(Document):
231
230
  self.log_top_lines(15, msg=timestamps_log_msg, level=logging.DEBUG)
232
231
 
233
232
  @staticmethod
234
- def build_table(docs: list['OtherFile']) -> Table:
233
+ def build_table(files: Sequence['OtherFile']) -> Table:
235
234
  """Build a table of OtherFile documents."""
236
235
  table = build_table(None, show_lines=True)
237
236
  table.add_column('File', justify='center', width=FILENAME_LENGTH)
@@ -240,31 +239,55 @@ class OtherFile(Document):
240
239
  table.add_column('Type', justify='center')
241
240
  table.add_column(FIRST_FEW_LINES, justify='left', style='pale_turquoise4')
242
241
 
243
- for doc in docs:
244
- link_and_info = [doc.raw_document_link_txt()]
245
- category = doc.category()
246
- date_str = doc.date_str()
242
+ for file in files:
243
+ link_and_info = [file.raw_document_link_txt()]
244
+ date_str = file.date_str()
247
245
 
248
- if doc.is_duplicate:
249
- preview_text = doc.duplicate_file_txt()
246
+ if file.is_duplicate():
247
+ preview_text = file.duplicate_file_txt()
250
248
  row_style = ' dim'
251
249
  else:
252
- link_and_info += doc.hints()
253
- preview_text = doc.highlighted_preview_text()
250
+ link_and_info += file.info()
251
+ preview_text = file.highlighted_preview_text()
254
252
  row_style = ''
255
253
 
256
- if category:
257
- category_txt = Text(category, get_style_for_category(category) or 'wheat4')
258
- else:
259
- category_txt = Text('')
260
-
261
254
  table.add_row(
262
255
  Group(*link_and_info),
263
256
  Text(date_str, style=TIMESTAMP_DIM) if date_str else QUESTION_MARK_TXT,
264
- doc.file_size_str(),
265
- category_txt,
257
+ file.file_size_str(),
258
+ file.category_txt(),
266
259
  preview_text,
267
260
  style=row_style
268
261
  )
269
262
 
270
263
  return table
264
+
265
+ @staticmethod
266
+ def count_by_category_table(files: Sequence['OtherFile']) -> Table:
267
+ counts = defaultdict(int)
268
+ category_bytes = defaultdict(int)
269
+
270
+ for file in files:
271
+ if file.category() is None:
272
+ logger.warning(f"file {file.file_id} has no category")
273
+
274
+ counts[file.category()] += 1
275
+ category_bytes[file.category()] += file.length
276
+
277
+ table = build_table('Other Files Summary')
278
+ add_cols_to_table(table, ['Category', 'Count', 'Has Author', 'No Author', 'Size'])
279
+ table.columns[-1].style = 'dim'
280
+
281
+ for (category, count) in sort_dict(counts):
282
+ category_files = [f for f in files if f.category() == category]
283
+ known_author_count = Document.known_author_count(category_files)
284
+
285
+ table.add_row(
286
+ styled_category(category or UNKNOWN),
287
+ str(count),
288
+ str(known_author_count),
289
+ str(count - known_author_count),
290
+ file_size_to_str(category_bytes[category]),
291
+ )
292
+
293
+ return table
@@ -23,12 +23,12 @@ from epstein_files.util.constant.strings import *
23
23
  from epstein_files.util.constant.urls import (EPSTEIN_MEDIA, EPSTEIN_WEB, JMAIL, epstein_media_person_url,
24
24
  epsteinify_name_url, epstein_web_person_url, search_jmail_url, search_twitter_url)
25
25
  from epstein_files.util.constants import *
26
- from epstein_files.util.data import dict_sets_to_lists, json_safe, sort_dict
26
+ from epstein_files.util.data import dict_sets_to_lists, json_safe, listify, sort_dict
27
27
  from epstein_files.util.doc_cfg import EmailCfg, Metadata
28
- from epstein_files.util.env import args, logger
29
- from epstein_files.util.file_helper import DOCS_DIR, file_size_str
28
+ from epstein_files.util.env import DOCS_DIR, args, logger
29
+ from epstein_files.util.file_helper import file_size_str
30
30
  from epstein_files.util.highlighted_group import get_info_for_name, get_style_for_name
31
- from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, TABLE_BORDER_STYLE, add_cols_to_table,
31
+ from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, add_cols_to_table,
32
32
  build_table, console, highlighter, link_text_obj, link_markup, print_author_header, print_centered,
33
33
  print_other_site_link, print_panel, print_section_header, vertically_pad)
34
34
  from epstein_files.util.search_result import SearchResult
@@ -66,7 +66,7 @@ class EpsteinFiles:
66
66
 
67
67
  def __post_init__(self):
68
68
  """Iterate through files and build appropriate objects."""
69
- self.all_files = [f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')]
69
+ self.all_files = sorted([f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')])
70
70
  documents = []
71
71
  file_type_count = defaultdict(int)
72
72
 
@@ -74,12 +74,15 @@ class EpsteinFiles:
74
74
  for file_arg in self.all_files:
75
75
  doc_timer = Timer(decimals=4)
76
76
  document = Document(file_arg)
77
+ cls = document_cls(document)
77
78
 
78
79
  if document.length == 0:
79
80
  logger.warning(f"Skipping empty file: {document}]")
80
81
  continue
82
+ elif args.skip_other_files and cls == OtherFile and file_type_count[cls.__name__] > 1:
83
+ logger.warning(f"Skipping {document.filename}...")
84
+ continue
81
85
 
82
- cls = document_cls(document)
83
86
  documents.append(cls(file_arg, text=document.text))
84
87
  logger.info(str(documents[-1]))
85
88
  file_type_count[cls.__name__] += 1
@@ -186,7 +189,8 @@ class EpsteinFiles:
186
189
  else:
187
190
  return [e for e in self.emails if author in e.recipients]
188
191
 
189
- def get_documents_by_id(self, file_ids: list[str]) -> list[Document]:
192
+ def get_documents_by_id(self, file_ids: str | list[str]) -> list[Document]:
193
+ file_ids = listify(file_ids)
190
194
  docs = [doc for doc in self.all_documents() if doc.file_id in file_ids]
191
195
 
192
196
  if len(docs) != len(file_ids):
@@ -223,7 +227,7 @@ class EpsteinFiles:
223
227
  f"{len(docs):,}",
224
228
  f"{known:,}" if known is not None else NA_TXT,
225
229
  f"{len(docs) - known:,}" if known is not None else NA_TXT,
226
- f"{len([d for d in docs if d.is_duplicate])}",
230
+ f"{len([d for d in docs if d.is_duplicate()])}",
227
231
  )
228
232
 
229
233
  add_row('iMessage Logs', self.imessage_logs)
@@ -237,7 +241,7 @@ class EpsteinFiles:
237
241
  """Print complete emails to or from a particular 'author'. Returns the Emails that were printed."""
238
242
  conversation_length = self.email_conversation_length_in_days(_author)
239
243
  emails = self.emails_for(_author)
240
- unique_emails = [email for email in emails if not email.is_duplicate]
244
+ unique_emails = [email for email in emails if not email.is_duplicate()]
241
245
  author = _author or UNKNOWN
242
246
 
243
247
  print_author_header(
@@ -250,7 +254,7 @@ class EpsteinFiles:
250
254
  last_printed_email_was_duplicate = False
251
255
 
252
256
  for email in emails:
253
- if email.is_duplicate:
257
+ if email.is_duplicate():
254
258
  console.print(Padding(email.duplicate_file_txt().append('...'), (0, 0, 0, 4)))
255
259
  last_printed_email_was_duplicate = True
256
260
  else:
@@ -263,7 +267,7 @@ class EpsteinFiles:
263
267
  return emails
264
268
 
265
269
  def print_emails_table_for(self, author: str | None) -> None:
266
- emails = [email for email in self.emails_for(author) if not email.is_duplicate] # Remove dupes
270
+ emails = [email for email in self.emails_for(author) if not email.is_duplicate()] # Remove dupes
267
271
  console.print(Align.center(Email.build_table(emails, author)), '\n')
268
272
 
269
273
  def print_email_device_info(self) -> None:
@@ -272,7 +276,7 @@ class EpsteinFiles:
272
276
  console.print(_build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
273
277
 
274
278
  def print_emailer_counts_table(self) -> None:
275
- footer = f"Identified authors of {self.attributed_email_count():,} emails out of {len(self.emails):,}."
279
+ footer = f"Identified authors of {self.attributed_email_count():,} out of {len(self.emails):,} emails ."
276
280
  counts_table = build_table("Email Counts", caption=footer)
277
281
  add_cols_to_table(counts_table, ['Name', 'Count', 'Sent', "Recv'd", JMAIL, EPSTEIN_MEDIA, EPSTEIN_WEB, 'Twitter'])
278
282
 
@@ -303,7 +307,7 @@ class EpsteinFiles:
303
307
  text_summary_msg = f"\nDeanonymized {Document.known_author_count(self.imessage_logs)} of "
304
308
  text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files):,} files."
305
309
  console.print(text_summary_msg)
306
- imessage_msg_count = sum([len(log.messages()) for log in self.imessage_logs])
310
+ imessage_msg_count = sum([len(log.messages) for log in self.imessage_logs])
307
311
  console.print(f"Found {imessage_msg_count} text messages in {len(self.imessage_logs)} iMessage log files.")
308
312
 
309
313
  def print_other_files_table(self) -> list[OtherFile]:
@@ -318,17 +322,18 @@ class EpsteinFiles:
318
322
  console.line(2)
319
323
 
320
324
  console.print(OtherFile.build_table(interesting_files))
325
+ console.print(Padding(OtherFile.count_by_category_table(interesting_files), (2, 0, 2, 2)))
321
326
  skipped_file_count = len(self.other_files) - len(interesting_files)
322
327
 
323
328
  if skipped_file_count > 0:
324
- logger.warning(f"Skipped {skipped_file_count} uninteresting files...")
329
+ logger.warning(f"Skipped {skipped_file_count} uninteresting other files...")
325
330
 
326
331
  return interesting_files
327
332
 
328
333
  def _tally_email_data(self) -> None:
329
334
  """Tally up summary info about Email objects."""
330
335
  for email in self.emails:
331
- if email.is_duplicate:
336
+ if email.is_duplicate():
332
337
  continue
333
338
 
334
339
  self.email_author_counts[email.author] += 1
@@ -360,6 +365,8 @@ def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
360
365
  def document_cls(doc: Document) -> Type[Document]:
361
366
  search_area = doc.text[0:5000] # Limit search area to avoid pointless scans of huge files
362
367
 
368
+ if doc.length == 0:
369
+ return Document
363
370
  if doc.text[0] == '{':
364
371
  return JsonFile
365
372
  elif isinstance(doc.config, EmailCfg) or (DETECT_EMAIL_REGEX.match(search_area) and doc.config is None):
@@ -233,14 +233,14 @@ OTHER_NAMES = NAMES_TO_NOT_HIGHLIGHT + """
233
233
  ferguson flachsbart francis franco frank
234
234
  gardner gary geoff geoffrey gilbert gloria goldberg gonzalez gould graham greene guarino gwyneth
235
235
  hancock harold harrison harry helen hirsch hofstadter horowitz hussein
236
- isaac isaacson
236
+ ian isaac isaacson
237
237
  jamie jane janet jason jen jim joe johnson jones josh julie justin
238
238
  karl kate kathy kelly kim kruger kyle
239
239
  leo leonard lenny leslie lieberman louis lynch lynn
240
240
  marcus marianne matt matthew melissa michele michelle moore moscowitz
241
241
  nicole nussbaum
242
242
  paulson philippe
243
- rafael ray richardson rob robin ron rudolph ryan
243
+ rafael ray richard richardson rob robin ron rubin rudolph ryan
244
244
  sara sarah seligman serge sergey silverman sloman smith snowden sorkin steele stevie stewart
245
245
  ted theresa thompson tiffany timothy tony
246
246
  valeria