epstein-files 1.2.5__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. epstein_files/__init__.py +55 -23
  2. epstein_files/documents/communication.py +9 -5
  3. epstein_files/documents/document.py +231 -135
  4. epstein_files/documents/doj_file.py +242 -0
  5. epstein_files/documents/doj_files/full_text.py +166 -0
  6. epstein_files/documents/email.py +289 -232
  7. epstein_files/documents/emails/email_header.py +35 -16
  8. epstein_files/documents/emails/emailers.py +223 -0
  9. epstein_files/documents/imessage/text_message.py +2 -3
  10. epstein_files/documents/json_file.py +18 -14
  11. epstein_files/documents/messenger_log.py +23 -39
  12. epstein_files/documents/other_file.py +54 -48
  13. epstein_files/epstein_files.py +65 -29
  14. epstein_files/person.py +151 -94
  15. epstein_files/util/constant/names.py +37 -10
  16. epstein_files/util/constant/output_files.py +2 -0
  17. epstein_files/util/constant/strings.py +14 -7
  18. epstein_files/util/constant/urls.py +17 -0
  19. epstein_files/util/constants.py +556 -391
  20. epstein_files/util/data.py +2 -0
  21. epstein_files/util/doc_cfg.py +44 -33
  22. epstein_files/util/env.py +34 -19
  23. epstein_files/util/file_helper.py +30 -6
  24. epstein_files/util/helpers/debugging_helper.py +13 -0
  25. epstein_files/util/helpers/env_helpers.py +21 -0
  26. epstein_files/util/highlighted_group.py +121 -37
  27. epstein_files/util/layout/left_bar_panel.py +26 -0
  28. epstein_files/util/logging.py +28 -13
  29. epstein_files/util/output.py +49 -40
  30. epstein_files/util/rich.py +30 -3
  31. epstein_files/util/word_count.py +7 -7
  32. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/METADATA +16 -3
  33. epstein_files-1.5.0.dist-info/RECORD +40 -0
  34. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/entry_points.txt +1 -1
  35. epstein_files-1.2.5.dist-info/RECORD +0 -34
  36. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/LICENSE +0 -0
  37. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/WHEEL +0 -0
@@ -30,7 +30,6 @@ MAX_DAYS_SPANNED_TO_BE_VALID = 10
30
30
  MAX_EXTRACTED_TIMESTAMPS = 100
31
31
  MIN_TIMESTAMP = datetime(2000, 1, 1)
32
32
  MID_TIMESTAMP = datetime(2007, 1, 1)
33
- MAX_TIMESTAMP = datetime(2022, 12, 31)
34
33
  PREVIEW_CHARS = int(580 * (1 if args.all_other_files else 1.5))
35
34
  LOG_INDENT = '\n '
36
35
  TIMESTAMP_LOG_INDENT = f'{LOG_INDENT} '
@@ -93,40 +92,28 @@ class OtherFile(Document):
93
92
  """
94
93
  was_timestamp_extracted: bool = False
95
94
  include_description_in_summary_panel: ClassVar[bool] = True # Class var for logging output
95
+ max_timestamp: ClassVar[datetime] = datetime(2022, 12, 31) # Overloaded in DojFile
96
96
 
97
- def __post_init__(self):
98
- super().__post_init__()
99
-
100
- if self.config is None and VI_DAILY_NEWS_REGEX.search(self.text):
101
- self.log(f"Creating synthetic config for VI Daily News article...")
102
- self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
97
+ @property
98
+ def config_description(self) -> str | None:
99
+ """Overloads superclass property."""
100
+ if self.config and self.config.description:
101
+ return self.config.complete_description
103
102
 
103
+ @property
104
104
  def category(self) -> str | None:
105
105
  return self.config and self.config.category
106
106
 
107
+ @property
107
108
  def category_txt(self) -> Text | None:
108
- return styled_category(self.category())
109
-
110
- def config_description(self) -> str | None:
111
- """Overloads superclass method."""
112
- if self.config is not None:
113
- return self.config.complete_description()
114
-
115
- def highlighted_preview_text(self) -> Text:
116
- try:
117
- return highlighter(escape(self.preview_text()))
118
- except Exception as e:
119
- logger.error(f"Failed to apply markup in string '{escape_single_quotes(self.preview_text())}'\n"
120
- f"Original string: '{escape_single_quotes(self.preview_text())}'\n"
121
- f"File: '{self.filename}'\n")
122
-
123
- return Text(escape(self.preview_text()))
109
+ return styled_category(self.category)
124
110
 
125
- def is_interesting(self):
126
- """False for lame prefixes, duplicates, and other boring files."""
127
- info_sentences = self.info()
111
+ @property
112
+ def is_interesting(self) -> bool:
113
+ """Overloaded. False for lame prefixes, duplicates, and other boring files."""
114
+ info_sentences = self.info
128
115
 
129
- if self.is_duplicate():
116
+ if self.is_duplicate:
130
117
  return False
131
118
  elif len(info_sentences) == 0:
132
119
  return True
@@ -135,9 +122,9 @@ class OtherFile(Document):
135
122
  return self.config.is_interesting
136
123
  elif self.config.author in INTERESTING_AUTHORS:
137
124
  return True
138
- elif self.category() == FINANCE and self.author is not None:
125
+ elif self.category == FINANCE and self.author is not None:
139
126
  return False
140
- elif self.category() in UNINTERESTING_CATEGORIES:
127
+ elif self.category in UNINTERESTING_CATEGORIES:
141
128
  return False
142
129
 
143
130
  for prefix in UNINTERESTING_PREFIXES:
@@ -146,15 +133,33 @@ class OtherFile(Document):
146
133
 
147
134
  return True
148
135
 
136
+ @property
149
137
  def metadata(self) -> Metadata:
150
- metadata = super().metadata()
151
- metadata['is_interesting'] = self.is_interesting()
138
+ metadata = super().metadata
139
+ metadata['is_interesting'] = self.is_interesting
152
140
 
153
141
  if self.was_timestamp_extracted:
154
142
  metadata['was_timestamp_extracted'] = self.was_timestamp_extracted
155
143
 
156
144
  return metadata
157
145
 
146
+ def __post_init__(self):
147
+ super().__post_init__()
148
+
149
+ if self.config is None and VI_DAILY_NEWS_REGEX.search(self.text):
150
+ self.log(f"Creating synthetic config for VI Daily News article...")
151
+ self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
152
+
153
+ def highlighted_preview_text(self) -> Text:
154
+ try:
155
+ return highlighter(escape(self.preview_text()))
156
+ except Exception as e:
157
+ logger.error(f"Failed to apply markup in string '{escape_single_quotes(self.preview_text())}'\n"
158
+ f"Original string: '{escape_single_quotes(self.preview_text())}'\n"
159
+ f"File: '{self.filename}'\n")
160
+
161
+ return Text(escape(self.preview_text()))
162
+
158
163
  def preview_text(self) -> str:
159
164
  return WHITESPACE_REGEX.sub(' ', self.text)[0:PREVIEW_CHARS]
160
165
 
@@ -164,9 +169,7 @@ class OtherFile(Document):
164
169
 
165
170
  def _extract_timestamp(self) -> datetime | None:
166
171
  """Return configured timestamp or value extracted by scanning text with datefinder."""
167
- if self.config and self.config.timestamp:
168
- return self.config.timestamp
169
- elif self.config and any([s in (self.config_description() or '') for s in SKIP_TIMESTAMP_EXTRACT]):
172
+ if self.config and any([s in (self.config_description or '') for s in SKIP_TIMESTAMP_EXTRACT]):
170
173
  return None
171
174
 
172
175
  timestamps: list[datetime] = []
@@ -175,10 +178,11 @@ class OtherFile(Document):
175
178
  warnings.filterwarnings("ignore", module="dateutil")
176
179
 
177
180
  try:
178
- for timestamp in datefinder.find_dates(self.text, strict=True):
181
+ # TODO: datefinder.find_dates() cannot find 08/29/2019 style e.g. in EFTA00005783 :(
182
+ for timestamp in datefinder.find_dates(self.text, strict=False):
179
183
  timestamp = remove_timezone(timestamp)
180
184
 
181
- if MIN_TIMESTAMP < timestamp < MAX_TIMESTAMP:
185
+ if MIN_TIMESTAMP < timestamp < self.max_timestamp:
182
186
  timestamps.append(timestamp)
183
187
 
184
188
  if len(timestamps) >= MAX_EXTRACTED_TIMESTAMPS:
@@ -187,7 +191,7 @@ class OtherFile(Document):
187
191
  self.warn(f"Error while iterating through datefinder.find_dates(): {e}")
188
192
 
189
193
  if len(timestamps) == 0:
190
- if not (self.is_duplicate() or VAST_HOUSE in self.text):
194
+ if not (self.is_duplicate or VAST_HOUSE in self.text):
191
195
  self.log_top_lines(15, msg=f"No timestamps found")
192
196
 
193
197
  return None
@@ -210,9 +214,10 @@ class OtherFile(Document):
210
214
  self.log_top_lines(15, msg=timestamps_log_msg, level=logging.DEBUG)
211
215
 
212
216
  @classmethod
213
- def files_preview_table(cls, files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
217
+ def files_preview_table(cls, files: Sequence['OtherFile'], title_pfx: str = '', title: str = '') -> Table:
214
218
  """Build a table of OtherFile documents."""
215
- table = build_table(f'{title_pfx}Other Files Details in Chronological Order', show_lines=True)
219
+ title = title or f'{title_pfx}Other Files Details in Chronological Order'
220
+ table = build_table(title, show_lines=True, title_justify='left' if title else 'center')
216
221
  table.add_column('File', justify='center', width=FILENAME_LENGTH)
217
222
  table.add_column('Date', justify='center')
218
223
  table.add_column('Size', justify='right', style='dim')
@@ -221,21 +226,21 @@ class OtherFile(Document):
221
226
 
222
227
  for file in files:
223
228
  link_and_info = [file.external_links_txt()]
224
- date_str = file.date_str()
229
+ date_str = file.date_str
225
230
 
226
- if file.is_duplicate():
227
- preview_text = file.duplicate_file_txt()
231
+ if file.is_duplicate:
232
+ preview_text = file.duplicate_file_txt
228
233
  row_style = ' dim'
229
234
  else:
230
- link_and_info += file.info()
235
+ link_and_info += file.info
231
236
  preview_text = file.highlighted_preview_text()
232
237
  row_style = ''
233
238
 
234
239
  table.add_row(
235
240
  Group(*link_and_info),
236
241
  Text(date_str, style=TIMESTAMP_STYLE) if date_str else QUESTION_MARKS_TXT,
237
- file.file_size_str(),
238
- file.category_txt(),
242
+ file.file_size_str,
243
+ file.category_txt,
239
244
  preview_text,
240
245
  style=row_style
241
246
  )
@@ -244,12 +249,13 @@ class OtherFile(Document):
244
249
 
245
250
  @classmethod
246
251
  def summary_table(cls, files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
247
- categories = uniquify([f.category() for f in files])
248
- categories = sorted(categories, key=lambda c: -len([f for f in files if f.category() == c]))
252
+ """Table showing file count by category."""
253
+ categories = uniquify([f.category for f in files])
254
+ categories = sorted(categories, key=lambda c: -len([f for f in files if f.category == c]))
249
255
  table = cls.file_info_table(f'{title_pfx}Other Files Summary', 'Category')
250
256
 
251
257
  for category in categories:
252
- category_files = [f for f in files if f.category() == category]
258
+ category_files = [f for f in files if f.category == category]
253
259
  table.add_row(styled_category(category), *cls.files_info_row(category_files))
254
260
 
255
261
  table.columns = table.columns[:-2] + [table.columns[-1]] # Removee unknown author col
@@ -12,7 +12,8 @@ from typing import Sequence, Type, cast
12
12
  from rich.table import Table
13
13
 
14
14
  from epstein_files.documents.document import Document
15
- from epstein_files.documents.email import DETECT_EMAIL_REGEX, Email
15
+ from epstein_files.documents.doj_file import DojFile
16
+ from epstein_files.documents.email import Email
16
17
  from epstein_files.documents.json_file import JsonFile
17
18
  from epstein_files.documents.messenger_log import MSG_REGEX, MessengerLog
18
19
  from epstein_files.documents.other_file import OtherFile
@@ -21,7 +22,7 @@ from epstein_files.util.constant.strings import *
21
22
  from epstein_files.util.constants import *
22
23
  from epstein_files.util.data import flatten, json_safe, listify, uniquify
23
24
  from epstein_files.util.doc_cfg import EmailCfg, Metadata
24
- from epstein_files.util.env import DOCS_DIR, args, logger
25
+ from epstein_files.util.env import DOCS_DIR, DOJ_PDFS_20260130_DIR, args, logger
25
26
  from epstein_files.util.file_helper import file_size_str
26
27
  from epstein_files.util.highlighted_group import HIGHLIGHTED_NAMES, HighlightedNames
27
28
  from epstein_files.util.search_result import SearchResult
@@ -49,14 +50,28 @@ class EpsteinFiles:
49
50
  imessage_logs: list[MessengerLog] = field(default_factory=list)
50
51
  json_files: list[JsonFile] = field(default_factory=list)
51
52
  other_files: list[OtherFile] = field(default_factory=list)
53
+ doj_files: list[DojFile] = field(default_factory=list)
52
54
  timer: Timer = field(default_factory=lambda: Timer())
53
55
  uninteresting_ccs: list[Name] = field(default_factory=list)
54
56
 
57
+ @property
58
+ def all_documents(self) -> Sequence[Document]:
59
+ return self.imessage_logs + self.emails + self.other_files + self.doj_files
60
+
61
+ @property
62
+ def all_doj_files(self) -> Sequence[DojFile | Email]:
63
+ """All files with the filename EFTAXXXXXX."""
64
+ return [doc for doc in self.all_documents if doc.is_doj_file]
65
+
55
66
  def __post_init__(self):
56
67
  """Iterate through files and build appropriate objects."""
57
68
  self.all_files = sorted([f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')])
58
- documents = []
59
- file_type_count = defaultdict(int) # Hack used by --skip-other-files option
69
+
70
+ if DOJ_PDFS_20260130_DIR:
71
+ self.all_files += sorted([f for f in DOJ_PDFS_20260130_DIR.glob('**/*.txt')])
72
+
73
+ docs = []
74
+ file_type_count = defaultdict(int) # Hack used by --skip-other-files option to get a few files parsed before skipping the rest
60
75
 
61
76
  # Read through and classify all the files
62
77
  for file_arg in self.all_files:
@@ -64,26 +79,28 @@ class EpsteinFiles:
64
79
  document = Document(file_arg)
65
80
  cls = document_cls(document)
66
81
 
67
- if document.length() == 0:
82
+ if document.length == 0:
68
83
  logger.warning(f"Skipping empty file: {document}]")
69
84
  continue
70
85
  elif args.skip_other_files and cls == OtherFile and file_type_count[cls.__name__] > 1:
71
86
  document.log(f"Skipping OtherFile...")
72
87
  continue
73
88
 
74
- documents.append(cls(file_arg, lines=document.lines, text=document.text))
75
- logger.info(str(documents[-1]))
89
+ docs.append(cls(file_arg, lines=document.lines, text=document.text).printable_document())
90
+ logger.info(str(docs[-1]))
76
91
  file_type_count[cls.__name__] += 1
77
92
 
78
93
  if doc_timer.seconds_since_start() > SLOW_FILE_SECONDS:
79
- doc_timer.print_at_checkpoint(f"Slow file: {documents[-1]} processed")
94
+ doc_timer.print_at_checkpoint(f"Slow file: {docs[-1]} processed")
80
95
 
81
- self.emails = Document.sort_by_timestamp([d for d in documents if isinstance(d, Email)])
82
- self.imessage_logs = Document.sort_by_timestamp([d for d in documents if isinstance(d, MessengerLog)])
83
- self.other_files = Document.sort_by_timestamp([d for d in documents if isinstance(d, (JsonFile, OtherFile))])
84
- self.json_files = [doc for doc in self.other_files if isinstance(doc, JsonFile)]
96
+ self.doj_files = Document.sort_by_timestamp([d for d in docs if isinstance(d, DojFile)])
97
+ self.emails = Document.sort_by_timestamp([d for d in docs if isinstance(d, Email)])
98
+ self.imessage_logs = Document.sort_by_timestamp([d for d in docs if isinstance(d, MessengerLog)])
99
+ self.json_files = Document.sort_by_timestamp([d for d in docs if isinstance(d, JsonFile)])
100
+ self.other_files = Document.sort_by_timestamp([d for d in docs if isinstance(d, OtherFile) and not isinstance(d, DojFile)])
85
101
  self._set_uninteresting_ccs()
86
102
  self._copy_duplicate_email_properties()
103
+ self._find_email_attachments_and_set_is_first_for_user()
87
104
 
88
105
  @classmethod
89
106
  def get_files(cls, timer: Timer | None = None) -> 'EpsteinFiles':
@@ -110,19 +127,19 @@ class EpsteinFiles:
110
127
  timer.print_at_checkpoint(f'Processed {len(epstein_files.all_files):,} documents')
111
128
  return epstein_files
112
129
 
113
- def all_documents(self) -> Sequence[Document]:
114
- return self.imessage_logs + self.emails + self.other_files
115
-
116
130
  def docs_matching(self, pattern: re.Pattern | str, names: list[Name] | None = None) -> list[SearchResult]:
117
131
  """Find documents whose text matches a pattern (file_type and names args limit the documents searched)."""
118
132
  results: list[SearchResult] = []
119
133
 
120
- for doc in self.all_documents():
134
+ for doc in self.all_documents:
121
135
  if names and doc.author not in names:
122
136
  continue
123
137
 
124
138
  lines = doc.matching_lines(pattern)
125
139
 
140
+ if args.min_line_length:
141
+ lines = [line for line in lines if len(line.line) > args.min_line_length]
142
+
126
143
  if len(lines) > 0:
127
144
  results.append(SearchResult(doc, lines))
128
145
 
@@ -136,15 +153,15 @@ class EpsteinFiles:
136
153
 
137
154
  def email_author_counts(self) -> dict[Name, int]:
138
155
  return {
139
- person.name: len(person.unique_emails_by())
140
- for person in self.emailers() if len(person.unique_emails_by()) > 0
156
+ person.name: len(person.unique_emails_by)
157
+ for person in self.emailers() if len(person.unique_emails_by) > 0
141
158
  }
142
159
 
143
160
  def email_authors_to_device_signatures(self) -> dict[str, set[str]]:
144
161
  signatures = defaultdict(set)
145
162
 
146
163
  for email in [e for e in self.non_duplicate_emails() if e.sent_from_device]:
147
- signatures[email.author_or_unknown()].add(email.sent_from_device)
164
+ signatures[email.author_or_unknown].add(email.sent_from_device)
148
165
 
149
166
  return signatures
150
167
 
@@ -152,14 +169,14 @@ class EpsteinFiles:
152
169
  signatures = defaultdict(set)
153
170
 
154
171
  for email in [e for e in self.non_duplicate_emails() if e.sent_from_device]:
155
- signatures[email.sent_from_device].add(email.author_or_unknown())
172
+ signatures[email.sent_from_device].add(email.author_or_unknown)
156
173
 
157
174
  return signatures
158
175
 
159
176
  def email_recipient_counts(self) -> dict[Name, int]:
160
177
  return {
161
- person.name: len(person.unique_emails_to())
162
- for person in self.emailers() if len(person.unique_emails_to()) > 0
178
+ person.name: len(person.unique_emails_to)
179
+ for person in self.emailers() if len(person.unique_emails_to) > 0
163
180
  }
164
181
 
165
182
  def email_signature_substitution_counts(self) -> dict[str, int]:
@@ -208,7 +225,7 @@ class EpsteinFiles:
208
225
 
209
226
  def for_ids(self, file_ids: str | list[str]) -> list[Document]:
210
227
  file_ids = listify(file_ids)
211
- docs = [doc for doc in self.all_documents() if doc.file_id in file_ids]
228
+ docs = [doc for doc in (list(self.all_documents) + self.doj_files) if doc.file_id in file_ids]
212
229
 
213
230
  if len(docs) != len(file_ids):
214
231
  logger.warning(f"{len(file_ids)} file IDs provided but only {len(docs)} Epstein files found!")
@@ -251,7 +268,7 @@ class EpsteinFiles:
251
268
  name=name,
252
269
  emails=self.emails_for(name),
253
270
  imessage_logs=self.imessage_logs_for(name),
254
- is_uninteresting_cc=name in self.uninteresting_emailers(),
271
+ is_uninteresting=name in self.uninteresting_emailers(),
255
272
  other_files=[f for f in self.other_files if name and name == f.author]
256
273
  )
257
274
  for name in names
@@ -276,13 +293,30 @@ class EpsteinFiles:
276
293
 
277
294
  return self._uninteresting_emailers
278
295
 
296
+ def _find_email_attachments_and_set_is_first_for_user(self) -> None:
297
+ for other_file in self.other_files:
298
+ if other_file.config and other_file.config.attached_to_email_id:
299
+ email = self.email_for_id(other_file.config.attached_to_email_id)
300
+ email.attached_docs.append(other_file)
301
+
302
+ if other_file.timestamp \
303
+ and other_file.timestamp != email.timestamp \
304
+ and not other_file.config_timestamp:
305
+ other_file.warn(f"Overwriting '{other_file.timestamp}' with {email}'s timestamp {email.timestamp}")
306
+
307
+ other_file.timestamp = email.timestamp
308
+
309
+ for emailer in self.emailers():
310
+ first_email = emailer.emails[0]
311
+ first_email._is_first_for_user = True
312
+
279
313
  def _copy_duplicate_email_properties(self) -> None:
280
314
  """Ensure dupe emails have the properties of the emails they duplicate to capture any repairs, config etc."""
281
315
  for email in self.emails:
282
- if not email.is_duplicate():
316
+ if not email.is_duplicate:
283
317
  continue
284
318
 
285
- original = self.email_for_id(email.duplicate_of_id())
319
+ original = self.email_for_id(email.duplicate_of_id)
286
320
 
287
321
  for field_name in DUPLICATE_PROPS_TO_COPY:
288
322
  original_prop = getattr(original, field_name)
@@ -321,11 +355,13 @@ def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
321
355
  def document_cls(doc: Document) -> Type[Document]:
322
356
  search_area = doc.text[0:5000] # Limit search area to avoid pointless scans of huge files
323
357
 
324
- if doc.length() == 0:
358
+ if doc.length == 0:
325
359
  return Document
360
+ elif doc.is_doj_file:
361
+ return DojFile
326
362
  if doc.text[0] == '{':
327
363
  return JsonFile
328
- elif isinstance(doc.config, EmailCfg) or (DETECT_EMAIL_REGEX.match(search_area) and doc.config is None):
364
+ elif Document.is_email(doc): # TODO: right now we setup the DojFile which makes an Email obj only later at print time
329
365
  return Email
330
366
  elif MSG_REGEX.search(search_area):
331
367
  return MessengerLog
@@ -334,4 +370,4 @@ def document_cls(doc: Document) -> Type[Document]:
334
370
 
335
371
 
336
372
  def _sorted_metadata(docs: Sequence[Document]) -> list[Metadata]:
337
- return [json_safe(d.metadata()) for d in Document.sort_by_id(docs)]
373
+ return [json_safe(d.metadata) for d in Document.sort_by_id(docs)]