epstein-files 1.0.12__py3-none-any.whl → 1.0.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
epstein_files/__init__.py CHANGED
@@ -21,7 +21,7 @@ from epstein_files.util.env import args, specified_names
21
21
  from epstein_files.util.file_helper import coerce_file_path, extract_file_id
22
22
  from epstein_files.util.logging import logger
23
23
  from epstein_files.util.output import (print_emails, print_json_files, print_json_stats,
24
- print_text_messages, write_json_metadata, write_urls)
24
+ write_json_metadata, write_urls)
25
25
  from epstein_files.util.rich import build_highlighter, console, print_header, print_panel, write_html
26
26
  from epstein_files.util.timer import Timer
27
27
  from epstein_files.util.word_count import write_word_counts_html
@@ -49,7 +49,7 @@ def generate_html() -> None:
49
49
  exit()
50
50
 
51
51
  if args.output_texts:
52
- print_text_messages(epstein_files)
52
+ epstein_files.print_text_messages_section()
53
53
  timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs)} text message logs')
54
54
 
55
55
  if args.output_emails:
@@ -57,8 +57,13 @@ def generate_html() -> None:
57
57
  timer.print_at_checkpoint(f"Printed {emails_printed:,} emails")
58
58
 
59
59
  if args.output_other:
60
- files_printed = epstein_files.print_other_files_table()
61
- timer.print_at_checkpoint(f"Printed {len(files_printed)} other files")
60
+ if args.uninteresting:
61
+ files = [f for f in epstein_files.other_files if not f.is_interesting()]
62
+ else:
63
+ files = [f for f in epstein_files.other_files if args.all_other_files or f.is_interesting()]
64
+
65
+ epstein_files.print_other_files_section(files)
66
+ timer.print_at_checkpoint(f"Printed {len(files)} other files (skipped {len(epstein_files.other_files) - len(files)})")
62
67
 
63
68
  # Save output
64
69
  write_html(ALL_EMAILS_PATH if args.all_emails else TEXT_MSGS_HTML_PATH)
@@ -90,7 +95,7 @@ def epstein_search():
90
95
 
91
96
  if args.whole_file:
92
97
  if isinstance(search_result.document, Email):
93
- search_result.document.truncation_allowed = False
98
+ search_result.document._truncation_allowed = False
94
99
 
95
100
  console.print(search_result.document)
96
101
  else:
@@ -111,7 +116,7 @@ def epstein_show():
111
116
 
112
117
  for doc in docs:
113
118
  if isinstance(doc, Email):
114
- doc.truncation_allowed = False
119
+ doc._truncation_allowed = False
115
120
 
116
121
  console.print('\n', doc, '\n')
117
122
 
@@ -34,9 +34,9 @@ class Communication(Document):
34
34
  def is_attribution_uncertain(self) -> bool:
35
35
  return bool(self.config and self.config.is_attribution_uncertain)
36
36
 
37
- def external_links(self, _style: str = '', include_alt_link: bool = True) -> Text:
37
+ def external_links_txt(self, _style: str = '', include_alt_links: bool = True) -> Text:
38
38
  """Overrides super() method to apply self.author_style."""
39
- return super().external_links(self.author_style, include_alt_link=include_alt_link)
39
+ return super().external_links_txt(self.author_style, include_alt_links=include_alt_links)
40
40
 
41
41
  def summary(self) -> Text:
42
42
  return self._summary().append(CLOSE_PROPERTIES_CHAR)
@@ -5,7 +5,7 @@ from dataclasses import asdict, dataclass, field
5
5
  from datetime import datetime
6
6
  from pathlib import Path
7
7
  from subprocess import run
8
- from typing import ClassVar, Sequence, TypeVar
8
+ from typing import Callable, ClassVar, Sequence, TypeVar
9
9
 
10
10
  from rich.console import Console, ConsoleOptions, Group, RenderResult
11
11
  from rich.padding import Padding
@@ -16,15 +16,15 @@ from epstein_files.util.constant.names import *
16
16
  from epstein_files.util.constant.strings import *
17
17
  from epstein_files.util.constant.urls import *
18
18
  from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
19
- from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_time_from_timestamp_str, without_falsey
19
+ from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time_from_timestamp_str, without_falsey
20
20
  from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
21
21
  from epstein_files.util.env import DOCS_DIR, args
22
- from epstein_files.util.file_helper import (file_stem_for_id, extract_file_id, file_size,
23
- file_size_str, is_local_extract_file)
22
+ from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, is_local_extract_file
24
23
  from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
25
- from epstein_files.util.rich import INFO_STYLE, SYMBOL_STYLE, console, highlighter, key_value_txt, link_text_obj
24
+ from epstein_files.util.rich import INFO_STYLE, SYMBOL_STYLE, console, highlighter, join_texts, key_value_txt, link_text_obj, parenthesize
26
25
  from epstein_files.util.search_result import MatchedLine
27
26
 
27
+ ALT_LINK_STYLE = 'white dim'
28
28
  CLOSE_PROPERTIES_CHAR = ']'
29
29
  HOUSE_OVERSIGHT = HOUSE_OVERSIGHT_PREFIX.replace('_', ' ').strip()
30
30
  INFO_INDENT = 2
@@ -46,7 +46,6 @@ FILENAME_MATCH_STYLES = [
46
46
  METADATA_FIELDS = [
47
47
  'author',
48
48
  'file_id',
49
- 'num_lines',
50
49
  'timestamp'
51
50
  ]
52
51
 
@@ -68,7 +67,6 @@ class Document:
68
67
  config (DocCfg): Information about this fil
69
68
  file_id (str): 6 digit (or 8 digits if it's a local extract file) string ID
70
69
  filename (str): File's basename
71
- length (int): Number of characters in the file after all the cleanup
72
70
  lines (str): Number of lines in the file after all the cleanup
73
71
  text (str): Contents of the file
74
72
  timestamp (datetime | None): When the file was originally created
@@ -80,12 +78,10 @@ class Document:
80
78
  config: EmailCfg | DocCfg | TextCfg | None = None
81
79
  file_id: str = field(init=False)
82
80
  filename: str = field(init=False)
83
- length: int = field(init=False)
84
- lines: list[str] = field(init=False)
85
- num_lines: int = field(init=False)
81
+ lines: list[str] = field(default_factory=list)
86
82
  text: str = ''
87
83
  timestamp: datetime | None = None
88
- url_slug: str = field(init=False) # e.g. 'HOUSE_OVERSIGHT_123456
84
+ url_slug: str = ''
89
85
 
90
86
  # Class variables
91
87
  include_description_in_summary_panel: ClassVar[bool] = False
@@ -94,12 +90,13 @@ class Document:
94
90
  def __post_init__(self):
95
91
  self.filename = self.file_path.name
96
92
  self.file_id = extract_file_id(self.filename)
93
+ # config and url_slug could have been pre-set in Email
97
94
  self.config = self.config or deepcopy(ALL_FILE_CONFIGS.get(self.file_id))
95
+ self.url_slug = self.url_slug or self.filename.split('.')[0]
98
96
 
99
- if 'url_slug' not in vars(self):
100
- self.url_slug = self.file_path.stem
97
+ if not self.text:
98
+ self._load_file()
101
99
 
102
- self._set_computed_fields(text=self.text or self._load_file())
103
100
  self._repair()
104
101
  self._extract_author()
105
102
  self.timestamp = self._extract_timestamp()
@@ -114,47 +111,49 @@ class Document:
114
111
 
115
112
  def duplicate_file_txt(self) -> Text:
116
113
  """If the file is a dupe make a nice message to explain what file it's a duplicate of."""
117
- if not self.config or not self.config.dupe_of_id or self.config.dupe_type is None:
114
+ if not self.is_duplicate():
118
115
  raise RuntimeError(f"duplicate_file_txt() called on {self.summary()} but not a dupe! config:\n\n{self.config}")
119
116
 
120
117
  txt = Text(f"Not showing ", style=INFO_STYLE).append(epstein_media_doc_link_txt(self.file_id, style='cyan'))
121
118
  txt.append(f" because it's {DUPE_TYPE_STRS[self.config.dupe_type]} ")
122
- return txt.append(epstein_media_doc_link_txt(self.config.dupe_of_id, style='royal_blue1'))
119
+ return txt.append(epstein_media_doc_link_txt(self.config.duplicate_of_id, style='royal_blue1'))
123
120
 
124
121
  def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
125
- """Create a Text obj link to this document on epsteinify.com."""
126
- return link_text_obj(epsteinify_doc_url(self.url_slug), link_txt or self.file_path.stem, style)
122
+ return self.external_link(epsteinify_doc_url, style, link_txt)
127
123
 
128
124
  def epstein_media_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
129
- """Create a Text obj link to this document on epstein.media."""
130
- return link_text_obj(epstein_media_doc_url(self.url_slug), link_txt or self.file_path.stem, style)
125
+ return self.external_link(epstein_media_doc_url, style, link_txt)
131
126
 
132
127
  def epstein_web_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
133
- """Create a Text obj link to this document on EpsteinWeb."""
134
- return link_text_obj(epstein_web_doc_url(self.url_slug), link_txt or self.file_path.stem, style)
128
+ return self.external_link(epstein_web_doc_url, style, link_txt)
135
129
 
136
- def external_links(self, style: str = '', include_alt_link: bool = False) -> Text:
137
- """Returns colored links to epstein.media and and epsteinweb in a Text object."""
138
- txt = Text('', style='white' if include_alt_link else ARCHIVE_LINK_COLOR)
130
+ def rollcall_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
131
+ return self.external_link(rollcall_doc_url, style, link_txt)
139
132
 
140
- if args.use_epstein_web:
141
- txt.append(self.epstein_web_link(style=style))
133
+ def external_link(self, fxn: Callable[[str], str], style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
134
+ return link_text_obj(fxn(self.url_slug), link_txt or self.file_path.stem, style)
142
135
 
143
- if include_alt_link:
144
- txt.append(' (').append(self.epsteinify_link(style='white dim', link_txt=EPSTEINIFY)).append(')')
145
- txt.append(' (').append(self.epstein_media_link(style='white dim', link_txt=EPSTEIN_MEDIA)).append(')')
146
- else:
147
- txt.append(self.epstein_media_link(style=style))
136
+ def external_links_txt(self, style: str = '', include_alt_links: bool = False) -> Text:
137
+ """Returns colored links to epstein.media and alternates in a Text object."""
138
+ links = [self.epstein_media_link(style=style)]
148
139
 
149
- if include_alt_link:
150
- txt.append(' (').append(self.epsteinify_link(style='white dim', link_txt=EPSTEINIFY)).append(')')
151
- txt.append(' (').append(self.epstein_web_link(style='white dim', link_txt=EPSTEIN_WEB)).append(')')
140
+ if include_alt_links:
141
+ links.append(self.epsteinify_link(style=ALT_LINK_STYLE, link_txt=EPSTEINIFY))
142
+ links.append(self.epstein_web_link(style=ALT_LINK_STYLE, link_txt=EPSTEIN_WEB))
152
143
 
153
- return txt
144
+ if self._class_name() == 'Email':
145
+ links.append(self.rollcall_link(style=ALT_LINK_STYLE, link_txt=ROLLCALL))
146
+
147
+ links = [links[0]] + [parenthesize(link) for link in links[1:]]
148
+ base_txt = Text('', style='white' if include_alt_links else ARCHIVE_LINK_COLOR)
149
+ return base_txt.append(join_texts(links))
150
+
151
+ def file_id_debug_info(self) -> str:
152
+ return ', '.join([f"{prop}={getattr(self, prop)}" for prop in ['file_id', 'filename', 'url_slug']])
154
153
 
155
154
  def file_info_panel(self) -> Group:
156
155
  """Panel with filename linking to raw file plus any additional info about the file."""
157
- panel = Panel(self.external_links(include_alt_link=True), border_style=self._border_style(), expand=False)
156
+ panel = Panel(self.external_links_txt(include_alt_links=True), border_style=self._border_style(), expand=False)
158
157
  padded_info = [Padding(sentence, INFO_PADDING) for sentence in self.info()]
159
158
  return Group(*([panel] + padded_info))
160
159
 
@@ -176,12 +175,15 @@ class Document:
176
175
  return None
177
176
 
178
177
  def is_duplicate(self) -> bool:
179
- return bool(self.config and self.config.dupe_of_id)
178
+ return bool(self.config and self.config.duplicate_of_id)
180
179
 
181
180
  def is_local_extract_file(self) -> bool:
182
181
  """True if file created by extracting text from a court doc (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
183
182
  return is_local_extract_file(self.filename)
184
183
 
184
+ def length(self) -> int:
185
+ return len(self.text)
186
+
185
187
  def log(self, msg: str, level: int = logging.INFO):
186
188
  """Log with filename as a prefix."""
187
189
  logger.log(level, f"{self.file_path.stem} {msg}")
@@ -202,17 +204,21 @@ class Document:
202
204
  metadata.update({k: v for k, v in asdict(self).items() if k in METADATA_FIELDS and v is not None})
203
205
  metadata['bytes'] = self.file_size()
204
206
  metadata['filename'] = f"{self.url_slug}.txt"
207
+ metadata['num_lines'] = self.num_lines()
205
208
  metadata['type'] = self._class_name()
206
209
 
207
210
  if self.is_local_extract_file():
208
211
  metadata['extracted_file'] = {
209
- 'explanation': 'Manually extracted from one of the court filings.',
212
+ 'explanation': 'manually extracted from one of the other files',
210
213
  'extracted_from': self.url_slug + '.txt',
211
214
  'url': extracted_file_url(self.filename),
212
215
  }
213
216
 
214
217
  return metadata
215
218
 
219
+ def num_lines(self) -> int:
220
+ return len(self.lines)
221
+
216
222
  def raw_text(self) -> str:
217
223
  with open(self.file_path) as f:
218
224
  return f.read()
@@ -229,7 +235,7 @@ class Document:
229
235
 
230
236
  def sort_key(self) -> tuple[datetime, str, int]:
231
237
  if self.is_duplicate():
232
- sort_id = self.config.dupe_of_id
238
+ sort_id = self.config.duplicate_of_id
233
239
  dupe_idx = 1
234
240
  else:
235
241
  sort_id = self.file_id
@@ -243,15 +249,15 @@ class Document:
243
249
  txt.append(f" {self.url_slug}", style=FILENAME_STYLE)
244
250
 
245
251
  if self.timestamp:
246
- timestamp_str = remove_time_from_timestamp_str(self.timestamp)
252
+ timestamp_str = remove_zero_time_from_timestamp_str(self.timestamp).replace('T', ' ')
247
253
  txt.append(' (', style=SYMBOL_STYLE)
248
254
  txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
249
255
 
250
256
  txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
251
- txt.append(", ").append(key_value_txt('lines', self.num_lines))
257
+ txt.append(", ").append(key_value_txt('lines', self.num_lines()))
252
258
 
253
- if self.config and self.config.dupe_of_id:
254
- txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.dupe_of_id, style='magenta')))
259
+ if self.config and self.config.duplicate_of_id:
260
+ txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='magenta')))
255
261
 
256
262
  return txt
257
263
 
@@ -290,13 +296,19 @@ class Document:
290
296
  """Should be implemented in subclasses."""
291
297
  pass
292
298
 
293
- def _load_file(self) -> str:
299
+ def _load_file(self) -> None:
294
300
  """Remove BOM and HOUSE OVERSIGHT lines, strip whitespace."""
295
301
  text = self.raw_text()
296
302
  text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text # remove BOM
297
303
  text = self.repair_ocr_text(OCR_REPAIRS, text.strip())
298
- lines = [l.strip() for l in text.split('\n') if not l.startswith(HOUSE_OVERSIGHT)]
299
- return collapse_newlines('\n'.join(lines))
304
+
305
+ lines = [
306
+ line.strip() if self.strip_whitespace else line for line in text.split('\n')
307
+ if not line.startswith(HOUSE_OVERSIGHT)
308
+ ]
309
+
310
+ self.text = collapse_newlines('\n'.join(lines))
311
+ self.lines = self.text.split('\n')
300
312
 
301
313
  def _repair(self) -> None:
302
314
  """Can optionally be overloaded in subclasses to further improve self.text."""
@@ -313,9 +325,7 @@ class Document:
313
325
  else:
314
326
  raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (neither was)")
315
327
 
316
- self.length = len(self.text)
317
328
  self.lines = [line.strip() if self.strip_whitespace else line for line in self.text.split('\n')]
318
- self.num_lines = len(self.lines)
319
329
 
320
330
  def _write_clean_text(self, output_path: Path) -> None:
321
331
  """Write self.text to 'output_path'. Used only for diffing files."""
@@ -328,7 +338,7 @@ class Document:
328
338
  with open(output_path, 'w') as f:
329
339
  f.write(self.text)
330
340
 
331
- logger.warning(f"Wrote {self.length} chars of cleaned {self.filename} to {output_path}.")
341
+ logger.warning(f"Wrote {self.length()} chars of cleaned {self.filename} to {output_path}.")
332
342
 
333
343
  def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
334
344
  yield self.file_info_panel()
@@ -131,13 +131,12 @@ JUNK_EMAILERS = [
131
131
  'editorialstaff@flipboard.com',
132
132
  'How To Academy',
133
133
  'Jokeland',
134
- JP_MORGAN_USGIO,
135
- 'Saved by Internet Explorer 11',
136
134
  ]
137
135
 
138
136
  MAILING_LISTS = [
139
137
  INTELLIGENCE_SQUARED,
140
138
  'middle.east.update@hotmail.com',
139
+ JP_MORGAN_USGIO,
141
140
  ]
142
141
 
143
142
  TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + MAILING_LISTS + [
@@ -274,11 +273,9 @@ USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIP
274
273
  'Michael Simmons', # Random CC
275
274
  'Nancy Portland', # Lawrence Krauss CC
276
275
  'Oliver Goodenough', # Robert Trivers CC
277
- 'Owen Blicksilver', # Landon Thomas CC
278
276
  'Peter Aldhous', # Lawrence Krauss CC
279
277
  'Sam Harris', # Lawrence Krauss CC
280
278
  SAMUEL_LEFF, # Random CC
281
- "Saved by Internet Explorer 11",
282
279
  'Sean T Lehane', # Random CC
283
280
  'Stephen Rubin', # Random CC
284
281
  'Tim Kane', # Random CC
@@ -319,7 +316,7 @@ class Email(Communication):
319
316
  recipients: list[str | None] = field(default_factory=list)
320
317
  sent_from_device: str | None = None
321
318
  signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
322
- truncation_allowed: bool = True
319
+ _truncation_allowed: bool = True # Hacky way to get __rich_console__() not to truncate in epstein_show script
323
320
 
324
321
  # For logging how many headers we prettified while printing, kind of janky
325
322
  rewritten_header_ids: ClassVar[set[str]] = set([])
@@ -340,10 +337,10 @@ class Email(Communication):
340
337
 
341
338
  try:
342
339
  if self.config and self.config.recipients:
343
- self.recipients = cast(list[str | None], self.config.recipients)
340
+ self.recipients = self.config.recipients
344
341
  else:
345
342
  for recipient in self.header.recipients():
346
- self.recipients.extend(self._get_names(recipient))
343
+ self.recipients.extend(self._emailer_names(recipient))
347
344
  except Exception as e:
348
345
  console.print_exception()
349
346
  console.line(2)
@@ -358,8 +355,12 @@ class Email(Communication):
358
355
  self.actual_text = self._actual_text()
359
356
  self.sent_from_device = self._sent_from_device()
360
357
 
358
+ def attachments(self) -> list[str]:
359
+ return (self.header.attachments or '').split(';')
360
+
361
361
  def info_txt(self) -> Text:
362
- txt = Text("OCR text of email from ", style='grey46').append(self.author_txt).append(' to ')
362
+ email_type = 'fwded article' if self.is_fwded_article() else 'email'
363
+ txt = Text(f"OCR text of {email_type} from ", style='grey46').append(self.author_txt).append(' to ')
363
364
  return txt.append(self._recipients_txt()).append(highlighter(f" probably sent at {self.timestamp}"))
364
365
 
365
366
  def is_fwded_article(self) -> bool:
@@ -401,8 +402,8 @@ class Email(Communication):
401
402
  return self.text
402
403
 
403
404
  reply_text_match = REPLY_TEXT_REGEX.search(text)
404
- # logger.info(f"Raw text:\n" + self.top_lines(20) + '\n\n')
405
- # logger.info(f"With header removed:\n" + text[0:500] + '\n\n')
405
+ self.log_top_lines(20, "Raw text:", logging.DEBUG)
406
+ self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
406
407
 
407
408
  if reply_text_match:
408
409
  actual_num_chars = len(reply_text_match.group(1))
@@ -438,12 +439,32 @@ class Email(Communication):
438
439
 
439
440
  return style.replace('bold', '').strip()
440
441
 
442
+ def _emailer_names(self, emailer_str: str) -> list[str]:
443
+ """Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
444
+ emailer_str = EmailHeader.cleanup_str(emailer_str)
445
+
446
+ if len(emailer_str) == 0:
447
+ return []
448
+
449
+ names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
450
+
451
+ if BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
452
+ if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
453
+ logger.warning(f"'{self.filename}': No emailer found in '{escape_single_quotes(emailer_str)}'")
454
+ else:
455
+ logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
456
+
457
+ return names_found
458
+
459
+ names_found = names_found or [emailer_str]
460
+ return [_reverse_first_and_last_names(name) for name in names_found]
461
+
441
462
  def _extract_author(self) -> None:
442
463
  self._extract_header()
443
464
  super()._extract_author()
444
465
 
445
466
  if not self.author and self.header.author:
446
- authors = self._get_names(self.header.author)
467
+ authors = self._emailer_names(self.header.author)
447
468
  self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
448
469
 
449
470
  def _extract_header(self) -> None:
@@ -493,26 +514,6 @@ class Email(Communication):
493
514
 
494
515
  raise RuntimeError(f"No timestamp found in '{self.file_path.name}' top lines:\n{searchable_text}")
495
516
 
496
- def _get_names(self, emailer_str: str) -> list[str]:
497
- """Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
498
- emailer_str = EmailHeader.cleanup_str(emailer_str)
499
-
500
- if len(emailer_str) == 0:
501
- return []
502
-
503
- names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
504
-
505
- if BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
506
- if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
507
- logger.warning(f"'{self.filename}': No emailer found in '{escape_single_quotes(emailer_str)}'")
508
- else:
509
- logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
510
-
511
- return names_found
512
-
513
- names_found = names_found or [emailer_str]
514
- return [_reverse_first_and_last_names(name) for name in names_found]
515
-
516
517
  def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
517
518
  """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
518
519
  for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text or self.text)):
@@ -584,7 +585,7 @@ class Email(Communication):
584
585
  self._merge_lines(2, 5)
585
586
  elif self.file_id in ['029498', '031428']:
586
587
  self._merge_lines(2, 4)
587
- elif self.file_id in ['029976', '023067']:
588
+ elif self.file_id in ['029976', '023067', '033576']:
588
589
  self._merge_lines(3) # Merge 4th and 5th rows
589
590
  elif self.file_id in '026609 029402 032405 022695'.split():
590
591
  self._merge_lines(4) # Merge 5th and 6th rows
@@ -609,6 +610,8 @@ class Email(Communication):
609
610
  self._merge_lines(7, 9)
610
611
  elif self.file_id == '030299':
611
612
  self._merge_lines(7, 10)
613
+ elif self.file_id in ['022673', '022684']:
614
+ self._merge_lines(9)
612
615
  elif self.file_id == '014860':
613
616
  self._merge_lines(3)
614
617
  self._merge_lines(4)
@@ -680,6 +683,9 @@ class Email(Communication):
680
683
  if extracted_from_description:
681
684
  extracted_description = f"{APPEARS_IN} {extracted_from_description}"
682
685
 
686
+ if isinstance(extracted_from_doc_cfg, EmailCfg):
687
+ extracted_description += ' email'
688
+
683
689
  if self.config.description:
684
690
  self.warn(f"Overwriting description '{self.config.description}' with extract description '{self.config.description}'")
685
691
 
@@ -705,10 +711,10 @@ class Email(Communication):
705
711
  num_chars = quote_cutoff
706
712
 
707
713
  # Truncate long emails but leave a note explaining what happened w/link to source document
708
- if len(text) > num_chars and self.truncation_allowed:
714
+ if len(text) > num_chars and self._truncation_allowed:
709
715
  text = text[0:num_chars]
710
716
  doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
711
- trim_note = f"<...trimmed to {num_chars} characters of {self.length}, read the rest at {doc_link_markup}...>"
717
+ trim_note = f"<...trimmed to {num_chars} characters of {self.length()}, read the rest at {doc_link_markup}...>"
712
718
  trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
713
719
 
714
720
  # Rewrite broken headers where the values are on separate lines from the field names
@@ -5,6 +5,7 @@ from datetime import datetime
5
5
  from rich.text import Text
6
6
 
7
7
  from epstein_files.util.constant.names import JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN
8
+ from epstein_files.util.constant.strings import TIMESTAMP_DIM
8
9
  from epstein_files.util.data import extract_last_name
9
10
  from epstein_files.util.highlighted_group import get_style_for_name
10
11
  from epstein_files.util.logging import logger
@@ -12,7 +13,6 @@ from epstein_files.util.rich import TEXT_LINK, highlighter
12
13
 
13
14
  MSG_DATE_FORMAT = r"%m/%d/%y %I:%M:%S %p"
14
15
  PHONE_NUMBER_REGEX = re.compile(r'^[\d+]+.*')
15
- TIMESTAMP_STYLE = 'turquoise4 dim'
16
16
 
17
17
  DISPLAY_LAST_NAME_ONLY = [
18
18
  JEFFREY_EPSTEIN,
@@ -29,7 +29,7 @@ TEXTER_MAPPING = {
29
29
  class TextMessage:
30
30
  """Class representing a single iMessage text message."""
31
31
  author: str | None
32
- author_str: str | None = None
32
+ author_str: str = ''
33
33
  id_confirmed: bool = False
34
34
  text: str
35
35
  timestamp_str: str
@@ -37,7 +37,7 @@ class TextMessage:
37
37
  def __post_init__(self):
38
38
  self.author = TEXTER_MAPPING.get(self.author or UNKNOWN, self.author)
39
39
 
40
- if self.author is None:
40
+ if not self.author:
41
41
  self.author_str = UNKNOWN
42
42
  elif self.author in DISPLAY_LAST_NAME_ONLY and not self.author_str:
43
43
  self.author_str = extract_last_name(self.author)
@@ -77,5 +77,5 @@ class TextMessage:
77
77
  def __rich__(self) -> Text:
78
78
  author_style = get_style_for_name(self.author_str if self.author_str.startswith('+') else self.author)
79
79
  author_txt = Text(self.author_str, style=author_style)
80
- timestamp_txt = Text(f"[{self.timestamp_str}]", style=TIMESTAMP_STYLE).append(' ')
80
+ timestamp_txt = Text(f"[{self.timestamp_str}]", style=TIMESTAMP_DIM).append(' ')
81
81
  return Text('').append(timestamp_txt).append(author_txt).append(': ', style='dim').append(self._message())
@@ -6,10 +6,12 @@ from typing import ClassVar
6
6
 
7
7
  from rich.text import Text
8
8
 
9
- from epstein_files.documents.other_file import OtherFile
9
+ from epstein_files.documents.other_file import Metadata, OtherFile
10
10
  from epstein_files.util.constant.strings import JSON
11
11
  from epstein_files.util.rich import INFO_STYLE
12
12
 
13
+ DESCRIPTION = "JSON data containing preview info for links sent in a messaging app like iMessage"
14
+
13
15
  TEXT_FIELDS = [
14
16
  'caption',
15
17
  'standard',
@@ -23,7 +25,6 @@ TEXT_FIELDS = [
23
25
  @dataclass
24
26
  class JsonFile(OtherFile):
25
27
  """File containing JSON data."""
26
-
27
28
  include_description_in_summary_panel: ClassVar[bool] = False
28
29
  strip_whitespace: ClassVar[bool] = False
29
30
 
@@ -39,7 +40,7 @@ class JsonFile(OtherFile):
39
40
  return JSON
40
41
 
41
42
  def info_txt(self) -> Text | None:
42
- return Text(f"JSON file, contains preview data for links sent a messaging app", style=INFO_STYLE)
43
+ return Text(DESCRIPTION, style=INFO_STYLE)
43
44
 
44
45
  def is_interesting(self):
45
46
  return False
@@ -48,5 +49,10 @@ class JsonFile(OtherFile):
48
49
  with open(self.file_path, encoding='utf-8-sig') as f:
49
50
  return json.load(f)
50
51
 
52
+ def metadata(self) -> Metadata:
53
+ metadata = super().metadata()
54
+ metadata['description'] = DESCRIPTION
55
+ return metadata
56
+
51
57
  def json_str(self) -> str:
52
58
  return json.dumps(self.json_data(), indent=4)