epstein-files 1.0.13__tar.gz → 1.0.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {epstein_files-1.0.13 → epstein_files-1.0.14}/PKG-INFO +10 -3
  2. {epstein_files-1.0.13 → epstein_files-1.0.14}/README.md +9 -2
  3. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/__init__.py +11 -6
  4. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/documents/communication.py +2 -2
  5. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/documents/document.py +52 -46
  6. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/documents/email.py +32 -29
  7. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/documents/imessage/text_message.py +4 -4
  8. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/documents/json_file.py +9 -3
  9. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/documents/messenger_log.py +20 -17
  10. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/documents/other_file.py +50 -71
  11. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/epstein_files.py +89 -67
  12. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/constant/names.py +1 -1
  13. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/constant/strings.py +1 -1
  14. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/constants.py +62 -44
  15. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/data.py +2 -0
  16. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/doc_cfg.py +7 -7
  17. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/env.py +2 -5
  18. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/highlighted_group.py +7 -15
  19. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/output.py +15 -30
  20. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/rich.py +29 -29
  21. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/word_count.py +1 -1
  22. {epstein_files-1.0.13 → epstein_files-1.0.14}/pyproject.toml +1 -1
  23. {epstein_files-1.0.13 → epstein_files-1.0.14}/LICENSE +0 -0
  24. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/documents/emails/email_header.py +0 -0
  25. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/constant/common_words.py +0 -0
  26. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/constant/html.py +0 -0
  27. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/constant/output_files.py +0 -0
  28. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/constant/urls.py +0 -0
  29. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/file_helper.py +0 -0
  30. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/logging.py +0 -0
  31. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/search_result.py +0 -0
  32. {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/timer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: epstein-files
3
- Version: 1.0.13
3
+ Version: 1.0.14
4
4
  Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
5
5
  Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
6
6
  License: GPL-3.0-or-later
@@ -43,11 +43,12 @@ Description-Content-Type: text/markdown
43
43
 
44
44
 
45
45
  ## Usage
46
-
47
46
  #### Installation
48
47
  1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8) (make sure you grab both the `001/` and `002/` folders).
49
48
  1. Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
50
49
 
50
+
51
+ #### Command Line Tools
51
52
  You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
52
53
 
53
54
  ```bash
@@ -63,7 +64,7 @@ epstein_generate
63
64
  # Search for a string:
64
65
  epstein_search Bannon
65
66
  # Or a regex:
66
- epstein_search '\bSteve\s*Bannon\b'
67
+ epstein_search '\bSteve\s*Bannon|Jeffrey\s*Epstein\b'
67
68
 
68
69
  # Show a file with color highlighting of keywords:
69
70
  epstein_show 030999
@@ -82,6 +83,12 @@ epstein_diff 030999 020442
82
83
  The first time you run anything it will take a few minutes to fix all the janky OCR text, attribute the redacted emails, etc. After that things will be quick.
83
84
  Run `epstein_generate --help` for command line option assistance.
84
85
 
86
+ **Optional:** There are a handful of emails that I extracted from the legal filings they were contained in. If you want to include these files in your local analysis you'll need to copy those files from the repo into your local document directory. Something like:
87
+
88
+ ```bash
89
+ cp ./emails_extracted_from_legal_filings/*.txt "$EPSTEIN_DOCS_DIR"
90
+ ```
91
+
85
92
 
86
93
  #### As A Library
87
94
  ```python
@@ -10,11 +10,12 @@
10
10
 
11
11
 
12
12
  ## Usage
13
-
14
13
  #### Installation
15
14
  1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8) (make sure you grab both the `001/` and `002/` folders).
16
15
  1. Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
17
16
 
17
+
18
+ #### Command Line Tools
18
19
  You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
19
20
 
20
21
  ```bash
@@ -30,7 +31,7 @@ epstein_generate
30
31
  # Search for a string:
31
32
  epstein_search Bannon
32
33
  # Or a regex:
33
- epstein_search '\bSteve\s*Bannon\b'
34
+ epstein_search '\bSteve\s*Bannon|Jeffrey\s*Epstein\b'
34
35
 
35
36
  # Show a file with color highlighting of keywords:
36
37
  epstein_show 030999
@@ -49,6 +50,12 @@ epstein_diff 030999 020442
49
50
  The first time you run anything it will take a few minutes to fix all the janky OCR text, attribute the redacted emails, etc. After that things will be quick.
50
51
  Run `epstein_generate --help` for command line option assistance.
51
52
 
53
+ **Optional:** There are a handful of emails that I extracted from the legal filings they were contained in. If you want to include these files in your local analysis you'll need to copy those files from the repo into your local document directory. Something like:
54
+
55
+ ```bash
56
+ cp ./emails_extracted_from_legal_filings/*.txt "$EPSTEIN_DOCS_DIR"
57
+ ```
58
+
52
59
 
53
60
  #### As A Library
54
61
  ```python
@@ -21,7 +21,7 @@ from epstein_files.util.env import args, specified_names
21
21
  from epstein_files.util.file_helper import coerce_file_path, extract_file_id
22
22
  from epstein_files.util.logging import logger
23
23
  from epstein_files.util.output import (print_emails, print_json_files, print_json_stats,
24
- print_text_messages, write_json_metadata, write_urls)
24
+ write_json_metadata, write_urls)
25
25
  from epstein_files.util.rich import build_highlighter, console, print_header, print_panel, write_html
26
26
  from epstein_files.util.timer import Timer
27
27
  from epstein_files.util.word_count import write_word_counts_html
@@ -49,7 +49,7 @@ def generate_html() -> None:
49
49
  exit()
50
50
 
51
51
  if args.output_texts:
52
- print_text_messages(epstein_files)
52
+ epstein_files.print_text_messages_section()
53
53
  timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs)} text message logs')
54
54
 
55
55
  if args.output_emails:
@@ -57,8 +57,13 @@ def generate_html() -> None:
57
57
  timer.print_at_checkpoint(f"Printed {emails_printed:,} emails")
58
58
 
59
59
  if args.output_other:
60
- files_printed = epstein_files.print_other_files_table()
61
- timer.print_at_checkpoint(f"Printed {len(files_printed)} other files")
60
+ if args.uninteresting:
61
+ files = [f for f in epstein_files.other_files if not f.is_interesting()]
62
+ else:
63
+ files = [f for f in epstein_files.other_files if args.all_other_files or f.is_interesting()]
64
+
65
+ epstein_files.print_other_files_section(files)
66
+ timer.print_at_checkpoint(f"Printed {len(files)} other files (skipped {len(epstein_files.other_files) - len(files)})")
62
67
 
63
68
  # Save output
64
69
  write_html(ALL_EMAILS_PATH if args.all_emails else TEXT_MSGS_HTML_PATH)
@@ -90,7 +95,7 @@ def epstein_search():
90
95
 
91
96
  if args.whole_file:
92
97
  if isinstance(search_result.document, Email):
93
- search_result.document.truncation_allowed = False
98
+ search_result.document._truncation_allowed = False
94
99
 
95
100
  console.print(search_result.document)
96
101
  else:
@@ -111,7 +116,7 @@ def epstein_show():
111
116
 
112
117
  for doc in docs:
113
118
  if isinstance(doc, Email):
114
- doc.truncation_allowed = False
119
+ doc._truncation_allowed = False
115
120
 
116
121
  console.print('\n', doc, '\n')
117
122
 
@@ -34,9 +34,9 @@ class Communication(Document):
34
34
  def is_attribution_uncertain(self) -> bool:
35
35
  return bool(self.config and self.config.is_attribution_uncertain)
36
36
 
37
- def external_links(self, _style: str = '', include_alt_links: bool = True) -> Text:
37
+ def external_links_txt(self, _style: str = '', include_alt_links: bool = True) -> Text:
38
38
  """Overrides super() method to apply self.author_style."""
39
- return super().external_links(self.author_style, include_alt_links=include_alt_links)
39
+ return super().external_links_txt(self.author_style, include_alt_links=include_alt_links)
40
40
 
41
41
  def summary(self) -> Text:
42
42
  return self._summary().append(CLOSE_PROPERTIES_CHAR)
@@ -19,12 +19,12 @@ from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
19
19
  from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time_from_timestamp_str, without_falsey
20
20
  from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
21
21
  from epstein_files.util.env import DOCS_DIR, args
22
- from epstein_files.util.file_helper import (file_stem_for_id, extract_file_id, file_size,
23
- file_size_str, is_local_extract_file)
22
+ from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, is_local_extract_file
24
23
  from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
25
- from epstein_files.util.rich import INFO_STYLE, SYMBOL_STYLE, console, highlighter, key_value_txt, link_text_obj
24
+ from epstein_files.util.rich import INFO_STYLE, SYMBOL_STYLE, console, highlighter, join_texts, key_value_txt, link_text_obj, parenthesize
26
25
  from epstein_files.util.search_result import MatchedLine
27
26
 
27
+ ALT_LINK_STYLE = 'white dim'
28
28
  CLOSE_PROPERTIES_CHAR = ']'
29
29
  HOUSE_OVERSIGHT = HOUSE_OVERSIGHT_PREFIX.replace('_', ' ').strip()
30
30
  INFO_INDENT = 2
@@ -46,7 +46,6 @@ FILENAME_MATCH_STYLES = [
46
46
  METADATA_FIELDS = [
47
47
  'author',
48
48
  'file_id',
49
- 'num_lines',
50
49
  'timestamp'
51
50
  ]
52
51
 
@@ -68,7 +67,6 @@ class Document:
68
67
  config (DocCfg): Information about this fil
69
68
  file_id (str): 6 digit (or 8 digits if it's a local extract file) string ID
70
69
  filename (str): File's basename
71
- length (int): Number of characters in the file after all the cleanup
72
70
  lines (str): Number of lines in the file after all the cleanup
73
71
  text (str): Contents of the file
74
72
  timestamp (datetime | None): When the file was originally created
@@ -80,12 +78,10 @@ class Document:
80
78
  config: EmailCfg | DocCfg | TextCfg | None = None
81
79
  file_id: str = field(init=False)
82
80
  filename: str = field(init=False)
83
- length: int = field(init=False)
84
- lines: list[str] = field(init=False)
85
- num_lines: int = field(init=False)
81
+ lines: list[str] = field(default_factory=list)
86
82
  text: str = ''
87
83
  timestamp: datetime | None = None
88
- url_slug: str = field(init=False) # e.g. 'HOUSE_OVERSIGHT_123456
84
+ url_slug: str = ''
89
85
 
90
86
  # Class variables
91
87
  include_description_in_summary_panel: ClassVar[bool] = False
@@ -94,12 +90,13 @@ class Document:
94
90
  def __post_init__(self):
95
91
  self.filename = self.file_path.name
96
92
  self.file_id = extract_file_id(self.filename)
93
+ # config and url_slug could have been pre-set in Email
97
94
  self.config = self.config or deepcopy(ALL_FILE_CONFIGS.get(self.file_id))
95
+ self.url_slug = self.url_slug or self.filename.split('.')[0]
98
96
 
99
- if 'url_slug' not in vars(self):
100
- self.url_slug = self.file_path.stem
97
+ if not self.text:
98
+ self._load_file()
101
99
 
102
- self._set_computed_fields(text=self.text or self._load_file())
103
100
  self._repair()
104
101
  self._extract_author()
105
102
  self.timestamp = self._extract_timestamp()
@@ -114,51 +111,49 @@ class Document:
114
111
 
115
112
  def duplicate_file_txt(self) -> Text:
116
113
  """If the file is a dupe make a nice message to explain what file it's a duplicate of."""
117
- if not self.config or not self.config.dupe_of_id or self.config.dupe_type is None:
114
+ if not self.is_duplicate():
118
115
  raise RuntimeError(f"duplicate_file_txt() called on {self.summary()} but not a dupe! config:\n\n{self.config}")
119
116
 
120
117
  txt = Text(f"Not showing ", style=INFO_STYLE).append(epstein_media_doc_link_txt(self.file_id, style='cyan'))
121
118
  txt.append(f" because it's {DUPE_TYPE_STRS[self.config.dupe_type]} ")
122
- return txt.append(epstein_media_doc_link_txt(self.config.dupe_of_id, style='royal_blue1'))
119
+ return txt.append(epstein_media_doc_link_txt(self.config.duplicate_of_id, style='royal_blue1'))
123
120
 
124
121
  def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
125
- return self.external_url(epsteinify_doc_url, style, link_txt)
122
+ return self.external_link(epsteinify_doc_url, style, link_txt)
126
123
 
127
124
  def epstein_media_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
128
- return self.external_url(epstein_media_doc_url, style, link_txt)
125
+ return self.external_link(epstein_media_doc_url, style, link_txt)
129
126
 
130
127
  def epstein_web_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
131
- return self.external_url(epstein_web_doc_url, style, link_txt)
128
+ return self.external_link(epstein_web_doc_url, style, link_txt)
132
129
 
133
130
  def rollcall_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
134
- return self.external_url(rollcall_doc_url, style, link_txt)
131
+ return self.external_link(rollcall_doc_url, style, link_txt)
135
132
 
136
- def external_url(self, fxn: Callable[[str], str], style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
133
+ def external_link(self, fxn: Callable[[str], str], style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
137
134
  return link_text_obj(fxn(self.url_slug), link_txt or self.file_path.stem, style)
138
135
 
139
- def external_links(self, style: str = '', include_alt_links: bool = False) -> Text:
140
- """Returns colored links to epstein.media and and epsteinweb in a Text object."""
141
- txt = Text('', style='white' if include_alt_links else ARCHIVE_LINK_COLOR)
142
-
143
- if args.use_epstein_web:
144
- txt.append(self.epstein_web_link(style=style))
145
- alt_link = self.epstein_media_link(style='white dim', link_txt=EPSTEIN_MEDIA)
146
- else:
147
- txt.append(self.epstein_media_link(style=style))
148
- alt_link = self.epstein_web_link(style='white dim', link_txt=EPSTEIN_WEB)
136
+ def external_links_txt(self, style: str = '', include_alt_links: bool = False) -> Text:
137
+ """Returns colored links to epstein.media and alternates in a Text object."""
138
+ links = [self.epstein_media_link(style=style)]
149
139
 
150
140
  if include_alt_links:
151
- txt.append(' (').append(self.epsteinify_link(style='white dim', link_txt=EPSTEINIFY)).append(')')
152
- txt.append(' (').append(alt_link).append(')')
141
+ links.append(self.epsteinify_link(style=ALT_LINK_STYLE, link_txt=EPSTEINIFY))
142
+ links.append(self.epstein_web_link(style=ALT_LINK_STYLE, link_txt=EPSTEIN_WEB))
153
143
 
154
144
  if self._class_name() == 'Email':
155
- txt.append(' (').append(self.rollcall_link(style='white dim', link_txt=ROLLCALL)).append(')')
145
+ links.append(self.rollcall_link(style=ALT_LINK_STYLE, link_txt=ROLLCALL))
156
146
 
157
- return txt
147
+ links = [links[0]] + [parenthesize(link) for link in links[1:]]
148
+ base_txt = Text('', style='white' if include_alt_links else ARCHIVE_LINK_COLOR)
149
+ return base_txt.append(join_texts(links))
150
+
151
+ def file_id_debug_info(self) -> str:
152
+ return ', '.join([f"{prop}={getattr(self, prop)}" for prop in ['file_id', 'filename', 'url_slug']])
158
153
 
159
154
  def file_info_panel(self) -> Group:
160
155
  """Panel with filename linking to raw file plus any additional info about the file."""
161
- panel = Panel(self.external_links(include_alt_links=True), border_style=self._border_style(), expand=False)
156
+ panel = Panel(self.external_links_txt(include_alt_links=True), border_style=self._border_style(), expand=False)
162
157
  padded_info = [Padding(sentence, INFO_PADDING) for sentence in self.info()]
163
158
  return Group(*([panel] + padded_info))
164
159
 
@@ -180,12 +175,15 @@ class Document:
180
175
  return None
181
176
 
182
177
  def is_duplicate(self) -> bool:
183
- return bool(self.config and self.config.dupe_of_id)
178
+ return bool(self.config and self.config.duplicate_of_id)
184
179
 
185
180
  def is_local_extract_file(self) -> bool:
186
181
  """True if file created by extracting text from a court doc (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
187
182
  return is_local_extract_file(self.filename)
188
183
 
184
+ def length(self) -> int:
185
+ return len(self.text)
186
+
189
187
  def log(self, msg: str, level: int = logging.INFO):
190
188
  """Log with filename as a prefix."""
191
189
  logger.log(level, f"{self.file_path.stem} {msg}")
@@ -206,17 +204,21 @@ class Document:
206
204
  metadata.update({k: v for k, v in asdict(self).items() if k in METADATA_FIELDS and v is not None})
207
205
  metadata['bytes'] = self.file_size()
208
206
  metadata['filename'] = f"{self.url_slug}.txt"
207
+ metadata['num_lines'] = self.num_lines()
209
208
  metadata['type'] = self._class_name()
210
209
 
211
210
  if self.is_local_extract_file():
212
211
  metadata['extracted_file'] = {
213
- 'explanation': 'Manually extracted from one of the court filings.',
212
+ 'explanation': 'manually extracted from one of the other files',
214
213
  'extracted_from': self.url_slug + '.txt',
215
214
  'url': extracted_file_url(self.filename),
216
215
  }
217
216
 
218
217
  return metadata
219
218
 
219
+ def num_lines(self) -> int:
220
+ return len(self.lines)
221
+
220
222
  def raw_text(self) -> str:
221
223
  with open(self.file_path) as f:
222
224
  return f.read()
@@ -233,7 +235,7 @@ class Document:
233
235
 
234
236
  def sort_key(self) -> tuple[datetime, str, int]:
235
237
  if self.is_duplicate():
236
- sort_id = self.config.dupe_of_id
238
+ sort_id = self.config.duplicate_of_id
237
239
  dupe_idx = 1
238
240
  else:
239
241
  sort_id = self.file_id
@@ -252,10 +254,10 @@ class Document:
252
254
  txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
253
255
 
254
256
  txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
255
- txt.append(", ").append(key_value_txt('lines', self.num_lines))
257
+ txt.append(", ").append(key_value_txt('lines', self.num_lines()))
256
258
 
257
- if self.config and self.config.dupe_of_id:
258
- txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.dupe_of_id, style='magenta')))
259
+ if self.config and self.config.duplicate_of_id:
260
+ txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='magenta')))
259
261
 
260
262
  return txt
261
263
 
@@ -294,13 +296,19 @@ class Document:
294
296
  """Should be implemented in subclasses."""
295
297
  pass
296
298
 
297
- def _load_file(self) -> str:
299
+ def _load_file(self) -> None:
298
300
  """Remove BOM and HOUSE OVERSIGHT lines, strip whitespace."""
299
301
  text = self.raw_text()
300
302
  text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text # remove BOM
301
303
  text = self.repair_ocr_text(OCR_REPAIRS, text.strip())
302
- lines = [l.strip() for l in text.split('\n') if not l.startswith(HOUSE_OVERSIGHT)]
303
- return collapse_newlines('\n'.join(lines))
304
+
305
+ lines = [
306
+ line.strip() if self.strip_whitespace else line for line in text.split('\n')
307
+ if not line.startswith(HOUSE_OVERSIGHT)
308
+ ]
309
+
310
+ self.text = collapse_newlines('\n'.join(lines))
311
+ self.lines = self.text.split('\n')
304
312
 
305
313
  def _repair(self) -> None:
306
314
  """Can optionally be overloaded in subclasses to further improve self.text."""
@@ -317,9 +325,7 @@ class Document:
317
325
  else:
318
326
  raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (neither was)")
319
327
 
320
- self.length = len(self.text)
321
328
  self.lines = [line.strip() if self.strip_whitespace else line for line in self.text.split('\n')]
322
- self.num_lines = len(self.lines)
323
329
 
324
330
  def _write_clean_text(self, output_path: Path) -> None:
325
331
  """Write self.text to 'output_path'. Used only for diffing files."""
@@ -332,7 +338,7 @@ class Document:
332
338
  with open(output_path, 'w') as f:
333
339
  f.write(self.text)
334
340
 
335
- logger.warning(f"Wrote {self.length} chars of cleaned {self.filename} to {output_path}.")
341
+ logger.warning(f"Wrote {self.length()} chars of cleaned {self.filename} to {output_path}.")
336
342
 
337
343
  def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
338
344
  yield self.file_info_panel()
@@ -131,12 +131,12 @@ JUNK_EMAILERS = [
131
131
  'editorialstaff@flipboard.com',
132
132
  'How To Academy',
133
133
  'Jokeland',
134
- JP_MORGAN_USGIO,
135
134
  ]
136
135
 
137
136
  MAILING_LISTS = [
138
137
  INTELLIGENCE_SQUARED,
139
138
  'middle.east.update@hotmail.com',
139
+ JP_MORGAN_USGIO,
140
140
  ]
141
141
 
142
142
  TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + MAILING_LISTS + [
@@ -316,7 +316,7 @@ class Email(Communication):
316
316
  recipients: list[str | None] = field(default_factory=list)
317
317
  sent_from_device: str | None = None
318
318
  signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
319
- truncation_allowed: bool = True
319
+ _truncation_allowed: bool = True # Hacky way to get __rich_console__() not to truncate in epstein_show script
320
320
 
321
321
  # For logging how many headers we prettified while printing, kind of janky
322
322
  rewritten_header_ids: ClassVar[set[str]] = set([])
@@ -337,10 +337,10 @@ class Email(Communication):
337
337
 
338
338
  try:
339
339
  if self.config and self.config.recipients:
340
- self.recipients = cast(list[str | None], self.config.recipients)
340
+ self.recipients = self.config.recipients
341
341
  else:
342
342
  for recipient in self.header.recipients():
343
- self.recipients.extend(self._get_names(recipient))
343
+ self.recipients.extend(self._emailer_names(recipient))
344
344
  except Exception as e:
345
345
  console.print_exception()
346
346
  console.line(2)
@@ -402,8 +402,8 @@ class Email(Communication):
402
402
  return self.text
403
403
 
404
404
  reply_text_match = REPLY_TEXT_REGEX.search(text)
405
- # logger.info(f"Raw text:\n" + self.top_lines(20) + '\n\n')
406
- # logger.info(f"With header removed:\n" + text[0:500] + '\n\n')
405
+ self.log_top_lines(20, "Raw text:", logging.DEBUG)
406
+ self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
407
407
 
408
408
  if reply_text_match:
409
409
  actual_num_chars = len(reply_text_match.group(1))
@@ -439,12 +439,32 @@ class Email(Communication):
439
439
 
440
440
  return style.replace('bold', '').strip()
441
441
 
442
+ def _emailer_names(self, emailer_str: str) -> list[str]:
443
+ """Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
444
+ emailer_str = EmailHeader.cleanup_str(emailer_str)
445
+
446
+ if len(emailer_str) == 0:
447
+ return []
448
+
449
+ names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
450
+
451
+ if BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
452
+ if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
453
+ logger.warning(f"'{self.filename}': No emailer found in '{escape_single_quotes(emailer_str)}'")
454
+ else:
455
+ logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
456
+
457
+ return names_found
458
+
459
+ names_found = names_found or [emailer_str]
460
+ return [_reverse_first_and_last_names(name) for name in names_found]
461
+
442
462
  def _extract_author(self) -> None:
443
463
  self._extract_header()
444
464
  super()._extract_author()
445
465
 
446
466
  if not self.author and self.header.author:
447
- authors = self._get_names(self.header.author)
467
+ authors = self._emailer_names(self.header.author)
448
468
  self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
449
469
 
450
470
  def _extract_header(self) -> None:
@@ -494,26 +514,6 @@ class Email(Communication):
494
514
 
495
515
  raise RuntimeError(f"No timestamp found in '{self.file_path.name}' top lines:\n{searchable_text}")
496
516
 
497
- def _get_names(self, emailer_str: str) -> list[str]:
498
- """Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
499
- emailer_str = EmailHeader.cleanup_str(emailer_str)
500
-
501
- if len(emailer_str) == 0:
502
- return []
503
-
504
- names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
505
-
506
- if BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
507
- if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
508
- logger.warning(f"'{self.filename}': No emailer found in '{escape_single_quotes(emailer_str)}'")
509
- else:
510
- logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
511
-
512
- return names_found
513
-
514
- names_found = names_found or [emailer_str]
515
- return [_reverse_first_and_last_names(name) for name in names_found]
516
-
517
517
  def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
518
518
  """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
519
519
  for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text or self.text)):
@@ -683,6 +683,9 @@ class Email(Communication):
683
683
  if extracted_from_description:
684
684
  extracted_description = f"{APPEARS_IN} {extracted_from_description}"
685
685
 
686
+ if isinstance(extracted_from_doc_cfg, EmailCfg):
687
+ extracted_description += ' email'
688
+
686
689
  if self.config.description:
687
690
  self.warn(f"Overwriting description '{self.config.description}' with extract description '{self.config.description}'")
688
691
 
@@ -708,10 +711,10 @@ class Email(Communication):
708
711
  num_chars = quote_cutoff
709
712
 
710
713
  # Truncate long emails but leave a note explaining what happened w/link to source document
711
- if len(text) > num_chars and self.truncation_allowed:
714
+ if len(text) > num_chars and self._truncation_allowed:
712
715
  text = text[0:num_chars]
713
716
  doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
714
- trim_note = f"<...trimmed to {num_chars} characters of {self.length}, read the rest at {doc_link_markup}...>"
717
+ trim_note = f"<...trimmed to {num_chars} characters of {self.length()}, read the rest at {doc_link_markup}...>"
715
718
  trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
716
719
 
717
720
  # Rewrite broken headers where the values are on separate lines from the field names
@@ -5,6 +5,7 @@ from datetime import datetime
5
5
  from rich.text import Text
6
6
 
7
7
  from epstein_files.util.constant.names import JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN
8
+ from epstein_files.util.constant.strings import TIMESTAMP_DIM
8
9
  from epstein_files.util.data import extract_last_name
9
10
  from epstein_files.util.highlighted_group import get_style_for_name
10
11
  from epstein_files.util.logging import logger
@@ -12,7 +13,6 @@ from epstein_files.util.rich import TEXT_LINK, highlighter
12
13
 
13
14
  MSG_DATE_FORMAT = r"%m/%d/%y %I:%M:%S %p"
14
15
  PHONE_NUMBER_REGEX = re.compile(r'^[\d+]+.*')
15
- TIMESTAMP_STYLE = 'turquoise4 dim'
16
16
 
17
17
  DISPLAY_LAST_NAME_ONLY = [
18
18
  JEFFREY_EPSTEIN,
@@ -29,7 +29,7 @@ TEXTER_MAPPING = {
29
29
  class TextMessage:
30
30
  """Class representing a single iMessage text message."""
31
31
  author: str | None
32
- author_str: str | None = None
32
+ author_str: str = ''
33
33
  id_confirmed: bool = False
34
34
  text: str
35
35
  timestamp_str: str
@@ -37,7 +37,7 @@ class TextMessage:
37
37
  def __post_init__(self):
38
38
  self.author = TEXTER_MAPPING.get(self.author or UNKNOWN, self.author)
39
39
 
40
- if self.author is None:
40
+ if not self.author:
41
41
  self.author_str = UNKNOWN
42
42
  elif self.author in DISPLAY_LAST_NAME_ONLY and not self.author_str:
43
43
  self.author_str = extract_last_name(self.author)
@@ -77,5 +77,5 @@ class TextMessage:
77
77
  def __rich__(self) -> Text:
78
78
  author_style = get_style_for_name(self.author_str if self.author_str.startswith('+') else self.author)
79
79
  author_txt = Text(self.author_str, style=author_style)
80
- timestamp_txt = Text(f"[{self.timestamp_str}]", style=TIMESTAMP_STYLE).append(' ')
80
+ timestamp_txt = Text(f"[{self.timestamp_str}]", style=TIMESTAMP_DIM).append(' ')
81
81
  return Text('').append(timestamp_txt).append(author_txt).append(': ', style='dim').append(self._message())
@@ -6,10 +6,12 @@ from typing import ClassVar
6
6
 
7
7
  from rich.text import Text
8
8
 
9
- from epstein_files.documents.other_file import OtherFile
9
+ from epstein_files.documents.other_file import Metadata, OtherFile
10
10
  from epstein_files.util.constant.strings import JSON
11
11
  from epstein_files.util.rich import INFO_STYLE
12
12
 
13
+ DESCRIPTION = "JSON data containing preview info for links sent in a messaging app like iMessage"
14
+
13
15
  TEXT_FIELDS = [
14
16
  'caption',
15
17
  'standard',
@@ -23,7 +25,6 @@ TEXT_FIELDS = [
23
25
  @dataclass
24
26
  class JsonFile(OtherFile):
25
27
  """File containing JSON data."""
26
-
27
28
  include_description_in_summary_panel: ClassVar[bool] = False
28
29
  strip_whitespace: ClassVar[bool] = False
29
30
 
@@ -39,7 +40,7 @@ class JsonFile(OtherFile):
39
40
  return JSON
40
41
 
41
42
  def info_txt(self) -> Text | None:
42
- return Text(f"JSON file, contains preview data for links sent a messaging app", style=INFO_STYLE)
43
+ return Text(DESCRIPTION, style=INFO_STYLE)
43
44
 
44
45
  def is_interesting(self):
45
46
  return False
@@ -48,5 +49,10 @@ class JsonFile(OtherFile):
48
49
  with open(self.file_path, encoding='utf-8-sig') as f:
49
50
  return json.load(f)
50
51
 
52
+ def metadata(self) -> Metadata:
53
+ metadata = super().metadata()
54
+ metadata['description'] = DESCRIPTION
55
+ return metadata
56
+
51
57
  def json_str(self) -> str:
52
58
  return json.dumps(self.json_data(), indent=4)