epstein-files 1.4.1__tar.gz → 1.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {epstein_files-1.4.1 → epstein_files-1.5.0}/PKG-INFO +14 -1
- {epstein_files-1.4.1 → epstein_files-1.5.0}/README.md +12 -0
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/__init__.py +31 -18
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/documents/communication.py +9 -5
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/documents/document.py +225 -136
- epstein_files-1.5.0/epstein_files/documents/doj_file.py +242 -0
- epstein_files-1.5.0/epstein_files/documents/doj_files/full_text.py +166 -0
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/documents/email.py +138 -163
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/documents/emails/email_header.py +21 -11
- epstein_files-1.5.0/epstein_files/documents/emails/emailers.py +223 -0
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/documents/imessage/text_message.py +2 -3
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/documents/json_file.py +18 -14
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/documents/messenger_log.py +23 -39
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/documents/other_file.py +48 -44
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/epstein_files.py +54 -33
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/person.py +142 -110
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/constant/names.py +29 -6
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/constant/output_files.py +2 -0
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/constant/strings.py +12 -6
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/constant/urls.py +17 -0
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/constants.py +101 -174
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/data.py +2 -0
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/doc_cfg.py +20 -15
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/env.py +24 -16
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/file_helper.py +28 -6
- epstein_files-1.5.0/epstein_files/util/helpers/debugging_helper.py +13 -0
- epstein_files-1.5.0/epstein_files/util/helpers/env_helpers.py +21 -0
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/highlighted_group.py +57 -16
- epstein_files-1.5.0/epstein_files/util/layout/left_bar_panel.py +26 -0
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/logging.py +28 -13
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/output.py +33 -10
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/rich.py +28 -2
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/word_count.py +7 -7
- {epstein_files-1.4.1 → epstein_files-1.5.0}/pyproject.toml +6 -1
- {epstein_files-1.4.1 → epstein_files-1.5.0}/LICENSE +0 -0
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/constant/common_words.py +0 -0
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/constant/html.py +0 -0
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/search_result.py +0 -0
- {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/timer.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: epstein-files
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5.0
|
|
4
4
|
Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
|
|
5
5
|
Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
|
|
6
6
|
License: GPL-3.0-or-later
|
|
@@ -20,6 +20,7 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
20
20
|
Requires-Dist: cairosvg (>=2.8.2,<3.0.0)
|
|
21
21
|
Requires-Dist: datefinder (>=0.7.3,<0.8.0)
|
|
22
22
|
Requires-Dist: inflection (>=0.5.1,<0.6.0)
|
|
23
|
+
Requires-Dist: pdfalyzer[extract] (>=1.19.6,<2.0.0)
|
|
23
24
|
Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
|
|
24
25
|
Requires-Dist: python-dotenv (>=1.2.1,<2.0.0)
|
|
25
26
|
Requires-Dist: requests (>=2.32.5,<3.0.0)
|
|
@@ -47,6 +48,7 @@ Description-Content-Type: text/markdown
|
|
|
47
48
|
#### Installation
|
|
48
49
|
1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8) (make sure you grab both the `001/` and `002/` folders).
|
|
49
50
|
1. Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
|
|
51
|
+
1. (Optional) If you want to work with the documents released by DOJ on January 30th 2026 you'll need to also download the PDF collections from [the DOJ site](https://www.justice.gov/epstein/doj-disclosures) (they're in the "Epstein Files Transparency Act" section) and OCR them or find another way to get the OCR text.
|
|
50
52
|
|
|
51
53
|
|
|
52
54
|
#### Command Line Tools
|
|
@@ -56,6 +58,13 @@ You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the
|
|
|
56
58
|
EPSTEIN_DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate --help
|
|
57
59
|
```
|
|
58
60
|
|
|
61
|
+
To work with the January 2026 DOJ documents you'll also need to set the `EPSTEIN_DOJ_TXTS_20260130_DIR` env var to point at folders full of OCR extracted texts from the raw DOJ PDFs. If you have the PDFs but not the text files there's [a script](scripts/extract_doj_pdfs.py) that can help you take care of that.
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
EPSTEIN_DOCS_DIR=/path/to/epstein/ocr_txt_files EPSTEIN_DOJ_TXTS_20260130_DIR=/path/to/doj/files epstein_generate --help
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
|
|
59
68
|
All the tools that come with the package require `EPSTEIN_DOCS_DIR` to be set. These are the available tools:
|
|
60
69
|
|
|
61
70
|
```bash
|
|
@@ -123,3 +132,7 @@ for file in epstein_files.other_files:
|
|
|
123
132
|
# Everyone Who Sent or Received an Email in the November Document Dump
|
|
124
133
|

|
|
125
134
|
|
|
135
|
+
|
|
136
|
+
# TODO List
|
|
137
|
+
See [TODO.md](TODO.md).
|
|
138
|
+
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
#### Installation
|
|
14
14
|
1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8) (make sure you grab both the `001/` and `002/` folders).
|
|
15
15
|
1. Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
|
|
16
|
+
1. (Optional) If you want to work with the documents released by DOJ on January 30th 2026 you'll need to also download the PDF collections from [the DOJ site](https://www.justice.gov/epstein/doj-disclosures) (they're in the "Epstein Files Transparency Act" section) and OCR them or find another way to get the OCR text.
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
#### Command Line Tools
|
|
@@ -22,6 +23,13 @@ You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the
|
|
|
22
23
|
EPSTEIN_DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate --help
|
|
23
24
|
```
|
|
24
25
|
|
|
26
|
+
To work with the January 2026 DOJ documents you'll also need to set the `EPSTEIN_DOJ_TXTS_20260130_DIR` env var to point at folders full of OCR extracted texts from the raw DOJ PDFs. If you have the PDFs but not the text files there's [a script](scripts/extract_doj_pdfs.py) that can help you take care of that.
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
EPSTEIN_DOCS_DIR=/path/to/epstein/ocr_txt_files EPSTEIN_DOJ_TXTS_20260130_DIR=/path/to/doj/files epstein_generate --help
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
|
|
25
33
|
All the tools that come with the package require `EPSTEIN_DOCS_DIR` to be set. These are the available tools:
|
|
26
34
|
|
|
27
35
|
```bash
|
|
@@ -88,3 +96,7 @@ for file in epstein_files.other_files:
|
|
|
88
96
|
|
|
89
97
|
# Everyone Who Sent or Received an Email in the November Document Dump
|
|
90
98
|

|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# TODO List
|
|
102
|
+
See [TODO.md](TODO.md).
|
|
@@ -4,6 +4,7 @@ Reformat Epstein text message files for readability and count email senders.
|
|
|
4
4
|
|
|
5
5
|
Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT epstein_generate'
|
|
6
6
|
"""
|
|
7
|
+
import re
|
|
7
8
|
from sys import exit
|
|
8
9
|
|
|
9
10
|
from dotenv import load_dotenv
|
|
@@ -15,16 +16,17 @@ from rich.text import Text
|
|
|
15
16
|
|
|
16
17
|
from epstein_files.epstein_files import EpsteinFiles, document_cls
|
|
17
18
|
from epstein_files.documents.document import INFO_PADDING, Document
|
|
19
|
+
from epstein_files.documents.doj_file import DojFile
|
|
18
20
|
from epstein_files.documents.email import Email
|
|
19
21
|
from epstein_files.documents.messenger_log import MessengerLog
|
|
20
22
|
from epstein_files.documents.other_file import OtherFile
|
|
21
23
|
from epstein_files.util.constant.output_files import make_clean
|
|
22
|
-
from epstein_files.util.constant.strings import
|
|
24
|
+
from epstein_files.util.constant.strings import HOUSE_OVERSIGHT_NOV_2025_ID_REGEX
|
|
23
25
|
from epstein_files.util.data import flatten
|
|
24
26
|
from epstein_files.util.env import args
|
|
25
27
|
from epstein_files.util.file_helper import coerce_file_path, extract_file_id
|
|
26
28
|
from epstein_files.util.logging import exit_with_error, logger
|
|
27
|
-
from epstein_files.util.output import (print_emails_section, print_json_files,
|
|
29
|
+
from epstein_files.util.output import (print_doj_files, print_emails_section, print_json_files, print_stats,
|
|
28
30
|
print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info,
|
|
29
31
|
print_json_metadata, write_urls)
|
|
30
32
|
from epstein_files.util.rich import (build_highlighter, console, highlighter, print_color_key, print_json,
|
|
@@ -62,6 +64,10 @@ def generate_html() -> None:
|
|
|
62
64
|
if args.colors_only:
|
|
63
65
|
exit()
|
|
64
66
|
|
|
67
|
+
if args.output_doj_files:
|
|
68
|
+
printed_doj_files = print_doj_files(epstein_files)
|
|
69
|
+
timer.log_section_complete('DojFile', epstein_files.doj_files, printed_doj_files)
|
|
70
|
+
|
|
65
71
|
if args.output_texts:
|
|
66
72
|
printed_logs = print_text_messages_section(epstein_files)
|
|
67
73
|
timer.log_section_complete('MessengerLog', epstein_files.imessage_logs, printed_logs)
|
|
@@ -83,9 +89,8 @@ def generate_html() -> None:
|
|
|
83
89
|
if args.debug:
|
|
84
90
|
highlighter.print_highlight_counts(console)
|
|
85
91
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
print_json_stats(epstein_files)
|
|
92
|
+
if args.stats:
|
|
93
|
+
print_stats(epstein_files) # Used for building pytest checks
|
|
89
94
|
|
|
90
95
|
|
|
91
96
|
def epstein_diff():
|
|
@@ -97,7 +102,7 @@ def epstein_grep():
|
|
|
97
102
|
"""Search the cleaned up text of the files."""
|
|
98
103
|
epstein_files = EpsteinFiles.get_files()
|
|
99
104
|
|
|
100
|
-
if
|
|
105
|
+
if HOUSE_OVERSIGHT_NOV_2025_ID_REGEX.match(args.positional_args[0]):
|
|
101
106
|
logger.warning(f"'{args.positional_args[0]}' seems to be an ID, running epstein_show instead...")
|
|
102
107
|
epstein_show()
|
|
103
108
|
return
|
|
@@ -113,7 +118,7 @@ def epstein_grep():
|
|
|
113
118
|
lines = search_result.lines
|
|
114
119
|
|
|
115
120
|
if (isinstance(doc, Email) and not args.output_emails) \
|
|
116
|
-
or (isinstance(doc, OtherFile) and not args.output_other) \
|
|
121
|
+
or (isinstance(doc, (DojFile, OtherFile)) and not args.output_other) \
|
|
117
122
|
or (isinstance(doc, MessengerLog) and not args.output_texts):
|
|
118
123
|
doc.log(f"{type(doc).__name__} Skipping search result...")
|
|
119
124
|
continue
|
|
@@ -124,22 +129,23 @@ def epstein_grep():
|
|
|
124
129
|
doc.log(f"None of the matches for '{search_term}' seem to be in the body of the email")
|
|
125
130
|
continue
|
|
126
131
|
|
|
127
|
-
if doc.is_duplicate
|
|
128
|
-
if last_document and not last_document.is_duplicate
|
|
132
|
+
if doc.is_duplicate:
|
|
133
|
+
if last_document and not last_document.is_duplicate:
|
|
129
134
|
console.line()
|
|
130
135
|
|
|
131
136
|
last_document = doc
|
|
132
|
-
console.print(doc.duplicate_file_txt
|
|
137
|
+
console.print(doc.duplicate_file_txt)
|
|
133
138
|
elif args.whole_file:
|
|
134
139
|
console.print(doc)
|
|
135
140
|
else:
|
|
136
|
-
console.print(doc.summary_panel
|
|
141
|
+
console.print(doc.summary_panel)
|
|
137
142
|
|
|
138
143
|
for matching_line in lines:
|
|
139
144
|
line_txt = matching_line.__rich__()
|
|
140
145
|
console.print(Padding(temp_highlighter(line_txt), INFO_PADDING), style='gray37')
|
|
141
146
|
|
|
142
147
|
console.line()
|
|
148
|
+
console.print(doc.local_path_and_url + '\n', style='dim')
|
|
143
149
|
|
|
144
150
|
|
|
145
151
|
def epstein_show():
|
|
@@ -153,28 +159,35 @@ def epstein_show():
|
|
|
153
159
|
raw_docs = [doc for doc in flatten([p.emails for p in people])]
|
|
154
160
|
else:
|
|
155
161
|
ids = [extract_file_id(arg.strip().strip('_')) for arg in args.positional_args]
|
|
156
|
-
|
|
162
|
+
logger.info(f"extracted IDs: {ids}")
|
|
163
|
+
raw_docs = [Document.from_file_id(id) for id in ids]
|
|
164
|
+
logger.info(f"raw docs: {raw_docs}")
|
|
157
165
|
|
|
166
|
+
# Rebuild the Document objs so we can see result of latest processing
|
|
158
167
|
docs = Document.sort_by_timestamp([document_cls(doc)(doc.file_path) for doc in raw_docs])
|
|
168
|
+
logger.info(f"Document types: {[doc._class_name for doc in docs]}")
|
|
159
169
|
except Exception as e:
|
|
170
|
+
console.print_exception()
|
|
160
171
|
exit_with_error(str(e))
|
|
161
172
|
|
|
162
173
|
for doc in docs:
|
|
163
174
|
console.print('\n', doc, '\n')
|
|
164
175
|
|
|
165
176
|
if args.raw:
|
|
166
|
-
console.print(Panel(Text("RAW: ").append(doc.summary()), expand=False, style=doc.
|
|
177
|
+
console.print(Panel(Text("RAW: ").append(doc.summary()), expand=False, style=doc.border_style))
|
|
167
178
|
console.print(escape(doc.raw_text()), '\n')
|
|
168
179
|
|
|
169
180
|
if isinstance(doc, Email):
|
|
170
|
-
console.print(Panel(Text("actual_text: ").append(doc.summary()), expand=False, style=doc.
|
|
171
|
-
console.print(escape(doc.
|
|
172
|
-
metadata = doc.metadata
|
|
173
|
-
metadata['is_fwded_article'] = doc.is_fwded_article
|
|
174
|
-
metadata['is_word_count_worthy'] = doc.is_word_count_worthy
|
|
181
|
+
console.print(Panel(Text("actual_text: ").append(doc.summary()), expand=False, style=doc.border_style))
|
|
182
|
+
console.print(escape(doc._extract_actual_text()), '\n')
|
|
183
|
+
metadata = doc.metadata
|
|
184
|
+
metadata['is_fwded_article'] = doc.is_fwded_article
|
|
185
|
+
metadata['is_word_count_worthy'] = doc.is_word_count_worthy
|
|
175
186
|
metadata['_is_first_for_user'] = doc._is_first_for_user
|
|
176
187
|
print_json(f"{doc.file_id} Metadata", metadata)
|
|
177
188
|
|
|
189
|
+
console.print(doc.local_path_and_url, style='dim')
|
|
190
|
+
|
|
178
191
|
|
|
179
192
|
def epstein_word_count() -> None:
|
|
180
193
|
write_word_counts_html()
|
|
@@ -21,26 +21,30 @@ class Communication(Document):
|
|
|
21
21
|
config: CommunicationCfg | None = None
|
|
22
22
|
timestamp: datetime = FALLBACK_TIMESTAMP # TODO this default sucks (though it never happens)
|
|
23
23
|
|
|
24
|
+
@property
|
|
24
25
|
def author_or_unknown(self) -> str:
|
|
25
26
|
return self.author or UNKNOWN
|
|
26
27
|
|
|
28
|
+
@property
|
|
27
29
|
def author_style(self) -> str:
|
|
28
30
|
return get_style_for_name(self.author)
|
|
29
31
|
|
|
32
|
+
@property
|
|
30
33
|
def author_txt(self) -> Text:
|
|
31
34
|
return styled_name(self.author)
|
|
32
35
|
|
|
36
|
+
@property
|
|
37
|
+
def timestamp_without_seconds(self) -> str:
|
|
38
|
+
return TIMESTAMP_SECONDS_REGEX.sub('', str(self.timestamp))
|
|
39
|
+
|
|
33
40
|
def external_links_txt(self, _style: str = '', include_alt_links: bool = True) -> Text:
|
|
34
41
|
"""Overrides super() method to apply self.author_style."""
|
|
35
|
-
return super().external_links_txt(self.author_style
|
|
42
|
+
return super().external_links_txt(self.author_style, include_alt_links=include_alt_links)
|
|
36
43
|
|
|
37
44
|
def summary(self) -> Text:
|
|
38
45
|
return self._summary().append(CLOSE_PROPERTIES_CHAR)
|
|
39
46
|
|
|
40
|
-
def timestamp_without_seconds(self) -> str:
|
|
41
|
-
return TIMESTAMP_SECONDS_REGEX.sub('', str(self.timestamp))
|
|
42
|
-
|
|
43
47
|
def _summary(self) -> Text:
|
|
44
48
|
"""One line summary mostly for logging."""
|
|
45
49
|
txt = super().summary().append(', ')
|
|
46
|
-
return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown
|
|
50
|
+
return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown}'", style=self.author_style)))
|