epstein-files 1.2.5__tar.gz → 1.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {epstein_files-1.2.5 → epstein_files-1.5.0}/PKG-INFO +16 -3
  2. {epstein_files-1.2.5 → epstein_files-1.5.0}/README.md +14 -2
  3. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/__init__.py +55 -23
  4. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/documents/communication.py +9 -5
  5. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/documents/document.py +231 -135
  6. epstein_files-1.5.0/epstein_files/documents/doj_file.py +242 -0
  7. epstein_files-1.5.0/epstein_files/documents/doj_files/full_text.py +166 -0
  8. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/documents/email.py +289 -232
  9. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/documents/emails/email_header.py +35 -16
  10. epstein_files-1.5.0/epstein_files/documents/emails/emailers.py +223 -0
  11. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/documents/imessage/text_message.py +2 -3
  12. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/documents/json_file.py +18 -14
  13. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/documents/messenger_log.py +23 -39
  14. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/documents/other_file.py +54 -48
  15. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/epstein_files.py +65 -29
  16. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/person.py +151 -94
  17. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/util/constant/names.py +37 -10
  18. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/util/constant/output_files.py +2 -0
  19. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/util/constant/strings.py +14 -7
  20. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/util/constant/urls.py +17 -0
  21. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/util/constants.py +556 -391
  22. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/util/data.py +2 -0
  23. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/util/doc_cfg.py +44 -33
  24. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/util/env.py +34 -19
  25. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/util/file_helper.py +30 -6
  26. epstein_files-1.5.0/epstein_files/util/helpers/debugging_helper.py +13 -0
  27. epstein_files-1.5.0/epstein_files/util/helpers/env_helpers.py +21 -0
  28. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/util/highlighted_group.py +121 -37
  29. epstein_files-1.5.0/epstein_files/util/layout/left_bar_panel.py +26 -0
  30. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/util/logging.py +28 -13
  31. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/util/output.py +49 -40
  32. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/util/rich.py +30 -3
  33. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/util/word_count.py +7 -7
  34. {epstein_files-1.2.5 → epstein_files-1.5.0}/pyproject.toml +7 -2
  35. {epstein_files-1.2.5 → epstein_files-1.5.0}/LICENSE +0 -0
  36. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/util/constant/common_words.py +0 -0
  37. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/util/constant/html.py +0 -0
  38. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/util/search_result.py +0 -0
  39. {epstein_files-1.2.5 → epstein_files-1.5.0}/epstein_files/util/timer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: epstein-files
3
- Version: 1.2.5
3
+ Version: 1.5.0
4
4
  Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
5
5
  Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
6
6
  License: GPL-3.0-or-later
@@ -20,6 +20,7 @@ Classifier: Programming Language :: Python :: 3.13
20
20
  Requires-Dist: cairosvg (>=2.8.2,<3.0.0)
21
21
  Requires-Dist: datefinder (>=0.7.3,<0.8.0)
22
22
  Requires-Dist: inflection (>=0.5.1,<0.6.0)
23
+ Requires-Dist: pdfalyzer[extract] (>=1.19.6,<2.0.0)
23
24
  Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
24
25
  Requires-Dist: python-dotenv (>=1.2.1,<2.0.0)
25
26
  Requires-Dist: requests (>=2.32.5,<3.0.0)
@@ -47,6 +48,7 @@ Description-Content-Type: text/markdown
47
48
  #### Installation
48
49
  1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8) (make sure you grab both the `001/` and `002/` folders).
49
50
  1. Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
51
+ 1. (Optional) If you want to work with the documents released by DOJ on January 30th 2026 you'll need to also download the PDF collections from [the DOJ site](https://www.justice.gov/epstein/doj-disclosures) (they're in the "Epstein Files Transparency Act" section) and OCR them or find another way to get the OCR text.
50
52
 
51
53
 
52
54
  #### Command Line Tools
@@ -56,6 +58,13 @@ You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the
56
58
  EPSTEIN_DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate --help
57
59
  ```
58
60
 
61
+ To work with the January 2026 DOJ documents you'll also need to set the `EPSTEIN_DOJ_TXTS_20260130_DIR` env var to point at folders full of OCR extracted texts from the raw DOJ PDFs. If you have the PDFs but not the text files there's [a script](scripts/extract_doj_pdfs.py) that can help you take care of that.
62
+
63
+ ```bash
64
+ EPSTEIN_DOCS_DIR=/path/to/epstein/ocr_txt_files EPSTEIN_DOJ_TXTS_20260130_DIR=/path/to/doj/files epstein_generate --help
65
+ ```
66
+
67
+
59
68
  All the tools that come with the package require `EPSTEIN_DOCS_DIR` to be set. These are the available tools:
60
69
 
61
70
  ```bash
@@ -63,9 +72,9 @@ All the tools that come with the package require `EPSTEIN_DOCS_DIR` to be set. T
63
72
  epstein_generate
64
73
 
65
74
  # Search for a string:
66
- epstein_search Bannon
75
+ epstein_grep Bannon
67
76
  # Or a regex:
68
- epstein_search '\bSteve\s*Bannon|Jeffrey\s*Epstein\b'
77
+ epstein_grep '\bSteve\s*Bannon|Jeffrey\s*Epstein\b'
69
78
 
70
79
  # Show a file with color highlighting of keywords:
71
80
  epstein_show 030999
@@ -123,3 +132,7 @@ for file in epstein_files.other_files:
123
132
  # Everyone Who Sent or Received an Email in the November Document Dump
124
133
  ![emails](https://github.com/michelcrypt4d4mus/epstein_text_messages/raw/master/docs/emailers_info_table.png)
125
134
 
135
+
136
+ # TODO List
137
+ See [TODO.md](TODO.md).
138
+
@@ -13,6 +13,7 @@
13
13
  #### Installation
14
14
  1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8) (make sure you grab both the `001/` and `002/` folders).
15
15
  1. Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
16
+ 1. (Optional) If you want to work with the documents released by DOJ on January 30th 2026 you'll need to also download the PDF collections from [the DOJ site](https://www.justice.gov/epstein/doj-disclosures) (they're in the "Epstein Files Transparency Act" section) and OCR them or find another way to get the OCR text.
16
17
 
17
18
 
18
19
  #### Command Line Tools
@@ -22,6 +23,13 @@ You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the
22
23
  EPSTEIN_DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate --help
23
24
  ```
24
25
 
26
+ To work with the January 2026 DOJ documents you'll also need to set the `EPSTEIN_DOJ_TXTS_20260130_DIR` env var to point at folders full of OCR extracted texts from the raw DOJ PDFs. If you have the PDFs but not the text files there's [a script](scripts/extract_doj_pdfs.py) that can help you take care of that.
27
+
28
+ ```bash
29
+ EPSTEIN_DOCS_DIR=/path/to/epstein/ocr_txt_files EPSTEIN_DOJ_TXTS_20260130_DIR=/path/to/doj/files epstein_generate --help
30
+ ```
31
+
32
+
25
33
  All the tools that come with the package require `EPSTEIN_DOCS_DIR` to be set. These are the available tools:
26
34
 
27
35
  ```bash
@@ -29,9 +37,9 @@ All the tools that come with the package require `EPSTEIN_DOCS_DIR` to be set. T
29
37
  epstein_generate
30
38
 
31
39
  # Search for a string:
32
- epstein_search Bannon
40
+ epstein_grep Bannon
33
41
  # Or a regex:
34
- epstein_search '\bSteve\s*Bannon|Jeffrey\s*Epstein\b'
42
+ epstein_grep '\bSteve\s*Bannon|Jeffrey\s*Epstein\b'
35
43
 
36
44
  # Show a file with color highlighting of keywords:
37
45
  epstein_show 030999
@@ -88,3 +96,7 @@ for file in epstein_files.other_files:
88
96
 
89
97
  # Everyone Who Sent or Received an Email in the November Document Dump
90
98
  ![emails](https://github.com/michelcrypt4d4mus/epstein_text_messages/raw/master/docs/emailers_info_table.png)
99
+
100
+
101
+ # TODO List
102
+ See [TODO.md](TODO.md).
@@ -4,6 +4,7 @@ Reformat Epstein text message files for readability and count email senders.
4
4
 
5
5
  Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT epstein_generate'
6
6
  """
7
+ import re
7
8
  from sys import exit
8
9
 
9
10
  from dotenv import load_dotenv
@@ -15,20 +16,21 @@ from rich.text import Text
15
16
 
16
17
  from epstein_files.epstein_files import EpsteinFiles, document_cls
17
18
  from epstein_files.documents.document import INFO_PADDING, Document
19
+ from epstein_files.documents.doj_file import DojFile
18
20
  from epstein_files.documents.email import Email
19
21
  from epstein_files.documents.messenger_log import MessengerLog
20
22
  from epstein_files.documents.other_file import OtherFile
21
23
  from epstein_files.util.constant.output_files import make_clean
22
- from epstein_files.util.constant.strings import ID_REGEX
24
+ from epstein_files.util.constant.strings import HOUSE_OVERSIGHT_NOV_2025_ID_REGEX
23
25
  from epstein_files.util.data import flatten
24
26
  from epstein_files.util.env import args
25
27
  from epstein_files.util.file_helper import coerce_file_path, extract_file_id
26
28
  from epstein_files.util.logging import exit_with_error, logger
27
- from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
29
+ from epstein_files.util.output import (print_doj_files, print_emails_section, print_json_files, print_stats,
28
30
  print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info,
29
31
  print_json_metadata, write_urls)
30
- from epstein_files.util.rich import (build_highlighter, console, highlighter, print_color_key, print_title_page_header,
31
- print_title_page_tables, print_subtitle_panel, write_html)
32
+ from epstein_files.util.rich import (build_highlighter, console, highlighter, print_color_key, print_json,
33
+ print_title_page_header, print_title_page_tables, print_subtitle_panel, write_html)
32
34
  from epstein_files.util.timer import Timer
33
35
  from epstein_files.util.word_count import write_word_counts_html
34
36
 
@@ -62,6 +64,10 @@ def generate_html() -> None:
62
64
  if args.colors_only:
63
65
  exit()
64
66
 
67
+ if args.output_doj_files:
68
+ printed_doj_files = print_doj_files(epstein_files)
69
+ timer.log_section_complete('DojFile', epstein_files.doj_files, printed_doj_files)
70
+
65
71
  if args.output_texts:
66
72
  printed_logs = print_text_messages_section(epstein_files)
67
73
  timer.log_section_complete('MessengerLog', epstein_files.imessage_logs, printed_logs)
@@ -83,9 +89,8 @@ def generate_html() -> None:
83
89
  if args.debug:
84
90
  highlighter.print_highlight_counts(console)
85
91
 
86
- # JSON stats (mostly used for building pytest checks)
87
- if args.json_stats:
88
- print_json_stats(epstein_files)
92
+ if args.stats:
93
+ print_stats(epstein_files) # Used for building pytest checks
89
94
 
90
95
 
91
96
  def epstein_diff():
@@ -93,11 +98,11 @@ def epstein_diff():
93
98
  Document.diff_files(args.positional_args)
94
99
 
95
100
 
96
- def epstein_search():
101
+ def epstein_grep():
97
102
  """Search the cleaned up text of the files."""
98
103
  epstein_files = EpsteinFiles.get_files()
99
104
 
100
- if ID_REGEX.match(args.positional_args[0]):
105
+ if HOUSE_OVERSIGHT_NOV_2025_ID_REGEX.match(args.positional_args[0]):
101
106
  logger.warning(f"'{args.positional_args[0]}' seems to be an ID, running epstein_show instead...")
102
107
  epstein_show()
103
108
  return
@@ -106,26 +111,41 @@ def epstein_search():
106
111
  temp_highlighter = build_highlighter(search_term)
107
112
  search_results = epstein_files.docs_matching(search_term, args.names)
108
113
  print_subtitle_panel(f"Found {len(search_results)} documents matching '{search_term}'")
114
+ last_document = None
109
115
 
110
116
  for search_result in search_results:
111
- document = search_result.document
117
+ doc = search_result.document
118
+ lines = search_result.lines
112
119
 
113
- if (isinstance(document, Email) and not args.output_emails) \
114
- or (isinstance(document, OtherFile) and not args.output_other) \
115
- or (isinstance(document, MessengerLog) and not args.output_texts):
116
- document.warn(f"{type(document).__name__} Skipping search result...")
120
+ if (isinstance(doc, Email) and not args.output_emails) \
121
+ or (isinstance(doc, (DojFile, OtherFile)) and not args.output_other) \
122
+ or (isinstance(doc, MessengerLog) and not args.output_texts):
123
+ doc.log(f"{type(doc).__name__} Skipping search result...")
117
124
  continue
125
+ elif isinstance(doc, Email) and args.email_body:
126
+ lines = [l for l in search_result.lines if l.line_number > doc.header.num_header_rows]
127
+
128
+ if not lines:
129
+ doc.log(f"None of the matches for '{search_term}' seem to be in the body of the email")
130
+ continue
131
+
132
+ if doc.is_duplicate:
133
+ if last_document and not last_document.is_duplicate:
134
+ console.line()
118
135
 
119
- if args.whole_file:
120
- console.print(document)
136
+ last_document = doc
137
+ console.print(doc.duplicate_file_txt)
138
+ elif args.whole_file:
139
+ console.print(doc)
121
140
  else:
122
- console.print(document.summary_panel())
141
+ console.print(doc.summary_panel)
123
142
 
124
- for matching_line in search_result.lines:
143
+ for matching_line in lines:
125
144
  line_txt = matching_line.__rich__()
126
145
  console.print(Padding(temp_highlighter(line_txt), INFO_PADDING), style='gray37')
127
146
 
128
147
  console.line()
148
+ console.print(doc.local_path_and_url + '\n', style='dim')
129
149
 
130
150
 
131
151
  def epstein_show():
@@ -138,23 +158,35 @@ def epstein_show():
138
158
  people = EpsteinFiles.get_files().person_objs(args.names)
139
159
  raw_docs = [doc for doc in flatten([p.emails for p in people])]
140
160
  else:
141
- ids = [extract_file_id(arg) for arg in args.positional_args]
142
- raw_docs = [Document(coerce_file_path(id)) for id in ids]
161
+ ids = [extract_file_id(arg.strip().strip('_')) for arg in args.positional_args]
162
+ logger.info(f"extracted IDs: {ids}")
163
+ raw_docs = [Document.from_file_id(id) for id in ids]
164
+ logger.info(f"raw docs: {raw_docs}")
143
165
 
166
+ # Rebuild the Document objs so we can see result of latest processing
144
167
  docs = Document.sort_by_timestamp([document_cls(doc)(doc.file_path) for doc in raw_docs])
168
+ logger.info(f"Document types: {[doc._class_name for doc in docs]}")
145
169
  except Exception as e:
170
+ console.print_exception()
146
171
  exit_with_error(str(e))
147
172
 
148
173
  for doc in docs:
149
174
  console.print('\n', doc, '\n')
150
175
 
151
176
  if args.raw:
152
- console.print(Panel(Text("RAW: ").append(doc.summary()), expand=False, style=doc._border_style()))
177
+ console.print(Panel(Text("RAW: ").append(doc.summary()), expand=False, style=doc.border_style))
153
178
  console.print(escape(doc.raw_text()), '\n')
154
179
 
155
180
  if isinstance(doc, Email):
156
- console.print(Panel(Text("actual_text: ").append(doc.summary()), expand=False, style=doc._border_style()))
157
- console.print(escape(doc._actual_text()), '\n')
181
+ console.print(Panel(Text("actual_text: ").append(doc.summary()), expand=False, style=doc.border_style))
182
+ console.print(escape(doc._extract_actual_text()), '\n')
183
+ metadata = doc.metadata
184
+ metadata['is_fwded_article'] = doc.is_fwded_article
185
+ metadata['is_word_count_worthy'] = doc.is_word_count_worthy
186
+ metadata['_is_first_for_user'] = doc._is_first_for_user
187
+ print_json(f"{doc.file_id} Metadata", metadata)
188
+
189
+ console.print(doc.local_path_and_url, style='dim')
158
190
 
159
191
 
160
192
  def epstein_word_count() -> None:
@@ -21,26 +21,30 @@ class Communication(Document):
21
21
  config: CommunicationCfg | None = None
22
22
  timestamp: datetime = FALLBACK_TIMESTAMP # TODO this default sucks (though it never happens)
23
23
 
24
+ @property
24
25
  def author_or_unknown(self) -> str:
25
26
  return self.author or UNKNOWN
26
27
 
28
+ @property
27
29
  def author_style(self) -> str:
28
30
  return get_style_for_name(self.author)
29
31
 
32
+ @property
30
33
  def author_txt(self) -> Text:
31
34
  return styled_name(self.author)
32
35
 
36
+ @property
37
+ def timestamp_without_seconds(self) -> str:
38
+ return TIMESTAMP_SECONDS_REGEX.sub('', str(self.timestamp))
39
+
33
40
  def external_links_txt(self, _style: str = '', include_alt_links: bool = True) -> Text:
34
41
  """Overrides super() method to apply self.author_style."""
35
- return super().external_links_txt(self.author_style(), include_alt_links=include_alt_links)
42
+ return super().external_links_txt(self.author_style, include_alt_links=include_alt_links)
36
43
 
37
44
  def summary(self) -> Text:
38
45
  return self._summary().append(CLOSE_PROPERTIES_CHAR)
39
46
 
40
- def timestamp_without_seconds(self) -> str:
41
- return TIMESTAMP_SECONDS_REGEX.sub('', str(self.timestamp))
42
-
43
47
  def _summary(self) -> Text:
44
48
  """One line summary mostly for logging."""
45
49
  txt = super().summary().append(', ')
46
- return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown()}'", style=self.author_style())))
50
+ return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown}'", style=self.author_style)))