epstein-files 1.4.1__tar.gz → 1.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {epstein_files-1.4.1 → epstein_files-1.5.0}/PKG-INFO +14 -1
  2. {epstein_files-1.4.1 → epstein_files-1.5.0}/README.md +12 -0
  3. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/__init__.py +31 -18
  4. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/documents/communication.py +9 -5
  5. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/documents/document.py +225 -136
  6. epstein_files-1.5.0/epstein_files/documents/doj_file.py +242 -0
  7. epstein_files-1.5.0/epstein_files/documents/doj_files/full_text.py +166 -0
  8. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/documents/email.py +138 -163
  9. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/documents/emails/email_header.py +21 -11
  10. epstein_files-1.5.0/epstein_files/documents/emails/emailers.py +223 -0
  11. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/documents/imessage/text_message.py +2 -3
  12. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/documents/json_file.py +18 -14
  13. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/documents/messenger_log.py +23 -39
  14. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/documents/other_file.py +48 -44
  15. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/epstein_files.py +54 -33
  16. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/person.py +142 -110
  17. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/constant/names.py +29 -6
  18. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/constant/output_files.py +2 -0
  19. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/constant/strings.py +12 -6
  20. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/constant/urls.py +17 -0
  21. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/constants.py +101 -174
  22. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/data.py +2 -0
  23. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/doc_cfg.py +20 -15
  24. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/env.py +24 -16
  25. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/file_helper.py +28 -6
  26. epstein_files-1.5.0/epstein_files/util/helpers/debugging_helper.py +13 -0
  27. epstein_files-1.5.0/epstein_files/util/helpers/env_helpers.py +21 -0
  28. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/highlighted_group.py +57 -16
  29. epstein_files-1.5.0/epstein_files/util/layout/left_bar_panel.py +26 -0
  30. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/logging.py +28 -13
  31. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/output.py +33 -10
  32. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/rich.py +28 -2
  33. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/word_count.py +7 -7
  34. {epstein_files-1.4.1 → epstein_files-1.5.0}/pyproject.toml +6 -1
  35. {epstein_files-1.4.1 → epstein_files-1.5.0}/LICENSE +0 -0
  36. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/constant/common_words.py +0 -0
  37. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/constant/html.py +0 -0
  38. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/search_result.py +0 -0
  39. {epstein_files-1.4.1 → epstein_files-1.5.0}/epstein_files/util/timer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: epstein-files
3
- Version: 1.4.1
3
+ Version: 1.5.0
4
4
  Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
5
5
  Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
6
6
  License: GPL-3.0-or-later
@@ -20,6 +20,7 @@ Classifier: Programming Language :: Python :: 3.13
20
20
  Requires-Dist: cairosvg (>=2.8.2,<3.0.0)
21
21
  Requires-Dist: datefinder (>=0.7.3,<0.8.0)
22
22
  Requires-Dist: inflection (>=0.5.1,<0.6.0)
23
+ Requires-Dist: pdfalyzer[extract] (>=1.19.6,<2.0.0)
23
24
  Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
24
25
  Requires-Dist: python-dotenv (>=1.2.1,<2.0.0)
25
26
  Requires-Dist: requests (>=2.32.5,<3.0.0)
@@ -47,6 +48,7 @@ Description-Content-Type: text/markdown
47
48
  #### Installation
48
49
  1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8) (make sure you grab both the `001/` and `002/` folders).
49
50
  1. Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
51
+ 1. (Optional) If you want to work with the documents released by DOJ on January 30th 2026 you'll need to also download the PDF collections from [the DOJ site](https://www.justice.gov/epstein/doj-disclosures) (they're in the "Epstein Files Transparency Act" section) and OCR them or find another way to get the OCR text.
50
52
 
51
53
 
52
54
  #### Command Line Tools
@@ -56,6 +58,13 @@ You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the
56
58
  EPSTEIN_DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate --help
57
59
  ```
58
60
 
61
+ To work with the January 2026 DOJ documents you'll also need to set the `EPSTEIN_DOJ_TXTS_20260130_DIR` env var to point at folders full of OCR extracted texts from the raw DOJ PDFs. If you have the PDFs but not the text files there's [a script](scripts/extract_doj_pdfs.py) that can help you take care of that.
62
+
63
+ ```bash
64
+ EPSTEIN_DOCS_DIR=/path/to/epstein/ocr_txt_files EPSTEIN_DOJ_TXTS_20260130_DIR=/path/to/doj/files epstein_generate --help
65
+ ```
66
+
67
+
59
68
  All the tools that come with the package require `EPSTEIN_DOCS_DIR` to be set. These are the available tools:
60
69
 
61
70
  ```bash
@@ -123,3 +132,7 @@ for file in epstein_files.other_files:
123
132
  # Everyone Who Sent or Received an Email in the November Document Dump
124
133
  ![emails](https://github.com/michelcrypt4d4mus/epstein_text_messages/raw/master/docs/emailers_info_table.png)
125
134
 
135
+
136
+ # TODO List
137
+ See [TODO.md](TODO.md).
138
+
@@ -13,6 +13,7 @@
13
13
  #### Installation
14
14
  1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8) (make sure you grab both the `001/` and `002/` folders).
15
15
  1. Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
16
+ 1. (Optional) If you want to work with the documents released by DOJ on January 30th 2026 you'll need to also download the PDF collections from [the DOJ site](https://www.justice.gov/epstein/doj-disclosures) (they're in the "Epstein Files Transparency Act" section) and OCR them or find another way to get the OCR text.
16
17
 
17
18
 
18
19
  #### Command Line Tools
@@ -22,6 +23,13 @@ You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the
22
23
  EPSTEIN_DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate --help
23
24
  ```
24
25
 
26
+ To work with the January 2026 DOJ documents you'll also need to set the `EPSTEIN_DOJ_TXTS_20260130_DIR` env var to point at folders full of OCR extracted texts from the raw DOJ PDFs. If you have the PDFs but not the text files there's [a script](scripts/extract_doj_pdfs.py) that can help you take care of that.
27
+
28
+ ```bash
29
+ EPSTEIN_DOCS_DIR=/path/to/epstein/ocr_txt_files EPSTEIN_DOJ_TXTS_20260130_DIR=/path/to/doj/files epstein_generate --help
30
+ ```
31
+
32
+
25
33
  All the tools that come with the package require `EPSTEIN_DOCS_DIR` to be set. These are the available tools:
26
34
 
27
35
  ```bash
@@ -88,3 +96,7 @@ for file in epstein_files.other_files:
88
96
 
89
97
  # Everyone Who Sent or Received an Email in the November Document Dump
90
98
  ![emails](https://github.com/michelcrypt4d4mus/epstein_text_messages/raw/master/docs/emailers_info_table.png)
99
+
100
+
101
+ # TODO List
102
+ See [TODO.md](TODO.md).
@@ -4,6 +4,7 @@ Reformat Epstein text message files for readability and count email senders.
4
4
 
5
5
  Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT epstein_generate'
6
6
  """
7
+ import re
7
8
  from sys import exit
8
9
 
9
10
  from dotenv import load_dotenv
@@ -15,16 +16,17 @@ from rich.text import Text
15
16
 
16
17
  from epstein_files.epstein_files import EpsteinFiles, document_cls
17
18
  from epstein_files.documents.document import INFO_PADDING, Document
19
+ from epstein_files.documents.doj_file import DojFile
18
20
  from epstein_files.documents.email import Email
19
21
  from epstein_files.documents.messenger_log import MessengerLog
20
22
  from epstein_files.documents.other_file import OtherFile
21
23
  from epstein_files.util.constant.output_files import make_clean
22
- from epstein_files.util.constant.strings import ID_REGEX
24
+ from epstein_files.util.constant.strings import HOUSE_OVERSIGHT_NOV_2025_ID_REGEX
23
25
  from epstein_files.util.data import flatten
24
26
  from epstein_files.util.env import args
25
27
  from epstein_files.util.file_helper import coerce_file_path, extract_file_id
26
28
  from epstein_files.util.logging import exit_with_error, logger
27
- from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
29
+ from epstein_files.util.output import (print_doj_files, print_emails_section, print_json_files, print_stats,
28
30
  print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info,
29
31
  print_json_metadata, write_urls)
30
32
  from epstein_files.util.rich import (build_highlighter, console, highlighter, print_color_key, print_json,
@@ -62,6 +64,10 @@ def generate_html() -> None:
62
64
  if args.colors_only:
63
65
  exit()
64
66
 
67
+ if args.output_doj_files:
68
+ printed_doj_files = print_doj_files(epstein_files)
69
+ timer.log_section_complete('DojFile', epstein_files.doj_files, printed_doj_files)
70
+
65
71
  if args.output_texts:
66
72
  printed_logs = print_text_messages_section(epstein_files)
67
73
  timer.log_section_complete('MessengerLog', epstein_files.imessage_logs, printed_logs)
@@ -83,9 +89,8 @@ def generate_html() -> None:
83
89
  if args.debug:
84
90
  highlighter.print_highlight_counts(console)
85
91
 
86
- # JSON stats (mostly used for building pytest checks)
87
- if args.json_stats:
88
- print_json_stats(epstein_files)
92
+ if args.stats:
93
+ print_stats(epstein_files) # Used for building pytest checks
89
94
 
90
95
 
91
96
  def epstein_diff():
@@ -97,7 +102,7 @@ def epstein_grep():
97
102
  """Search the cleaned up text of the files."""
98
103
  epstein_files = EpsteinFiles.get_files()
99
104
 
100
- if ID_REGEX.match(args.positional_args[0]):
105
+ if HOUSE_OVERSIGHT_NOV_2025_ID_REGEX.match(args.positional_args[0]):
101
106
  logger.warning(f"'{args.positional_args[0]}' seems to be an ID, running epstein_show instead...")
102
107
  epstein_show()
103
108
  return
@@ -113,7 +118,7 @@ def epstein_grep():
113
118
  lines = search_result.lines
114
119
 
115
120
  if (isinstance(doc, Email) and not args.output_emails) \
116
- or (isinstance(doc, OtherFile) and not args.output_other) \
121
+ or (isinstance(doc, (DojFile, OtherFile)) and not args.output_other) \
117
122
  or (isinstance(doc, MessengerLog) and not args.output_texts):
118
123
  doc.log(f"{type(doc).__name__} Skipping search result...")
119
124
  continue
@@ -124,22 +129,23 @@ def epstein_grep():
124
129
  doc.log(f"None of the matches for '{search_term}' seem to be in the body of the email")
125
130
  continue
126
131
 
127
- if doc.is_duplicate():
128
- if last_document and not last_document.is_duplicate():
132
+ if doc.is_duplicate:
133
+ if last_document and not last_document.is_duplicate:
129
134
  console.line()
130
135
 
131
136
  last_document = doc
132
- console.print(doc.duplicate_file_txt())
137
+ console.print(doc.duplicate_file_txt)
133
138
  elif args.whole_file:
134
139
  console.print(doc)
135
140
  else:
136
- console.print(doc.summary_panel())
141
+ console.print(doc.summary_panel)
137
142
 
138
143
  for matching_line in lines:
139
144
  line_txt = matching_line.__rich__()
140
145
  console.print(Padding(temp_highlighter(line_txt), INFO_PADDING), style='gray37')
141
146
 
142
147
  console.line()
148
+ console.print(doc.local_path_and_url + '\n', style='dim')
143
149
 
144
150
 
145
151
  def epstein_show():
@@ -153,28 +159,35 @@ def epstein_show():
153
159
  raw_docs = [doc for doc in flatten([p.emails for p in people])]
154
160
  else:
155
161
  ids = [extract_file_id(arg.strip().strip('_')) for arg in args.positional_args]
156
- raw_docs = [Document(coerce_file_path(id)) for id in ids]
162
+ logger.info(f"extracted IDs: {ids}")
163
+ raw_docs = [Document.from_file_id(id) for id in ids]
164
+ logger.info(f"raw docs: {raw_docs}")
157
165
 
166
+ # Rebuild the Document objs so we can see result of latest processing
158
167
  docs = Document.sort_by_timestamp([document_cls(doc)(doc.file_path) for doc in raw_docs])
168
+ logger.info(f"Document types: {[doc._class_name for doc in docs]}")
159
169
  except Exception as e:
170
+ console.print_exception()
160
171
  exit_with_error(str(e))
161
172
 
162
173
  for doc in docs:
163
174
  console.print('\n', doc, '\n')
164
175
 
165
176
  if args.raw:
166
- console.print(Panel(Text("RAW: ").append(doc.summary()), expand=False, style=doc._border_style()))
177
+ console.print(Panel(Text("RAW: ").append(doc.summary()), expand=False, style=doc.border_style))
167
178
  console.print(escape(doc.raw_text()), '\n')
168
179
 
169
180
  if isinstance(doc, Email):
170
- console.print(Panel(Text("actual_text: ").append(doc.summary()), expand=False, style=doc._border_style()))
171
- console.print(escape(doc._actual_text()), '\n')
172
- metadata = doc.metadata()
173
- metadata['is_fwded_article'] = doc.is_fwded_article()
174
- metadata['is_word_count_worthy'] = doc.is_word_count_worthy()
181
+ console.print(Panel(Text("actual_text: ").append(doc.summary()), expand=False, style=doc.border_style))
182
+ console.print(escape(doc._extract_actual_text()), '\n')
183
+ metadata = doc.metadata
184
+ metadata['is_fwded_article'] = doc.is_fwded_article
185
+ metadata['is_word_count_worthy'] = doc.is_word_count_worthy
175
186
  metadata['_is_first_for_user'] = doc._is_first_for_user
176
187
  print_json(f"{doc.file_id} Metadata", metadata)
177
188
 
189
+ console.print(doc.local_path_and_url, style='dim')
190
+
178
191
 
179
192
  def epstein_word_count() -> None:
180
193
  write_word_counts_html()
@@ -21,26 +21,30 @@ class Communication(Document):
21
21
  config: CommunicationCfg | None = None
22
22
  timestamp: datetime = FALLBACK_TIMESTAMP # TODO this default sucks (though it never happens)
23
23
 
24
+ @property
24
25
  def author_or_unknown(self) -> str:
25
26
  return self.author or UNKNOWN
26
27
 
28
+ @property
27
29
  def author_style(self) -> str:
28
30
  return get_style_for_name(self.author)
29
31
 
32
+ @property
30
33
  def author_txt(self) -> Text:
31
34
  return styled_name(self.author)
32
35
 
36
+ @property
37
+ def timestamp_without_seconds(self) -> str:
38
+ return TIMESTAMP_SECONDS_REGEX.sub('', str(self.timestamp))
39
+
33
40
  def external_links_txt(self, _style: str = '', include_alt_links: bool = True) -> Text:
34
41
  """Overrides super() method to apply self.author_style."""
35
- return super().external_links_txt(self.author_style(), include_alt_links=include_alt_links)
42
+ return super().external_links_txt(self.author_style, include_alt_links=include_alt_links)
36
43
 
37
44
  def summary(self) -> Text:
38
45
  return self._summary().append(CLOSE_PROPERTIES_CHAR)
39
46
 
40
- def timestamp_without_seconds(self) -> str:
41
- return TIMESTAMP_SECONDS_REGEX.sub('', str(self.timestamp))
42
-
43
47
  def _summary(self) -> Text:
44
48
  """One line summary mostly for logging."""
45
49
  txt = super().summary().append(', ')
46
- return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown()}'", style=self.author_style())))
50
+ return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown}'", style=self.author_style)))