epstein-files 1.0.4__tar.gz → 1.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {epstein_files-1.0.4 → epstein_files-1.0.5}/PKG-INFO +34 -18
  2. {epstein_files-1.0.4 → epstein_files-1.0.5}/README.md +33 -17
  3. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/__init__.py +5 -10
  4. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/documents/document.py +1 -2
  5. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/documents/email.py +15 -10
  6. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/epstein_files.py +18 -18
  7. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/util/constant/output_files.py +2 -3
  8. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/util/constant/strings.py +8 -7
  9. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/util/constant/urls.py +1 -1
  10. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/util/constants.py +19 -18
  11. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/util/doc_cfg.py +3 -1
  12. epstein_files-1.0.5/epstein_files/util/env.py +84 -0
  13. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/util/highlighted_group.py +4 -3
  14. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/util/output.py +5 -10
  15. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/util/rich.py +9 -5
  16. {epstein_files-1.0.4 → epstein_files-1.0.5}/pyproject.toml +1 -1
  17. epstein_files-1.0.4/epstein_files/util/env.py +0 -80
  18. {epstein_files-1.0.4 → epstein_files-1.0.5}/LICENSE +0 -0
  19. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/documents/communication.py +0 -0
  20. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/documents/emails/email_header.py +0 -0
  21. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/documents/imessage/text_message.py +0 -0
  22. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/documents/json_file.py +0 -0
  23. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/documents/messenger_log.py +0 -0
  24. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/documents/other_file.py +0 -0
  25. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/util/constant/common_words.py +0 -0
  26. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/util/constant/html.py +0 -0
  27. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/util/constant/names.py +0 -0
  28. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/util/data.py +0 -0
  29. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/util/file_helper.py +0 -0
  30. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/util/logging.py +0 -0
  31. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/util/search_result.py +0 -0
  32. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/util/timer.py +0 -0
  33. {epstein_files-1.0.4 → epstein_files-1.0.5}/epstein_files/util/word_count.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: epstein-files
3
- Version: 1.0.4
3
+ Version: 1.0.5
4
4
  Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
5
5
  Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
6
6
  License: GPL-3.0-or-later
@@ -32,6 +32,8 @@ Description-Content-Type: text/markdown
32
32
 
33
33
  # I Made Epstein's Text Messages Great Again
34
34
 
35
+ ![joi_ito](docs/joi_ito_gavin_is_clever_epstein_funds_bitcoin_dev_team.png)
36
+
35
37
  * [I Made Epstein's Text Messages Great Again (And You Should Read Them)](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great) post on [Substack](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great)
36
38
  * The Epstein text messages (and some of the emails along with summary information) generated by this code can be viewed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/).
37
39
  * All of His Emails along with descriptions of the 496 files that were neither emails nor text messages can be read at [another page also generated by this code](https://michelcrypt4d4mus.github.io/epstein_text_messages/all_emails_epstein_files_nov_2025.html).
@@ -40,27 +42,41 @@ Description-Content-Type: text/markdown
40
42
  * Configuration variables assigning specific `HOUSE_OVERSIGHT_XXXXXX.txt` file IDs (the `111111` part) as being emails to or from particular people based on various research and contributions can be found in [constants.py](./epstein_files/util/constants.py). Everything in `constants.py` appears in the JSON metadata linked above.
41
43
 
42
44
 
43
- ### Usage
44
- 1. Requires you have a local copy of OCR text from the House Oversight document dump in a directory `/path/to/epstein/ocr_txt_files`. You can download them from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8).
45
- 1. Dependencies are in [pyproject.toml](./pyproject.toml). Use `poetry install` for easiest time installing. `pip install .` may or may not work.
45
+ ## Usage
46
+ 1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8).
47
+ 1. Dependencies are in [pyproject.toml](./pyproject.toml). Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
46
48
 
47
- You need to set the `DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
49
+ You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
50
+
51
+ ```bash
52
+ EPSTEIN_DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate --help
53
+ ```
54
+
55
+ All the tools that come with the package require `EPSTEIN_DOCS_DIR` to be set. These are the available tools:
48
56
 
49
57
  ```bash
50
58
  # Generate color highlighted texts/emails/other files
51
- DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate
59
+ epstein_generate
60
+
61
+ # Search for a string:
62
+ epstein_search Bannon
63
+ # Or a regex:
64
+ epstein_search '\bSteve\s*Bannon\b'
52
65
 
53
- # Search
54
- DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_search Bannon
66
+ # Show a file with color highlighting of keywords
67
+ epstein_show 030999
68
+ # Show both the highlighted and raw versions of the file:
69
+ epstein_show --raw 030999
70
+ # This also works:
71
+ epstein_show HOUSE_OVERSIGHT_030999
55
72
 
56
- # Show a color highlighted file
57
- DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_show 030999
58
- # This also works
59
- DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_show HOUSE_OVERSIGHT_030999
73
+ # Diff two epstein files after all the cleanup (stripping BOMs, matching newline chars, etc):
74
+ epstein_diff 030999 020442
60
75
  ```
61
76
 
77
+ The first time you run anything it will take a few minutes to fix all the data, attribute the redacted emails, etc.
62
78
  Run `epstein_generate --help` for command line option assistance.
63
- The first time you run anything it will take a few minutes to fix all the data, attribute the redacted emails, etc. Once you've run things once you can run the `epstein_generate --pickled` to load the cached fixed up data and things will be quick.
79
+
64
80
 
65
81
  #### As A Library
66
82
  ```python
@@ -69,18 +85,18 @@ epstein_files = EpsteinFiles.get_files()
69
85
 
70
86
  # All files
71
87
  for document in epstein_files.all_documents():
72
- do_stuff()
88
+ do_stuff(document)
73
89
 
74
90
  # Emails
75
91
  for email in epstein_files.emails:
76
- do_stuff()
92
+ do_stuff(email)
77
93
 
78
94
  # iMessage Logs
79
95
  for imessage_log in epstein_files.imessage_logs:
80
- do_stuff()
96
+ do_stuff(imessage_log)
81
97
 
82
98
  # Other Files
83
- for document in epstein_files.other_files:
84
- do_stuff()
99
+ for file in epstein_files.other_files:
100
+ do_stuff(file)
85
101
  ```
86
102
 
@@ -1,5 +1,7 @@
1
1
  # I Made Epstein's Text Messages Great Again
2
2
 
3
+ ![joi_ito](docs/joi_ito_gavin_is_clever_epstein_funds_bitcoin_dev_team.png)
4
+
3
5
  * [I Made Epstein's Text Messages Great Again (And You Should Read Them)](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great) post on [Substack](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great)
4
6
  * The Epstein text messages (and some of the emails along with summary information) generated by this code can be viewed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/).
5
7
  * All of His Emails along with descriptions of the 496 files that were neither emails nor text messages can be read at [another page also generated by this code](https://michelcrypt4d4mus.github.io/epstein_text_messages/all_emails_epstein_files_nov_2025.html).
@@ -8,27 +10,41 @@
8
10
  * Configuration variables assigning specific `HOUSE_OVERSIGHT_XXXXXX.txt` file IDs (the `111111` part) as being emails to or from particular people based on various research and contributions can be found in [constants.py](./epstein_files/util/constants.py). Everything in `constants.py` appears in the JSON metadata linked above.
9
11
 
10
12
 
11
- ### Usage
12
- 1. Requires you have a local copy of OCR text from the House Oversight document dump in a directory `/path/to/epstein/ocr_txt_files`. You can download them from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8).
13
- 1. Dependencies are in [pyproject.toml](./pyproject.toml). Use `poetry install` for easiest time installing. `pip install .` may or may not work.
13
+ ## Usage
14
+ 1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8).
15
+ 1. Dependencies are in [pyproject.toml](./pyproject.toml). Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
14
16
 
15
- You need to set the `DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
17
+ You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
18
+
19
+ ```bash
20
+ EPSTEIN_DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate --help
21
+ ```
22
+
23
+ All the tools that come with the package require `EPSTEIN_DOCS_DIR` to be set. These are the available tools:
16
24
 
17
25
  ```bash
18
26
  # Generate color highlighted texts/emails/other files
19
- DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate
27
+ epstein_generate
28
+
29
+ # Search for a string:
30
+ epstein_search Bannon
31
+ # Or a regex:
32
+ epstein_search '\bSteve\s*Bannon\b'
20
33
 
21
- # Search
22
- DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_search Bannon
34
+ # Show a file with color highlighting of keywords
35
+ epstein_show 030999
36
+ # Show both the highlighted and raw versions of the file:
37
+ epstein_show --raw 030999
38
+ # This also works:
39
+ epstein_show HOUSE_OVERSIGHT_030999
23
40
 
24
- # Show a color highlighted file
25
- DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_show 030999
26
- # This also works
27
- DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_show HOUSE_OVERSIGHT_030999
41
+ # Diff two epstein files after all the cleanup (stripping BOMs, matching newline chars, etc):
42
+ epstein_diff 030999 020442
28
43
  ```
29
44
 
45
+ The first time you run anything it will take a few minutes to fix all the data, attribute the redacted emails, etc.
30
46
  Run `epstein_generate --help` for command line option assistance.
31
- The first time you run anything it will take a few minutes to fix all the data, attribute the redacted emails, etc. Once you've run things once you can run the `epstein_generate --pickled` to load the cached fixed up data and things will be quick.
47
+
32
48
 
33
49
  #### As A Library
34
50
  ```python
@@ -37,17 +53,17 @@ epstein_files = EpsteinFiles.get_files()
37
53
 
38
54
  # All files
39
55
  for document in epstein_files.all_documents():
40
- do_stuff()
56
+ do_stuff(document)
41
57
 
42
58
  # Emails
43
59
  for email in epstein_files.emails:
44
- do_stuff()
60
+ do_stuff(email)
45
61
 
46
62
  # iMessage Logs
47
63
  for imessage_log in epstein_files.imessage_logs:
48
- do_stuff()
64
+ do_stuff(imessage_log)
49
65
 
50
66
  # Other Files
51
- for document in epstein_files.other_files:
52
- do_stuff()
67
+ for file in epstein_files.other_files:
68
+ do_stuff(file)
53
69
  ```
@@ -75,7 +75,7 @@ def epstein_diff():
75
75
  def epstein_search():
76
76
  """Search the cleaned up text of the files."""
77
77
  _assert_positional_args()
78
- epstein_files = EpsteinFiles.get_files(use_pickled=True)
78
+ epstein_files = EpsteinFiles.get_files()
79
79
 
80
80
  for search_term in args.positional_args:
81
81
  temp_highlighter = build_highlighter(search_term)
@@ -103,27 +103,22 @@ def epstein_show():
103
103
  """Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
104
104
  _assert_positional_args()
105
105
  ids = [extract_file_id(arg) for arg in args.positional_args]
106
+ raw_docs = [Document(coerce_file_path(id)) for id in ids]
107
+ docs = [document_cls(doc)(doc.file_path) for doc in raw_docs]
106
108
  console.line()
107
109
 
108
- if args.pickled:
109
- epstein_files = EpsteinFiles.get_files(use_pickled=True)
110
- docs = epstein_files.get_documents_by_id(ids)
111
- else:
112
- raw_docs = [Document(coerce_file_path(id)) for id in ids]
113
- docs = [document_cls(doc)(doc.file_path) for doc in raw_docs]
114
-
115
110
  for doc in docs:
116
111
  console.line()
117
112
  console.print(doc)
118
113
 
119
114
  if args.raw:
120
115
  console.line()
121
- console.print(Panel(f"*** {doc.url_slug} RAW ***", expand=False, style=doc._border_style()))
116
+ console.print(Panel(f"RAW {doc.filename} RAW", expand=False, style=doc._border_style()))
122
117
  console.print(escape(doc.raw_text()))
123
118
 
124
119
  if isinstance(doc, Email):
125
120
  console.line()
126
- console.print(Panel(f"*** {doc.url_slug} actual_text ***", expand=False, style=doc._border_style()))
121
+ console.print(Panel(f"{doc.filename}: actual_text() output", expand=False, style=doc._border_style()))
127
122
  console.print(escape(doc._actual_text()))
128
123
 
129
124
 
@@ -85,10 +85,9 @@ class Document:
85
85
 
86
86
  if self.is_local_extract_file():
87
87
  self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
88
- cfg_type = type(self.config).__name__ if self.config else None
89
88
 
90
89
  # Coerce FileConfig for court docs etc. to MessageCfg for email files extracted from that document
91
- if self.class_name() == EMAIL_CLASS and self.config and cfg_type != EmailCfg.__name__:
90
+ if self.class_name() == EMAIL_CLASS and self.config and not isinstance(self.config, EmailCfg):
92
91
  self.config = EmailCfg.from_doc_cfg(self.config)
93
92
  else:
94
93
  self.url_slug = self.file_path.stem
@@ -26,7 +26,7 @@ from epstein_files.util.logging import logger
26
26
  from epstein_files.util.rich import *
27
27
 
28
28
  BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
29
- BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
29
+ BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
30
30
  DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
31
31
  LINK_LINE_REGEX = re.compile(f"^(> )?htt")
32
32
  QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
@@ -245,12 +245,10 @@ TRUNCATE_TERMS = [
245
245
  ]
246
246
 
247
247
  # Some Paul Krassner emails have a ton of CCed parties we don't care about
248
- KRASSNER_RECIPIENTS = uniquify(flatten(ALL_FILE_CONFIGS[id].recipients for id in ['025329', '024923', '033568']))
248
+ KRASSNER_RECIPIENTS = uniquify(flatten([ALL_FILE_CONFIGS[id].recipients for id in ['025329', '024923', '033568']]))
249
249
 
250
250
  # No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
251
- USELESS_EMAILERS = IRAN_NUCLEAR_DEAL_SPAM_EMAIL_RECIPIENTS + \
252
- KRASSNER_RECIPIENTS + \
253
- FLIGHT_IN_2012_PEOPLE + [
251
+ USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIPIENTS + [
254
252
  'Alan Rogers', # Random CC
255
253
  'Andrew Friendly', # Presumably some relation of Kelly Friendly
256
254
  'BS Stern', # A random fwd of email we have
@@ -322,11 +320,18 @@ class Email(Communication):
322
320
  def __post_init__(self):
323
321
  super().__post_init__()
324
322
 
325
- if self.config and self.config.recipients:
326
- self.recipients = cast(list[str | None], self.config.recipients)
327
- else:
328
- for recipient in self.header.recipients():
329
- self.recipients.extend(self._get_names(recipient))
323
+ try:
324
+ if self.config and self.config.recipients:
325
+ self.recipients = cast(list[str | None], self.config.recipients)
326
+ else:
327
+ for recipient in self.header.recipients():
328
+ self.recipients.extend(self._get_names(recipient))
329
+ except Exception as e:
330
+ console.print_exception()
331
+ console.line(2)
332
+ logger.fatal(f"Failed on {self.file_id}")
333
+ console.line(2)
334
+ raise e
330
335
 
331
336
  # Remove self CCs
332
337
  recipients = [r for r in self.recipients if r != self.author or self.file_id in SELF_EMAILS_FILE_IDS]
@@ -19,7 +19,6 @@ from epstein_files.documents.emails.email_header import AUTHOR
19
19
  from epstein_files.documents.json_file import JsonFile
20
20
  from epstein_files.documents.messenger_log import MSG_REGEX, MessengerLog
21
21
  from epstein_files.documents.other_file import OtherFile
22
- from epstein_files.util.constant.output_files import PICKLED_PATH
23
22
  from epstein_files.util.constant.strings import *
24
23
  from epstein_files.util.constant.urls import (EPSTEIN_MEDIA, EPSTEIN_WEB, JMAIL, epstein_media_person_url,
25
24
  epsteinify_name_url, epstein_web_person_url, search_jmail_url, search_twitter_url)
@@ -35,9 +34,10 @@ from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, add_cols_to_tab
35
34
  from epstein_files.util.search_result import SearchResult
36
35
  from epstein_files.util.timer import Timer
37
36
 
37
+ EXCLUDED_EMAILERS = [e.lower() for e in (USELESS_EMAILERS + [JEFFREY_EPSTEIN])]
38
+ PICKLED_PATH = Path("the_epstein_files.pkl.gz")
38
39
  DEVICE_SIGNATURE = 'Device Signature'
39
40
  DEVICE_SIGNATURE_PADDING = (1, 0)
40
- NOT_INCLUDED_EMAILERS = [e.lower() for e in (USELESS_EMAILERS + [JEFFREY_EPSTEIN])]
41
41
  SLOW_FILE_SECONDS = 1.0
42
42
 
43
43
  INVALID_FOR_EPSTEIN_WEB = JUNK_EMAILERS + KRASSNER_RECIPIENTS + [
@@ -94,23 +94,23 @@ class EpsteinFiles:
94
94
  self._tally_email_data()
95
95
 
96
96
  @classmethod
97
- def get_files(cls, timer: Timer | None = None, use_pickled: bool = False) -> 'EpsteinFiles':
97
+ def get_files(cls, timer: Timer | None = None) -> 'EpsteinFiles':
98
98
  """Alternate constructor that reads/writes a pickled version of the data ('timer' arg is for logging)."""
99
99
  timer = timer or Timer()
100
100
 
101
- if ((args.pickled or use_pickled) and PICKLED_PATH.exists()) and not args.overwrite_pickle:
101
+ if PICKLED_PATH.exists() and not args.overwrite_pickle:
102
102
  with gzip.open(PICKLED_PATH, 'rb') as file:
103
103
  epstein_files = pickle.load(file)
104
104
  timer.print_at_checkpoint(f"Loaded {len(epstein_files.all_files):,} documents from '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})")
105
105
  epstein_files.timer = timer
106
106
  return epstein_files
107
107
 
108
+ logger.warning(f"Building new cache file, this will take a few minutes...")
108
109
  epstein_files = EpsteinFiles(timer=timer)
109
110
 
110
- if args.overwrite_pickle or not PICKLED_PATH.exists():
111
- with gzip.open(PICKLED_PATH, 'wb') as file:
112
- pickle.dump(epstein_files, file)
113
- logger.warning(f"Pickled data to '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})...")
111
+ with gzip.open(PICKLED_PATH, 'wb') as file:
112
+ pickle.dump(epstein_files, file)
113
+ logger.warning(f"Pickled data to '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})...")
114
114
 
115
115
  timer.print_at_checkpoint(f'Processed {len(epstein_files.all_files):,} documents')
116
116
  return epstein_files
@@ -119,9 +119,9 @@ class EpsteinFiles:
119
119
  return self.imessage_logs + self.emails + self.other_files
120
120
 
121
121
  def all_emailers(self, include_useless: bool = False) -> list[str | None]:
122
- """Returns all emailers except Epstein and USELESS_EMAILERS, sorted from least frequent to most."""
122
+ """Returns all emailers except Epstein and EXCLUDED_EMAILERS, sorted from least frequent to most."""
123
123
  names = [a for a in self.email_author_counts.keys()] + [r for r in self.email_recipient_counts.keys()]
124
- names = names if include_useless else [e for e in names if e is None or e.lower() not in NOT_INCLUDED_EMAILERS]
124
+ names = names if include_useless else [e for e in names if e is None or e.lower() not in EXCLUDED_EMAILERS]
125
125
  return sorted(list(set(names)), key=lambda e: self.email_author_counts[e] + self.email_recipient_counts[e])
126
126
 
127
127
  def attributed_email_count(self) -> int:
@@ -200,10 +200,10 @@ class EpsteinFiles:
200
200
  def json_metadata(self) -> str:
201
201
  """Create a JSON string containing metadata for all the files."""
202
202
  metadata = {
203
- EMAIL_CLASS: _sorted_metadata(self.emails),
204
- JSON_FILE_CLASS: _sorted_metadata(self.json_files),
205
- MESSENGER_LOG_CLASS: _sorted_metadata(self.imessage_logs),
206
- OTHER_FILE_CLASS: _sorted_metadata(self.non_json_other_files()),
203
+ Email.__name__: _sorted_metadata(self.emails),
204
+ JsonFile.__name__: _sorted_metadata(self.json_files),
205
+ MessengerLog.__name__: _sorted_metadata(self.imessage_logs),
206
+ OtherFile.__name__: _sorted_metadata(self.non_json_other_files()),
207
207
  }
208
208
 
209
209
  return json.dumps(metadata, indent=4, sort_keys=True)
@@ -372,12 +372,12 @@ def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
372
372
  return counts
373
373
 
374
374
 
375
- def document_cls(document: Document) -> Type[Document]:
376
- search_area = document.text[0:5000] # Limit search area to avoid pointless scans of huge files
375
+ def document_cls(doc: Document) -> Type[Document]:
376
+ search_area = doc.text[0:5000] # Limit search area to avoid pointless scans of huge files
377
377
 
378
- if document.text[0] == '{':
378
+ if doc.text[0] == '{':
379
379
  return JsonFile
380
- elif isinstance(document.config, EmailCfg) or DETECT_EMAIL_REGEX.match(search_area):
380
+ elif isinstance(doc.config, EmailCfg) or (DETECT_EMAIL_REGEX.match(search_area) and doc.config is None):
381
381
  return Email
382
382
  elif MSG_REGEX.search(search_area):
383
383
  return MessengerLog
@@ -1,11 +1,10 @@
1
1
  from pathlib import Path
2
2
 
3
- PICKLED_PATH = Path("the_epstein_files.pkl.gz")
4
-
5
- EPSTEIN_FILES_NOV_2025 = 'epstein_files_nov_2025'
6
3
  URLS_ENV = '.urls.env'
7
4
 
5
+ # Files output by the code
8
6
  HTML_DIR = Path('docs')
7
+ EPSTEIN_FILES_NOV_2025 = 'epstein_files_nov_2025'
9
8
  ALL_EMAILS_PATH = HTML_DIR.joinpath(f'all_emails_{EPSTEIN_FILES_NOV_2025}.html')
10
9
  JSON_METADATA_PATH = HTML_DIR.joinpath(f'file_metadata_{EPSTEIN_FILES_NOV_2025}.json')
11
10
  TEXT_MSGS_HTML_PATH = HTML_DIR.joinpath('index.html')
@@ -2,13 +2,6 @@ import re
2
2
  from typing import Literal
3
3
 
4
4
 
5
- # Document subclass names (this sucks)
6
- DOCUMENT_CLASS = 'Document'
7
- EMAIL_CLASS = 'Email'
8
- JSON_FILE_CLASS = 'JsonFile'
9
- MESSENGER_LOG_CLASS = 'MessengerLog'
10
- OTHER_FILE_CLASS = 'OtherFile'
11
-
12
5
  # categories
13
6
  ACADEMIA = 'academia'
14
7
  ARTS = 'arts'
@@ -27,6 +20,7 @@ POLITICS = 'politics'
27
20
  PROPERTY = 'property'
28
21
  PUBLICIST = 'publicist'
29
22
  REPUTATION = 'reputation'
23
+ SKYPE_LOG= 'skype log'
30
24
  SOCIAL = 'social'
31
25
  SPEECH = 'speech'
32
26
 
@@ -76,5 +70,12 @@ FILE_STEM_REGEX = re.compile(fr"{HOUSE_OVERSIGHT_PREFIX}(\d{{6}}(_\d{{1,2}})?)")
76
70
  FILE_NAME_REGEX = re.compile(fr"{FILE_STEM_REGEX.pattern}(\.txt(\.json)?)?")
77
71
  QUESTION_MARKS_REGEX = re.compile(fr' {re.escape(QUESTION_MARKS)}$')
78
72
 
73
+ # Document subclass names (this sucks)
74
+ DOCUMENT_CLASS = 'Document'
75
+ EMAIL_CLASS = 'Email'
76
+ JSON_FILE_CLASS = 'JsonFile'
77
+ MESSENGER_LOG_CLASS = 'MessengerLog'
78
+ OTHER_FILE_CLASS = 'OtherFile'
79
+
79
80
 
80
81
  remove_question_marks = lambda name: QUESTION_MARKS_REGEX.sub('', name)
@@ -47,7 +47,7 @@ extracted_file_url = lambda f: f"{EXTRACTS_BASE_URL}/{f}"
47
47
  COFFEEZILLA_ARCHIVE_URL = 'https://journaliststudio.google.com/pinpoint/search?collection=061ce61c9e70bdfd'
48
48
  COURIER_NEWSROOM_ARCHIVE_URL = 'https://journaliststudio.google.com/pinpoint/search?collection=092314e384a58618'
49
49
  EPSTEINIFY_URL = 'https://epsteinify.com'
50
- EPSTEIN_MEDIA_URL = 'https://www.epstein.media'
50
+ EPSTEIN_MEDIA_URL = 'https://epstein.media'
51
51
  EPSTEIN_WEB_URL = 'https://epsteinweb.org'
52
52
  JMAIL_URL = 'https://jmail.world'
53
53
  OVERSIGHT_REPUBLICANS_PRESSER_URL = 'https://oversight.house.gov/release/oversight-committee-releases-additional-epstein-estate-documents/'
@@ -1,5 +1,6 @@
1
1
  import re
2
2
  from copy import deepcopy
3
+ from typing import cast
3
4
 
4
5
  from dateutil.parser import parse
5
6
 
@@ -84,7 +85,7 @@ EMAILER_ID_REGEXES: dict[str, re.Pattern] = {
84
85
  JAMES_HILL: re.compile(r"hill, james e.|james.e.hill@abc.com", re.IGNORECASE),
85
86
  JEAN_LUC_BRUNEL: re.compile(r'Jean[- ]Luc Brunel?', re.IGNORECASE),
86
87
  JEFF_FULLER: re.compile(r"jeff@mc2mm.com|Jeff Fuller", re.IGNORECASE),
87
- JEFFREY_EPSTEIN: re.compile(r'[djl]ee[vy]acation[©@]?g?(mail.com)?|Epstine|\bJEE?\b|Jeffrey E((sp|ps)tein?)?|jeeproject@yahoo.com|J Jep|Jeffery Edwards|(?<!Mark L. )Epstein', re.IGNORECASE),
88
+ JEFFREY_EPSTEIN: re.compile(r'[djl]\s?ee[vy]acation[©@]?g?(mail.com)?|Epstine|\bJEE?\b|Jeffrey E((sp|ps)tein?)?|jeeproject@yahoo.com|J Jep|Jeffery Edwards|(?<!Mark L. )Epstein', re.IGNORECASE),
88
89
  JESSICA_CADWELL: re.compile(r'Jessica Cadwell?', re.IGNORECASE),
89
90
  JOHNNY_EL_HACHEM: re.compile(r'el hachem johnny|johnny el hachem', re.IGNORECASE),
90
91
  JOI_ITO: re.compile(r'ji@media.mit.?edu|(joichi|joi)( Ito)?', re.IGNORECASE),
@@ -94,7 +95,7 @@ EMAILER_ID_REGEXES: dict[str, re.Pattern] = {
94
95
  LANDON_THOMAS: re.compile(r'lando[nr] thomas( jr)?|thomas jr.?, lando[nr]', re.IGNORECASE),
95
96
  LARRY_SUMMERS: re.compile(r'(La(wrence|rry).{1,5})?Summers?|^LH$|LHS|Ihsofficel', re.IGNORECASE),
96
97
  LAWRANCE_VISOSKI: re.compile(r'La(rry|wrance) Visoski?|Lvjet', re.IGNORECASE),
97
- LAWRENCE_KRAUSS: re.compile(r'Lawrence Kraus|lawkrauss', re.IGNORECASE),
98
+ LAWRENCE_KRAUSS: re.compile(r'Lawrence Kraus|[jl]awkrauss', re.IGNORECASE),
98
99
  LEON_BLACK: re.compile(r'Leon Black?', re.IGNORECASE),
99
100
  MANUELA_MARTINEZ: re.compile(fr'Manuela (- Mega Partners|Martinez)', re.IGNORECASE),
100
101
  MARIANA_IDZKOWSKA: re.compile(r'Mariana [Il]d[źi]kowska?', re.IGNORECASE),
@@ -268,7 +269,7 @@ SHIMON_POST = 'The Shimon Post'
268
269
  SHIMON_POST_ARTICLE = f'selection of articles about the mideast'
269
270
  SINGLE_PAGE = 'single page of'
270
271
  STRANGE_BEDFELLOWS = "'Strange Bedfellows' list of invitees f. Johnny Depp, Woody Allen, Obama, and more"
271
- SWEDISH_LIFE_SCIENCES_SUMMIT = f"{BARBRO_C_EHNBOM}'s Swedish American Life Science Summit"
272
+ SWEDISH_LIFE_SCIENCES_SUMMIT = f"{BARBRO_C_EHNBOM}'s Swedish American Life Science Summit (SALSS)"
272
273
  THE_REAL_DEAL_ARTICLE = 'article by Keith Larsen'
273
274
  TRUMP_DISCLOSURES = f"Donald Trump financial disclosures from U.S. Office of Government Ethics"
274
275
  UBS_CIO_REPORT = 'CIO Monthly Extended report'
@@ -371,8 +372,8 @@ TEXTS_CONFIG = CONFIRMED_TEXTS_CONFIG + UNCONFIRMED_TEXTS_CONFIG
371
372
  ########################################################################################################
372
373
 
373
374
  # Some emails have a lot of uninteresting CCs
374
- IRAN_NUCLEAR_DEAL_SPAM_EMAIL_RECIPIENTS: list[str | None] = ['Allen West', 'Rafael Bardaji', 'Philip Kafka', 'Herb Goodman', 'Grant Seeger', 'Lisa Albert', 'Janet Kafka', 'James Ramsey', 'ACT for America', 'John Zouzelka', 'Joel Dunn', 'Nate McClain', 'Bennet Greenwald', 'Taal Safdie', 'Uri Fouzailov', 'Neil Anderson', 'Nate White', 'Rita Hortenstine', 'Henry Hortenstine', 'Gary Gross', 'Forrest Miller', 'Bennett Schmidt', 'Val Sherman', 'Marcie Brown', 'Michael Horowitz', 'Marshall Funk']
375
- FLIGHT_IN_2012_PEOPLE: list[str | None] = ['Francis Derby', 'Januiz Banasiak', 'Louella Rabuyo', 'Richard Barnnet']
375
+ IRAN_DEAL_RECIPIENTS = ['Allen West', 'Rafael Bardaji', 'Philip Kafka', 'Herb Goodman', 'Grant Seeger', 'Lisa Albert', 'Janet Kafka', 'James Ramsey', 'ACT for America', 'John Zouzelka', 'Joel Dunn', 'Nate McClain', 'Bennet Greenwald', 'Taal Safdie', 'Uri Fouzailov', 'Neil Anderson', 'Nate White', 'Rita Hortenstine', 'Henry Hortenstine', 'Gary Gross', 'Forrest Miller', 'Bennett Schmidt', 'Val Sherman', 'Marcie Brown', 'Michael Horowitz', 'Marshall Funk']
376
+ FLIGHT_IN_2012_PEOPLE = ['Francis Derby', 'Januiz Banasiak', 'Louella Rabuyo', 'Richard Barnnet']
376
377
 
377
378
  EMAILS_CONFIG = [
378
379
  EmailCfg(id='032436', author=ALIREZA_ITTIHADIEH, attribution_reason='Signature'),
@@ -491,9 +492,6 @@ EMAILS_CONFIG = [
491
492
  EmailCfg(id='032727', author=KATHRYN_RUEMMLER, attribution_reason=KATHY_REASON, is_attribution_uncertain=True),
492
493
  EmailCfg(id='030478', author=LANDON_THOMAS),
493
494
  EmailCfg(id='029013', author=LARRY_SUMMERS, recipients=[JEFFREY_EPSTEIN]), # Bad OCR (nofix)
494
- EmailCfg(id='032206', author=LAWRENCE_KRAUSS), # More of a text convo?
495
- EmailCfg(id='032208', author=LAWRENCE_KRAUSS, recipients=[JEFFREY_EPSTEIN]), # More of a text convo?
496
- EmailCfg(id='032209', author=LAWRENCE_KRAUSS, recipients=[JEFFREY_EPSTEIN]), # More of a text convo?
497
495
  EmailCfg(id='029196', author=LAWRENCE_KRAUSS, recipients=[JEFFREY_EPSTEIN], actual_text='Talk in 40?'),
498
496
  EmailCfg(id='033593', author=LAWRANCE_VISOSKI, attribution_reason='Signature'),
499
497
  EmailCfg(id='033370', author=LAWRANCE_VISOSKI, attribution_reason=LARRY_REASON),
@@ -575,7 +573,7 @@ EMAILS_CONFIG = [
575
573
  attribution_reason='ends with "Respectfully, terry"',
576
574
  author=TERRY_KAFKA,
577
575
  fwded_text_after='From: Mike Cohen',
578
- recipients=[JEFFREY_EPSTEIN, MARK_EPSTEIN, MICHAEL_BUCHHOLTZ] + IRAN_NUCLEAR_DEAL_SPAM_EMAIL_RECIPIENTS,
576
+ recipients=[JEFFREY_EPSTEIN, MARK_EPSTEIN, MICHAEL_BUCHHOLTZ] + IRAN_DEAL_RECIPIENTS,
579
577
  duplicate_ids=['028482'],
580
578
  ),
581
579
  EmailCfg(id='029992', author=TERRY_KAFKA, attribution_reason='Quoted reply'),
@@ -600,7 +598,6 @@ EMAILS_CONFIG = [
600
598
  EmailCfg(id='022202', recipients=[JEAN_LUC_BRUNEL], attribution_reason='Follow up / reply', duplicate_ids=['029975']),
601
599
  EmailCfg(id='022187', recipients=[JEFFREY_EPSTEIN]), # Bad OCR (nofix)
602
600
  EmailCfg(id='031489', recipients=[JEFFREY_EPSTEIN]), # Bad OCR (unfixable)
603
- EmailCfg(id='032210', recipients=[JEFFREY_EPSTEIN]), # More of a text convo?
604
601
  EmailCfg(id='030347', recipients=[JEFFREY_EPSTEIN]), # Bad OCR (nofix)
605
602
  EmailCfg(id='030367', recipients=[JEFFREY_EPSTEIN]), # Bad OCR (nofix)
606
603
  EmailCfg(id='033274', recipients=[JEFFREY_EPSTEIN]), # this is a note sent to self
@@ -751,7 +748,7 @@ EMAILS_CONFIG = [
751
748
  EmailCfg(id='031118', duplicate_ids=['019465']),
752
749
  EmailCfg(id='031912', duplicate_ids=['032158']),
753
750
  EmailCfg(id='030587', duplicate_ids=['030514']),
754
- EmailCfg(id='029773', duplicate_ids=['012685']),
751
+ EmailCfg(id='029773', duplicate_ids=['012685'], fwded_text_after='Omar Quadhafi'),
755
752
  EmailCfg(id='033297', duplicate_ids=['033586']),
756
753
  EmailCfg(id='031089', duplicate_ids=['018084']),
757
754
  EmailCfg(id='031088', duplicate_ids=['030885']),
@@ -1195,7 +1192,7 @@ OTHER_FILES_CONFERENCES = [
1195
1192
  DocCfg(id='019300', author=SVETLANA_POZHIDAEVA, description=f'{WOMEN_EMPOWERMENT} f. {KATHRYN_RUEMMLER}', date='2019-04-05'),
1196
1193
  DocCfg(id='022267', author=SVETLANA_POZHIDAEVA, description=f'{WOMEN_EMPOWERMENT} founder essay about growing the seminar business'),
1197
1194
  DocCfg(id='022407', author=SVETLANA_POZHIDAEVA, description=f'{WOMEN_EMPOWERMENT} seminar pitch deck'),
1198
- DocCfg(id='017524', author=SWEDISH_LIFE_SCIENCES_SUMMIT, description=f"2012 program"),
1195
+ DocCfg(id='017524', author=SWEDISH_LIFE_SCIENCES_SUMMIT, description=f"2012 program emailed to epstein BY {BARBRO_C_EHNBOM} in 031226", date='2012-08-18'),
1199
1196
  DocCfg(id='026747', author=SWEDISH_LIFE_SCIENCES_SUMMIT, description=f"2017 program", date='2017-08-23'),
1200
1197
  DocCfg(id='014951', author='TED Talks', description=f"2017 program", date='2017-04-20'),
1201
1198
  DocCfg(id='024179', author=UN_GENERAL_ASSEMBLY, description=f'president and first lady schedule', date='2012-09-21'),
@@ -1326,7 +1323,7 @@ OTHER_FILES_LETTERS = [
1326
1323
  ]
1327
1324
 
1328
1325
  OTHER_FILES_PROPERTY = [
1329
- DocCfg(id='026759', author='Great Bay Condominium Owners Association', description=f'{PRESS_RELEASE} by about Hurricane Irma damage', date='2017-09-13'),
1326
+ DocCfg(id='026759', author='Great Bay Condominium Owners Association', description=f'{PRESS_RELEASE} about Hurricane Irma damage', date='2017-09-13'),
1330
1327
  DocCfg(id='016602', author=PALM_BEACH_CODE_ENFORCEMENT, description='board minutes', date='2008-04-17'),
1331
1328
  DocCfg(id='016554', author=PALM_BEACH_CODE_ENFORCEMENT, description='board minutes', date='2008-07-17', duplicate_ids=['016616', '016574']),
1332
1329
  DocCfg(id='027068', author=THE_REAL_DEAL, description=f"{THE_REAL_DEAL_ARTICLE} Palm House Hotel Bankruptcy and EB-5 Visa Fraud Allegations"),
@@ -1379,8 +1376,8 @@ OTHER_FILES_SOCIAL = [
1379
1376
  ]
1380
1377
 
1381
1378
  OTHER_FILES_POLITICS = [
1382
- DocCfg(id='029918', author=DIANA_DEGETTE_CAMPAIGN, description=f"bio", date='2012-01-01'),
1383
- DocCfg(id='031184', author=DIANA_DEGETTE_CAMPAIGN, description=f"fundraiser invitation"),
1379
+ DocCfg(id='029918', author=DIANA_DEGETTE_CAMPAIGN, description=f"bio", date='2012-09-27'),
1380
+ DocCfg(id='031184', author=DIANA_DEGETTE_CAMPAIGN, description=f"invitation to fundraiser hosted by {BARBRO_C_EHNBOM}", date='2012-09-27'),
1384
1381
  DocCfg(id='026827', author='Scowcroft Group', description=f'report on ISIS', date='2015-11-14'),
1385
1382
  DocCfg(id='024294', author=STACEY_PLASKETT, description=f"campaign flier", date='2016-10-01'),
1386
1383
  DocCfg(
@@ -1482,6 +1479,11 @@ OTHER_FILES_ARTS = [
1482
1479
  OTHER_FILES_MISC = [
1483
1480
  DocCfg(id='022780', category=FLIGHT_LOGS),
1484
1481
  DocCfg(id='022816', category=FLIGHT_LOGS),
1482
+ DocCfg(id='032206', category=SKYPE_LOG, author=LAWRENCE_KRAUSS),
1483
+ DocCfg(id='032208', category=SKYPE_LOG, author=LAWRENCE_KRAUSS),
1484
+ DocCfg(id='032209', category=SKYPE_LOG, author=LAWRENCE_KRAUSS),
1485
+ DocCfg(id='018224', category=SKYPE_LOG, author=LAWRENCE_KRAUSS, description=f'conversations with linkspirit (French?) and {LAWRENCE_KRAUSS}'),
1486
+ DocCfg(id='032210', category=SKYPE_LOG, description=f'conversation with linkspirit'),
1485
1487
  DocCfg(
1486
1488
  id='025147',
1487
1489
  author=BROCKMAN_INC,
@@ -1496,7 +1498,6 @@ OTHER_FILES_MISC = [
1496
1498
  DocCfg(id='027074', author=FEMALE_HEALTH_COMPANY, description=f"pitch deck (USAID was a customer)"),
1497
1499
  DocCfg(id='032735', author=GORDON_GETTY, description=f"on Trump", date='2018-03-20'), # Dated based on concurrent emails from Getty
1498
1500
  DocCfg(id='025540', author=JEFFREY_EPSTEIN, description=f"rough draft of Epstein's side of the story?"),
1499
- DocCfg(id='018224', author=LAWRENCE_KRAUSS, description=f"Skype conversation log"),
1500
1501
  DocCfg(id='026634', author='Michael Carrier', description=f"comments about an Apollo linked hedge fund 'DE Fund VIII'"),
1501
1502
  DocCfg(id='031425', author=SCOTT_J_LINK, description=f'completely redacted email from'),
1502
1503
  DocCfg(id='020447', author='Working Group on Chinese Influence Activities in the U.S.', description=f'Promoting Constructive Vigilance'),
@@ -1589,8 +1590,8 @@ SENT_FROM_REGEX = re.compile(r'^(?:(Please forgive|Sorry for all the) typos.{1,4
1589
1590
 
1590
1591
 
1591
1592
  # Error checking.
1592
- if len(OTHER_FILES_CONFIG) != 438:
1593
- logger.warning(f"Only {len(OTHER_FILES_CONFIG)} configured other files!")
1593
+ if len(OTHER_FILES_CONFIG) != 442:
1594
+ logger.warning(f"Found {len(OTHER_FILES_CONFIG)} configured other files!")
1594
1595
 
1595
1596
  encountered_file_ids = set()
1596
1597
 
@@ -109,7 +109,9 @@ class DocCfg:
109
109
 
110
110
  def info_str(self) -> str | None:
111
111
  """String that summarizes what is known about this document."""
112
- if self.category == REPUTATION:
112
+ if self.category and not self.description:
113
+ return self.category
114
+ elif self.category == REPUTATION:
113
115
  return f"{REPUTATION_MGMT}: {self.description}"
114
116
  elif self.author and self.description:
115
117
  if self.category in [ACADEMIA, BOOK]:
@@ -0,0 +1,84 @@
1
+ import logging
2
+ from argparse import ArgumentParser
3
+ from os import environ
4
+ from pathlib import Path
5
+ from sys import argv
6
+
7
+ from epstein_files.util.logging import datefinder_logger, env_log_level, logger
8
+
9
+ COUNT_WORDS_SCRIPT = 'count_words.py'
10
+ DEFAULT_WIDTH = 145
11
+ HTML_SCRIPTS = ['epstein_generate', 'generate_html.py', COUNT_WORDS_SCRIPT]
12
+
13
+
14
+ parser = ArgumentParser(description="Parse epstein OCR docs and generate HTML page.")
15
+ parser.add_argument('--name', '-n', action='append', dest='names', help='specify the name(s) whose communications should be output')
16
+ parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='ovewrite cached EpsteinFiles')
17
+
18
+ output = parser.add_argument_group('OUTPUT')
19
+ output.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
20
+ output.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just the interesting ones')
21
+ output.add_argument('--build', '-b', action='store_true', help='write output to HTML file')
22
+ output.add_argument('--make-clean', '-mc', action='store_true', help='delete all build artifact HTML and JSON files')
23
+ output.add_argument('--output-emails', '-oe', action='store_true', help='generate other files section')
24
+ output.add_argument('--output-other-files', '-oo', action='store_true', help='generate other files section')
25
+ output.add_argument('--output-texts', '-ot', action='store_true', help='generate other files section')
26
+ output.add_argument('--suppress-output', action='store_true', help='no output to terminal (use with --build)')
27
+ output.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use (in characters)')
28
+ output.add_argument('--use-epstein-web-links', action='store_true', help='use epsteinweb.org links instead of epstein.media')
29
+
30
+ scripts = parser.add_argument_group('SCRIPTS', 'Arguments used only by epstein_search, epstein_show, epstein_diff')
31
+ scripts.add_argument('positional_args', nargs='*', help='strings to searchs for, file IDs to show or diff, etc.')
32
+ scripts.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (only used by scripts)')
33
+ scripts.add_argument('--whole-file', '-wf', action='store_true', help='print whole file (only used by epstein_search)')
34
+
35
+ debug = parser.add_argument_group('DEBUG')
36
+ debug.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
37
+ debug.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
38
+ debug.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
39
+ debug.add_argument('--json-metadata', '-jm', action='store_true', help='dump JSON metadata for all files')
40
+ debug.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats at the end')
41
+ debug.add_argument('--sort-alphabetical', action='store_true', help='sort emailers alphabetically in counts table')
42
+ debug.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
43
+ args = parser.parse_args()
44
+
45
+ current_script = Path(argv[0]).name
46
+ is_env_var_set = lambda s: len(environ.get(s) or '') > 0
47
+ is_html_script = current_script in HTML_SCRIPTS
48
+
49
+ args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
50
+ args.output_emails = args.output_emails or args.all_emails
51
+ args.output_other_files = args.output_other_files or args.all_other_files
52
+ args.overwrite_pickle = args.overwrite_pickle or (is_env_var_set('OVERWRITE_PICKLE') and not is_env_var_set('PICKLED'))
53
+ args.width = args.width if is_html_script else None
54
+ specified_names: list[str | None] = [None if n == 'None' else n for n in (args.names or [])]
55
+
56
+
57
+ # Log level args
58
+ if args.deep_debug:
59
+ logger.setLevel(logging.DEBUG)
60
+ elif args.debug:
61
+ logger.setLevel(logging.INFO)
62
+ elif args.suppress_logs:
63
+ logger.setLevel(logging.FATAL)
64
+ elif not env_log_level:
65
+ logger.setLevel(logging.WARNING)
66
+
67
+ logger.info(f'Log level set to {logger.level}...')
68
+ datefinder_logger.setLevel(logger.level)
69
+
70
+
71
+ # Massage args that depend on other args to the appropriate state
72
+ if not (args.json_metadata or args.output_texts or args.output_emails or args.output_other_files):
73
+ if is_html_script and current_script != COUNT_WORDS_SCRIPT and not args.make_clean and not args.colors_only:
74
+ logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
75
+
76
+ args.output_texts = True
77
+ args.output_emails = True
78
+ args.output_other_files = True
79
+
80
+ if args.use_epstein_web_links:
81
+ logger.warning(f"Using links to epsteinweb.org links instead of epsteinify.com...")
82
+
83
+ if args.debug:
84
+ logger.warning(f"Invocation args:\ncurrent_script={current_script}\nis_html_script={is_html_script},\nspecified_names={specified_names},\nargs={args}")
@@ -159,7 +159,7 @@ HIGHLIGHTED_NAMES = [
159
159
  pattern=r'Gruterite|(John\s*)?Kluge|Marc Rich|(Mi(chael|ke)\s*)?Ovitz|(Steve\s+)?Wynn|(Les(lie)?\s+)?Wexner|SALSS|Swedish[-\s]*American\s*Life\s*Science\s*Summit|Valhi|(Yves\s*)?Bouvier',
160
160
  emailers = {
161
161
  ALIREZA_ITTIHADIEH: 'CEO Freestream Aircraft Limited',
162
- BARBRO_C_EHNBOM: 'Swedish pharmaceuticals',
162
+ BARBRO_C_EHNBOM: 'Swedish pharmaceuticals, SALSS',
163
163
  FRED_HADDAD: "co-founder of Heck's in West Virginia",
164
164
  GERALD_BARTON: "Maryland property developer Landmark Land Company, fan of Trump's Irish golf course",
165
165
  GORDON_GETTY: 'heir of oil tycoon J. Paul Getty',
@@ -296,6 +296,7 @@ HIGHLIGHTED_NAMES = [
296
296
  emailers = {
297
297
  DAVID_STERN: f'emailed Epstein from Moscow, appears to know chairman of {DEUTSCHE_BANK}',
298
298
  JONATHAN_FARKAS: "heir to the Alexander's department store fortune",
299
+ 'linkspirit': "Skype username of someone Epstein communicated with",
299
300
  'Peter Thomas Roth': 'student of Epstein at Dalton, skincare company founder',
300
301
  STEPHEN_HANSON: None,
301
302
  TOM_BARRACK: 'long time friend of Trump',
@@ -304,7 +305,7 @@ HIGHLIGHTED_NAMES = [
304
305
  HighlightedNames(
305
306
  label='finance',
306
307
  style='green',
307
- pattern=r'Apollo|Ari\s*Glass|(Bernie\s*)?Madoff|Black(rock|stone)|BofA|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|(money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
308
+ pattern=r'Apollo|Ari\s*Glass|Bank|(Bernie\s*)?Madoff|Black(rock|stone)|B\s*of\s*A|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|(money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
308
309
  emailers={
309
310
  AMANDA_ENS: 'Citigroup',
310
311
  DANIEL_SABBA: 'UBS Investment Bank',
@@ -587,7 +588,7 @@ HIGHLIGHTED_NAMES = [
587
588
  HighlightedText(
588
589
  label='phone_number',
589
590
  style='bright_green',
590
- pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|[\d+]{10,12}",
591
+ pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|\b[\d+]{10,12}\b",
591
592
  ),
592
593
  ]
593
594
 
@@ -7,7 +7,6 @@ from epstein_files.util.constant.output_files import JSON_METADATA_PATH
7
7
  from epstein_files.util.constant import urls
8
8
  from epstein_files.util.constant.html import *
9
9
  from epstein_files.util.constant.names import *
10
- from epstein_files.util.constant.strings import EMAIL_CLASS, MESSENGER_LOG_CLASS
11
10
  from epstein_files.util.data import dict_sets_to_lists
12
11
  from epstein_files.util.env import args, specified_names
13
12
  from epstein_files.util.logging import log_file_write, logger
@@ -122,9 +121,9 @@ def print_json_metadata(epstein_files: EpsteinFiles) -> None:
122
121
  def print_json_stats(epstein_files: EpsteinFiles) -> None:
123
122
  console.line(5)
124
123
  console.print(Panel('JSON Stats Dump', expand=True, style='reverse bold'), '\n')
125
- print_json(f"{MESSENGER_LOG_CLASS} Sender Counts", MessengerLog.count_authors(epstein_files.imessage_logs), skip_falsey=True)
126
- print_json(f"{EMAIL_CLASS} Author Counts", epstein_files.email_author_counts, skip_falsey=True)
127
- print_json(f"{EMAIL_CLASS} Recipient Counts", epstein_files.email_recipient_counts, skip_falsey=True)
124
+ print_json(f"MessengerLog Sender Counts", MessengerLog.count_authors(epstein_files.imessage_logs), skip_falsey=True)
125
+ print_json(f"Email Author Counts", epstein_files.email_author_counts, skip_falsey=True)
126
+ print_json(f"Email Recipient Counts", epstein_files.email_recipient_counts, skip_falsey=True)
128
127
  print_json("Email signature_substitution_countss", epstein_files.email_signature_substitution_counts(), skip_falsey=True)
129
128
  print_json("email_author_device_signatures", dict_sets_to_lists(epstein_files.email_authors_to_device_signatures))
130
129
  print_json("email_sent_from_devices", dict_sets_to_lists(epstein_files.email_device_signatures_to_authors))
@@ -147,16 +146,12 @@ def print_text_messages(epstein_files: EpsteinFiles) -> None:
147
146
 
148
147
  def write_urls() -> None:
149
148
  """Write _URL style constant variables to a file bash scripts can load as env vars."""
150
- if args.output_file == 'index.html':
151
- logger.warning(f"Can't write env vars to '{args.output_file}', writing to '{URLS_ENV}' instead.\n")
152
- args.output_file = URLS_ENV
153
-
154
149
  url_vars = {
155
150
  k: v for k, v in vars(urls).items()
156
151
  if isinstance(v, str) and k.split('_')[-1] in ['URL'] and 'github.io' in v and 'BASE' not in k
157
152
  }
158
153
 
159
- with open(args.output_file, 'w') as f:
154
+ with open(URLS_ENV, 'w') as f:
160
155
  for var_name, url in url_vars.items():
161
156
  key_value = f"{var_name}='{url}'"
162
157
 
@@ -166,7 +161,7 @@ def write_urls() -> None:
166
161
  f.write(f"{key_value}\n")
167
162
 
168
163
  console.line()
169
- logger.warning(f"Wrote {len(url_vars)} URL variables to '{args.output_file}'\n")
164
+ logger.warning(f"Wrote {len(url_vars)} URL variables to '{URLS_ENV}'\n")
170
165
 
171
166
 
172
167
  def _verify_all_emails_were_printed(epstein_files: EpsteinFiles, already_printed_emails: list[Email]) -> None:
@@ -231,10 +231,13 @@ def print_other_site_link(is_header: bool = True) -> None:
231
231
  other_site_msg += f" Epstein's {other_site_type}s also generated by this code"
232
232
  markup_msg = link_markup(SITE_URLS[other_site_type], other_site_msg, OTHER_SITE_LINK_STYLE)
233
233
  print_centered(parenthesize(Text.from_markup(markup_msg)), style='bold')
234
- word_count_link = link_text_obj(WORD_COUNT_URL, 'site showing the most frequently used words in these communiques', OTHER_SITE_LINK_STYLE)
235
- print_centered(parenthesize(word_count_link))
236
- metadata_link = link_text_obj(JSON_METADATA_URL, 'metadata with author attribution explanations', OTHER_SITE_LINK_STYLE)
237
- print_centered(parenthesize(metadata_link))
234
+
235
+ if is_header:
236
+ metadata_link = link_text_obj(JSON_METADATA_URL, 'metadata with author attribution explanations', OTHER_SITE_LINK_STYLE)
237
+ print_centered(parenthesize(metadata_link))
238
+ word_count_link = link_text_obj(WORD_COUNT_URL, 'most frequently used words', OTHER_SITE_LINK_STYLE)
239
+ print_centered(parenthesize(word_count_link))
240
+ print_centered(parenthesize(link_text_obj(GH_PROJECT_URL, '@github', 'dark_orange3 bold')))
238
241
 
239
242
 
240
243
  def print_page_title(expand: bool = True, width: int | None = None) -> None:
@@ -247,8 +250,8 @@ def print_page_title(expand: bool = True, width: int | None = None) -> None:
247
250
  def print_panel(msg: str, style: str = 'black on white', padding: tuple | None = None, centered: bool = False) -> None:
248
251
  _padding: list[int] = list(padding or [0, 0, 0, 0])
249
252
  _padding[2] += 1 # Bottom pad
250
- panel = Panel(Text.from_markup(msg, justify='center'), width=70, style=style)
251
253
  actual_padding: tuple[int, int, int, int] = tuple(_padding)
254
+ panel = Panel(Text.from_markup(msg, justify='center'), width=70, style=style)
252
255
 
253
256
  if centered:
254
257
  console.print(Align.center(Padding(panel, actual_padding)))
@@ -335,6 +338,7 @@ def _print_external_links() -> None:
335
338
  print_centered(link_markup(COURIER_NEWSROOM_ARCHIVE_URL, 'Searchable Archive') + " (Courier Newsroom)")
336
339
  print_centered(link_markup(EPSTEINIFY_URL) + " (raw document images)")
337
340
  print_centered(link_markup(EPSTEIN_WEB_URL) + " (character summaries)")
341
+ print_centered(link_markup(EPSTEIN_MEDIA_URL) + " (raw document images)")
338
342
 
339
343
 
340
344
  # if args.deep_debug:
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "epstein-files"
3
- version = "1.0.4"
3
+ version = "1.0.5"
4
4
  description = "Tools for working with the Jeffrey Epstein documents released in November 2025."
5
5
  authors = ["Michel de Cryptadamus"]
6
6
  readme = "README.md"
@@ -1,80 +0,0 @@
1
- import logging
2
- from argparse import ArgumentParser
3
- from os import environ
4
- from pathlib import Path
5
- from sys import argv
6
-
7
- from epstein_files.util.logging import datefinder_logger, env_log_level, logger
8
-
9
- COUNT_WORDS_SCRIPT = 'count_words.py'
10
- DEFAULT_WIDTH = 154
11
- HTML_SCRIPTS = ['epstein_generate', 'generate_html.py', COUNT_WORDS_SCRIPT]
12
-
13
-
14
- parser = ArgumentParser(description="Parse epstein OCR docs and generate HTML page.")
15
- parser.add_argument('--build', '-b', action='store_true', help='write output to file')
16
- parser.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
17
- parser.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just interesting ones')
18
- parser.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
19
- parser.add_argument('--name', '-n', action='append', dest='names', help='specify the name(s) whose communications should be output')
20
- parser.add_argument('--output-file', '-out', metavar='FILE', default='index.html', help='write output to FILE in docs/ (default=index.html)')
21
- parser.add_argument('--output-emails', '-oe', action='store_true', help='generate other files section')
22
- parser.add_argument('--output-other-files', '-oo', action='store_true', help='generate other files section')
23
- parser.add_argument('--output-texts', '-ot', action='store_true', help='generate other files section')
24
- parser.add_argument('--pickled', '-p', action='store_true', help='use pickled EpsteinFiles object')
25
- parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='generate new pickled EpsteinFiles object')
26
- parser.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (only used by scripts)')
27
- parser.add_argument('--sort-alphabetical', '-alpha', action='store_true', help='sort emailers alphabetically in counts table')
28
- parser.add_argument('--suppress-output', '-s', action='store_true', help='no output to terminal (use with --build)')
29
- parser.add_argument('--use-epstein-web-links', '-use', action='store_true', help='use epsteinweb.org links instead of epstein.media')
30
- parser.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use')
31
- parser.add_argument('--whole-file', '-wf', action='store_true', help='print whole file (only used by search script)')
32
- parser.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
33
- parser.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
34
- parser.add_argument('--make-clean', '-mc', action='store_true', help='delete all build artifact HTML and JSON files')
35
- parser.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
36
- parser.add_argument('--json-metadata', '-jm', action='store_true', help='dump JSON metadata for all files')
37
- parser.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats at the end')
38
- parser.add_argument('positional_args', nargs='*', help='Optional args (only used by helper scripts)')
39
- args = parser.parse_args()
40
-
41
- current_script = Path(argv[0]).name
42
- is_env_var_set = lambda s: len(environ.get(s) or '') > 0
43
- is_html_script = current_script in HTML_SCRIPTS
44
-
45
- args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
46
- args.output_emails = args.output_emails or args.all_emails
47
- args.output_other_files = args.output_other_files or args.all_other_files
48
- args.pickled = args.pickled or is_env_var_set('PICKLED') or args.colors_only or len(args.names or []) > 0
49
- args.width = args.width if is_html_script else None
50
- specified_names: list[str | None] = [None if n == 'None' else n for n in (args.names or [])]
51
-
52
-
53
- # Log level args
54
- if args.deep_debug:
55
- logger.setLevel(logging.DEBUG)
56
- elif args.debug:
57
- logger.setLevel(logging.INFO)
58
- elif args.suppress_logs:
59
- logger.setLevel(logging.FATAL)
60
- elif not env_log_level:
61
- logger.setLevel(logging.WARNING)
62
-
63
- logger.info(f'Log level set to {logger.level}...')
64
- datefinder_logger.setLevel(logger.level)
65
-
66
-
67
- # Massage args that depend on other args to the appropriate state
68
- if not (args.json_metadata or args.output_texts or args.output_emails or args.output_other_files):
69
- if is_html_script and current_script != COUNT_WORDS_SCRIPT and not args.make_clean:
70
- logger.warning(f"No output section chosen; outputting default of texts, selected emails, and other files...")
71
-
72
- args.output_texts = True
73
- args.output_emails = True
74
- args.output_other_files = True
75
-
76
- if args.use_epstein_web_links:
77
- logger.warning(f"Using links to epsteinweb.org links instead of epsteinify.com...")
78
-
79
- if args.debug:
80
- logger.warning(f"Invocation args:\nis_html_script={is_html_script},\nspecified_names={specified_names},\nargs={args}")
File without changes