epstein-files 1.0.4__tar.gz → 1.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {epstein_files-1.0.4 → epstein_files-1.0.6}/PKG-INFO +37 -18
  2. {epstein_files-1.0.4 → epstein_files-1.0.6}/README.md +36 -17
  3. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/__init__.py +24 -25
  4. epstein_files-1.0.6/epstein_files/count_words.py +72 -0
  5. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/documents/document.py +1 -2
  6. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/documents/email.py +15 -10
  7. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/documents/json_file.py +4 -4
  8. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/documents/messenger_log.py +2 -1
  9. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/documents/other_file.py +2 -2
  10. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/epstein_files.py +40 -40
  11. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/constant/output_files.py +20 -4
  12. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/constant/strings.py +8 -8
  13. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/constant/urls.py +6 -21
  14. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/constants.py +19 -18
  15. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/doc_cfg.py +3 -1
  16. epstein_files-1.0.6/epstein_files/util/env.py +85 -0
  17. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/highlighted_group.py +4 -3
  18. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/output.py +29 -16
  19. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/rich.py +56 -28
  20. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/word_count.py +10 -10
  21. {epstein_files-1.0.4 → epstein_files-1.0.6}/pyproject.toml +2 -2
  22. epstein_files-1.0.4/epstein_files/util/env.py +0 -80
  23. {epstein_files-1.0.4 → epstein_files-1.0.6}/LICENSE +0 -0
  24. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/documents/communication.py +0 -0
  25. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/documents/emails/email_header.py +0 -0
  26. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/documents/imessage/text_message.py +0 -0
  27. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/constant/common_words.py +0 -0
  28. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/constant/html.py +0 -0
  29. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/constant/names.py +0 -0
  30. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/data.py +0 -0
  31. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/file_helper.py +0 -0
  32. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/logging.py +0 -0
  33. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/search_result.py +0 -0
  34. {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/timer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: epstein-files
3
- Version: 1.0.4
3
+ Version: 1.0.6
4
4
  Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
5
5
  Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
6
6
  License: GPL-3.0-or-later
@@ -32,6 +32,8 @@ Description-Content-Type: text/markdown
32
32
 
33
33
  # I Made Epstein's Text Messages Great Again
34
34
 
35
+ ![joi_ito](docs/joi_ito_gavin_is_clever_epstein_funds_bitcoin_dev_team.png)
36
+
35
37
  * [I Made Epstein's Text Messages Great Again (And You Should Read Them)](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great) post on [Substack](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great)
36
38
  * The Epstein text messages (and some of the emails along with summary information) generated by this code can be viewed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/).
37
39
  * All of His Emails along with descriptions of the 496 files that were neither emails nor text messages can be read at [another page also generated by this code](https://michelcrypt4d4mus.github.io/epstein_text_messages/all_emails_epstein_files_nov_2025.html).
@@ -40,27 +42,44 @@ Description-Content-Type: text/markdown
40
42
  * Configuration variables assigning specific `HOUSE_OVERSIGHT_XXXXXX.txt` file IDs (the `111111` part) as being emails to or from particular people based on various research and contributions can be found in [constants.py](./epstein_files/util/constants.py). Everything in `constants.py` appears in the JSON metadata linked above.
41
43
 
42
44
 
43
- ### Usage
44
- 1. Requires you have a local copy of OCR text from the House Oversight document dump in a directory `/path/to/epstein/ocr_txt_files`. You can download them from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8).
45
- 1. Dependencies are in [pyproject.toml](./pyproject.toml). Use `poetry install` for easiest time installing. `pip install .` may or may not work.
45
+ ## Usage
46
+ 1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8).
47
+ 1. Dependencies are in [pyproject.toml](./pyproject.toml). Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
48
+
49
+ You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
50
+
51
+ ```bash
52
+ EPSTEIN_DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate --help
53
+ ```
46
54
 
47
- You need to set the `DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
55
+ All the tools that come with the package require `EPSTEIN_DOCS_DIR` to be set. These are the available tools:
48
56
 
49
57
  ```bash
50
58
  # Generate color highlighted texts/emails/other files
51
- DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate
59
+ epstein_generate
60
+
61
+ # Search for a string:
62
+ epstein_search Bannon
63
+ # Or a regex:
64
+ epstein_search '\bSteve\s*Bannon\b'
52
65
 
53
- # Search
54
- DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_search Bannon
66
+ # Show a file with color highlighting of keywords
67
+ epstein_show 030999
68
+ # Show both the highlighted and raw versions of the file:
69
+ epstein_show --raw 030999
70
+ # This also works:
71
+ epstein_show HOUSE_OVERSIGHT_030999
55
72
 
56
- # Show a color highlighted file
57
- DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_show 030999
58
- # This also works
59
- DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_show HOUSE_OVERSIGHT_030999
73
+ # Count words used by Epstein and Bannon
74
+ epstein_word_count --name 'Jeffrey Epstein' --name 'Steve Bannon'
75
+
76
+ # Diff two epstein files after all the cleanup (stripping BOMs, matching newline chars, etc):
77
+ epstein_diff 030999 020442
60
78
  ```
61
79
 
80
+ The first time you run anything it will take a few minutes to fix all the data, attribute the redacted emails, etc.
62
81
  Run `epstein_generate --help` for command line option assistance.
63
- The first time you run anything it will take a few minutes to fix all the data, attribute the redacted emails, etc. Once you've run things once you can run the `epstein_generate --pickled` to load the cached fixed up data and things will be quick.
82
+
64
83
 
65
84
  #### As A Library
66
85
  ```python
@@ -69,18 +88,18 @@ epstein_files = EpsteinFiles.get_files()
69
88
 
70
89
  # All files
71
90
  for document in epstein_files.all_documents():
72
- do_stuff()
91
+ do_stuff(document)
73
92
 
74
93
  # Emails
75
94
  for email in epstein_files.emails:
76
- do_stuff()
95
+ do_stuff(email)
77
96
 
78
97
  # iMessage Logs
79
98
  for imessage_log in epstein_files.imessage_logs:
80
- do_stuff()
99
+ do_stuff(imessage_log)
81
100
 
82
101
  # Other Files
83
- for document in epstein_files.other_files:
84
- do_stuff()
102
+ for file in epstein_files.other_files:
103
+ do_stuff(file)
85
104
  ```
86
105
 
@@ -1,5 +1,7 @@
1
1
  # I Made Epstein's Text Messages Great Again
2
2
 
3
+ ![joi_ito](docs/joi_ito_gavin_is_clever_epstein_funds_bitcoin_dev_team.png)
4
+
3
5
  * [I Made Epstein's Text Messages Great Again (And You Should Read Them)](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great) post on [Substack](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great)
4
6
  * The Epstein text messages (and some of the emails along with summary information) generated by this code can be viewed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/).
5
7
  * All of His Emails along with descriptions of the 496 files that were neither emails nor text messages can be read at [another page also generated by this code](https://michelcrypt4d4mus.github.io/epstein_text_messages/all_emails_epstein_files_nov_2025.html).
@@ -8,27 +10,44 @@
8
10
  * Configuration variables assigning specific `HOUSE_OVERSIGHT_XXXXXX.txt` file IDs (the `111111` part) as being emails to or from particular people based on various research and contributions can be found in [constants.py](./epstein_files/util/constants.py). Everything in `constants.py` appears in the JSON metadata linked above.
9
11
 
10
12
 
11
- ### Usage
12
- 1. Requires you have a local copy of OCR text from the House Oversight document dump in a directory `/path/to/epstein/ocr_txt_files`. You can download them from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8).
13
- 1. Dependencies are in [pyproject.toml](./pyproject.toml). Use `poetry install` for easiest time installing. `pip install .` may or may not work.
13
+ ## Usage
14
+ 1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8).
15
+ 1. Dependencies are in [pyproject.toml](./pyproject.toml). Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
16
+
17
+ You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
18
+
19
+ ```bash
20
+ EPSTEIN_DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate --help
21
+ ```
14
22
 
15
- You need to set the `DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
23
+ All the tools that come with the package require `EPSTEIN_DOCS_DIR` to be set. These are the available tools:
16
24
 
17
25
  ```bash
18
26
  # Generate color highlighted texts/emails/other files
19
- DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate
27
+ epstein_generate
28
+
29
+ # Search for a string:
30
+ epstein_search Bannon
31
+ # Or a regex:
32
+ epstein_search '\bSteve\s*Bannon\b'
20
33
 
21
- # Search
22
- DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_search Bannon
34
+ # Show a file with color highlighting of keywords
35
+ epstein_show 030999
36
+ # Show both the highlighted and raw versions of the file:
37
+ epstein_show --raw 030999
38
+ # This also works:
39
+ epstein_show HOUSE_OVERSIGHT_030999
23
40
 
24
- # Show a color highlighted file
25
- DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_show 030999
26
- # This also works
27
- DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_show HOUSE_OVERSIGHT_030999
41
+ # Count words used by Epstein and Bannon
42
+ epstein_word_count --name 'Jeffrey Epstein' --name 'Steve Bannon'
43
+
44
+ # Diff two epstein files after all the cleanup (stripping BOMs, matching newline chars, etc):
45
+ epstein_diff 030999 020442
28
46
  ```
29
47
 
48
+ The first time you run anything it will take a few minutes to fix all the data, attribute the redacted emails, etc.
30
49
  Run `epstein_generate --help` for command line option assistance.
31
- The first time you run anything it will take a few minutes to fix all the data, attribute the redacted emails, etc. Once you've run things once you can run the `epstein_generate --pickled` to load the cached fixed up data and things will be quick.
50
+
32
51
 
33
52
  #### As A Library
34
53
  ```python
@@ -37,17 +56,17 @@ epstein_files = EpsteinFiles.get_files()
37
56
 
38
57
  # All files
39
58
  for document in epstein_files.all_documents():
40
- do_stuff()
59
+ do_stuff(document)
41
60
 
42
61
  # Emails
43
62
  for email in epstein_files.emails:
44
- do_stuff()
63
+ do_stuff(email)
45
64
 
46
65
  # iMessage Logs
47
66
  for imessage_log in epstein_files.imessage_logs:
48
- do_stuff()
67
+ do_stuff(imessage_log)
49
68
 
50
69
  # Other Files
51
- for document in epstein_files.other_files:
52
- do_stuff()
70
+ for file in epstein_files.other_files:
71
+ do_stuff(file)
53
72
  ```
@@ -10,11 +10,12 @@ from sys import exit
10
10
 
11
11
  from dotenv import load_dotenv
12
12
  load_dotenv()
13
-
14
13
  from rich.markup import escape
15
14
  from rich.padding import Padding
16
15
  from rich.panel import Panel
16
+ from rich.text import Text
17
17
 
18
+ from epstein_files.count_words import write_word_counts_html
18
19
  from epstein_files.epstein_files import EpsteinFiles, document_cls
19
20
  from epstein_files.documents.document import INFO_PADDING, Document
20
21
  from epstein_files.documents.email import Email
@@ -24,22 +25,25 @@ from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, TEXT_MSGS_
24
25
  from epstein_files.util.env import args, specified_names
25
26
  from epstein_files.util.file_helper import coerce_file_path, extract_file_id
26
27
  from epstein_files.util.logging import logger
27
- from epstein_files.util.output import print_emails, print_json_metadata, print_json_stats, print_text_messages, write_urls
28
+ from epstein_files.util.output import print_emails, print_json_files, print_json_metadata, print_json_stats, print_text_messages, write_urls
28
29
  from epstein_files.util.rich import build_highlighter, console, print_header, print_panel, write_html
29
30
  from epstein_files.util.timer import Timer
30
31
 
32
+ timer = Timer()
33
+ epstein_files = EpsteinFiles.get_files(timer)
34
+
31
35
 
32
36
  def generate_html() -> None:
33
37
  if args.make_clean:
34
38
  make_clean()
39
+ write_urls()
35
40
  exit()
36
-
37
- timer = Timer()
38
- epstein_files = EpsteinFiles.get_files(timer)
39
-
40
- if args.json_metadata:
41
+ elif args.json_metadata:
41
42
  print_json_metadata(epstein_files)
42
43
  exit()
44
+ elif args.output_json_files:
45
+ print_json_files(epstein_files)
46
+ exit()
43
47
 
44
48
  print_header(epstein_files)
45
49
 
@@ -75,7 +79,7 @@ def epstein_diff():
75
79
  def epstein_search():
76
80
  """Search the cleaned up text of the files."""
77
81
  _assert_positional_args()
78
- epstein_files = EpsteinFiles.get_files(use_pickled=True)
82
+ epstein_files = EpsteinFiles.get_files()
79
83
 
80
84
  for search_term in args.positional_args:
81
85
  temp_highlighter = build_highlighter(search_term)
@@ -103,32 +107,27 @@ def epstein_show():
103
107
  """Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
104
108
  _assert_positional_args()
105
109
  ids = [extract_file_id(arg) for arg in args.positional_args]
110
+ raw_docs = [Document(coerce_file_path(id)) for id in ids]
111
+ docs = [document_cls(doc)(doc.file_path) for doc in raw_docs]
106
112
  console.line()
107
113
 
108
- if args.pickled:
109
- epstein_files = EpsteinFiles.get_files(use_pickled=True)
110
- docs = epstein_files.get_documents_by_id(ids)
111
- else:
112
- raw_docs = [Document(coerce_file_path(id)) for id in ids]
113
- docs = [document_cls(doc)(doc.file_path) for doc in raw_docs]
114
-
115
114
  for doc in docs:
116
- console.line()
117
- console.print(doc)
115
+ if isinstance(doc, Email):
116
+ doc.truncation_allowed = False
117
+
118
+ console.print('\n', doc, '\n')
118
119
 
119
120
  if args.raw:
120
- console.line()
121
- console.print(Panel(f"*** {doc.url_slug} RAW ***", expand=False, style=doc._border_style()))
122
- console.print(escape(doc.raw_text()))
121
+ console.print(Panel(Text("RAW: ").append(doc.summary()), expand=False, style=doc._border_style()))
122
+ console.print(escape(doc.raw_text()), '\n')
123
123
 
124
124
  if isinstance(doc, Email):
125
- console.line()
126
- console.print(Panel(f"*** {doc.url_slug} actual_text ***", expand=False, style=doc._border_style()))
127
- console.print(escape(doc._actual_text()))
125
+ console.print(Panel(Text("actual_text: ").append(doc.summary()), expand=False, style=doc._border_style()))
126
+ console.print(escape(doc._actual_text()), '\n')
128
127
 
129
128
 
130
- def epstein_dump_urls() -> None:
131
- write_urls()
129
+ def epstein_word_count() -> None:
130
+ write_word_counts_html()
132
131
 
133
132
 
134
133
  def _assert_positional_args():
@@ -0,0 +1,72 @@
1
+ # Count word usage in emails and texts
2
+ import re
3
+
4
+ from epstein_files.epstein_files import EpsteinFiles
5
+ from epstein_files.util.constant.common_words import COMMON_WORDS_LIST
6
+ from epstein_files.util.constant.output_files import WORD_COUNT_HTML_PATH
7
+ from epstein_files.util.env import args, specified_names
8
+ from epstein_files.util.logging import logger
9
+ from epstein_files.util.rich import (console, print_centered, print_color_key, print_page_title, print_panel,
10
+ print_starred_header, write_html)
11
+ from epstein_files.util.search_result import MatchedLine, SearchResult
12
+ from epstein_files.util.timer import Timer
13
+ from epstein_files.util.word_count import WordCount
14
+
15
+ HTML_REGEX = re.compile(r"^http|#yiv")
16
+
17
+
18
+ def write_word_counts_html() -> None:
19
+ timer = Timer()
20
+ epstein_files = EpsteinFiles.get_files(timer)
21
+ email_subjects: set[str] = set()
22
+ word_count = WordCount()
23
+
24
+ # Remove dupes, junk mail, and fwded articles from emails
25
+ emails = [
26
+ e for e in epstein_files.emails
27
+ if not (e.is_duplicate or e.is_junk_mail() or (e.config and e.config.is_fwded_article)) \
28
+ and (len(specified_names) == 0 or e.author in specified_names)
29
+ ]
30
+
31
+ for email in emails:
32
+ logger.info(f"Counting words in {email}\n [SUBJECT] {email.subject()}")
33
+ lines = email.actual_text.split('\n')
34
+
35
+ if email.subject() not in email_subjects and f'Re: {email.subject()}' not in email_subjects:
36
+ email_subjects.add(email.subject())
37
+ lines.append(email.subject())
38
+
39
+ for i, line in enumerate(lines):
40
+ if HTML_REGEX.search(line):
41
+ continue
42
+
43
+ for word in line.split():
44
+ word_count.tally_word(word, SearchResult(email, [MatchedLine(line, i)]))
45
+
46
+ # Add in iMessage conversation words
47
+ imessage_logs = epstein_files.imessage_logs_for(specified_names) if specified_names else epstein_files.imessage_logs
48
+
49
+ for imessage_log in imessage_logs:
50
+ logger.info(f"Counting words in {imessage_log}")
51
+
52
+ for msg in imessage_log.messages():
53
+ if len(specified_names) > 0 and msg.author not in specified_names:
54
+ continue
55
+ elif HTML_REGEX.search(line):
56
+ continue
57
+
58
+ for word in msg.text.split():
59
+ word_count.tally_word(word, SearchResult(imessage_log, [msg.text]))
60
+
61
+ print_page_title(expand=False)
62
+ print_starred_header(f"Most Common Words in {len(emails):,} Emails and {len(imessage_logs)} iMessage Logs")
63
+ print_centered(f"(excluding {len(COMMON_WORDS_LIST)} particularly common words at bottom)", style='dim')
64
+ console.line()
65
+ print_color_key()
66
+ console.line()
67
+ console.print(word_count)
68
+ console.line(2)
69
+ print_panel(f"{len(COMMON_WORDS_LIST):,} Excluded Words", centered=True)
70
+ console.print(', '.join(COMMON_WORDS_LIST), highlight=False)
71
+ write_html(WORD_COUNT_HTML_PATH)
72
+ timer.print_at_checkpoint(f"Finished counting words")
@@ -85,10 +85,9 @@ class Document:
85
85
 
86
86
  if self.is_local_extract_file():
87
87
  self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
88
- cfg_type = type(self.config).__name__ if self.config else None
89
88
 
90
89
  # Coerce FileConfig for court docs etc. to MessageCfg for email files extracted from that document
91
- if self.class_name() == EMAIL_CLASS and self.config and cfg_type != EmailCfg.__name__:
90
+ if self.class_name() == EMAIL_CLASS and self.config and not isinstance(self.config, EmailCfg):
92
91
  self.config = EmailCfg.from_doc_cfg(self.config)
93
92
  else:
94
93
  self.url_slug = self.file_path.stem
@@ -26,7 +26,7 @@ from epstein_files.util.logging import logger
26
26
  from epstein_files.util.rich import *
27
27
 
28
28
  BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
29
- BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
29
+ BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
30
30
  DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
31
31
  LINK_LINE_REGEX = re.compile(f"^(> )?htt")
32
32
  QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
@@ -245,12 +245,10 @@ TRUNCATE_TERMS = [
245
245
  ]
246
246
 
247
247
  # Some Paul Krassner emails have a ton of CCed parties we don't care about
248
- KRASSNER_RECIPIENTS = uniquify(flatten(ALL_FILE_CONFIGS[id].recipients for id in ['025329', '024923', '033568']))
248
+ KRASSNER_RECIPIENTS = uniquify(flatten([ALL_FILE_CONFIGS[id].recipients for id in ['025329', '024923', '033568']]))
249
249
 
250
250
  # No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
251
- USELESS_EMAILERS = IRAN_NUCLEAR_DEAL_SPAM_EMAIL_RECIPIENTS + \
252
- KRASSNER_RECIPIENTS + \
253
- FLIGHT_IN_2012_PEOPLE + [
251
+ USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIPIENTS + [
254
252
  'Alan Rogers', # Random CC
255
253
  'Andrew Friendly', # Presumably some relation of Kelly Friendly
256
254
  'BS Stern', # A random fwd of email we have
@@ -322,11 +320,18 @@ class Email(Communication):
322
320
  def __post_init__(self):
323
321
  super().__post_init__()
324
322
 
325
- if self.config and self.config.recipients:
326
- self.recipients = cast(list[str | None], self.config.recipients)
327
- else:
328
- for recipient in self.header.recipients():
329
- self.recipients.extend(self._get_names(recipient))
323
+ try:
324
+ if self.config and self.config.recipients:
325
+ self.recipients = cast(list[str | None], self.config.recipients)
326
+ else:
327
+ for recipient in self.header.recipients():
328
+ self.recipients.extend(self._get_names(recipient))
329
+ except Exception as e:
330
+ console.print_exception()
331
+ console.line(2)
332
+ logger.fatal(f"Failed on {self.file_id}")
333
+ console.line(2)
334
+ raise e
330
335
 
331
336
  # Remove self CCs
332
337
  recipients = [r for r in self.recipients if r != self.author or self.file_id in SELF_EMAILS_FILE_IDS]
@@ -21,14 +21,11 @@ class JsonFile(OtherFile):
21
21
  if self.url_slug.endswith('.txt') or self.url_slug.endswith('.json'):
22
22
  self.url_slug = Path(self.url_slug).stem
23
23
 
24
- self._set_computed_fields(text=self.formatted_json())
24
+ self._set_computed_fields(text=self.json_str())
25
25
 
26
26
  def category(self) -> str:
27
27
  return JSON
28
28
 
29
- def formatted_json(self) -> str:
30
- return json.dumps(self.json_data(), indent=4)
31
-
32
29
  def info_txt(self) -> Text | None:
33
30
  return Text(f"JSON file, possibly iMessage or similar app metadata", style='white dim italic')
34
31
 
@@ -38,3 +35,6 @@ class JsonFile(OtherFile):
38
35
  def json_data(self) -> object:
39
36
  with open(self.file_path, encoding='utf-8-sig') as f:
40
37
  return json.load(f)
38
+
39
+ def json_str(self) -> str:
40
+ return json.dumps(self.json_data(), indent=4)
@@ -15,6 +15,7 @@ from epstein_files.util.data import iso_timestamp, listify, sort_dict
15
15
  from epstein_files.util.doc_cfg import Metadata, TextCfg
16
16
  from epstein_files.util.highlighted_group import get_style_for_name
17
17
  from epstein_files.util.logging import logger
18
+ from epstein_files.util.rich import build_table
18
19
 
19
20
  CONFIRMED_MSG = 'Found confirmed counterparty'
20
21
  GUESSED_MSG = 'This is probably a conversation with'
@@ -111,7 +112,7 @@ class MessengerLog(Communication):
111
112
  @classmethod
112
113
  def summary_table(cls, imessage_logs: list['MessengerLog']) -> Table:
113
114
  """Build a table summarizing the text messages in 'imessage_logs'."""
114
- counts_table = Table(title="Text Message Counts By Author", header_style="bold")
115
+ counts_table = build_table("Text Message Counts By Author")
115
116
  counts_table.add_column(AUTHOR.title(), justify='left', style="steel_blue bold", width=30)
116
117
  counts_table.add_column('Files', justify='right', style='white')
117
118
  counts_table.add_column("Msgs", justify='right')
@@ -20,7 +20,7 @@ from epstein_files.util.data import escape_single_quotes, remove_timezone, uniqu
20
20
  from epstein_files.util.file_helper import FILENAME_LENGTH
21
21
  from epstein_files.util.env import args
22
22
  from epstein_files.util.highlighted_group import get_style_for_category
23
- from epstein_files.util.rich import QUESTION_MARK_TXT, highlighter
23
+ from epstein_files.util.rich import QUESTION_MARK_TXT, build_table, highlighter
24
24
  from epstein_files.util.logging import logger
25
25
 
26
26
  MAX_DAYS_SPANNED_TO_BE_VALID = 10
@@ -233,7 +233,7 @@ class OtherFile(Document):
233
233
  @staticmethod
234
234
  def build_table(docs: list['OtherFile']) -> Table:
235
235
  """Build a table of OtherFile documents."""
236
- table = Table(header_style='bold', show_lines=True)
236
+ table = build_table(None, show_lines=True)
237
237
  table.add_column('File', justify='center', width=FILENAME_LENGTH)
238
238
  table.add_column('Date', justify='center')
239
239
  table.add_column('Size', justify='center')
@@ -19,7 +19,6 @@ from epstein_files.documents.emails.email_header import AUTHOR
19
19
  from epstein_files.documents.json_file import JsonFile
20
20
  from epstein_files.documents.messenger_log import MSG_REGEX, MessengerLog
21
21
  from epstein_files.documents.other_file import OtherFile
22
- from epstein_files.util.constant.output_files import PICKLED_PATH
23
22
  from epstein_files.util.constant.strings import *
24
23
  from epstein_files.util.constant.urls import (EPSTEIN_MEDIA, EPSTEIN_WEB, JMAIL, epstein_media_person_url,
25
24
  epsteinify_name_url, epstein_web_person_url, search_jmail_url, search_twitter_url)
@@ -29,15 +28,16 @@ from epstein_files.util.doc_cfg import EmailCfg, Metadata
29
28
  from epstein_files.util.env import args, logger
30
29
  from epstein_files.util.file_helper import DOCS_DIR, file_size_str
31
30
  from epstein_files.util.highlighted_group import get_info_for_name, get_style_for_name
32
- from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, add_cols_to_table, console, highlighter,
33
- link_text_obj, link_markup, print_author_header, print_centered, print_other_site_link, print_panel,
34
- print_section_header, vertically_pad)
31
+ from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, TABLE_BORDER_STYLE, add_cols_to_table,
32
+ build_table, console, highlighter, link_text_obj, link_markup, print_author_header, print_centered,
33
+ print_other_site_link, print_panel, print_section_header, vertically_pad)
35
34
  from epstein_files.util.search_result import SearchResult
36
35
  from epstein_files.util.timer import Timer
37
36
 
37
+ EXCLUDED_EMAILERS = [e.lower() for e in (USELESS_EMAILERS + [JEFFREY_EPSTEIN])]
38
+ PICKLED_PATH = Path("the_epstein_files.pkl.gz")
38
39
  DEVICE_SIGNATURE = 'Device Signature'
39
40
  DEVICE_SIGNATURE_PADDING = (1, 0)
40
- NOT_INCLUDED_EMAILERS = [e.lower() for e in (USELESS_EMAILERS + [JEFFREY_EPSTEIN])]
41
41
  SLOW_FILE_SECONDS = 1.0
42
42
 
43
43
  INVALID_FOR_EPSTEIN_WEB = JUNK_EMAILERS + KRASSNER_RECIPIENTS + [
@@ -94,23 +94,23 @@ class EpsteinFiles:
94
94
  self._tally_email_data()
95
95
 
96
96
  @classmethod
97
- def get_files(cls, timer: Timer | None = None, use_pickled: bool = False) -> 'EpsteinFiles':
97
+ def get_files(cls, timer: Timer | None = None) -> 'EpsteinFiles':
98
98
  """Alternate constructor that reads/writes a pickled version of the data ('timer' arg is for logging)."""
99
99
  timer = timer or Timer()
100
100
 
101
- if ((args.pickled or use_pickled) and PICKLED_PATH.exists()) and not args.overwrite_pickle:
101
+ if PICKLED_PATH.exists() and not args.overwrite_pickle:
102
102
  with gzip.open(PICKLED_PATH, 'rb') as file:
103
103
  epstein_files = pickle.load(file)
104
104
  timer.print_at_checkpoint(f"Loaded {len(epstein_files.all_files):,} documents from '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})")
105
105
  epstein_files.timer = timer
106
106
  return epstein_files
107
107
 
108
+ logger.warning(f"Building new cache file, this will take a few minutes...")
108
109
  epstein_files = EpsteinFiles(timer=timer)
109
110
 
110
- if args.overwrite_pickle or not PICKLED_PATH.exists():
111
- with gzip.open(PICKLED_PATH, 'wb') as file:
112
- pickle.dump(epstein_files, file)
113
- logger.warning(f"Pickled data to '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})...")
111
+ with gzip.open(PICKLED_PATH, 'wb') as file:
112
+ pickle.dump(epstein_files, file)
113
+ logger.warning(f"Pickled data to '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})...")
114
114
 
115
115
  timer.print_at_checkpoint(f'Processed {len(epstein_files.all_files):,} documents')
116
116
  return epstein_files
@@ -119,9 +119,9 @@ class EpsteinFiles:
119
119
  return self.imessage_logs + self.emails + self.other_files
120
120
 
121
121
  def all_emailers(self, include_useless: bool = False) -> list[str | None]:
122
- """Returns all emailers except Epstein and USELESS_EMAILERS, sorted from least frequent to most."""
122
+ """Returns all emailers except Epstein and EXCLUDED_EMAILERS, sorted from least frequent to most."""
123
123
  names = [a for a in self.email_author_counts.keys()] + [r for r in self.email_recipient_counts.keys()]
124
- names = names if include_useless else [e for e in names if e is None or e.lower() not in NOT_INCLUDED_EMAILERS]
124
+ names = names if include_useless else [e for e in names if e is None or e.lower() not in EXCLUDED_EMAILERS]
125
125
  return sorted(list(set(names)), key=lambda e: self.email_author_counts[e] + self.email_recipient_counts[e])
126
126
 
127
127
  def attributed_email_count(self) -> int:
@@ -200,10 +200,10 @@ class EpsteinFiles:
200
200
  def json_metadata(self) -> str:
201
201
  """Create a JSON string containing metadata for all the files."""
202
202
  metadata = {
203
- EMAIL_CLASS: _sorted_metadata(self.emails),
204
- JSON_FILE_CLASS: _sorted_metadata(self.json_files),
205
- MESSENGER_LOG_CLASS: _sorted_metadata(self.imessage_logs),
206
- OTHER_FILE_CLASS: _sorted_metadata(self.non_json_other_files()),
203
+ Email.__name__: _sorted_metadata(self.emails),
204
+ JsonFile.__name__: _sorted_metadata(self.json_files),
205
+ MessengerLog.__name__: _sorted_metadata(self.imessage_logs),
206
+ OtherFile.__name__: _sorted_metadata(self.non_json_other_files()),
207
207
  }
208
208
 
209
209
  return json.dumps(metadata, indent=4, sort_keys=True)
@@ -212,7 +212,7 @@ class EpsteinFiles:
212
212
  return [doc for doc in self.other_files if not isinstance(doc, JsonFile)]
213
213
 
214
214
  def print_files_summary(self) -> None:
215
- table = Table(title='Summary of Document Types')
215
+ table = build_table('Summary of Document Types')
216
216
  add_cols_to_table(table, ['File Type', 'Files', 'Author Known', 'Author Unknown', 'Duplicates'])
217
217
 
218
218
  def add_row(label: str, docs: list):
@@ -268,12 +268,12 @@ class EpsteinFiles:
268
268
 
269
269
  def print_email_device_info(self) -> None:
270
270
  print_panel(f"Email [italic]Sent from \\[DEVICE][/italic] Signature Breakdown", padding=(4, 0, 0, 0), centered=True)
271
- console.print(build_signature_table(self.email_authors_to_device_signatures, (AUTHOR, DEVICE_SIGNATURE)))
272
- console.print(build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
271
+ console.print(_build_signature_table(self.email_authors_to_device_signatures, (AUTHOR, DEVICE_SIGNATURE)))
272
+ console.print(_build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
273
273
 
274
274
  def print_emailer_counts_table(self) -> None:
275
275
  footer = f"Identified authors of {self.attributed_email_count():,} emails out of {len(self.emails):,}."
276
- counts_table = Table(title=f"Email Counts", caption=footer, header_style="bold")
276
+ counts_table = build_table("Email Counts", caption=footer)
277
277
  add_cols_to_table(counts_table, ['Name', 'Count', 'Sent', "Recv'd", JMAIL, EPSTEIN_MEDIA, EPSTEIN_WEB, 'Twitter'])
278
278
 
279
279
  emailer_counts = {
@@ -345,21 +345,6 @@ class EpsteinFiles:
345
345
  self.email_device_signatures_to_authors[email.sent_from_device].add(email.author_or_unknown())
346
346
 
347
347
 
348
- def build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str], join_char: str = '\n') -> Padding:
349
- title = 'Signatures Used By Authors' if cols[0] == AUTHOR else 'Authors Seen Using Signatures'
350
- table = Table(header_style="bold reverse", show_lines=True, title=title)
351
-
352
- for i, col in enumerate(cols):
353
- table.add_column(col.title() + ('s' if i == 1 else ''))
354
-
355
- new_dict = dict_sets_to_lists(keyed_sets)
356
-
357
- for k in sorted(new_dict.keys()):
358
- table.add_row(highlighter(k or UNKNOWN), highlighter(join_char.join(sorted(new_dict[k]))))
359
-
360
- return Padding(table, DEVICE_SIGNATURE_PADDING)
361
-
362
-
363
348
  def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
364
349
  counts: dict[str | None, int] = defaultdict(int)
365
350
 
@@ -372,12 +357,12 @@ def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
372
357
  return counts
373
358
 
374
359
 
375
- def document_cls(document: Document) -> Type[Document]:
376
- search_area = document.text[0:5000] # Limit search area to avoid pointless scans of huge files
360
+ def document_cls(doc: Document) -> Type[Document]:
361
+ search_area = doc.text[0:5000] # Limit search area to avoid pointless scans of huge files
377
362
 
378
- if document.text[0] == '{':
363
+ if doc.text[0] == '{':
379
364
  return JsonFile
380
- elif isinstance(document.config, EmailCfg) or DETECT_EMAIL_REGEX.match(search_area):
365
+ elif isinstance(doc.config, EmailCfg) or (DETECT_EMAIL_REGEX.match(search_area) and doc.config is None):
381
366
  return Email
382
367
  elif MSG_REGEX.search(search_area):
383
368
  return MessengerLog
@@ -397,6 +382,21 @@ def is_ok_for_epstein_web(name: str | None) -> bool:
397
382
  return True
398
383
 
399
384
 
385
+ def _build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str], join_char: str = '\n') -> Padding:
386
+ title = 'Signatures Used By Authors' if cols[0] == AUTHOR else 'Authors Seen Using Signatures'
387
+ table = build_table(title, header_style="bold reverse", show_lines=True)
388
+
389
+ for i, col in enumerate(cols):
390
+ table.add_column(col.title() + ('s' if i == 1 else ''))
391
+
392
+ new_dict = dict_sets_to_lists(keyed_sets)
393
+
394
+ for k in sorted(new_dict.keys()):
395
+ table.add_row(highlighter(k or UNKNOWN), highlighter(join_char.join(sorted(new_dict[k]))))
396
+
397
+ return Padding(table, DEVICE_SIGNATURE_PADDING)
398
+
399
+
400
400
  def _sorted_metadata(docs: Sequence[Document]) -> list[Metadata]:
401
401
  docs_sorted_by_id = sorted(docs, key=lambda d: d.file_id)
402
402
  return [json_safe(d.metadata()) for d in docs_sorted_by_id]