epstein-files 1.0.4__tar.gz → 1.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {epstein_files-1.0.4 → epstein_files-1.0.6}/PKG-INFO +37 -18
- {epstein_files-1.0.4 → epstein_files-1.0.6}/README.md +36 -17
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/__init__.py +24 -25
- epstein_files-1.0.6/epstein_files/count_words.py +72 -0
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/documents/document.py +1 -2
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/documents/email.py +15 -10
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/documents/json_file.py +4 -4
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/documents/messenger_log.py +2 -1
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/documents/other_file.py +2 -2
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/epstein_files.py +40 -40
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/constant/output_files.py +20 -4
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/constant/strings.py +8 -8
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/constant/urls.py +6 -21
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/constants.py +19 -18
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/doc_cfg.py +3 -1
- epstein_files-1.0.6/epstein_files/util/env.py +85 -0
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/highlighted_group.py +4 -3
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/output.py +29 -16
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/rich.py +56 -28
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/word_count.py +10 -10
- {epstein_files-1.0.4 → epstein_files-1.0.6}/pyproject.toml +2 -2
- epstein_files-1.0.4/epstein_files/util/env.py +0 -80
- {epstein_files-1.0.4 → epstein_files-1.0.6}/LICENSE +0 -0
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/documents/communication.py +0 -0
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/documents/emails/email_header.py +0 -0
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/documents/imessage/text_message.py +0 -0
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/constant/common_words.py +0 -0
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/constant/html.py +0 -0
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/constant/names.py +0 -0
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/data.py +0 -0
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/file_helper.py +0 -0
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/logging.py +0 -0
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/search_result.py +0 -0
- {epstein_files-1.0.4 → epstein_files-1.0.6}/epstein_files/util/timer.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: epstein-files
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.6
|
|
4
4
|
Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
|
|
5
5
|
Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
|
|
6
6
|
License: GPL-3.0-or-later
|
|
@@ -32,6 +32,8 @@ Description-Content-Type: text/markdown
|
|
|
32
32
|
|
|
33
33
|
# I Made Epstein's Text Messages Great Again
|
|
34
34
|
|
|
35
|
+

|
|
36
|
+
|
|
35
37
|
* [I Made Epstein's Text Messages Great Again (And You Should Read Them)](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great) post on [Substack](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great)
|
|
36
38
|
* The Epstein text messages (and some of the emails along with summary information) generated by this code can be viewed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/).
|
|
37
39
|
* All of His Emails along with descriptions of the 496 files that were neither emails nor text messages can be read at [another page also generated by this code](https://michelcrypt4d4mus.github.io/epstein_text_messages/all_emails_epstein_files_nov_2025.html).
|
|
@@ -40,27 +42,44 @@ Description-Content-Type: text/markdown
|
|
|
40
42
|
* Configuration variables assigning specific `HOUSE_OVERSIGHT_XXXXXX.txt` file IDs (the `111111` part) as being emails to or from particular people based on various research and contributions can be found in [constants.py](./epstein_files/util/constants.py). Everything in `constants.py` appears in the JSON metadata linked above.
|
|
41
43
|
|
|
42
44
|
|
|
43
|
-
|
|
44
|
-
1. Requires you have a local copy of OCR text from the House Oversight document
|
|
45
|
-
1. Dependencies are in [pyproject.toml](./pyproject.toml). Use `poetry install` for easiest time installing. `pip install
|
|
45
|
+
## Usage
|
|
46
|
+
1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8).
|
|
47
|
+
1. Dependencies are in [pyproject.toml](./pyproject.toml). Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
|
|
48
|
+
|
|
49
|
+
You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
EPSTEIN_DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate --help
|
|
53
|
+
```
|
|
46
54
|
|
|
47
|
-
|
|
55
|
+
All the tools that come with the package require `EPSTEIN_DOCS_DIR` to be set. These are the available tools:
|
|
48
56
|
|
|
49
57
|
```bash
|
|
50
58
|
# Generate color highlighted texts/emails/other files
|
|
51
|
-
|
|
59
|
+
epstein_generate
|
|
60
|
+
|
|
61
|
+
# Search for a string:
|
|
62
|
+
epstein_search Bannon
|
|
63
|
+
# Or a regex:
|
|
64
|
+
epstein_search '\bSteve\s*Bannon\b'
|
|
52
65
|
|
|
53
|
-
#
|
|
54
|
-
|
|
66
|
+
# Show a file with color highlighting of keywords
|
|
67
|
+
epstein_show 030999
|
|
68
|
+
# Show both the highlighted and raw versions of the file:
|
|
69
|
+
epstein_show --raw 030999
|
|
70
|
+
# This also works:
|
|
71
|
+
epstein_show HOUSE_OVERSIGHT_030999
|
|
55
72
|
|
|
56
|
-
#
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
73
|
+
# Count words used by Epstein and Bannon
|
|
74
|
+
epstein_word_count --name 'Jeffrey Epstein' --name 'Steve Bannon'
|
|
75
|
+
|
|
76
|
+
# Diff two epstein files after all the cleanup (stripping BOMs, matching newline chars, etc):
|
|
77
|
+
epstein_diff 030999 020442
|
|
60
78
|
```
|
|
61
79
|
|
|
80
|
+
The first time you run anything it will take a few minutes to fix all the data, attribute the redacted emails, etc.
|
|
62
81
|
Run `epstein_generate --help` for command line option assistance.
|
|
63
|
-
|
|
82
|
+
|
|
64
83
|
|
|
65
84
|
#### As A Library
|
|
66
85
|
```python
|
|
@@ -69,18 +88,18 @@ epstein_files = EpsteinFiles.get_files()
|
|
|
69
88
|
|
|
70
89
|
# All files
|
|
71
90
|
for document in epstein_files.all_documents():
|
|
72
|
-
do_stuff()
|
|
91
|
+
do_stuff(document)
|
|
73
92
|
|
|
74
93
|
# Emails
|
|
75
94
|
for email in epstein_files.emails:
|
|
76
|
-
do_stuff()
|
|
95
|
+
do_stuff(email)
|
|
77
96
|
|
|
78
97
|
# iMessage Logs
|
|
79
98
|
for imessage_log in epstein_files.imessage_logs:
|
|
80
|
-
do_stuff()
|
|
99
|
+
do_stuff(imessage_log)
|
|
81
100
|
|
|
82
101
|
# Other Files
|
|
83
|
-
for
|
|
84
|
-
do_stuff()
|
|
102
|
+
for file in epstein_files.other_files:
|
|
103
|
+
do_stuff(file)
|
|
85
104
|
```
|
|
86
105
|
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# I Made Epstein's Text Messages Great Again
|
|
2
2
|
|
|
3
|
+

|
|
4
|
+
|
|
3
5
|
* [I Made Epstein's Text Messages Great Again (And You Should Read Them)](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great) post on [Substack](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great)
|
|
4
6
|
* The Epstein text messages (and some of the emails along with summary information) generated by this code can be viewed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/).
|
|
5
7
|
* All of His Emails along with descriptions of the 496 files that were neither emails nor text messages can be read at [another page also generated by this code](https://michelcrypt4d4mus.github.io/epstein_text_messages/all_emails_epstein_files_nov_2025.html).
|
|
@@ -8,27 +10,44 @@
|
|
|
8
10
|
* Configuration variables assigning specific `HOUSE_OVERSIGHT_XXXXXX.txt` file IDs (the `111111` part) as being emails to or from particular people based on various research and contributions can be found in [constants.py](./epstein_files/util/constants.py). Everything in `constants.py` appears in the JSON metadata linked above.
|
|
9
11
|
|
|
10
12
|
|
|
11
|
-
|
|
12
|
-
1. Requires you have a local copy of OCR text from the House Oversight document
|
|
13
|
-
1. Dependencies are in [pyproject.toml](./pyproject.toml). Use `poetry install` for easiest time installing. `pip install
|
|
13
|
+
## Usage
|
|
14
|
+
1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8).
|
|
15
|
+
1. Dependencies are in [pyproject.toml](./pyproject.toml). Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
|
|
16
|
+
|
|
17
|
+
You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
EPSTEIN_DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate --help
|
|
21
|
+
```
|
|
14
22
|
|
|
15
|
-
|
|
23
|
+
All the tools that come with the package require `EPSTEIN_DOCS_DIR` to be set. These are the available tools:
|
|
16
24
|
|
|
17
25
|
```bash
|
|
18
26
|
# Generate color highlighted texts/emails/other files
|
|
19
|
-
|
|
27
|
+
epstein_generate
|
|
28
|
+
|
|
29
|
+
# Search for a string:
|
|
30
|
+
epstein_search Bannon
|
|
31
|
+
# Or a regex:
|
|
32
|
+
epstein_search '\bSteve\s*Bannon\b'
|
|
20
33
|
|
|
21
|
-
#
|
|
22
|
-
|
|
34
|
+
# Show a file with color highlighting of keywords
|
|
35
|
+
epstein_show 030999
|
|
36
|
+
# Show both the highlighted and raw versions of the file:
|
|
37
|
+
epstein_show --raw 030999
|
|
38
|
+
# This also works:
|
|
39
|
+
epstein_show HOUSE_OVERSIGHT_030999
|
|
23
40
|
|
|
24
|
-
#
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
41
|
+
# Count words used by Epstein and Bannon
|
|
42
|
+
epstein_word_count --name 'Jeffrey Epstein' --name 'Steve Bannon'
|
|
43
|
+
|
|
44
|
+
# Diff two epstein files after all the cleanup (stripping BOMs, matching newline chars, etc):
|
|
45
|
+
epstein_diff 030999 020442
|
|
28
46
|
```
|
|
29
47
|
|
|
48
|
+
The first time you run anything it will take a few minutes to fix all the data, attribute the redacted emails, etc.
|
|
30
49
|
Run `epstein_generate --help` for command line option assistance.
|
|
31
|
-
|
|
50
|
+
|
|
32
51
|
|
|
33
52
|
#### As A Library
|
|
34
53
|
```python
|
|
@@ -37,17 +56,17 @@ epstein_files = EpsteinFiles.get_files()
|
|
|
37
56
|
|
|
38
57
|
# All files
|
|
39
58
|
for document in epstein_files.all_documents():
|
|
40
|
-
do_stuff()
|
|
59
|
+
do_stuff(document)
|
|
41
60
|
|
|
42
61
|
# Emails
|
|
43
62
|
for email in epstein_files.emails:
|
|
44
|
-
do_stuff()
|
|
63
|
+
do_stuff(email)
|
|
45
64
|
|
|
46
65
|
# iMessage Logs
|
|
47
66
|
for imessage_log in epstein_files.imessage_logs:
|
|
48
|
-
do_stuff()
|
|
67
|
+
do_stuff(imessage_log)
|
|
49
68
|
|
|
50
69
|
# Other Files
|
|
51
|
-
for
|
|
52
|
-
do_stuff()
|
|
70
|
+
for file in epstein_files.other_files:
|
|
71
|
+
do_stuff(file)
|
|
53
72
|
```
|
|
@@ -10,11 +10,12 @@ from sys import exit
|
|
|
10
10
|
|
|
11
11
|
from dotenv import load_dotenv
|
|
12
12
|
load_dotenv()
|
|
13
|
-
|
|
14
13
|
from rich.markup import escape
|
|
15
14
|
from rich.padding import Padding
|
|
16
15
|
from rich.panel import Panel
|
|
16
|
+
from rich.text import Text
|
|
17
17
|
|
|
18
|
+
from epstein_files.count_words import write_word_counts_html
|
|
18
19
|
from epstein_files.epstein_files import EpsteinFiles, document_cls
|
|
19
20
|
from epstein_files.documents.document import INFO_PADDING, Document
|
|
20
21
|
from epstein_files.documents.email import Email
|
|
@@ -24,22 +25,25 @@ from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, TEXT_MSGS_
|
|
|
24
25
|
from epstein_files.util.env import args, specified_names
|
|
25
26
|
from epstein_files.util.file_helper import coerce_file_path, extract_file_id
|
|
26
27
|
from epstein_files.util.logging import logger
|
|
27
|
-
from epstein_files.util.output import print_emails, print_json_metadata, print_json_stats, print_text_messages, write_urls
|
|
28
|
+
from epstein_files.util.output import print_emails, print_json_files, print_json_metadata, print_json_stats, print_text_messages, write_urls
|
|
28
29
|
from epstein_files.util.rich import build_highlighter, console, print_header, print_panel, write_html
|
|
29
30
|
from epstein_files.util.timer import Timer
|
|
30
31
|
|
|
32
|
+
timer = Timer()
|
|
33
|
+
epstein_files = EpsteinFiles.get_files(timer)
|
|
34
|
+
|
|
31
35
|
|
|
32
36
|
def generate_html() -> None:
|
|
33
37
|
if args.make_clean:
|
|
34
38
|
make_clean()
|
|
39
|
+
write_urls()
|
|
35
40
|
exit()
|
|
36
|
-
|
|
37
|
-
timer = Timer()
|
|
38
|
-
epstein_files = EpsteinFiles.get_files(timer)
|
|
39
|
-
|
|
40
|
-
if args.json_metadata:
|
|
41
|
+
elif args.json_metadata:
|
|
41
42
|
print_json_metadata(epstein_files)
|
|
42
43
|
exit()
|
|
44
|
+
elif args.output_json_files:
|
|
45
|
+
print_json_files(epstein_files)
|
|
46
|
+
exit()
|
|
43
47
|
|
|
44
48
|
print_header(epstein_files)
|
|
45
49
|
|
|
@@ -75,7 +79,7 @@ def epstein_diff():
|
|
|
75
79
|
def epstein_search():
|
|
76
80
|
"""Search the cleaned up text of the files."""
|
|
77
81
|
_assert_positional_args()
|
|
78
|
-
epstein_files = EpsteinFiles.get_files(
|
|
82
|
+
epstein_files = EpsteinFiles.get_files()
|
|
79
83
|
|
|
80
84
|
for search_term in args.positional_args:
|
|
81
85
|
temp_highlighter = build_highlighter(search_term)
|
|
@@ -103,32 +107,27 @@ def epstein_show():
|
|
|
103
107
|
"""Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
|
|
104
108
|
_assert_positional_args()
|
|
105
109
|
ids = [extract_file_id(arg) for arg in args.positional_args]
|
|
110
|
+
raw_docs = [Document(coerce_file_path(id)) for id in ids]
|
|
111
|
+
docs = [document_cls(doc)(doc.file_path) for doc in raw_docs]
|
|
106
112
|
console.line()
|
|
107
113
|
|
|
108
|
-
if args.pickled:
|
|
109
|
-
epstein_files = EpsteinFiles.get_files(use_pickled=True)
|
|
110
|
-
docs = epstein_files.get_documents_by_id(ids)
|
|
111
|
-
else:
|
|
112
|
-
raw_docs = [Document(coerce_file_path(id)) for id in ids]
|
|
113
|
-
docs = [document_cls(doc)(doc.file_path) for doc in raw_docs]
|
|
114
|
-
|
|
115
114
|
for doc in docs:
|
|
116
|
-
|
|
117
|
-
|
|
115
|
+
if isinstance(doc, Email):
|
|
116
|
+
doc.truncation_allowed = False
|
|
117
|
+
|
|
118
|
+
console.print('\n', doc, '\n')
|
|
118
119
|
|
|
119
120
|
if args.raw:
|
|
120
|
-
console.
|
|
121
|
-
console.print(
|
|
122
|
-
console.print(escape(doc.raw_text()))
|
|
121
|
+
console.print(Panel(Text("RAW: ").append(doc.summary()), expand=False, style=doc._border_style()))
|
|
122
|
+
console.print(escape(doc.raw_text()), '\n')
|
|
123
123
|
|
|
124
124
|
if isinstance(doc, Email):
|
|
125
|
-
console.
|
|
126
|
-
console.print(
|
|
127
|
-
console.print(escape(doc._actual_text()))
|
|
125
|
+
console.print(Panel(Text("actual_text: ").append(doc.summary()), expand=False, style=doc._border_style()))
|
|
126
|
+
console.print(escape(doc._actual_text()), '\n')
|
|
128
127
|
|
|
129
128
|
|
|
130
|
-
def
|
|
131
|
-
|
|
129
|
+
def epstein_word_count() -> None:
|
|
130
|
+
write_word_counts_html()
|
|
132
131
|
|
|
133
132
|
|
|
134
133
|
def _assert_positional_args():
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# Count word usage in emails and texts
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from epstein_files.epstein_files import EpsteinFiles
|
|
5
|
+
from epstein_files.util.constant.common_words import COMMON_WORDS_LIST
|
|
6
|
+
from epstein_files.util.constant.output_files import WORD_COUNT_HTML_PATH
|
|
7
|
+
from epstein_files.util.env import args, specified_names
|
|
8
|
+
from epstein_files.util.logging import logger
|
|
9
|
+
from epstein_files.util.rich import (console, print_centered, print_color_key, print_page_title, print_panel,
|
|
10
|
+
print_starred_header, write_html)
|
|
11
|
+
from epstein_files.util.search_result import MatchedLine, SearchResult
|
|
12
|
+
from epstein_files.util.timer import Timer
|
|
13
|
+
from epstein_files.util.word_count import WordCount
|
|
14
|
+
|
|
15
|
+
HTML_REGEX = re.compile(r"^http|#yiv")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def write_word_counts_html() -> None:
|
|
19
|
+
timer = Timer()
|
|
20
|
+
epstein_files = EpsteinFiles.get_files(timer)
|
|
21
|
+
email_subjects: set[str] = set()
|
|
22
|
+
word_count = WordCount()
|
|
23
|
+
|
|
24
|
+
# Remove dupes, junk mail, and fwded articles from emails
|
|
25
|
+
emails = [
|
|
26
|
+
e for e in epstein_files.emails
|
|
27
|
+
if not (e.is_duplicate or e.is_junk_mail() or (e.config and e.config.is_fwded_article)) \
|
|
28
|
+
and (len(specified_names) == 0 or e.author in specified_names)
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
for email in emails:
|
|
32
|
+
logger.info(f"Counting words in {email}\n [SUBJECT] {email.subject()}")
|
|
33
|
+
lines = email.actual_text.split('\n')
|
|
34
|
+
|
|
35
|
+
if email.subject() not in email_subjects and f'Re: {email.subject()}' not in email_subjects:
|
|
36
|
+
email_subjects.add(email.subject())
|
|
37
|
+
lines.append(email.subject())
|
|
38
|
+
|
|
39
|
+
for i, line in enumerate(lines):
|
|
40
|
+
if HTML_REGEX.search(line):
|
|
41
|
+
continue
|
|
42
|
+
|
|
43
|
+
for word in line.split():
|
|
44
|
+
word_count.tally_word(word, SearchResult(email, [MatchedLine(line, i)]))
|
|
45
|
+
|
|
46
|
+
# Add in iMessage conversation words
|
|
47
|
+
imessage_logs = epstein_files.imessage_logs_for(specified_names) if specified_names else epstein_files.imessage_logs
|
|
48
|
+
|
|
49
|
+
for imessage_log in imessage_logs:
|
|
50
|
+
logger.info(f"Counting words in {imessage_log}")
|
|
51
|
+
|
|
52
|
+
for msg in imessage_log.messages():
|
|
53
|
+
if len(specified_names) > 0 and msg.author not in specified_names:
|
|
54
|
+
continue
|
|
55
|
+
elif HTML_REGEX.search(line):
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
for word in msg.text.split():
|
|
59
|
+
word_count.tally_word(word, SearchResult(imessage_log, [msg.text]))
|
|
60
|
+
|
|
61
|
+
print_page_title(expand=False)
|
|
62
|
+
print_starred_header(f"Most Common Words in {len(emails):,} Emails and {len(imessage_logs)} iMessage Logs")
|
|
63
|
+
print_centered(f"(excluding {len(COMMON_WORDS_LIST)} particularly common words at bottom)", style='dim')
|
|
64
|
+
console.line()
|
|
65
|
+
print_color_key()
|
|
66
|
+
console.line()
|
|
67
|
+
console.print(word_count)
|
|
68
|
+
console.line(2)
|
|
69
|
+
print_panel(f"{len(COMMON_WORDS_LIST):,} Excluded Words", centered=True)
|
|
70
|
+
console.print(', '.join(COMMON_WORDS_LIST), highlight=False)
|
|
71
|
+
write_html(WORD_COUNT_HTML_PATH)
|
|
72
|
+
timer.print_at_checkpoint(f"Finished counting words")
|
|
@@ -85,10 +85,9 @@ class Document:
|
|
|
85
85
|
|
|
86
86
|
if self.is_local_extract_file():
|
|
87
87
|
self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
|
|
88
|
-
cfg_type = type(self.config).__name__ if self.config else None
|
|
89
88
|
|
|
90
89
|
# Coerce FileConfig for court docs etc. to MessageCfg for email files extracted from that document
|
|
91
|
-
if self.class_name() == EMAIL_CLASS and self.config and
|
|
90
|
+
if self.class_name() == EMAIL_CLASS and self.config and not isinstance(self.config, EmailCfg):
|
|
92
91
|
self.config = EmailCfg.from_doc_cfg(self.config)
|
|
93
92
|
else:
|
|
94
93
|
self.url_slug = self.file_path.stem
|
|
@@ -26,7 +26,7 @@ from epstein_files.util.logging import logger
|
|
|
26
26
|
from epstein_files.util.rich import *
|
|
27
27
|
|
|
28
28
|
BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
|
|
29
|
-
BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
|
|
29
|
+
BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
|
|
30
30
|
DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
|
|
31
31
|
LINK_LINE_REGEX = re.compile(f"^(> )?htt")
|
|
32
32
|
QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
|
|
@@ -245,12 +245,10 @@ TRUNCATE_TERMS = [
|
|
|
245
245
|
]
|
|
246
246
|
|
|
247
247
|
# Some Paul Krassner emails have a ton of CCed parties we don't care about
|
|
248
|
-
KRASSNER_RECIPIENTS = uniquify(flatten(ALL_FILE_CONFIGS[id].recipients for id in ['025329', '024923', '033568']))
|
|
248
|
+
KRASSNER_RECIPIENTS = uniquify(flatten([ALL_FILE_CONFIGS[id].recipients for id in ['025329', '024923', '033568']]))
|
|
249
249
|
|
|
250
250
|
# No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
|
|
251
|
-
USELESS_EMAILERS =
|
|
252
|
-
KRASSNER_RECIPIENTS + \
|
|
253
|
-
FLIGHT_IN_2012_PEOPLE + [
|
|
251
|
+
USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIPIENTS + [
|
|
254
252
|
'Alan Rogers', # Random CC
|
|
255
253
|
'Andrew Friendly', # Presumably some relation of Kelly Friendly
|
|
256
254
|
'BS Stern', # A random fwd of email we have
|
|
@@ -322,11 +320,18 @@ class Email(Communication):
|
|
|
322
320
|
def __post_init__(self):
|
|
323
321
|
super().__post_init__()
|
|
324
322
|
|
|
325
|
-
|
|
326
|
-
self.
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
self.recipients
|
|
323
|
+
try:
|
|
324
|
+
if self.config and self.config.recipients:
|
|
325
|
+
self.recipients = cast(list[str | None], self.config.recipients)
|
|
326
|
+
else:
|
|
327
|
+
for recipient in self.header.recipients():
|
|
328
|
+
self.recipients.extend(self._get_names(recipient))
|
|
329
|
+
except Exception as e:
|
|
330
|
+
console.print_exception()
|
|
331
|
+
console.line(2)
|
|
332
|
+
logger.fatal(f"Failed on {self.file_id}")
|
|
333
|
+
console.line(2)
|
|
334
|
+
raise e
|
|
330
335
|
|
|
331
336
|
# Remove self CCs
|
|
332
337
|
recipients = [r for r in self.recipients if r != self.author or self.file_id in SELF_EMAILS_FILE_IDS]
|
|
@@ -21,14 +21,11 @@ class JsonFile(OtherFile):
|
|
|
21
21
|
if self.url_slug.endswith('.txt') or self.url_slug.endswith('.json'):
|
|
22
22
|
self.url_slug = Path(self.url_slug).stem
|
|
23
23
|
|
|
24
|
-
self._set_computed_fields(text=self.
|
|
24
|
+
self._set_computed_fields(text=self.json_str())
|
|
25
25
|
|
|
26
26
|
def category(self) -> str:
|
|
27
27
|
return JSON
|
|
28
28
|
|
|
29
|
-
def formatted_json(self) -> str:
|
|
30
|
-
return json.dumps(self.json_data(), indent=4)
|
|
31
|
-
|
|
32
29
|
def info_txt(self) -> Text | None:
|
|
33
30
|
return Text(f"JSON file, possibly iMessage or similar app metadata", style='white dim italic')
|
|
34
31
|
|
|
@@ -38,3 +35,6 @@ class JsonFile(OtherFile):
|
|
|
38
35
|
def json_data(self) -> object:
|
|
39
36
|
with open(self.file_path, encoding='utf-8-sig') as f:
|
|
40
37
|
return json.load(f)
|
|
38
|
+
|
|
39
|
+
def json_str(self) -> str:
|
|
40
|
+
return json.dumps(self.json_data(), indent=4)
|
|
@@ -15,6 +15,7 @@ from epstein_files.util.data import iso_timestamp, listify, sort_dict
|
|
|
15
15
|
from epstein_files.util.doc_cfg import Metadata, TextCfg
|
|
16
16
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
17
17
|
from epstein_files.util.logging import logger
|
|
18
|
+
from epstein_files.util.rich import build_table
|
|
18
19
|
|
|
19
20
|
CONFIRMED_MSG = 'Found confirmed counterparty'
|
|
20
21
|
GUESSED_MSG = 'This is probably a conversation with'
|
|
@@ -111,7 +112,7 @@ class MessengerLog(Communication):
|
|
|
111
112
|
@classmethod
|
|
112
113
|
def summary_table(cls, imessage_logs: list['MessengerLog']) -> Table:
|
|
113
114
|
"""Build a table summarizing the text messages in 'imessage_logs'."""
|
|
114
|
-
counts_table =
|
|
115
|
+
counts_table = build_table("Text Message Counts By Author")
|
|
115
116
|
counts_table.add_column(AUTHOR.title(), justify='left', style="steel_blue bold", width=30)
|
|
116
117
|
counts_table.add_column('Files', justify='right', style='white')
|
|
117
118
|
counts_table.add_column("Msgs", justify='right')
|
|
@@ -20,7 +20,7 @@ from epstein_files.util.data import escape_single_quotes, remove_timezone, uniqu
|
|
|
20
20
|
from epstein_files.util.file_helper import FILENAME_LENGTH
|
|
21
21
|
from epstein_files.util.env import args
|
|
22
22
|
from epstein_files.util.highlighted_group import get_style_for_category
|
|
23
|
-
from epstein_files.util.rich import QUESTION_MARK_TXT, highlighter
|
|
23
|
+
from epstein_files.util.rich import QUESTION_MARK_TXT, build_table, highlighter
|
|
24
24
|
from epstein_files.util.logging import logger
|
|
25
25
|
|
|
26
26
|
MAX_DAYS_SPANNED_TO_BE_VALID = 10
|
|
@@ -233,7 +233,7 @@ class OtherFile(Document):
|
|
|
233
233
|
@staticmethod
|
|
234
234
|
def build_table(docs: list['OtherFile']) -> Table:
|
|
235
235
|
"""Build a table of OtherFile documents."""
|
|
236
|
-
table =
|
|
236
|
+
table = build_table(None, show_lines=True)
|
|
237
237
|
table.add_column('File', justify='center', width=FILENAME_LENGTH)
|
|
238
238
|
table.add_column('Date', justify='center')
|
|
239
239
|
table.add_column('Size', justify='center')
|
|
@@ -19,7 +19,6 @@ from epstein_files.documents.emails.email_header import AUTHOR
|
|
|
19
19
|
from epstein_files.documents.json_file import JsonFile
|
|
20
20
|
from epstein_files.documents.messenger_log import MSG_REGEX, MessengerLog
|
|
21
21
|
from epstein_files.documents.other_file import OtherFile
|
|
22
|
-
from epstein_files.util.constant.output_files import PICKLED_PATH
|
|
23
22
|
from epstein_files.util.constant.strings import *
|
|
24
23
|
from epstein_files.util.constant.urls import (EPSTEIN_MEDIA, EPSTEIN_WEB, JMAIL, epstein_media_person_url,
|
|
25
24
|
epsteinify_name_url, epstein_web_person_url, search_jmail_url, search_twitter_url)
|
|
@@ -29,15 +28,16 @@ from epstein_files.util.doc_cfg import EmailCfg, Metadata
|
|
|
29
28
|
from epstein_files.util.env import args, logger
|
|
30
29
|
from epstein_files.util.file_helper import DOCS_DIR, file_size_str
|
|
31
30
|
from epstein_files.util.highlighted_group import get_info_for_name, get_style_for_name
|
|
32
|
-
from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT,
|
|
33
|
-
link_text_obj, link_markup, print_author_header, print_centered,
|
|
34
|
-
print_section_header, vertically_pad)
|
|
31
|
+
from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, TABLE_BORDER_STYLE, add_cols_to_table,
|
|
32
|
+
build_table, console, highlighter, link_text_obj, link_markup, print_author_header, print_centered,
|
|
33
|
+
print_other_site_link, print_panel, print_section_header, vertically_pad)
|
|
35
34
|
from epstein_files.util.search_result import SearchResult
|
|
36
35
|
from epstein_files.util.timer import Timer
|
|
37
36
|
|
|
37
|
+
EXCLUDED_EMAILERS = [e.lower() for e in (USELESS_EMAILERS + [JEFFREY_EPSTEIN])]
|
|
38
|
+
PICKLED_PATH = Path("the_epstein_files.pkl.gz")
|
|
38
39
|
DEVICE_SIGNATURE = 'Device Signature'
|
|
39
40
|
DEVICE_SIGNATURE_PADDING = (1, 0)
|
|
40
|
-
NOT_INCLUDED_EMAILERS = [e.lower() for e in (USELESS_EMAILERS + [JEFFREY_EPSTEIN])]
|
|
41
41
|
SLOW_FILE_SECONDS = 1.0
|
|
42
42
|
|
|
43
43
|
INVALID_FOR_EPSTEIN_WEB = JUNK_EMAILERS + KRASSNER_RECIPIENTS + [
|
|
@@ -94,23 +94,23 @@ class EpsteinFiles:
|
|
|
94
94
|
self._tally_email_data()
|
|
95
95
|
|
|
96
96
|
@classmethod
|
|
97
|
-
def get_files(cls, timer: Timer | None = None
|
|
97
|
+
def get_files(cls, timer: Timer | None = None) -> 'EpsteinFiles':
|
|
98
98
|
"""Alternate constructor that reads/writes a pickled version of the data ('timer' arg is for logging)."""
|
|
99
99
|
timer = timer or Timer()
|
|
100
100
|
|
|
101
|
-
if
|
|
101
|
+
if PICKLED_PATH.exists() and not args.overwrite_pickle:
|
|
102
102
|
with gzip.open(PICKLED_PATH, 'rb') as file:
|
|
103
103
|
epstein_files = pickle.load(file)
|
|
104
104
|
timer.print_at_checkpoint(f"Loaded {len(epstein_files.all_files):,} documents from '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})")
|
|
105
105
|
epstein_files.timer = timer
|
|
106
106
|
return epstein_files
|
|
107
107
|
|
|
108
|
+
logger.warning(f"Building new cache file, this will take a few minutes...")
|
|
108
109
|
epstein_files = EpsteinFiles(timer=timer)
|
|
109
110
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
logger.warning(f"Pickled data to '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})...")
|
|
111
|
+
with gzip.open(PICKLED_PATH, 'wb') as file:
|
|
112
|
+
pickle.dump(epstein_files, file)
|
|
113
|
+
logger.warning(f"Pickled data to '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})...")
|
|
114
114
|
|
|
115
115
|
timer.print_at_checkpoint(f'Processed {len(epstein_files.all_files):,} documents')
|
|
116
116
|
return epstein_files
|
|
@@ -119,9 +119,9 @@ class EpsteinFiles:
|
|
|
119
119
|
return self.imessage_logs + self.emails + self.other_files
|
|
120
120
|
|
|
121
121
|
def all_emailers(self, include_useless: bool = False) -> list[str | None]:
|
|
122
|
-
"""Returns all emailers except Epstein and
|
|
122
|
+
"""Returns all emailers except Epstein and EXCLUDED_EMAILERS, sorted from least frequent to most."""
|
|
123
123
|
names = [a for a in self.email_author_counts.keys()] + [r for r in self.email_recipient_counts.keys()]
|
|
124
|
-
names = names if include_useless else [e for e in names if e is None or e.lower() not in
|
|
124
|
+
names = names if include_useless else [e for e in names if e is None or e.lower() not in EXCLUDED_EMAILERS]
|
|
125
125
|
return sorted(list(set(names)), key=lambda e: self.email_author_counts[e] + self.email_recipient_counts[e])
|
|
126
126
|
|
|
127
127
|
def attributed_email_count(self) -> int:
|
|
@@ -200,10 +200,10 @@ class EpsteinFiles:
|
|
|
200
200
|
def json_metadata(self) -> str:
|
|
201
201
|
"""Create a JSON string containing metadata for all the files."""
|
|
202
202
|
metadata = {
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
203
|
+
Email.__name__: _sorted_metadata(self.emails),
|
|
204
|
+
JsonFile.__name__: _sorted_metadata(self.json_files),
|
|
205
|
+
MessengerLog.__name__: _sorted_metadata(self.imessage_logs),
|
|
206
|
+
OtherFile.__name__: _sorted_metadata(self.non_json_other_files()),
|
|
207
207
|
}
|
|
208
208
|
|
|
209
209
|
return json.dumps(metadata, indent=4, sort_keys=True)
|
|
@@ -212,7 +212,7 @@ class EpsteinFiles:
|
|
|
212
212
|
return [doc for doc in self.other_files if not isinstance(doc, JsonFile)]
|
|
213
213
|
|
|
214
214
|
def print_files_summary(self) -> None:
|
|
215
|
-
table =
|
|
215
|
+
table = build_table('Summary of Document Types')
|
|
216
216
|
add_cols_to_table(table, ['File Type', 'Files', 'Author Known', 'Author Unknown', 'Duplicates'])
|
|
217
217
|
|
|
218
218
|
def add_row(label: str, docs: list):
|
|
@@ -268,12 +268,12 @@ class EpsteinFiles:
|
|
|
268
268
|
|
|
269
269
|
def print_email_device_info(self) -> None:
|
|
270
270
|
print_panel(f"Email [italic]Sent from \\[DEVICE][/italic] Signature Breakdown", padding=(4, 0, 0, 0), centered=True)
|
|
271
|
-
console.print(
|
|
272
|
-
console.print(
|
|
271
|
+
console.print(_build_signature_table(self.email_authors_to_device_signatures, (AUTHOR, DEVICE_SIGNATURE)))
|
|
272
|
+
console.print(_build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
|
|
273
273
|
|
|
274
274
|
def print_emailer_counts_table(self) -> None:
|
|
275
275
|
footer = f"Identified authors of {self.attributed_email_count():,} emails out of {len(self.emails):,}."
|
|
276
|
-
counts_table =
|
|
276
|
+
counts_table = build_table("Email Counts", caption=footer)
|
|
277
277
|
add_cols_to_table(counts_table, ['Name', 'Count', 'Sent', "Recv'd", JMAIL, EPSTEIN_MEDIA, EPSTEIN_WEB, 'Twitter'])
|
|
278
278
|
|
|
279
279
|
emailer_counts = {
|
|
@@ -345,21 +345,6 @@ class EpsteinFiles:
|
|
|
345
345
|
self.email_device_signatures_to_authors[email.sent_from_device].add(email.author_or_unknown())
|
|
346
346
|
|
|
347
347
|
|
|
348
|
-
def build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str], join_char: str = '\n') -> Padding:
|
|
349
|
-
title = 'Signatures Used By Authors' if cols[0] == AUTHOR else 'Authors Seen Using Signatures'
|
|
350
|
-
table = Table(header_style="bold reverse", show_lines=True, title=title)
|
|
351
|
-
|
|
352
|
-
for i, col in enumerate(cols):
|
|
353
|
-
table.add_column(col.title() + ('s' if i == 1 else ''))
|
|
354
|
-
|
|
355
|
-
new_dict = dict_sets_to_lists(keyed_sets)
|
|
356
|
-
|
|
357
|
-
for k in sorted(new_dict.keys()):
|
|
358
|
-
table.add_row(highlighter(k or UNKNOWN), highlighter(join_char.join(sorted(new_dict[k]))))
|
|
359
|
-
|
|
360
|
-
return Padding(table, DEVICE_SIGNATURE_PADDING)
|
|
361
|
-
|
|
362
|
-
|
|
363
348
|
def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
|
|
364
349
|
counts: dict[str | None, int] = defaultdict(int)
|
|
365
350
|
|
|
@@ -372,12 +357,12 @@ def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
|
|
|
372
357
|
return counts
|
|
373
358
|
|
|
374
359
|
|
|
375
|
-
def document_cls(
|
|
376
|
-
search_area =
|
|
360
|
+
def document_cls(doc: Document) -> Type[Document]:
|
|
361
|
+
search_area = doc.text[0:5000] # Limit search area to avoid pointless scans of huge files
|
|
377
362
|
|
|
378
|
-
if
|
|
363
|
+
if doc.text[0] == '{':
|
|
379
364
|
return JsonFile
|
|
380
|
-
elif isinstance(
|
|
365
|
+
elif isinstance(doc.config, EmailCfg) or (DETECT_EMAIL_REGEX.match(search_area) and doc.config is None):
|
|
381
366
|
return Email
|
|
382
367
|
elif MSG_REGEX.search(search_area):
|
|
383
368
|
return MessengerLog
|
|
@@ -397,6 +382,21 @@ def is_ok_for_epstein_web(name: str | None) -> bool:
|
|
|
397
382
|
return True
|
|
398
383
|
|
|
399
384
|
|
|
385
|
+
def _build_signature_table(keyed_sets: dict[str, set[str]], cols: tuple[str, str], join_char: str = '\n') -> Padding:
|
|
386
|
+
title = 'Signatures Used By Authors' if cols[0] == AUTHOR else 'Authors Seen Using Signatures'
|
|
387
|
+
table = build_table(title, header_style="bold reverse", show_lines=True)
|
|
388
|
+
|
|
389
|
+
for i, col in enumerate(cols):
|
|
390
|
+
table.add_column(col.title() + ('s' if i == 1 else ''))
|
|
391
|
+
|
|
392
|
+
new_dict = dict_sets_to_lists(keyed_sets)
|
|
393
|
+
|
|
394
|
+
for k in sorted(new_dict.keys()):
|
|
395
|
+
table.add_row(highlighter(k or UNKNOWN), highlighter(join_char.join(sorted(new_dict[k]))))
|
|
396
|
+
|
|
397
|
+
return Padding(table, DEVICE_SIGNATURE_PADDING)
|
|
398
|
+
|
|
399
|
+
|
|
400
400
|
def _sorted_metadata(docs: Sequence[Document]) -> list[Metadata]:
|
|
401
401
|
docs_sorted_by_id = sorted(docs, key=lambda d: d.file_id)
|
|
402
402
|
return [json_safe(d.metadata()) for d in docs_sorted_by_id]
|