epstein-files 1.0.0__tar.gz → 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {epstein_files-1.0.0 → epstein_files-1.0.1}/PKG-INFO +3 -2
- {epstein_files-1.0.0 → epstein_files-1.0.1}/README.md +2 -1
- {epstein_files-1.0.0 → epstein_files-1.0.1}/epstein_files/__init__.py +59 -51
- {epstein_files-1.0.0 → epstein_files-1.0.1}/epstein_files/documents/communication.py +9 -9
- {epstein_files-1.0.0 → epstein_files-1.0.1}/epstein_files/documents/document.py +111 -87
- {epstein_files-1.0.0 → epstein_files-1.0.1}/epstein_files/documents/email.py +154 -85
- {epstein_files-1.0.0 → epstein_files-1.0.1}/epstein_files/documents/emails/email_header.py +7 -6
- {epstein_files-1.0.0 → epstein_files-1.0.1}/epstein_files/documents/imessage/text_message.py +3 -2
- {epstein_files-1.0.0 → epstein_files-1.0.1}/epstein_files/documents/json_file.py +17 -0
- epstein_files-1.0.1/epstein_files/documents/messenger_log.py +132 -0
- epstein_files-1.0.1/epstein_files/documents/other_file.py +265 -0
- {epstein_files-1.0.0 → epstein_files-1.0.1}/epstein_files/epstein_files.py +100 -143
- {epstein_files-1.0.0 → epstein_files-1.0.1}/epstein_files/util/constant/names.py +6 -0
- {epstein_files-1.0.0 → epstein_files-1.0.1}/epstein_files/util/constant/strings.py +27 -0
- {epstein_files-1.0.0 → epstein_files-1.0.1}/epstein_files/util/constant/urls.py +22 -9
- epstein_files-1.0.1/epstein_files/util/constants.py +1505 -0
- {epstein_files-1.0.0 → epstein_files-1.0.1}/epstein_files/util/data.py +14 -28
- epstein_files-1.0.0/epstein_files/util/file_cfg.py → epstein_files-1.0.1/epstein_files/util/doc_cfg.py +120 -34
- {epstein_files-1.0.0 → epstein_files-1.0.1}/epstein_files/util/env.py +16 -18
- {epstein_files-1.0.0 → epstein_files-1.0.1}/epstein_files/util/file_helper.py +56 -17
- {epstein_files-1.0.0 → epstein_files-1.0.1}/epstein_files/util/highlighted_group.py +227 -175
- epstein_files-1.0.1/epstein_files/util/logging.py +57 -0
- {epstein_files-1.0.0 → epstein_files-1.0.1}/epstein_files/util/rich.py +18 -13
- epstein_files-1.0.1/epstein_files/util/search_result.py +23 -0
- epstein_files-1.0.1/epstein_files/util/timer.py +24 -0
- {epstein_files-1.0.0 → epstein_files-1.0.1}/epstein_files/util/word_count.py +2 -1
- {epstein_files-1.0.0 → epstein_files-1.0.1}/pyproject.toml +1 -1
- epstein_files-1.0.0/epstein_files/documents/messenger_log.py +0 -73
- epstein_files-1.0.0/epstein_files/documents/other_file.py +0 -117
- epstein_files-1.0.0/epstein_files/util/constants.py +0 -1552
- epstein_files-1.0.0/epstein_files/util/search_result.py +0 -15
- {epstein_files-1.0.0 → epstein_files-1.0.1}/LICENSE +0 -0
- {epstein_files-1.0.0 → epstein_files-1.0.1}/epstein_files/util/constant/common_words.py +0 -0
- {epstein_files-1.0.0 → epstein_files-1.0.1}/epstein_files/util/constant/html.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: epstein-files
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.1
|
|
4
4
|
Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
|
|
5
5
|
Author: Michel de Cryptadamus
|
|
6
6
|
Requires-Python: >=3.11,<4.0
|
|
@@ -20,7 +20,8 @@ Description-Content-Type: text/markdown
|
|
|
20
20
|
* The Epstein text messages (and some of the emails along with summary counts of sent emails to/from Epstein) generated by this code can be viewed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/).
|
|
21
21
|
* All of His Emails can be read at another page also generated by this code [here](https://michelcrypt4d4mus.github.io/epstein_emails_house_oversight/).
|
|
22
22
|
* Word counts for the emails and text messages are [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/epstein_emails_word_count.html).
|
|
23
|
-
*
|
|
23
|
+
* Metadata containing what I have figured out about who sent or received the communications in a given file (and a brief explanation for how I figured it out for each file) is deployed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/epstein_files_nov_2025_cryptadamus_metadata.json)
|
|
24
|
+
* Configuration variables assigning specific `HOUSE_OVERSIGHT_XXXXXX.txt` file IDs (the `111111` part) as being emails to or from particular people based on various research and contributions can be found in [constants.py](./epstein_files/util/constants.py). Everything in `constants.py` should also appear in the JSON metadata.
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
### Usage
|
|
@@ -4,7 +4,8 @@
|
|
|
4
4
|
* The Epstein text messages (and some of the emails along with summary counts of sent emails to/from Epstein) generated by this code can be viewed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/).
|
|
5
5
|
* All of His Emails can be read at another page also generated by this code [here](https://michelcrypt4d4mus.github.io/epstein_emails_house_oversight/).
|
|
6
6
|
* Word counts for the emails and text messages are [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/epstein_emails_word_count.html).
|
|
7
|
-
*
|
|
7
|
+
* Metadata containing what I have figured out about who sent or received the communications in a given file (and a brief explanation for how I figured it out for each file) is deployed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/epstein_files_nov_2025_cryptadamus_metadata.json)
|
|
8
|
+
* Configuration variables assigning specific `HOUSE_OVERSIGHT_XXXXXX.txt` file IDs (the `111111` part) as being emails to or from particular people based on various research and contributions can be found in [constants.py](./epstein_files/util/constants.py). Everything in `constants.py` should also appear in the JSON metadata.
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
### Usage
|
|
@@ -13,19 +13,22 @@ load_dotenv()
|
|
|
13
13
|
from rich.padding import Padding
|
|
14
14
|
|
|
15
15
|
from epstein_files.documents.email import Email
|
|
16
|
+
from epstein_files.documents.messenger_log import MessengerLog
|
|
16
17
|
from epstein_files.epstein_files import EpsteinFiles, count_by_month
|
|
17
18
|
from epstein_files.util.constant.html import *
|
|
18
19
|
from epstein_files.util.constant.names import *
|
|
19
20
|
from epstein_files.util.constant.strings import EMAIL_CLASS, MESSENGER_LOG_CLASS
|
|
20
|
-
from epstein_files.util.data import
|
|
21
|
-
from epstein_files.util.env import
|
|
22
|
-
from epstein_files.util.file_helper import GH_PAGES_HTML_PATH
|
|
21
|
+
from epstein_files.util.data import dict_sets_to_lists
|
|
22
|
+
from epstein_files.util.env import args, specified_names
|
|
23
|
+
from epstein_files.util.file_helper import GH_PAGES_HTML_PATH, JSON_METADATA_PATH, make_clean
|
|
24
|
+
from epstein_files.util.logging import logger
|
|
23
25
|
from epstein_files.util.rich import *
|
|
26
|
+
from epstein_files.util.timer import Timer
|
|
24
27
|
|
|
25
28
|
PRINT_COLOR_KEY_EVERY_N_EMAILS = 150
|
|
26
29
|
|
|
27
|
-
# Order matters
|
|
28
|
-
|
|
30
|
+
# Order matters. Default names to print emails for.
|
|
31
|
+
DEFAULT_EMAILERS = [
|
|
29
32
|
JEREMY_RUBIN,
|
|
30
33
|
AL_SECKEL,
|
|
31
34
|
JOI_ITO,
|
|
@@ -49,8 +52,9 @@ PEOPLE_WHOSE_EMAILS_SHOULD_BE_PRINTED: list[str | None] = [
|
|
|
49
52
|
None,
|
|
50
53
|
]
|
|
51
54
|
|
|
52
|
-
# Order matters
|
|
53
|
-
|
|
55
|
+
# Order matters. Default names to print tables w/email subject, timestamp, etc for.
|
|
56
|
+
# TODO: get rid of this
|
|
57
|
+
DEFAULT_EMAILER_TABLES: list[str | None] = [
|
|
54
58
|
GHISLAINE_MAXWELL,
|
|
55
59
|
LEON_BLACK,
|
|
56
60
|
LANDON_THOMAS,
|
|
@@ -64,42 +68,58 @@ PEOPLE_WHOSE_EMAILS_SHOULD_BE_TABLES: list[str | None] = [
|
|
|
64
68
|
TOM_PRITZKER,
|
|
65
69
|
]
|
|
66
70
|
|
|
71
|
+
if len(set(DEFAULT_EMAILERS).intersection(set(DEFAULT_EMAILER_TABLES))) > 0:
|
|
72
|
+
raise RuntimeError(f"Some names appear in both DEFAULT_EMAILERS and DEFAULT_EMAILER_TABLES")
|
|
73
|
+
|
|
67
74
|
|
|
68
75
|
def generate_html() -> None:
|
|
76
|
+
if args.make_clean:
|
|
77
|
+
make_clean()
|
|
78
|
+
exit()
|
|
79
|
+
|
|
69
80
|
timer = Timer()
|
|
70
81
|
epstein_files = EpsteinFiles.get_files(timer)
|
|
82
|
+
|
|
83
|
+
if args.json_metadata:
|
|
84
|
+
json_str = epstein_files.json_metadata()
|
|
85
|
+
|
|
86
|
+
if args.build:
|
|
87
|
+
with open(JSON_METADATA_PATH, 'w') as f:
|
|
88
|
+
f.write(json_str)
|
|
89
|
+
timer.print_at_checkpoint(f"Wrote {file_size_str(JSON_METADATA_PATH)} to '{JSON_METADATA_PATH}'")
|
|
90
|
+
else:
|
|
91
|
+
console.print_json(json_str, indent=4, sort_keys=True)
|
|
92
|
+
|
|
93
|
+
exit()
|
|
94
|
+
|
|
71
95
|
print_header(epstein_files)
|
|
72
96
|
|
|
73
97
|
if args.colors_only:
|
|
74
98
|
exit()
|
|
75
99
|
|
|
76
|
-
# Text messages section
|
|
77
100
|
if args.output_texts:
|
|
78
|
-
|
|
79
|
-
timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs)
|
|
101
|
+
_print_text_messages(epstein_files)
|
|
102
|
+
timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs)} text message logs')
|
|
80
103
|
|
|
81
|
-
# Emails section
|
|
82
104
|
if args.output_emails:
|
|
83
|
-
emails_printed =
|
|
105
|
+
emails_printed = _print_emails(epstein_files)
|
|
84
106
|
timer.print_at_checkpoint(f"Printed {emails_printed:,} emails")
|
|
85
107
|
|
|
86
108
|
if args.output_other_files:
|
|
87
|
-
epstein_files.print_other_files_table()
|
|
88
|
-
timer.print_at_checkpoint(f"Printed {len(
|
|
89
|
-
else:
|
|
90
|
-
logger.warning(f"Skipping other files section...")
|
|
109
|
+
files_printed = epstein_files.print_other_files_table()
|
|
110
|
+
timer.print_at_checkpoint(f"Printed {len(files_printed)} other files")
|
|
91
111
|
|
|
92
112
|
# Save output
|
|
93
113
|
write_html(GH_PAGES_HTML_PATH)
|
|
94
|
-
logger.warning(f"Total time: {timer.
|
|
114
|
+
logger.warning(f"Total time: {timer.seconds_since_start_str()}")
|
|
95
115
|
|
|
96
116
|
# JSON stats (mostly used for building pytest checks)
|
|
97
117
|
if args.json_stats:
|
|
98
118
|
console.line(5)
|
|
99
|
-
|
|
119
|
+
_print_json_stats(epstein_files)
|
|
100
120
|
|
|
101
121
|
|
|
102
|
-
def
|
|
122
|
+
def _print_emails(epstein_files: EpsteinFiles) -> int:
|
|
103
123
|
"""Returns number of emails printed."""
|
|
104
124
|
print_section_header(('Selections from ' if not args.all_emails else '') + 'His Emails')
|
|
105
125
|
print_other_site_link(is_header=False)
|
|
@@ -109,7 +129,7 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
|
|
|
109
129
|
|
|
110
130
|
emailers_to_print: list[str | None]
|
|
111
131
|
emailer_tables: list[str | None] = []
|
|
112
|
-
|
|
132
|
+
already_printed_emails: list[Email] = []
|
|
113
133
|
num_emails_printed_since_last_color_key = 0
|
|
114
134
|
|
|
115
135
|
if args.all_emails:
|
|
@@ -117,26 +137,17 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
|
|
|
117
137
|
emailers_to_print = sorted(epstein_files.all_emailers(), key=lambda e: epstein_files.earliest_email_at(e))
|
|
118
138
|
print_numbered_list_of_emailers(emailers_to_print, epstein_files)
|
|
119
139
|
else:
|
|
120
|
-
if
|
|
121
|
-
emailers_to_print = specified_names
|
|
122
|
-
else:
|
|
123
|
-
emailers_to_print = PEOPLE_WHOSE_EMAILS_SHOULD_BE_PRINTED
|
|
124
|
-
|
|
140
|
+
emailers_to_print = specified_names if specified_names else DEFAULT_EMAILERS
|
|
125
141
|
console.print('Email conversations grouped by counterparty can be found in the order listed below.')
|
|
126
142
|
print_numbered_list_of_emailers(emailers_to_print)
|
|
127
143
|
console.print("\nAfter that there's tables linking to (but not displaying) all known emails for each of these people:")
|
|
128
144
|
|
|
129
145
|
if len(specified_names) > 0:
|
|
130
|
-
|
|
131
|
-
emailer_tables = sorted(epstein_files.all_emailers(), key=lambda e: epstein_files.earliest_email_at(e))
|
|
132
|
-
else:
|
|
133
|
-
emailer_tables = PEOPLE_WHOSE_EMAILS_SHOULD_BE_TABLES
|
|
134
|
-
|
|
135
|
-
print_numbered_list_of_emailers(emailer_tables)
|
|
146
|
+
print_numbered_list_of_emailers(DEFAULT_EMAILER_TABLES)
|
|
136
147
|
|
|
137
148
|
for author in emailers_to_print:
|
|
138
149
|
newly_printed_emails = epstein_files.print_emails_for(author)
|
|
139
|
-
|
|
150
|
+
already_printed_emails.extend(newly_printed_emails)
|
|
140
151
|
num_emails_printed_since_last_color_key += len(newly_printed_emails)
|
|
141
152
|
|
|
142
153
|
# Print color key every once in a while
|
|
@@ -144,36 +155,33 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
|
|
|
144
155
|
print_color_key()
|
|
145
156
|
num_emails_printed_since_last_color_key = 0
|
|
146
157
|
|
|
147
|
-
if
|
|
148
|
-
|
|
158
|
+
if not specified_names:
|
|
159
|
+
if not args.all_emails:
|
|
160
|
+
print_author_header(f"Email Tables for {len(emailer_tables)} Other People", 'white')
|
|
149
161
|
|
|
150
|
-
|
|
151
|
-
|
|
162
|
+
for name in DEFAULT_EMAILER_TABLES:
|
|
163
|
+
epstein_files.print_emails_table_for(name)
|
|
152
164
|
|
|
153
|
-
if len(specified_names) == 0:
|
|
154
165
|
epstein_files.print_email_device_info()
|
|
155
166
|
|
|
156
|
-
|
|
157
|
-
|
|
167
|
+
# Check that all emails were actually printed
|
|
158
168
|
if args.all_emails:
|
|
159
|
-
email_ids_that_were_printed = set([email.file_id for email in
|
|
160
|
-
logger.warning(f"Printed {len(
|
|
169
|
+
email_ids_that_were_printed = set([email.file_id for email in already_printed_emails])
|
|
170
|
+
logger.warning(f"Printed {len(already_printed_emails)} emails of {len(email_ids_that_were_printed)} unique file IDs.")
|
|
161
171
|
|
|
162
172
|
for email in epstein_files.emails:
|
|
163
173
|
if email.file_id not in email_ids_that_were_printed and not email.is_duplicate:
|
|
164
|
-
logger.warning(f"Failed to print {email.
|
|
174
|
+
logger.warning(f"Failed to print {email.summary()}")
|
|
165
175
|
|
|
166
|
-
|
|
176
|
+
logger.warning(f"Rewrote {len(Email.rewritten_header_ids)} headers of {len(epstein_files.emails)} emails")
|
|
177
|
+
return len(already_printed_emails)
|
|
167
178
|
|
|
168
179
|
|
|
169
|
-
def
|
|
180
|
+
def _print_text_messages(epstein_files: EpsteinFiles) -> None:
|
|
170
181
|
print_section_header('Text Messages')
|
|
171
182
|
print_centered("(conversations are sorted chronologically based on timestamp of first message)\n", style='gray30')
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
log_files = epstein_files.imessage_logs
|
|
175
|
-
else:
|
|
176
|
-
log_files = flatten([epstein_files.imessage_logs_for(name) for name in specified_names])
|
|
183
|
+
authors: list[str | None] = specified_names if specified_names else [JEFFREY_EPSTEIN]
|
|
184
|
+
log_files = epstein_files.imessage_logs_for(authors)
|
|
177
185
|
|
|
178
186
|
for log_file in log_files:
|
|
179
187
|
console.print(Padding(log_file))
|
|
@@ -182,9 +190,9 @@ def print_text_messages(epstein_files: EpsteinFiles) -> None:
|
|
|
182
190
|
epstein_files.print_imessage_summary()
|
|
183
191
|
|
|
184
192
|
|
|
185
|
-
def
|
|
193
|
+
def _print_json_stats(epstein_files: EpsteinFiles) -> None:
|
|
186
194
|
console.print(Panel('JSON Stats Dump', expand=True, style='reverse bold'), '\n')
|
|
187
|
-
print_json(f"{MESSENGER_LOG_CLASS} Sender Counts", epstein_files.
|
|
195
|
+
print_json(f"{MESSENGER_LOG_CLASS} Sender Counts", MessengerLog.count_authors(epstein_files.imessage_logs), skip_falsey=True)
|
|
188
196
|
print_json(f"{EMAIL_CLASS} Author Counts", epstein_files.email_author_counts, skip_falsey=True)
|
|
189
197
|
print_json(f"{EMAIL_CLASS} Recipient Counts", epstein_files.email_recipient_counts, skip_falsey=True)
|
|
190
198
|
print_json("Email signature_substitution_countss", epstein_files.email_signature_substitution_counts(), skip_falsey=True)
|
|
@@ -8,7 +8,7 @@ from rich.text import Text
|
|
|
8
8
|
from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, Document
|
|
9
9
|
from epstein_files.util.constant.names import UNKNOWN
|
|
10
10
|
from epstein_files.util.constants import FALLBACK_TIMESTAMP
|
|
11
|
-
from epstein_files.util.
|
|
11
|
+
from epstein_files.util.doc_cfg import CommunicationCfg
|
|
12
12
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
13
13
|
from epstein_files.util.rich import key_value_txt
|
|
14
14
|
|
|
@@ -20,7 +20,7 @@ class Communication(Document):
|
|
|
20
20
|
"""Superclass for Email and MessengerLog."""
|
|
21
21
|
author_style: str = 'white'
|
|
22
22
|
author_txt: Text = field(init=False)
|
|
23
|
-
config:
|
|
23
|
+
config: CommunicationCfg | None = None
|
|
24
24
|
timestamp: datetime = FALLBACK_TIMESTAMP # TODO this default sucks (though it never happens)
|
|
25
25
|
|
|
26
26
|
def __post_init__(self):
|
|
@@ -31,22 +31,22 @@ class Communication(Document):
|
|
|
31
31
|
def author_or_unknown(self) -> str:
|
|
32
32
|
return self.author or UNKNOWN
|
|
33
33
|
|
|
34
|
-
def
|
|
35
|
-
return self.
|
|
36
|
-
|
|
37
|
-
def is_attribution_uncertain(self) -> bool | None:
|
|
38
|
-
return self.config and self.config.is_attribution_uncertain
|
|
34
|
+
def is_attribution_uncertain(self) -> bool:
|
|
35
|
+
return bool(self.config and self.config.is_attribution_uncertain)
|
|
39
36
|
|
|
40
37
|
def raw_document_link_txt(self, _style: str = '', include_alt_link: bool = True) -> Text:
|
|
41
38
|
"""Overrides super() method to apply self.author_style."""
|
|
42
39
|
return super().raw_document_link_txt(self.author_style, include_alt_link=include_alt_link)
|
|
43
40
|
|
|
41
|
+
def summary(self) -> Text:
|
|
42
|
+
return self._summary().append(CLOSE_PROPERTIES_CHAR)
|
|
43
|
+
|
|
44
44
|
def timestamp_without_seconds(self) -> str:
|
|
45
45
|
return TIMESTAMP_SECONDS_REGEX.sub('', str(self.timestamp))
|
|
46
46
|
|
|
47
|
-
def
|
|
47
|
+
def _summary(self) -> Text:
|
|
48
48
|
"""One line summary mostly for logging."""
|
|
49
|
-
txt = super().
|
|
49
|
+
txt = super().summary().append(', ')
|
|
50
50
|
return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown()}'", style=self.author_style)))
|
|
51
51
|
|
|
52
52
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import re
|
|
3
|
-
from dataclasses import dataclass, field
|
|
3
|
+
from dataclasses import asdict, dataclass, field
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from subprocess import run
|
|
@@ -14,33 +14,28 @@ from rich.text import Text
|
|
|
14
14
|
from epstein_files.util.constant.names import *
|
|
15
15
|
from epstein_files.util.constant.strings import *
|
|
16
16
|
from epstein_files.util.constant.urls import *
|
|
17
|
-
from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
|
|
18
|
-
from epstein_files.util.
|
|
19
|
-
from epstein_files.util.
|
|
20
|
-
from epstein_files.util.env import args
|
|
21
|
-
from epstein_files.util.file_helper import DOCS_DIR, file_stem_for_id, extract_file_id,
|
|
22
|
-
|
|
17
|
+
from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
|
|
18
|
+
from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize, without_nones
|
|
19
|
+
from epstein_files.util.doc_cfg import EmailCfg, DocCfg, Metadata, TextCfg
|
|
20
|
+
from epstein_files.util.env import args
|
|
21
|
+
from epstein_files.util.file_helper import (DOCS_DIR, file_stem_for_id, extract_file_id, file_size,
|
|
22
|
+
file_size_str, is_local_extract_file)
|
|
23
|
+
from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
|
|
24
|
+
from epstein_files.util.rich import SYMBOL_STYLE, console, highlighter, key_value_txt, link_text_obj
|
|
25
|
+
from epstein_files.util.search_result import MatchedLine
|
|
23
26
|
|
|
24
|
-
|
|
27
|
+
CLOSE_PROPERTIES_CHAR = ']'
|
|
25
28
|
HOUSE_OVERSIGHT = HOUSE_OVERSIGHT_PREFIX.replace('_', ' ').strip()
|
|
26
|
-
MIN_DOCUMENT_ID = 10477
|
|
27
29
|
INFO_INDENT = 2
|
|
28
30
|
INFO_PADDING = (0, 0, 0, INFO_INDENT)
|
|
31
|
+
MAX_TOP_LINES_LEN = 4000 # Only for logging
|
|
32
|
+
MIN_DOCUMENT_ID = 10477
|
|
33
|
+
LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
|
|
34
|
+
WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
|
|
29
35
|
|
|
30
|
-
CLOSE_PROPERTIES_CHAR = ']'
|
|
31
|
-
MAX_EXTRACTED_TIMESTAMPS = 6
|
|
32
36
|
MIN_TIMESTAMP = datetime(1991, 1, 1)
|
|
33
37
|
MID_TIMESTAMP = datetime(2007, 1, 1)
|
|
34
38
|
MAX_TIMESTAMP = datetime(2020, 1, 1)
|
|
35
|
-
VI_DAILY_NEWS_REGEX = re.compile(r'virgin\s*is[kl][ai]nds\s*daily\s*news', re.IGNORECASE)
|
|
36
|
-
|
|
37
|
-
DOC_TYPE_STYLES = {
|
|
38
|
-
DOCUMENT_CLASS: 'grey69',
|
|
39
|
-
EMAIL_CLASS: 'sea_green2',
|
|
40
|
-
JSON_FILE_CLASS: 'sandy_brown',
|
|
41
|
-
MESSENGER_LOG_CLASS: 'cyan',
|
|
42
|
-
OTHER_FILE_CLASS: 'grey69',
|
|
43
|
-
}
|
|
44
39
|
|
|
45
40
|
FILENAME_MATCH_STYLES = [
|
|
46
41
|
'dark_green',
|
|
@@ -48,6 +43,13 @@ FILENAME_MATCH_STYLES = [
|
|
|
48
43
|
'spring_green4',
|
|
49
44
|
]
|
|
50
45
|
|
|
46
|
+
METADATA_FIELDS = [
|
|
47
|
+
'author',
|
|
48
|
+
'file_id',
|
|
49
|
+
'num_lines',
|
|
50
|
+
'timestamp'
|
|
51
|
+
]
|
|
52
|
+
|
|
51
53
|
OCR_REPAIRS = {
|
|
52
54
|
re.compile(r'\.corn\b'): '.com',
|
|
53
55
|
re.compile('ln(adequate|dyke)'): r'In\1',
|
|
@@ -61,7 +63,7 @@ class Document:
|
|
|
61
63
|
file_path: Path
|
|
62
64
|
# Optional fields
|
|
63
65
|
author: str | None = None
|
|
64
|
-
config:
|
|
66
|
+
config: EmailCfg | DocCfg | TextCfg | None = None
|
|
65
67
|
file_id: str = field(init=False)
|
|
66
68
|
filename: str = field(init=False)
|
|
67
69
|
is_duplicate: bool = False
|
|
@@ -72,8 +74,8 @@ class Document:
|
|
|
72
74
|
timestamp: datetime | None = None
|
|
73
75
|
url_slug: str = field(init=False) # e.g. 'HOUSE_OVERSIGHT_123456
|
|
74
76
|
|
|
75
|
-
# Class variable
|
|
76
|
-
|
|
77
|
+
# Class variable overridden in JsonFile
|
|
78
|
+
strip_whitespace: ClassVar[bool] = True
|
|
77
79
|
|
|
78
80
|
def __post_init__(self):
|
|
79
81
|
self.filename = self.file_path.name
|
|
@@ -82,12 +84,12 @@ class Document:
|
|
|
82
84
|
self.is_duplicate = bool(self.config.dupe_of_id) if self.config else False
|
|
83
85
|
|
|
84
86
|
if self.is_local_extract_file():
|
|
85
|
-
self.url_slug = file_stem_for_id(self.file_id)
|
|
87
|
+
self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
|
|
86
88
|
cfg_type = type(self.config).__name__ if self.config else None
|
|
87
89
|
|
|
88
90
|
# Coerce FileConfig for court docs etc. to MessageCfg for email files extracted from that document
|
|
89
|
-
if self.
|
|
90
|
-
self.config =
|
|
91
|
+
if self.class_name() == EMAIL_CLASS and self.config and cfg_type != EmailCfg.__name__:
|
|
92
|
+
self.config = EmailCfg.from_doc_cfg(self.config)
|
|
91
93
|
else:
|
|
92
94
|
self.url_slug = self.file_path.stem
|
|
93
95
|
|
|
@@ -96,41 +98,30 @@ class Document:
|
|
|
96
98
|
self._extract_author()
|
|
97
99
|
self.timestamp = self._extract_timestamp()
|
|
98
100
|
|
|
101
|
+
def class_name(self) -> str:
|
|
102
|
+
"""Annoying workaround for circular import issues and isinstance()."""
|
|
103
|
+
return str(type(self).__name__)
|
|
104
|
+
|
|
99
105
|
def configured_description(self) -> str | None:
|
|
100
|
-
|
|
106
|
+
"""Overloaded in OtherFile."""
|
|
107
|
+
if self.config and self.config.description:
|
|
108
|
+
return f"({self.config.description})"
|
|
101
109
|
|
|
102
110
|
def date_str(self) -> str | None:
|
|
103
111
|
return date_str(self.timestamp)
|
|
104
112
|
|
|
105
|
-
def description(self) -> Text:
|
|
106
|
-
"""Mostly for logging. Brackets are left open for subclasses to add stuff."""
|
|
107
|
-
txt = Text('').append(self.url_slug, style='magenta')
|
|
108
|
-
txt.append(f' {self.document_type()}', style=self.document_type_style())
|
|
109
|
-
|
|
110
|
-
if self.timestamp:
|
|
111
|
-
txt.append(' (', style=SYMBOL_STYLE)
|
|
112
|
-
txt.append(f"{iso_timestamp(self.timestamp)}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
|
|
113
|
-
|
|
114
|
-
txt.append(" [").append(key_value_txt('num_lines', Text(f"{self.num_lines}", style='cyan')))
|
|
115
|
-
txt.append(', ').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
|
|
116
|
-
return txt
|
|
117
|
-
|
|
118
113
|
def description_panel(self, include_hints: bool = False) -> Panel:
|
|
119
114
|
"""Panelized description() with info_txt(), used in search results."""
|
|
120
115
|
hints = [Text('', style='italic').append(h) for h in (self.hints() if include_hints else [])]
|
|
121
|
-
return Panel(Group(*([self.
|
|
122
|
-
|
|
123
|
-
def document_type(self) -> str:
|
|
124
|
-
"""Annoying workaround for circular import issues and isinstance()."""
|
|
125
|
-
return str(type(self).__name__)
|
|
116
|
+
return Panel(Group(*([self.summary()] + hints)), border_style=self.document_type_style(), expand=False)
|
|
126
117
|
|
|
127
118
|
def document_type_style(self) -> str:
|
|
128
|
-
return DOC_TYPE_STYLES[self.
|
|
119
|
+
return DOC_TYPE_STYLES[self.class_name()]
|
|
129
120
|
|
|
130
121
|
def duplicate_file_txt(self) -> Text:
|
|
131
122
|
"""If the file is a dupe make a nice message to explain what file it's a duplicate of."""
|
|
132
123
|
if not self.config or not self.config.dupe_of_id:
|
|
133
|
-
raise RuntimeError(f"duplicate_file_txt() called on {self.
|
|
124
|
+
raise RuntimeError(f"duplicate_file_txt() called on {self.summary()} but not a dupe! config:\n\n{self.config}")
|
|
134
125
|
|
|
135
126
|
txt = Text(f"Not showing ", style='white dim italic').append(epstein_media_doc_link_txt(self.file_id, style='cyan'))
|
|
136
127
|
txt.append(f" because it's {self.config.duplicate_reason()} ")
|
|
@@ -154,6 +145,9 @@ class Document:
|
|
|
154
145
|
hints = [Padding(hint, INFO_PADDING) for hint in self.hints()]
|
|
155
146
|
return Group(*([panel] + hints))
|
|
156
147
|
|
|
148
|
+
def file_size(self) -> int:
|
|
149
|
+
return file_size(self.file_path)
|
|
150
|
+
|
|
157
151
|
def file_size_str(self) -> str:
|
|
158
152
|
return file_size_str(self.file_path)
|
|
159
153
|
|
|
@@ -162,16 +156,10 @@ class Document:
|
|
|
162
156
|
hints = listify(self.info_txt())
|
|
163
157
|
hint_msg = self.configured_description()
|
|
164
158
|
|
|
165
|
-
if self.document_type() == OTHER_FILE_CLASS:
|
|
166
|
-
if not hint_msg and VI_DAILY_NEWS_REGEX.search(self.text):
|
|
167
|
-
hint_msg = VI_DAILY_NEWS_ARTICLE
|
|
168
|
-
elif hint_msg:
|
|
169
|
-
hint_msg = f"({hint_msg})"
|
|
170
|
-
|
|
171
159
|
if hint_msg:
|
|
172
160
|
hints.append(highlighter(Text(hint_msg, style='white dim italic')))
|
|
173
161
|
|
|
174
|
-
return hints
|
|
162
|
+
return without_nones(hints)
|
|
175
163
|
|
|
176
164
|
def info_txt(self) -> Text | None:
|
|
177
165
|
"""Secondary info about this file (recipients, level of certainty, etc). Overload in subclasses."""
|
|
@@ -181,32 +169,42 @@ class Document:
|
|
|
181
169
|
"""True if file created by extracting text from a court doc (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
|
|
182
170
|
return is_local_extract_file(self.filename)
|
|
183
171
|
|
|
184
|
-
def lines_matching_txt(self, _pattern: re.Pattern | str) -> list[Text]:
|
|
185
|
-
"""Return lines matching a regex as colored list[Text]."""
|
|
186
|
-
pattern = patternize(_pattern)
|
|
187
|
-
matched_lines = [line for line in self.lines if pattern.search(line)]
|
|
188
|
-
|
|
189
|
-
if len(matched_lines) == 0:
|
|
190
|
-
return []
|
|
191
|
-
|
|
192
|
-
file_style = FILENAME_MATCH_STYLES[type(self).file_matching_idx % len(FILENAME_MATCH_STYLES)]
|
|
193
|
-
type(self).file_matching_idx += 1
|
|
194
|
-
|
|
195
|
-
return [
|
|
196
|
-
Text('').append(self.file_path.name, style=file_style).append(':').append(line)
|
|
197
|
-
for line in matched_lines
|
|
198
|
-
]
|
|
199
|
-
|
|
200
172
|
def log(self, msg: str, level: int = logging.WARNING):
|
|
201
|
-
"""Log with
|
|
202
|
-
logger.log(level, f"
|
|
173
|
+
"""Log with filename as a prefix."""
|
|
174
|
+
logger.log(level, f"{self.url_slug} {msg}")
|
|
203
175
|
|
|
204
176
|
def log_top_lines(self, n: int = 10, msg: str = '', level: int = logging.INFO) -> None:
|
|
205
177
|
"""Log first 'n' lines of self.text at 'level'. 'msg' can be optionally provided."""
|
|
206
178
|
separator = '\n\n' if '\n' in msg else '. '
|
|
207
|
-
msg =
|
|
179
|
+
msg = (msg + separator) if msg else ''
|
|
180
|
+
msg = f"{self.filename}: {msg}First {n} lines:"
|
|
208
181
|
logger.log(level, f"{msg}\n\n{self.top_lines(n)}\n")
|
|
209
182
|
|
|
183
|
+
def matching_lines(self, _pattern: re.Pattern | str) -> list[MatchedLine]:
|
|
184
|
+
"""Return lines matching a regex as colored list[Text]."""
|
|
185
|
+
pattern = patternize(_pattern)
|
|
186
|
+
return [MatchedLine(line, i) for i, line in enumerate(self.lines) if pattern.search(line)]
|
|
187
|
+
|
|
188
|
+
def metadata(self) -> Metadata:
|
|
189
|
+
metadata = self.config.metadata() if self.config else {}
|
|
190
|
+
metadata.update({k: v for k, v in asdict(self).items() if k in METADATA_FIELDS and v is not None})
|
|
191
|
+
metadata['bytes'] = self.file_size()
|
|
192
|
+
metadata['filename'] = f"{self.url_slug}.txt"
|
|
193
|
+
metadata['type'] = self.class_name()
|
|
194
|
+
|
|
195
|
+
if self.is_local_extract_file():
|
|
196
|
+
metadata['extracted_file'] = {
|
|
197
|
+
'explanation': 'This file was extracted from a court filing, not distributed directly. A copy can be found on github.',
|
|
198
|
+
'extracted_from_file': self.url_slug + '.txt',
|
|
199
|
+
'extracted_file_url': extracted_file_url(self.filename),
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
return metadata
|
|
203
|
+
|
|
204
|
+
def raw_text(self) -> str:
|
|
205
|
+
with open(self.file_path) as f:
|
|
206
|
+
return f.read()
|
|
207
|
+
|
|
210
208
|
def raw_document_link_txt(self, style: str = '', include_alt_link: bool = False) -> Text:
|
|
211
209
|
"""Returns colored links to epstein.media and and epsteinweb in a Text object."""
|
|
212
210
|
txt = Text('', style='white' if include_alt_link else ARCHIVE_LINK_COLOR)
|
|
@@ -215,11 +213,13 @@ class Document:
|
|
|
215
213
|
txt.append(self.epstein_web_link(style=style))
|
|
216
214
|
|
|
217
215
|
if include_alt_link:
|
|
216
|
+
txt.append(' (').append(self.epsteinify_link(style='white dim', link_txt=EPSTEINIFY)).append(')')
|
|
218
217
|
txt.append(' (').append(self.epstein_media_link(style='white dim', link_txt=EPSTEIN_MEDIA)).append(')')
|
|
219
218
|
else:
|
|
220
219
|
txt.append(self.epstein_media_link(style=style))
|
|
221
220
|
|
|
222
221
|
if include_alt_link:
|
|
222
|
+
txt.append(' (').append(self.epsteinify_link(style='white dim', link_txt=EPSTEINIFY)).append(')')
|
|
223
223
|
txt.append(' (').append(self.epstein_web_link(style='white dim', link_txt=EPSTEIN_WEB)).append(')')
|
|
224
224
|
|
|
225
225
|
return txt
|
|
@@ -234,8 +234,32 @@ class Document:
|
|
|
234
234
|
|
|
235
235
|
return text
|
|
236
236
|
|
|
237
|
+
def sort_key(self) -> tuple[datetime, str, int]:
|
|
238
|
+
if self.config and self.config.dupe_of_id:
|
|
239
|
+
sort_id = self.config.dupe_of_id
|
|
240
|
+
dupe_idx = 1
|
|
241
|
+
else:
|
|
242
|
+
sort_id = self.file_id
|
|
243
|
+
dupe_idx = 0
|
|
244
|
+
|
|
245
|
+
return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
|
|
246
|
+
|
|
247
|
+
def summary(self) -> Text:
|
|
248
|
+
"""Summary of this file for logging. Brackets are left open for subclasses to add stuff."""
|
|
249
|
+
txt = Text('').append(self.class_name(), style=self.document_type_style())
|
|
250
|
+
txt.append(f" {self.url_slug}", style=FILENAME_STYLE)
|
|
251
|
+
|
|
252
|
+
if self.timestamp:
|
|
253
|
+
timestamp_str = iso_timestamp(self.timestamp).removesuffix(' 00:00:00')
|
|
254
|
+
txt.append(' (', style=SYMBOL_STYLE)
|
|
255
|
+
txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
|
|
256
|
+
|
|
257
|
+
txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
|
|
258
|
+
txt.append(", ").append(key_value_txt('lines', Text(f"{self.num_lines}", style='cyan')))
|
|
259
|
+
return txt
|
|
260
|
+
|
|
237
261
|
def top_lines(self, n: int = 10) -> str:
|
|
238
|
-
return '\n'.join(self.lines[0:n])
|
|
262
|
+
return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
|
|
239
263
|
|
|
240
264
|
def _border_style(self) -> str:
|
|
241
265
|
"""Should be overloaded in subclasses."""
|
|
@@ -250,21 +274,20 @@ class Document:
|
|
|
250
274
|
"""Should be implemented in subclasses."""
|
|
251
275
|
pass
|
|
252
276
|
|
|
253
|
-
def _load_file(self):
|
|
277
|
+
def _load_file(self) -> str:
|
|
254
278
|
"""Remove BOM and HOUSE OVERSIGHT lines, strip whitespace."""
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
lines = lines[1:] if (len(lines) > 1 and lines[0] == '>>') else lines
|
|
261
|
-
return collapse_newlines('\n'.join(lines))
|
|
279
|
+
text = self.raw_text()
|
|
280
|
+
text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text # remove BOM
|
|
281
|
+
text = self.repair_ocr_text(OCR_REPAIRS, text.strip())
|
|
282
|
+
lines = [l.strip() for l in text.split('\n') if not l.startswith(HOUSE_OVERSIGHT)]
|
|
283
|
+
return collapse_newlines('\n'.join(lines))
|
|
262
284
|
|
|
263
285
|
def _repair(self) -> None:
|
|
264
|
-
"""Can optionally be overloaded in subclasses."""
|
|
286
|
+
"""Can optionally be overloaded in subclasses to further improve self.text."""
|
|
265
287
|
pass
|
|
266
288
|
|
|
267
289
|
def _set_computed_fields(self, lines: list[str] | None = None, text: str | None = None) -> None:
|
|
290
|
+
"""Sets all fields derived from self.text based on either 'lines' or 'text' arg."""
|
|
268
291
|
if (lines and text):
|
|
269
292
|
raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (got both)")
|
|
270
293
|
elif lines is not None:
|
|
@@ -275,7 +298,7 @@ class Document:
|
|
|
275
298
|
raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (neither was)")
|
|
276
299
|
|
|
277
300
|
self.length = len(self.text)
|
|
278
|
-
self.lines = [line.strip() for line in self.text.split('\n')]
|
|
301
|
+
self.lines = [line.strip() if self.strip_whitespace else line for line in self.text.split('\n')]
|
|
279
302
|
self.num_lines = len(self.lines)
|
|
280
303
|
|
|
281
304
|
def _write_clean_text(self, output_path: Path) -> None:
|
|
@@ -291,16 +314,17 @@ class Document:
|
|
|
291
314
|
|
|
292
315
|
logger.warning(f"Wrote {self.length} chars of cleaned {self.filename} to {output_path}.")
|
|
293
316
|
|
|
294
|
-
def __rich_console__(self,
|
|
317
|
+
def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
|
|
295
318
|
yield self.file_info_panel()
|
|
296
319
|
text_panel = Panel(highlighter(self.text), border_style=self._border_style(), expand=False)
|
|
297
320
|
yield Padding(text_panel, (0, 0, 1, INFO_INDENT))
|
|
298
321
|
|
|
299
322
|
def __str__(self) -> str:
|
|
300
|
-
return self.
|
|
323
|
+
return self.summary().plain
|
|
301
324
|
|
|
302
325
|
@staticmethod
|
|
303
326
|
def diff_files(files: list[str]) -> None:
|
|
327
|
+
"""Diff the contents of two Documents after all cleanup, BOM removal, etc."""
|
|
304
328
|
if len(files) != 2:
|
|
305
329
|
raise RuntimeError('Need 2 files')
|
|
306
330
|
elif files[0] == files[1]:
|
|
@@ -330,7 +354,7 @@ class Document:
|
|
|
330
354
|
|
|
331
355
|
@staticmethod
|
|
332
356
|
def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
|
|
333
|
-
return sorted(docs, key=lambda doc:
|
|
357
|
+
return sorted(docs, key=lambda doc: doc.sort_key())
|
|
334
358
|
|
|
335
359
|
@classmethod
|
|
336
360
|
def uniquify(cls, documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
|