epstein-files 1.0.13__tar.gz → 1.0.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {epstein_files-1.0.13 → epstein_files-1.0.14}/PKG-INFO +10 -3
- {epstein_files-1.0.13 → epstein_files-1.0.14}/README.md +9 -2
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/__init__.py +11 -6
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/documents/communication.py +2 -2
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/documents/document.py +52 -46
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/documents/email.py +32 -29
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/documents/imessage/text_message.py +4 -4
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/documents/json_file.py +9 -3
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/documents/messenger_log.py +20 -17
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/documents/other_file.py +50 -71
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/epstein_files.py +89 -67
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/constant/names.py +1 -1
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/constant/strings.py +1 -1
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/constants.py +62 -44
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/data.py +2 -0
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/doc_cfg.py +7 -7
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/env.py +2 -5
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/highlighted_group.py +7 -15
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/output.py +15 -30
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/rich.py +29 -29
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/word_count.py +1 -1
- {epstein_files-1.0.13 → epstein_files-1.0.14}/pyproject.toml +1 -1
- {epstein_files-1.0.13 → epstein_files-1.0.14}/LICENSE +0 -0
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/documents/emails/email_header.py +0 -0
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/constant/common_words.py +0 -0
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/constant/html.py +0 -0
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/constant/output_files.py +0 -0
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/constant/urls.py +0 -0
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/file_helper.py +0 -0
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/logging.py +0 -0
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/search_result.py +0 -0
- {epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/util/timer.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: epstein-files
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.14
|
|
4
4
|
Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
|
|
5
5
|
Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
|
|
6
6
|
License: GPL-3.0-or-later
|
|
@@ -43,11 +43,12 @@ Description-Content-Type: text/markdown
|
|
|
43
43
|
|
|
44
44
|
|
|
45
45
|
## Usage
|
|
46
|
-
|
|
47
46
|
#### Installation
|
|
48
47
|
1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8) (make sure you grab both the `001/` and `002/` folders).
|
|
49
48
|
1. Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
|
|
50
49
|
|
|
50
|
+
|
|
51
|
+
#### Command Line Tools
|
|
51
52
|
You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
|
|
52
53
|
|
|
53
54
|
```bash
|
|
@@ -63,7 +64,7 @@ epstein_generate
|
|
|
63
64
|
# Search for a string:
|
|
64
65
|
epstein_search Bannon
|
|
65
66
|
# Or a regex:
|
|
66
|
-
epstein_search '\bSteve\s*Bannon\b'
|
|
67
|
+
epstein_search '\bSteve\s*Bannon|Jeffrey\s*Epstein\b'
|
|
67
68
|
|
|
68
69
|
# Show a file with color highlighting of keywords:
|
|
69
70
|
epstein_show 030999
|
|
@@ -82,6 +83,12 @@ epstein_diff 030999 020442
|
|
|
82
83
|
The first time you run anything it will take a few minutes to fix all the janky OCR text, attribute the redacted emails, etc. After that things will be quick.
|
|
83
84
|
Run `epstein_generate --help` for command line option assistance.
|
|
84
85
|
|
|
86
|
+
**Optional:** There are a handful of emails that I extracted from the legal filings they were contained in. If you want to include these files in your local analysis you'll need to copy those files from the repo into your local document directory. Something like:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
cp ./emails_extracted_from_legal_filings/*.txt "$EPSTEIN_DOCS_DIR"
|
|
90
|
+
```
|
|
91
|
+
|
|
85
92
|
|
|
86
93
|
#### As A Library
|
|
87
94
|
```python
|
|
@@ -10,11 +10,12 @@
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
## Usage
|
|
13
|
-
|
|
14
13
|
#### Installation
|
|
15
14
|
1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8) (make sure you grab both the `001/` and `002/` folders).
|
|
16
15
|
1. Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
|
|
17
16
|
|
|
17
|
+
|
|
18
|
+
#### Command Line Tools
|
|
18
19
|
You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
|
|
19
20
|
|
|
20
21
|
```bash
|
|
@@ -30,7 +31,7 @@ epstein_generate
|
|
|
30
31
|
# Search for a string:
|
|
31
32
|
epstein_search Bannon
|
|
32
33
|
# Or a regex:
|
|
33
|
-
epstein_search '\bSteve\s*Bannon\b'
|
|
34
|
+
epstein_search '\bSteve\s*Bannon|Jeffrey\s*Epstein\b'
|
|
34
35
|
|
|
35
36
|
# Show a file with color highlighting of keywords:
|
|
36
37
|
epstein_show 030999
|
|
@@ -49,6 +50,12 @@ epstein_diff 030999 020442
|
|
|
49
50
|
The first time you run anything it will take a few minutes to fix all the janky OCR text, attribute the redacted emails, etc. After that things will be quick.
|
|
50
51
|
Run `epstein_generate --help` for command line option assistance.
|
|
51
52
|
|
|
53
|
+
**Optional:** There are a handful of emails that I extracted from the legal filings they were contained in. If you want to include these files in your local analysis you'll need to copy those files from the repo into your local document directory. Something like:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
cp ./emails_extracted_from_legal_filings/*.txt "$EPSTEIN_DOCS_DIR"
|
|
57
|
+
```
|
|
58
|
+
|
|
52
59
|
|
|
53
60
|
#### As A Library
|
|
54
61
|
```python
|
|
@@ -21,7 +21,7 @@ from epstein_files.util.env import args, specified_names
|
|
|
21
21
|
from epstein_files.util.file_helper import coerce_file_path, extract_file_id
|
|
22
22
|
from epstein_files.util.logging import logger
|
|
23
23
|
from epstein_files.util.output import (print_emails, print_json_files, print_json_stats,
|
|
24
|
-
|
|
24
|
+
write_json_metadata, write_urls)
|
|
25
25
|
from epstein_files.util.rich import build_highlighter, console, print_header, print_panel, write_html
|
|
26
26
|
from epstein_files.util.timer import Timer
|
|
27
27
|
from epstein_files.util.word_count import write_word_counts_html
|
|
@@ -49,7 +49,7 @@ def generate_html() -> None:
|
|
|
49
49
|
exit()
|
|
50
50
|
|
|
51
51
|
if args.output_texts:
|
|
52
|
-
|
|
52
|
+
epstein_files.print_text_messages_section()
|
|
53
53
|
timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs)} text message logs')
|
|
54
54
|
|
|
55
55
|
if args.output_emails:
|
|
@@ -57,8 +57,13 @@ def generate_html() -> None:
|
|
|
57
57
|
timer.print_at_checkpoint(f"Printed {emails_printed:,} emails")
|
|
58
58
|
|
|
59
59
|
if args.output_other:
|
|
60
|
-
|
|
61
|
-
|
|
60
|
+
if args.uninteresting:
|
|
61
|
+
files = [f for f in epstein_files.other_files if not f.is_interesting()]
|
|
62
|
+
else:
|
|
63
|
+
files = [f for f in epstein_files.other_files if args.all_other_files or f.is_interesting()]
|
|
64
|
+
|
|
65
|
+
epstein_files.print_other_files_section(files)
|
|
66
|
+
timer.print_at_checkpoint(f"Printed {len(files)} other files (skipped {len(epstein_files.other_files) - len(files)})")
|
|
62
67
|
|
|
63
68
|
# Save output
|
|
64
69
|
write_html(ALL_EMAILS_PATH if args.all_emails else TEXT_MSGS_HTML_PATH)
|
|
@@ -90,7 +95,7 @@ def epstein_search():
|
|
|
90
95
|
|
|
91
96
|
if args.whole_file:
|
|
92
97
|
if isinstance(search_result.document, Email):
|
|
93
|
-
search_result.document.
|
|
98
|
+
search_result.document._truncation_allowed = False
|
|
94
99
|
|
|
95
100
|
console.print(search_result.document)
|
|
96
101
|
else:
|
|
@@ -111,7 +116,7 @@ def epstein_show():
|
|
|
111
116
|
|
|
112
117
|
for doc in docs:
|
|
113
118
|
if isinstance(doc, Email):
|
|
114
|
-
doc.
|
|
119
|
+
doc._truncation_allowed = False
|
|
115
120
|
|
|
116
121
|
console.print('\n', doc, '\n')
|
|
117
122
|
|
|
@@ -34,9 +34,9 @@ class Communication(Document):
|
|
|
34
34
|
def is_attribution_uncertain(self) -> bool:
|
|
35
35
|
return bool(self.config and self.config.is_attribution_uncertain)
|
|
36
36
|
|
|
37
|
-
def
|
|
37
|
+
def external_links_txt(self, _style: str = '', include_alt_links: bool = True) -> Text:
|
|
38
38
|
"""Overrides super() method to apply self.author_style."""
|
|
39
|
-
return super().
|
|
39
|
+
return super().external_links_txt(self.author_style, include_alt_links=include_alt_links)
|
|
40
40
|
|
|
41
41
|
def summary(self) -> Text:
|
|
42
42
|
return self._summary().append(CLOSE_PROPERTIES_CHAR)
|
|
@@ -19,12 +19,12 @@ from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
|
|
|
19
19
|
from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time_from_timestamp_str, without_falsey
|
|
20
20
|
from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
|
|
21
21
|
from epstein_files.util.env import DOCS_DIR, args
|
|
22
|
-
from epstein_files.util.file_helper import
|
|
23
|
-
file_size_str, is_local_extract_file)
|
|
22
|
+
from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, is_local_extract_file
|
|
24
23
|
from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
|
|
25
|
-
from epstein_files.util.rich import INFO_STYLE, SYMBOL_STYLE, console, highlighter, key_value_txt, link_text_obj
|
|
24
|
+
from epstein_files.util.rich import INFO_STYLE, SYMBOL_STYLE, console, highlighter, join_texts, key_value_txt, link_text_obj, parenthesize
|
|
26
25
|
from epstein_files.util.search_result import MatchedLine
|
|
27
26
|
|
|
27
|
+
ALT_LINK_STYLE = 'white dim'
|
|
28
28
|
CLOSE_PROPERTIES_CHAR = ']'
|
|
29
29
|
HOUSE_OVERSIGHT = HOUSE_OVERSIGHT_PREFIX.replace('_', ' ').strip()
|
|
30
30
|
INFO_INDENT = 2
|
|
@@ -46,7 +46,6 @@ FILENAME_MATCH_STYLES = [
|
|
|
46
46
|
METADATA_FIELDS = [
|
|
47
47
|
'author',
|
|
48
48
|
'file_id',
|
|
49
|
-
'num_lines',
|
|
50
49
|
'timestamp'
|
|
51
50
|
]
|
|
52
51
|
|
|
@@ -68,7 +67,6 @@ class Document:
|
|
|
68
67
|
config (DocCfg): Information about this fil
|
|
69
68
|
file_id (str): 6 digit (or 8 digits if it's a local extract file) string ID
|
|
70
69
|
filename (str): File's basename
|
|
71
|
-
length (int): Number of characters in the file after all the cleanup
|
|
72
70
|
lines (str): Number of lines in the file after all the cleanup
|
|
73
71
|
text (str): Contents of the file
|
|
74
72
|
timestamp (datetime | None): When the file was originally created
|
|
@@ -80,12 +78,10 @@ class Document:
|
|
|
80
78
|
config: EmailCfg | DocCfg | TextCfg | None = None
|
|
81
79
|
file_id: str = field(init=False)
|
|
82
80
|
filename: str = field(init=False)
|
|
83
|
-
|
|
84
|
-
lines: list[str] = field(init=False)
|
|
85
|
-
num_lines: int = field(init=False)
|
|
81
|
+
lines: list[str] = field(default_factory=list)
|
|
86
82
|
text: str = ''
|
|
87
83
|
timestamp: datetime | None = None
|
|
88
|
-
url_slug: str =
|
|
84
|
+
url_slug: str = ''
|
|
89
85
|
|
|
90
86
|
# Class variables
|
|
91
87
|
include_description_in_summary_panel: ClassVar[bool] = False
|
|
@@ -94,12 +90,13 @@ class Document:
|
|
|
94
90
|
def __post_init__(self):
|
|
95
91
|
self.filename = self.file_path.name
|
|
96
92
|
self.file_id = extract_file_id(self.filename)
|
|
93
|
+
# config and url_slug could have been pre-set in Email
|
|
97
94
|
self.config = self.config or deepcopy(ALL_FILE_CONFIGS.get(self.file_id))
|
|
95
|
+
self.url_slug = self.url_slug or self.filename.split('.')[0]
|
|
98
96
|
|
|
99
|
-
if
|
|
100
|
-
self.
|
|
97
|
+
if not self.text:
|
|
98
|
+
self._load_file()
|
|
101
99
|
|
|
102
|
-
self._set_computed_fields(text=self.text or self._load_file())
|
|
103
100
|
self._repair()
|
|
104
101
|
self._extract_author()
|
|
105
102
|
self.timestamp = self._extract_timestamp()
|
|
@@ -114,51 +111,49 @@ class Document:
|
|
|
114
111
|
|
|
115
112
|
def duplicate_file_txt(self) -> Text:
|
|
116
113
|
"""If the file is a dupe make a nice message to explain what file it's a duplicate of."""
|
|
117
|
-
if not self.
|
|
114
|
+
if not self.is_duplicate():
|
|
118
115
|
raise RuntimeError(f"duplicate_file_txt() called on {self.summary()} but not a dupe! config:\n\n{self.config}")
|
|
119
116
|
|
|
120
117
|
txt = Text(f"Not showing ", style=INFO_STYLE).append(epstein_media_doc_link_txt(self.file_id, style='cyan'))
|
|
121
118
|
txt.append(f" because it's {DUPE_TYPE_STRS[self.config.dupe_type]} ")
|
|
122
|
-
return txt.append(epstein_media_doc_link_txt(self.config.
|
|
119
|
+
return txt.append(epstein_media_doc_link_txt(self.config.duplicate_of_id, style='royal_blue1'))
|
|
123
120
|
|
|
124
121
|
def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
125
|
-
return self.
|
|
122
|
+
return self.external_link(epsteinify_doc_url, style, link_txt)
|
|
126
123
|
|
|
127
124
|
def epstein_media_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
128
|
-
return self.
|
|
125
|
+
return self.external_link(epstein_media_doc_url, style, link_txt)
|
|
129
126
|
|
|
130
127
|
def epstein_web_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
131
|
-
return self.
|
|
128
|
+
return self.external_link(epstein_web_doc_url, style, link_txt)
|
|
132
129
|
|
|
133
130
|
def rollcall_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
134
|
-
return self.
|
|
131
|
+
return self.external_link(rollcall_doc_url, style, link_txt)
|
|
135
132
|
|
|
136
|
-
def
|
|
133
|
+
def external_link(self, fxn: Callable[[str], str], style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
137
134
|
return link_text_obj(fxn(self.url_slug), link_txt or self.file_path.stem, style)
|
|
138
135
|
|
|
139
|
-
def
|
|
140
|
-
"""Returns colored links to epstein.media and
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
if args.use_epstein_web:
|
|
144
|
-
txt.append(self.epstein_web_link(style=style))
|
|
145
|
-
alt_link = self.epstein_media_link(style='white dim', link_txt=EPSTEIN_MEDIA)
|
|
146
|
-
else:
|
|
147
|
-
txt.append(self.epstein_media_link(style=style))
|
|
148
|
-
alt_link = self.epstein_web_link(style='white dim', link_txt=EPSTEIN_WEB)
|
|
136
|
+
def external_links_txt(self, style: str = '', include_alt_links: bool = False) -> Text:
|
|
137
|
+
"""Returns colored links to epstein.media and alternates in a Text object."""
|
|
138
|
+
links = [self.epstein_media_link(style=style)]
|
|
149
139
|
|
|
150
140
|
if include_alt_links:
|
|
151
|
-
|
|
152
|
-
|
|
141
|
+
links.append(self.epsteinify_link(style=ALT_LINK_STYLE, link_txt=EPSTEINIFY))
|
|
142
|
+
links.append(self.epstein_web_link(style=ALT_LINK_STYLE, link_txt=EPSTEIN_WEB))
|
|
153
143
|
|
|
154
144
|
if self._class_name() == 'Email':
|
|
155
|
-
|
|
145
|
+
links.append(self.rollcall_link(style=ALT_LINK_STYLE, link_txt=ROLLCALL))
|
|
156
146
|
|
|
157
|
-
|
|
147
|
+
links = [links[0]] + [parenthesize(link) for link in links[1:]]
|
|
148
|
+
base_txt = Text('', style='white' if include_alt_links else ARCHIVE_LINK_COLOR)
|
|
149
|
+
return base_txt.append(join_texts(links))
|
|
150
|
+
|
|
151
|
+
def file_id_debug_info(self) -> str:
|
|
152
|
+
return ', '.join([f"{prop}={getattr(self, prop)}" for prop in ['file_id', 'filename', 'url_slug']])
|
|
158
153
|
|
|
159
154
|
def file_info_panel(self) -> Group:
|
|
160
155
|
"""Panel with filename linking to raw file plus any additional info about the file."""
|
|
161
|
-
panel = Panel(self.
|
|
156
|
+
panel = Panel(self.external_links_txt(include_alt_links=True), border_style=self._border_style(), expand=False)
|
|
162
157
|
padded_info = [Padding(sentence, INFO_PADDING) for sentence in self.info()]
|
|
163
158
|
return Group(*([panel] + padded_info))
|
|
164
159
|
|
|
@@ -180,12 +175,15 @@ class Document:
|
|
|
180
175
|
return None
|
|
181
176
|
|
|
182
177
|
def is_duplicate(self) -> bool:
|
|
183
|
-
return bool(self.config and self.config.
|
|
178
|
+
return bool(self.config and self.config.duplicate_of_id)
|
|
184
179
|
|
|
185
180
|
def is_local_extract_file(self) -> bool:
|
|
186
181
|
"""True if file created by extracting text from a court doc (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
|
|
187
182
|
return is_local_extract_file(self.filename)
|
|
188
183
|
|
|
184
|
+
def length(self) -> int:
|
|
185
|
+
return len(self.text)
|
|
186
|
+
|
|
189
187
|
def log(self, msg: str, level: int = logging.INFO):
|
|
190
188
|
"""Log with filename as a prefix."""
|
|
191
189
|
logger.log(level, f"{self.file_path.stem} {msg}")
|
|
@@ -206,17 +204,21 @@ class Document:
|
|
|
206
204
|
metadata.update({k: v for k, v in asdict(self).items() if k in METADATA_FIELDS and v is not None})
|
|
207
205
|
metadata['bytes'] = self.file_size()
|
|
208
206
|
metadata['filename'] = f"{self.url_slug}.txt"
|
|
207
|
+
metadata['num_lines'] = self.num_lines()
|
|
209
208
|
metadata['type'] = self._class_name()
|
|
210
209
|
|
|
211
210
|
if self.is_local_extract_file():
|
|
212
211
|
metadata['extracted_file'] = {
|
|
213
|
-
'explanation': '
|
|
212
|
+
'explanation': 'manually extracted from one of the other files',
|
|
214
213
|
'extracted_from': self.url_slug + '.txt',
|
|
215
214
|
'url': extracted_file_url(self.filename),
|
|
216
215
|
}
|
|
217
216
|
|
|
218
217
|
return metadata
|
|
219
218
|
|
|
219
|
+
def num_lines(self) -> int:
|
|
220
|
+
return len(self.lines)
|
|
221
|
+
|
|
220
222
|
def raw_text(self) -> str:
|
|
221
223
|
with open(self.file_path) as f:
|
|
222
224
|
return f.read()
|
|
@@ -233,7 +235,7 @@ class Document:
|
|
|
233
235
|
|
|
234
236
|
def sort_key(self) -> tuple[datetime, str, int]:
|
|
235
237
|
if self.is_duplicate():
|
|
236
|
-
sort_id = self.config.
|
|
238
|
+
sort_id = self.config.duplicate_of_id
|
|
237
239
|
dupe_idx = 1
|
|
238
240
|
else:
|
|
239
241
|
sort_id = self.file_id
|
|
@@ -252,10 +254,10 @@ class Document:
|
|
|
252
254
|
txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
|
|
253
255
|
|
|
254
256
|
txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
|
|
255
|
-
txt.append(", ").append(key_value_txt('lines', self.num_lines))
|
|
257
|
+
txt.append(", ").append(key_value_txt('lines', self.num_lines()))
|
|
256
258
|
|
|
257
|
-
if self.config and self.config.
|
|
258
|
-
txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.
|
|
259
|
+
if self.config and self.config.duplicate_of_id:
|
|
260
|
+
txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='magenta')))
|
|
259
261
|
|
|
260
262
|
return txt
|
|
261
263
|
|
|
@@ -294,13 +296,19 @@ class Document:
|
|
|
294
296
|
"""Should be implemented in subclasses."""
|
|
295
297
|
pass
|
|
296
298
|
|
|
297
|
-
def _load_file(self) ->
|
|
299
|
+
def _load_file(self) -> None:
|
|
298
300
|
"""Remove BOM and HOUSE OVERSIGHT lines, strip whitespace."""
|
|
299
301
|
text = self.raw_text()
|
|
300
302
|
text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text # remove BOM
|
|
301
303
|
text = self.repair_ocr_text(OCR_REPAIRS, text.strip())
|
|
302
|
-
|
|
303
|
-
|
|
304
|
+
|
|
305
|
+
lines = [
|
|
306
|
+
line.strip() if self.strip_whitespace else line for line in text.split('\n')
|
|
307
|
+
if not line.startswith(HOUSE_OVERSIGHT)
|
|
308
|
+
]
|
|
309
|
+
|
|
310
|
+
self.text = collapse_newlines('\n'.join(lines))
|
|
311
|
+
self.lines = self.text.split('\n')
|
|
304
312
|
|
|
305
313
|
def _repair(self) -> None:
|
|
306
314
|
"""Can optionally be overloaded in subclasses to further improve self.text."""
|
|
@@ -317,9 +325,7 @@ class Document:
|
|
|
317
325
|
else:
|
|
318
326
|
raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (neither was)")
|
|
319
327
|
|
|
320
|
-
self.length = len(self.text)
|
|
321
328
|
self.lines = [line.strip() if self.strip_whitespace else line for line in self.text.split('\n')]
|
|
322
|
-
self.num_lines = len(self.lines)
|
|
323
329
|
|
|
324
330
|
def _write_clean_text(self, output_path: Path) -> None:
|
|
325
331
|
"""Write self.text to 'output_path'. Used only for diffing files."""
|
|
@@ -332,7 +338,7 @@ class Document:
|
|
|
332
338
|
with open(output_path, 'w') as f:
|
|
333
339
|
f.write(self.text)
|
|
334
340
|
|
|
335
|
-
logger.warning(f"Wrote {self.length} chars of cleaned {self.filename} to {output_path}.")
|
|
341
|
+
logger.warning(f"Wrote {self.length()} chars of cleaned {self.filename} to {output_path}.")
|
|
336
342
|
|
|
337
343
|
def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
|
|
338
344
|
yield self.file_info_panel()
|
|
@@ -131,12 +131,12 @@ JUNK_EMAILERS = [
|
|
|
131
131
|
'editorialstaff@flipboard.com',
|
|
132
132
|
'How To Academy',
|
|
133
133
|
'Jokeland',
|
|
134
|
-
JP_MORGAN_USGIO,
|
|
135
134
|
]
|
|
136
135
|
|
|
137
136
|
MAILING_LISTS = [
|
|
138
137
|
INTELLIGENCE_SQUARED,
|
|
139
138
|
'middle.east.update@hotmail.com',
|
|
139
|
+
JP_MORGAN_USGIO,
|
|
140
140
|
]
|
|
141
141
|
|
|
142
142
|
TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + MAILING_LISTS + [
|
|
@@ -316,7 +316,7 @@ class Email(Communication):
|
|
|
316
316
|
recipients: list[str | None] = field(default_factory=list)
|
|
317
317
|
sent_from_device: str | None = None
|
|
318
318
|
signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
|
|
319
|
-
|
|
319
|
+
_truncation_allowed: bool = True # Hacky way to get __rich_console__() not to truncate in epstein_show script
|
|
320
320
|
|
|
321
321
|
# For logging how many headers we prettified while printing, kind of janky
|
|
322
322
|
rewritten_header_ids: ClassVar[set[str]] = set([])
|
|
@@ -337,10 +337,10 @@ class Email(Communication):
|
|
|
337
337
|
|
|
338
338
|
try:
|
|
339
339
|
if self.config and self.config.recipients:
|
|
340
|
-
self.recipients =
|
|
340
|
+
self.recipients = self.config.recipients
|
|
341
341
|
else:
|
|
342
342
|
for recipient in self.header.recipients():
|
|
343
|
-
self.recipients.extend(self.
|
|
343
|
+
self.recipients.extend(self._emailer_names(recipient))
|
|
344
344
|
except Exception as e:
|
|
345
345
|
console.print_exception()
|
|
346
346
|
console.line(2)
|
|
@@ -402,8 +402,8 @@ class Email(Communication):
|
|
|
402
402
|
return self.text
|
|
403
403
|
|
|
404
404
|
reply_text_match = REPLY_TEXT_REGEX.search(text)
|
|
405
|
-
|
|
406
|
-
|
|
405
|
+
self.log_top_lines(20, "Raw text:", logging.DEBUG)
|
|
406
|
+
self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
|
|
407
407
|
|
|
408
408
|
if reply_text_match:
|
|
409
409
|
actual_num_chars = len(reply_text_match.group(1))
|
|
@@ -439,12 +439,32 @@ class Email(Communication):
|
|
|
439
439
|
|
|
440
440
|
return style.replace('bold', '').strip()
|
|
441
441
|
|
|
442
|
+
def _emailer_names(self, emailer_str: str) -> list[str]:
|
|
443
|
+
"""Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
|
|
444
|
+
emailer_str = EmailHeader.cleanup_str(emailer_str)
|
|
445
|
+
|
|
446
|
+
if len(emailer_str) == 0:
|
|
447
|
+
return []
|
|
448
|
+
|
|
449
|
+
names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
|
|
450
|
+
|
|
451
|
+
if BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
|
|
452
|
+
if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
|
|
453
|
+
logger.warning(f"'{self.filename}': No emailer found in '{escape_single_quotes(emailer_str)}'")
|
|
454
|
+
else:
|
|
455
|
+
logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
|
|
456
|
+
|
|
457
|
+
return names_found
|
|
458
|
+
|
|
459
|
+
names_found = names_found or [emailer_str]
|
|
460
|
+
return [_reverse_first_and_last_names(name) for name in names_found]
|
|
461
|
+
|
|
442
462
|
def _extract_author(self) -> None:
|
|
443
463
|
self._extract_header()
|
|
444
464
|
super()._extract_author()
|
|
445
465
|
|
|
446
466
|
if not self.author and self.header.author:
|
|
447
|
-
authors = self.
|
|
467
|
+
authors = self._emailer_names(self.header.author)
|
|
448
468
|
self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
|
|
449
469
|
|
|
450
470
|
def _extract_header(self) -> None:
|
|
@@ -494,26 +514,6 @@ class Email(Communication):
|
|
|
494
514
|
|
|
495
515
|
raise RuntimeError(f"No timestamp found in '{self.file_path.name}' top lines:\n{searchable_text}")
|
|
496
516
|
|
|
497
|
-
def _get_names(self, emailer_str: str) -> list[str]:
|
|
498
|
-
"""Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
|
|
499
|
-
emailer_str = EmailHeader.cleanup_str(emailer_str)
|
|
500
|
-
|
|
501
|
-
if len(emailer_str) == 0:
|
|
502
|
-
return []
|
|
503
|
-
|
|
504
|
-
names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
|
|
505
|
-
|
|
506
|
-
if BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
|
|
507
|
-
if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
|
|
508
|
-
logger.warning(f"'{self.filename}': No emailer found in '{escape_single_quotes(emailer_str)}'")
|
|
509
|
-
else:
|
|
510
|
-
logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
|
|
511
|
-
|
|
512
|
-
return names_found
|
|
513
|
-
|
|
514
|
-
names_found = names_found or [emailer_str]
|
|
515
|
-
return [_reverse_first_and_last_names(name) for name in names_found]
|
|
516
|
-
|
|
517
517
|
def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
|
|
518
518
|
"""Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
|
|
519
519
|
for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text or self.text)):
|
|
@@ -683,6 +683,9 @@ class Email(Communication):
|
|
|
683
683
|
if extracted_from_description:
|
|
684
684
|
extracted_description = f"{APPEARS_IN} {extracted_from_description}"
|
|
685
685
|
|
|
686
|
+
if isinstance(extracted_from_doc_cfg, EmailCfg):
|
|
687
|
+
extracted_description += ' email'
|
|
688
|
+
|
|
686
689
|
if self.config.description:
|
|
687
690
|
self.warn(f"Overwriting description '{self.config.description}' with extract description '{self.config.description}'")
|
|
688
691
|
|
|
@@ -708,10 +711,10 @@ class Email(Communication):
|
|
|
708
711
|
num_chars = quote_cutoff
|
|
709
712
|
|
|
710
713
|
# Truncate long emails but leave a note explaining what happened w/link to source document
|
|
711
|
-
if len(text) > num_chars and self.
|
|
714
|
+
if len(text) > num_chars and self._truncation_allowed:
|
|
712
715
|
text = text[0:num_chars]
|
|
713
716
|
doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
|
|
714
|
-
trim_note = f"<...trimmed to {num_chars} characters of {self.length}, read the rest at {doc_link_markup}...>"
|
|
717
|
+
trim_note = f"<...trimmed to {num_chars} characters of {self.length()}, read the rest at {doc_link_markup}...>"
|
|
715
718
|
trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
|
|
716
719
|
|
|
717
720
|
# Rewrite broken headers where the values are on separate lines from the field names
|
{epstein_files-1.0.13 → epstein_files-1.0.14}/epstein_files/documents/imessage/text_message.py
RENAMED
|
@@ -5,6 +5,7 @@ from datetime import datetime
|
|
|
5
5
|
from rich.text import Text
|
|
6
6
|
|
|
7
7
|
from epstein_files.util.constant.names import JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN
|
|
8
|
+
from epstein_files.util.constant.strings import TIMESTAMP_DIM
|
|
8
9
|
from epstein_files.util.data import extract_last_name
|
|
9
10
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
10
11
|
from epstein_files.util.logging import logger
|
|
@@ -12,7 +13,6 @@ from epstein_files.util.rich import TEXT_LINK, highlighter
|
|
|
12
13
|
|
|
13
14
|
MSG_DATE_FORMAT = r"%m/%d/%y %I:%M:%S %p"
|
|
14
15
|
PHONE_NUMBER_REGEX = re.compile(r'^[\d+]+.*')
|
|
15
|
-
TIMESTAMP_STYLE = 'turquoise4 dim'
|
|
16
16
|
|
|
17
17
|
DISPLAY_LAST_NAME_ONLY = [
|
|
18
18
|
JEFFREY_EPSTEIN,
|
|
@@ -29,7 +29,7 @@ TEXTER_MAPPING = {
|
|
|
29
29
|
class TextMessage:
|
|
30
30
|
"""Class representing a single iMessage text message."""
|
|
31
31
|
author: str | None
|
|
32
|
-
author_str: str
|
|
32
|
+
author_str: str = ''
|
|
33
33
|
id_confirmed: bool = False
|
|
34
34
|
text: str
|
|
35
35
|
timestamp_str: str
|
|
@@ -37,7 +37,7 @@ class TextMessage:
|
|
|
37
37
|
def __post_init__(self):
|
|
38
38
|
self.author = TEXTER_MAPPING.get(self.author or UNKNOWN, self.author)
|
|
39
39
|
|
|
40
|
-
if self.author
|
|
40
|
+
if not self.author:
|
|
41
41
|
self.author_str = UNKNOWN
|
|
42
42
|
elif self.author in DISPLAY_LAST_NAME_ONLY and not self.author_str:
|
|
43
43
|
self.author_str = extract_last_name(self.author)
|
|
@@ -77,5 +77,5 @@ class TextMessage:
|
|
|
77
77
|
def __rich__(self) -> Text:
|
|
78
78
|
author_style = get_style_for_name(self.author_str if self.author_str.startswith('+') else self.author)
|
|
79
79
|
author_txt = Text(self.author_str, style=author_style)
|
|
80
|
-
timestamp_txt = Text(f"[{self.timestamp_str}]", style=
|
|
80
|
+
timestamp_txt = Text(f"[{self.timestamp_str}]", style=TIMESTAMP_DIM).append(' ')
|
|
81
81
|
return Text('').append(timestamp_txt).append(author_txt).append(': ', style='dim').append(self._message())
|
|
@@ -6,10 +6,12 @@ from typing import ClassVar
|
|
|
6
6
|
|
|
7
7
|
from rich.text import Text
|
|
8
8
|
|
|
9
|
-
from epstein_files.documents.other_file import OtherFile
|
|
9
|
+
from epstein_files.documents.other_file import Metadata, OtherFile
|
|
10
10
|
from epstein_files.util.constant.strings import JSON
|
|
11
11
|
from epstein_files.util.rich import INFO_STYLE
|
|
12
12
|
|
|
13
|
+
DESCRIPTION = "JSON data containing preview info for links sent in a messaging app like iMessage"
|
|
14
|
+
|
|
13
15
|
TEXT_FIELDS = [
|
|
14
16
|
'caption',
|
|
15
17
|
'standard',
|
|
@@ -23,7 +25,6 @@ TEXT_FIELDS = [
|
|
|
23
25
|
@dataclass
|
|
24
26
|
class JsonFile(OtherFile):
|
|
25
27
|
"""File containing JSON data."""
|
|
26
|
-
|
|
27
28
|
include_description_in_summary_panel: ClassVar[bool] = False
|
|
28
29
|
strip_whitespace: ClassVar[bool] = False
|
|
29
30
|
|
|
@@ -39,7 +40,7 @@ class JsonFile(OtherFile):
|
|
|
39
40
|
return JSON
|
|
40
41
|
|
|
41
42
|
def info_txt(self) -> Text | None:
|
|
42
|
-
return Text(
|
|
43
|
+
return Text(DESCRIPTION, style=INFO_STYLE)
|
|
43
44
|
|
|
44
45
|
def is_interesting(self):
|
|
45
46
|
return False
|
|
@@ -48,5 +49,10 @@ class JsonFile(OtherFile):
|
|
|
48
49
|
with open(self.file_path, encoding='utf-8-sig') as f:
|
|
49
50
|
return json.load(f)
|
|
50
51
|
|
|
52
|
+
def metadata(self) -> Metadata:
|
|
53
|
+
metadata = super().metadata()
|
|
54
|
+
metadata['description'] = DESCRIPTION
|
|
55
|
+
return metadata
|
|
56
|
+
|
|
51
57
|
def json_str(self) -> str:
|
|
52
58
|
return json.dumps(self.json_data(), indent=4)
|