epstein-files 1.0.13__tar.gz → 1.0.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {epstein_files-1.0.13 → epstein_files-1.0.15}/PKG-INFO +10 -3
- {epstein_files-1.0.13 → epstein_files-1.0.15}/README.md +9 -2
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/__init__.py +16 -11
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/documents/communication.py +2 -2
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/documents/document.py +59 -51
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/documents/email.py +34 -30
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/documents/imessage/text_message.py +4 -4
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/documents/json_file.py +9 -3
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/documents/messenger_log.py +29 -27
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/documents/other_file.py +80 -100
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/epstein_files.py +50 -69
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/constant/names.py +3 -1
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/constant/strings.py +1 -3
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/constant/urls.py +1 -7
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/constants.py +126 -114
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/data.py +2 -0
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/doc_cfg.py +11 -10
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/env.py +12 -13
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/file_helper.py +8 -4
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/highlighted_group.py +8 -16
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/output.py +56 -36
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/rich.py +29 -29
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/word_count.py +7 -9
- {epstein_files-1.0.13 → epstein_files-1.0.15}/pyproject.toml +1 -1
- {epstein_files-1.0.13 → epstein_files-1.0.15}/LICENSE +0 -0
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/documents/emails/email_header.py +0 -0
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/constant/common_words.py +0 -0
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/constant/html.py +0 -0
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/constant/output_files.py +0 -0
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/logging.py +0 -0
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/search_result.py +0 -0
- {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/timer.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: epstein-files
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.15
|
|
4
4
|
Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
|
|
5
5
|
Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
|
|
6
6
|
License: GPL-3.0-or-later
|
|
@@ -43,11 +43,12 @@ Description-Content-Type: text/markdown
|
|
|
43
43
|
|
|
44
44
|
|
|
45
45
|
## Usage
|
|
46
|
-
|
|
47
46
|
#### Installation
|
|
48
47
|
1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8) (make sure you grab both the `001/` and `002/` folders).
|
|
49
48
|
1. Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
|
|
50
49
|
|
|
50
|
+
|
|
51
|
+
#### Command Line Tools
|
|
51
52
|
You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
|
|
52
53
|
|
|
53
54
|
```bash
|
|
@@ -63,7 +64,7 @@ epstein_generate
|
|
|
63
64
|
# Search for a string:
|
|
64
65
|
epstein_search Bannon
|
|
65
66
|
# Or a regex:
|
|
66
|
-
epstein_search '\bSteve\s*Bannon\b'
|
|
67
|
+
epstein_search '\bSteve\s*Bannon|Jeffrey\s*Epstein\b'
|
|
67
68
|
|
|
68
69
|
# Show a file with color highlighting of keywords:
|
|
69
70
|
epstein_show 030999
|
|
@@ -82,6 +83,12 @@ epstein_diff 030999 020442
|
|
|
82
83
|
The first time you run anything it will take a few minutes to fix all the janky OCR text, attribute the redacted emails, etc. After that things will be quick.
|
|
83
84
|
Run `epstein_generate --help` for command line option assistance.
|
|
84
85
|
|
|
86
|
+
**Optional:** There are a handful of emails that I extracted from the legal filings they were contained in. If you want to include these files in your local analysis you'll need to copy those files from the repo into your local document directory. Something like:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
cp ./emails_extracted_from_legal_filings/*.txt "$EPSTEIN_DOCS_DIR"
|
|
90
|
+
```
|
|
91
|
+
|
|
85
92
|
|
|
86
93
|
#### As A Library
|
|
87
94
|
```python
|
|
@@ -10,11 +10,12 @@
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
## Usage
|
|
13
|
-
|
|
14
13
|
#### Installation
|
|
15
14
|
1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8) (make sure you grab both the `001/` and `002/` folders).
|
|
16
15
|
1. Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
|
|
17
16
|
|
|
17
|
+
|
|
18
|
+
#### Command Line Tools
|
|
18
19
|
You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
|
|
19
20
|
|
|
20
21
|
```bash
|
|
@@ -30,7 +31,7 @@ epstein_generate
|
|
|
30
31
|
# Search for a string:
|
|
31
32
|
epstein_search Bannon
|
|
32
33
|
# Or a regex:
|
|
33
|
-
epstein_search '\bSteve\s*Bannon\b'
|
|
34
|
+
epstein_search '\bSteve\s*Bannon|Jeffrey\s*Epstein\b'
|
|
34
35
|
|
|
35
36
|
# Show a file with color highlighting of keywords:
|
|
36
37
|
epstein_show 030999
|
|
@@ -49,6 +50,12 @@ epstein_diff 030999 020442
|
|
|
49
50
|
The first time you run anything it will take a few minutes to fix all the janky OCR text, attribute the redacted emails, etc. After that things will be quick.
|
|
50
51
|
Run `epstein_generate --help` for command line option assistance.
|
|
51
52
|
|
|
53
|
+
**Optional:** There are a handful of emails that I extracted from the legal filings they were contained in. If you want to include these files in your local analysis you'll need to copy those files from the repo into your local document directory. Something like:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
cp ./emails_extracted_from_legal_filings/*.txt "$EPSTEIN_DOCS_DIR"
|
|
57
|
+
```
|
|
58
|
+
|
|
52
59
|
|
|
53
60
|
#### As A Library
|
|
54
61
|
```python
|
|
@@ -17,11 +17,11 @@ from epstein_files.epstein_files import EpsteinFiles, document_cls
|
|
|
17
17
|
from epstein_files.documents.document import INFO_PADDING, Document
|
|
18
18
|
from epstein_files.documents.email import Email
|
|
19
19
|
from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, TEXT_MSGS_HTML_PATH, make_clean
|
|
20
|
-
from epstein_files.util.env import args
|
|
20
|
+
from epstein_files.util.env import args
|
|
21
21
|
from epstein_files.util.file_helper import coerce_file_path, extract_file_id
|
|
22
22
|
from epstein_files.util.logging import logger
|
|
23
|
-
from epstein_files.util.output import (
|
|
24
|
-
|
|
23
|
+
from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
|
|
24
|
+
print_other_files_section, print_text_messages_section, write_json_metadata, write_urls)
|
|
25
25
|
from epstein_files.util.rich import build_highlighter, console, print_header, print_panel, write_html
|
|
26
26
|
from epstein_files.util.timer import Timer
|
|
27
27
|
from epstein_files.util.word_count import write_word_counts_html
|
|
@@ -49,16 +49,21 @@ def generate_html() -> None:
|
|
|
49
49
|
exit()
|
|
50
50
|
|
|
51
51
|
if args.output_texts:
|
|
52
|
-
|
|
52
|
+
print_text_messages_section(epstein_files)
|
|
53
53
|
timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs)} text message logs')
|
|
54
54
|
|
|
55
55
|
if args.output_emails:
|
|
56
|
-
|
|
57
|
-
timer.print_at_checkpoint(f"Printed {
|
|
56
|
+
emails_that_were_printed = print_emails_section(epstein_files)
|
|
57
|
+
timer.print_at_checkpoint(f"Printed {len(emails_that_were_printed):,} emails")
|
|
58
58
|
|
|
59
59
|
if args.output_other:
|
|
60
|
-
|
|
61
|
-
|
|
60
|
+
if args.uninteresting:
|
|
61
|
+
files = [f for f in epstein_files.other_files if not f.is_interesting()]
|
|
62
|
+
else:
|
|
63
|
+
files = [f for f in epstein_files.other_files if args.all_other_files or f.is_interesting()]
|
|
64
|
+
|
|
65
|
+
print_other_files_section(files, epstein_files)
|
|
66
|
+
timer.print_at_checkpoint(f"Printed {len(files)} other files (skipped {len(epstein_files.other_files) - len(files)})")
|
|
62
67
|
|
|
63
68
|
# Save output
|
|
64
69
|
write_html(ALL_EMAILS_PATH if args.all_emails else TEXT_MSGS_HTML_PATH)
|
|
@@ -81,7 +86,7 @@ def epstein_search():
|
|
|
81
86
|
|
|
82
87
|
for search_term in args.positional_args:
|
|
83
88
|
temp_highlighter = build_highlighter(search_term)
|
|
84
|
-
search_results = epstein_files.docs_matching(search_term,
|
|
89
|
+
search_results = epstein_files.docs_matching(search_term, args.names)
|
|
85
90
|
console.line(2)
|
|
86
91
|
print_panel(f"Found {len(search_results)} documents matching '{search_term}'", padding=(0, 0, 0, 3))
|
|
87
92
|
|
|
@@ -90,7 +95,7 @@ def epstein_search():
|
|
|
90
95
|
|
|
91
96
|
if args.whole_file:
|
|
92
97
|
if isinstance(search_result.document, Email):
|
|
93
|
-
search_result.document.
|
|
98
|
+
search_result.document._truncation_allowed = False
|
|
94
99
|
|
|
95
100
|
console.print(search_result.document)
|
|
96
101
|
else:
|
|
@@ -111,7 +116,7 @@ def epstein_show():
|
|
|
111
116
|
|
|
112
117
|
for doc in docs:
|
|
113
118
|
if isinstance(doc, Email):
|
|
114
|
-
doc.
|
|
119
|
+
doc._truncation_allowed = False
|
|
115
120
|
|
|
116
121
|
console.print('\n', doc, '\n')
|
|
117
122
|
|
|
@@ -34,9 +34,9 @@ class Communication(Document):
|
|
|
34
34
|
def is_attribution_uncertain(self) -> bool:
|
|
35
35
|
return bool(self.config and self.config.is_attribution_uncertain)
|
|
36
36
|
|
|
37
|
-
def
|
|
37
|
+
def external_links_txt(self, _style: str = '', include_alt_links: bool = True) -> Text:
|
|
38
38
|
"""Overrides super() method to apply self.author_style."""
|
|
39
|
-
return super().
|
|
39
|
+
return super().external_links_txt(self.author_style, include_alt_links=include_alt_links)
|
|
40
40
|
|
|
41
41
|
def summary(self) -> Text:
|
|
42
42
|
return self._summary().append(CLOSE_PROPERTIES_CHAR)
|
|
@@ -19,12 +19,12 @@ from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
|
|
|
19
19
|
from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time_from_timestamp_str, without_falsey
|
|
20
20
|
from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
|
|
21
21
|
from epstein_files.util.env import DOCS_DIR, args
|
|
22
|
-
from epstein_files.util.file_helper import
|
|
23
|
-
file_size_str, is_local_extract_file)
|
|
22
|
+
from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, is_local_extract_file
|
|
24
23
|
from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
|
|
25
|
-
from epstein_files.util.rich import INFO_STYLE, SYMBOL_STYLE, console, highlighter, key_value_txt, link_text_obj
|
|
24
|
+
from epstein_files.util.rich import INFO_STYLE, SYMBOL_STYLE, console, highlighter, join_texts, key_value_txt, link_text_obj, parenthesize
|
|
26
25
|
from epstein_files.util.search_result import MatchedLine
|
|
27
26
|
|
|
27
|
+
ALT_LINK_STYLE = 'white dim'
|
|
28
28
|
CLOSE_PROPERTIES_CHAR = ']'
|
|
29
29
|
HOUSE_OVERSIGHT = HOUSE_OVERSIGHT_PREFIX.replace('_', ' ').strip()
|
|
30
30
|
INFO_INDENT = 2
|
|
@@ -46,7 +46,6 @@ FILENAME_MATCH_STYLES = [
|
|
|
46
46
|
METADATA_FIELDS = [
|
|
47
47
|
'author',
|
|
48
48
|
'file_id',
|
|
49
|
-
'num_lines',
|
|
50
49
|
'timestamp'
|
|
51
50
|
]
|
|
52
51
|
|
|
@@ -68,7 +67,6 @@ class Document:
|
|
|
68
67
|
config (DocCfg): Information about this fil
|
|
69
68
|
file_id (str): 6 digit (or 8 digits if it's a local extract file) string ID
|
|
70
69
|
filename (str): File's basename
|
|
71
|
-
length (int): Number of characters in the file after all the cleanup
|
|
72
70
|
lines (str): Number of lines in the file after all the cleanup
|
|
73
71
|
text (str): Contents of the file
|
|
74
72
|
timestamp (datetime | None): When the file was originally created
|
|
@@ -80,12 +78,10 @@ class Document:
|
|
|
80
78
|
config: EmailCfg | DocCfg | TextCfg | None = None
|
|
81
79
|
file_id: str = field(init=False)
|
|
82
80
|
filename: str = field(init=False)
|
|
83
|
-
|
|
84
|
-
lines: list[str] = field(init=False)
|
|
85
|
-
num_lines: int = field(init=False)
|
|
81
|
+
lines: list[str] = field(default_factory=list)
|
|
86
82
|
text: str = ''
|
|
87
83
|
timestamp: datetime | None = None
|
|
88
|
-
url_slug: str =
|
|
84
|
+
url_slug: str = ''
|
|
89
85
|
|
|
90
86
|
# Class variables
|
|
91
87
|
include_description_in_summary_panel: ClassVar[bool] = False
|
|
@@ -94,12 +90,13 @@ class Document:
|
|
|
94
90
|
def __post_init__(self):
|
|
95
91
|
self.filename = self.file_path.name
|
|
96
92
|
self.file_id = extract_file_id(self.filename)
|
|
93
|
+
# config and url_slug could have been pre-set in Email
|
|
97
94
|
self.config = self.config or deepcopy(ALL_FILE_CONFIGS.get(self.file_id))
|
|
95
|
+
self.url_slug = self.url_slug or self.filename.split('.')[0]
|
|
98
96
|
|
|
99
|
-
if
|
|
100
|
-
self.
|
|
97
|
+
if not self.text:
|
|
98
|
+
self._load_file()
|
|
101
99
|
|
|
102
|
-
self._set_computed_fields(text=self.text or self._load_file())
|
|
103
100
|
self._repair()
|
|
104
101
|
self._extract_author()
|
|
105
102
|
self.timestamp = self._extract_timestamp()
|
|
@@ -114,59 +111,57 @@ class Document:
|
|
|
114
111
|
|
|
115
112
|
def duplicate_file_txt(self) -> Text:
|
|
116
113
|
"""If the file is a dupe make a nice message to explain what file it's a duplicate of."""
|
|
117
|
-
if not self.
|
|
114
|
+
if not self.is_duplicate():
|
|
118
115
|
raise RuntimeError(f"duplicate_file_txt() called on {self.summary()} but not a dupe! config:\n\n{self.config}")
|
|
119
116
|
|
|
120
117
|
txt = Text(f"Not showing ", style=INFO_STYLE).append(epstein_media_doc_link_txt(self.file_id, style='cyan'))
|
|
121
118
|
txt.append(f" because it's {DUPE_TYPE_STRS[self.config.dupe_type]} ")
|
|
122
|
-
return txt.append(epstein_media_doc_link_txt(self.config.
|
|
119
|
+
return txt.append(epstein_media_doc_link_txt(self.config.duplicate_of_id, style='royal_blue1'))
|
|
123
120
|
|
|
124
121
|
def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
125
|
-
return self.
|
|
122
|
+
return self.external_link(epsteinify_doc_url, style, link_txt)
|
|
126
123
|
|
|
127
124
|
def epstein_media_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
128
|
-
return self.
|
|
125
|
+
return self.external_link(epstein_media_doc_url, style, link_txt)
|
|
129
126
|
|
|
130
127
|
def epstein_web_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
131
|
-
return self.
|
|
128
|
+
return self.external_link(epstein_web_doc_url, style, link_txt)
|
|
132
129
|
|
|
133
130
|
def rollcall_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
134
|
-
return self.
|
|
131
|
+
return self.external_link(rollcall_doc_url, style, link_txt)
|
|
135
132
|
|
|
136
|
-
def
|
|
133
|
+
def external_link(self, fxn: Callable[[str], str], style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
137
134
|
return link_text_obj(fxn(self.url_slug), link_txt or self.file_path.stem, style)
|
|
138
135
|
|
|
139
|
-
def
|
|
140
|
-
"""Returns colored links to epstein.media and
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
if args.use_epstein_web:
|
|
144
|
-
txt.append(self.epstein_web_link(style=style))
|
|
145
|
-
alt_link = self.epstein_media_link(style='white dim', link_txt=EPSTEIN_MEDIA)
|
|
146
|
-
else:
|
|
147
|
-
txt.append(self.epstein_media_link(style=style))
|
|
148
|
-
alt_link = self.epstein_web_link(style='white dim', link_txt=EPSTEIN_WEB)
|
|
136
|
+
def external_links_txt(self, style: str = '', include_alt_links: bool = False) -> Text:
|
|
137
|
+
"""Returns colored links to epstein.media and alternates in a Text object."""
|
|
138
|
+
links = [self.epstein_media_link(style=style)]
|
|
149
139
|
|
|
150
140
|
if include_alt_links:
|
|
151
|
-
|
|
152
|
-
|
|
141
|
+
links.append(self.epsteinify_link(style=ALT_LINK_STYLE, link_txt=EPSTEINIFY))
|
|
142
|
+
links.append(self.epstein_web_link(style=ALT_LINK_STYLE, link_txt=EPSTEIN_WEB))
|
|
153
143
|
|
|
154
144
|
if self._class_name() == 'Email':
|
|
155
|
-
|
|
145
|
+
links.append(self.rollcall_link(style=ALT_LINK_STYLE, link_txt=ROLLCALL))
|
|
156
146
|
|
|
157
|
-
|
|
147
|
+
links = [links[0]] + [parenthesize(link) for link in links[1:]]
|
|
148
|
+
base_txt = Text('', style='white' if include_alt_links else ARCHIVE_LINK_COLOR)
|
|
149
|
+
return base_txt.append(join_texts(links))
|
|
150
|
+
|
|
151
|
+
def file_id_debug_info(self) -> str:
|
|
152
|
+
return ', '.join([f"{prop}={getattr(self, prop)}" for prop in ['file_id', 'filename', 'url_slug']])
|
|
158
153
|
|
|
159
154
|
def file_info_panel(self) -> Group:
|
|
160
155
|
"""Panel with filename linking to raw file plus any additional info about the file."""
|
|
161
|
-
panel = Panel(self.
|
|
156
|
+
panel = Panel(self.external_links_txt(include_alt_links=True), border_style=self._border_style(), expand=False)
|
|
162
157
|
padded_info = [Padding(sentence, INFO_PADDING) for sentence in self.info()]
|
|
163
158
|
return Group(*([panel] + padded_info))
|
|
164
159
|
|
|
165
160
|
def file_size(self) -> int:
|
|
166
161
|
return file_size(self.file_path)
|
|
167
162
|
|
|
168
|
-
def file_size_str(self) -> str:
|
|
169
|
-
return file_size_str(self.file_path)
|
|
163
|
+
def file_size_str(self, decimal_places: int | None = None) -> str:
|
|
164
|
+
return file_size_str(self.file_path, decimal_places)
|
|
170
165
|
|
|
171
166
|
def info(self) -> list[Text]:
|
|
172
167
|
"""0 to 2 sentences containing the info_txt() as well as any configured description."""
|
|
@@ -176,16 +171,19 @@ class Document:
|
|
|
176
171
|
])
|
|
177
172
|
|
|
178
173
|
def info_txt(self) -> Text | None:
|
|
179
|
-
"""Secondary info about this file (recipients,
|
|
174
|
+
"""Secondary info about this file (description recipients, etc). Overload in subclasses."""
|
|
180
175
|
return None
|
|
181
176
|
|
|
182
177
|
def is_duplicate(self) -> bool:
|
|
183
|
-
return bool(self.config and self.config.
|
|
178
|
+
return bool(self.config and self.config.duplicate_of_id)
|
|
184
179
|
|
|
185
180
|
def is_local_extract_file(self) -> bool:
|
|
186
|
-
"""True if
|
|
181
|
+
"""True if extracted from other file (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
|
|
187
182
|
return is_local_extract_file(self.filename)
|
|
188
183
|
|
|
184
|
+
def length(self) -> int:
|
|
185
|
+
return len(self.text)
|
|
186
|
+
|
|
189
187
|
def log(self, msg: str, level: int = logging.INFO):
|
|
190
188
|
"""Log with filename as a prefix."""
|
|
191
189
|
logger.log(level, f"{self.file_path.stem} {msg}")
|
|
@@ -206,17 +204,21 @@ class Document:
|
|
|
206
204
|
metadata.update({k: v for k, v in asdict(self).items() if k in METADATA_FIELDS and v is not None})
|
|
207
205
|
metadata['bytes'] = self.file_size()
|
|
208
206
|
metadata['filename'] = f"{self.url_slug}.txt"
|
|
207
|
+
metadata['num_lines'] = self.num_lines()
|
|
209
208
|
metadata['type'] = self._class_name()
|
|
210
209
|
|
|
211
210
|
if self.is_local_extract_file():
|
|
212
211
|
metadata['extracted_file'] = {
|
|
213
|
-
'explanation': '
|
|
212
|
+
'explanation': 'manually extracted from one of the other files',
|
|
214
213
|
'extracted_from': self.url_slug + '.txt',
|
|
215
214
|
'url': extracted_file_url(self.filename),
|
|
216
215
|
}
|
|
217
216
|
|
|
218
217
|
return metadata
|
|
219
218
|
|
|
219
|
+
def num_lines(self) -> int:
|
|
220
|
+
return len(self.lines)
|
|
221
|
+
|
|
220
222
|
def raw_text(self) -> str:
|
|
221
223
|
with open(self.file_path) as f:
|
|
222
224
|
return f.read()
|
|
@@ -232,8 +234,9 @@ class Document:
|
|
|
232
234
|
return text
|
|
233
235
|
|
|
234
236
|
def sort_key(self) -> tuple[datetime, str, int]:
|
|
237
|
+
"""Sort by timestamp, file_id, then whether or not it's a duplicate file."""
|
|
235
238
|
if self.is_duplicate():
|
|
236
|
-
sort_id = self.config.
|
|
239
|
+
sort_id = self.config.duplicate_of_id
|
|
237
240
|
dupe_idx = 1
|
|
238
241
|
else:
|
|
239
242
|
sort_id = self.file_id
|
|
@@ -251,11 +254,11 @@ class Document:
|
|
|
251
254
|
txt.append(' (', style=SYMBOL_STYLE)
|
|
252
255
|
txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
|
|
253
256
|
|
|
254
|
-
txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
|
|
255
|
-
txt.append(", ").append(key_value_txt('lines', self.num_lines))
|
|
257
|
+
txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(0), style='aquamarine1')))
|
|
258
|
+
txt.append(", ").append(key_value_txt('lines', self.num_lines()))
|
|
256
259
|
|
|
257
|
-
if self.config and self.config.
|
|
258
|
-
txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.
|
|
260
|
+
if self.config and self.config.duplicate_of_id:
|
|
261
|
+
txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='magenta')))
|
|
259
262
|
|
|
260
263
|
return txt
|
|
261
264
|
|
|
@@ -269,6 +272,7 @@ class Document:
|
|
|
269
272
|
return Panel(Group(*sentences), border_style=self._class_style(), expand=False)
|
|
270
273
|
|
|
271
274
|
def top_lines(self, n: int = 10) -> str:
|
|
275
|
+
"""First n lines."""
|
|
272
276
|
return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
|
|
273
277
|
|
|
274
278
|
def warn(self, msg: str) -> None:
|
|
@@ -294,13 +298,19 @@ class Document:
|
|
|
294
298
|
"""Should be implemented in subclasses."""
|
|
295
299
|
pass
|
|
296
300
|
|
|
297
|
-
def _load_file(self) ->
|
|
301
|
+
def _load_file(self) -> None:
|
|
298
302
|
"""Remove BOM and HOUSE OVERSIGHT lines, strip whitespace."""
|
|
299
303
|
text = self.raw_text()
|
|
300
304
|
text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text # remove BOM
|
|
301
305
|
text = self.repair_ocr_text(OCR_REPAIRS, text.strip())
|
|
302
|
-
|
|
303
|
-
|
|
306
|
+
|
|
307
|
+
lines = [
|
|
308
|
+
line.strip() if self.strip_whitespace else line for line in text.split('\n')
|
|
309
|
+
if not line.startswith(HOUSE_OVERSIGHT)
|
|
310
|
+
]
|
|
311
|
+
|
|
312
|
+
self.text = collapse_newlines('\n'.join(lines))
|
|
313
|
+
self.lines = self.text.split('\n')
|
|
304
314
|
|
|
305
315
|
def _repair(self) -> None:
|
|
306
316
|
"""Can optionally be overloaded in subclasses to further improve self.text."""
|
|
@@ -317,9 +327,7 @@ class Document:
|
|
|
317
327
|
else:
|
|
318
328
|
raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (neither was)")
|
|
319
329
|
|
|
320
|
-
self.length = len(self.text)
|
|
321
330
|
self.lines = [line.strip() if self.strip_whitespace else line for line in self.text.split('\n')]
|
|
322
|
-
self.num_lines = len(self.lines)
|
|
323
331
|
|
|
324
332
|
def _write_clean_text(self, output_path: Path) -> None:
|
|
325
333
|
"""Write self.text to 'output_path'. Used only for diffing files."""
|
|
@@ -332,7 +340,7 @@ class Document:
|
|
|
332
340
|
with open(output_path, 'w') as f:
|
|
333
341
|
f.write(self.text)
|
|
334
342
|
|
|
335
|
-
logger.warning(f"Wrote {self.length} chars of cleaned {self.filename} to {output_path}.")
|
|
343
|
+
logger.warning(f"Wrote {self.length()} chars of cleaned {self.filename} to {output_path}.")
|
|
336
344
|
|
|
337
345
|
def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
|
|
338
346
|
yield self.file_info_panel()
|
|
@@ -17,7 +17,7 @@ from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, INFO_INDENT
|
|
|
17
17
|
from epstein_files.documents.emails.email_header import (BAD_EMAILER_REGEX, EMAIL_SIMPLE_HEADER_REGEX,
|
|
18
18
|
EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, TIME_REGEX, EmailHeader)
|
|
19
19
|
from epstein_files.util.constant.names import *
|
|
20
|
-
from epstein_files.util.constant.strings import REDACTED
|
|
20
|
+
from epstein_files.util.constant.strings import REDACTED
|
|
21
21
|
from epstein_files.util.constants import *
|
|
22
22
|
from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes, extract_last_name,
|
|
23
23
|
flatten, remove_timezone, uniquify)
|
|
@@ -41,6 +41,7 @@ LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
|
|
|
41
41
|
|
|
42
42
|
SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
|
|
43
43
|
REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
|
|
44
|
+
URL_SIGNIFIERS = ['gclid', 'htm', 'ref=', 'utm']
|
|
44
45
|
APPEARS_IN = 'Appears in'
|
|
45
46
|
MAX_CHARS_TO_PRINT = 4000
|
|
46
47
|
MAX_NUM_HEADER_LINES = 14
|
|
@@ -131,12 +132,12 @@ JUNK_EMAILERS = [
|
|
|
131
132
|
'editorialstaff@flipboard.com',
|
|
132
133
|
'How To Academy',
|
|
133
134
|
'Jokeland',
|
|
134
|
-
JP_MORGAN_USGIO,
|
|
135
135
|
]
|
|
136
136
|
|
|
137
137
|
MAILING_LISTS = [
|
|
138
138
|
INTELLIGENCE_SQUARED,
|
|
139
139
|
'middle.east.update@hotmail.com',
|
|
140
|
+
JP_MORGAN_USGIO,
|
|
140
141
|
]
|
|
141
142
|
|
|
142
143
|
TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + MAILING_LISTS + [
|
|
@@ -316,7 +317,7 @@ class Email(Communication):
|
|
|
316
317
|
recipients: list[str | None] = field(default_factory=list)
|
|
317
318
|
sent_from_device: str | None = None
|
|
318
319
|
signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
|
|
319
|
-
|
|
320
|
+
_truncation_allowed: bool = True # Hacky way to get __rich_console__() not to truncate in epstein_show script
|
|
320
321
|
|
|
321
322
|
# For logging how many headers we prettified while printing, kind of janky
|
|
322
323
|
rewritten_header_ids: ClassVar[set[str]] = set([])
|
|
@@ -337,10 +338,10 @@ class Email(Communication):
|
|
|
337
338
|
|
|
338
339
|
try:
|
|
339
340
|
if self.config and self.config.recipients:
|
|
340
|
-
self.recipients =
|
|
341
|
+
self.recipients = self.config.recipients
|
|
341
342
|
else:
|
|
342
343
|
for recipient in self.header.recipients():
|
|
343
|
-
self.recipients.extend(self.
|
|
344
|
+
self.recipients.extend(self._emailer_names(recipient))
|
|
344
345
|
except Exception as e:
|
|
345
346
|
console.print_exception()
|
|
346
347
|
console.line(2)
|
|
@@ -402,8 +403,8 @@ class Email(Communication):
|
|
|
402
403
|
return self.text
|
|
403
404
|
|
|
404
405
|
reply_text_match = REPLY_TEXT_REGEX.search(text)
|
|
405
|
-
|
|
406
|
-
|
|
406
|
+
self.log_top_lines(20, "Raw text:", logging.DEBUG)
|
|
407
|
+
self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
|
|
407
408
|
|
|
408
409
|
if reply_text_match:
|
|
409
410
|
actual_num_chars = len(reply_text_match.group(1))
|
|
@@ -439,12 +440,32 @@ class Email(Communication):
|
|
|
439
440
|
|
|
440
441
|
return style.replace('bold', '').strip()
|
|
441
442
|
|
|
443
|
+
def _emailer_names(self, emailer_str: str) -> list[str]:
|
|
444
|
+
"""Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
|
|
445
|
+
emailer_str = EmailHeader.cleanup_str(emailer_str)
|
|
446
|
+
|
|
447
|
+
if len(emailer_str) == 0:
|
|
448
|
+
return []
|
|
449
|
+
|
|
450
|
+
names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
|
|
451
|
+
|
|
452
|
+
if BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
|
|
453
|
+
if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
|
|
454
|
+
logger.warning(f"'{self.filename}': No emailer found in '{escape_single_quotes(emailer_str)}'")
|
|
455
|
+
else:
|
|
456
|
+
logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
|
|
457
|
+
|
|
458
|
+
return names_found
|
|
459
|
+
|
|
460
|
+
names_found = names_found or [emailer_str]
|
|
461
|
+
return [_reverse_first_and_last_names(name) for name in names_found]
|
|
462
|
+
|
|
442
463
|
def _extract_author(self) -> None:
|
|
443
464
|
self._extract_header()
|
|
444
465
|
super()._extract_author()
|
|
445
466
|
|
|
446
467
|
if not self.author and self.header.author:
|
|
447
|
-
authors = self.
|
|
468
|
+
authors = self._emailer_names(self.header.author)
|
|
448
469
|
self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
|
|
449
470
|
|
|
450
471
|
def _extract_header(self) -> None:
|
|
@@ -494,26 +515,6 @@ class Email(Communication):
|
|
|
494
515
|
|
|
495
516
|
raise RuntimeError(f"No timestamp found in '{self.file_path.name}' top lines:\n{searchable_text}")
|
|
496
517
|
|
|
497
|
-
def _get_names(self, emailer_str: str) -> list[str]:
|
|
498
|
-
"""Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
|
|
499
|
-
emailer_str = EmailHeader.cleanup_str(emailer_str)
|
|
500
|
-
|
|
501
|
-
if len(emailer_str) == 0:
|
|
502
|
-
return []
|
|
503
|
-
|
|
504
|
-
names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
|
|
505
|
-
|
|
506
|
-
if BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
|
|
507
|
-
if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
|
|
508
|
-
logger.warning(f"'{self.filename}': No emailer found in '{escape_single_quotes(emailer_str)}'")
|
|
509
|
-
else:
|
|
510
|
-
logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
|
|
511
|
-
|
|
512
|
-
return names_found
|
|
513
|
-
|
|
514
|
-
names_found = names_found or [emailer_str]
|
|
515
|
-
return [_reverse_first_and_last_names(name) for name in names_found]
|
|
516
|
-
|
|
517
518
|
def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
|
|
518
519
|
"""Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
|
|
519
520
|
for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text or self.text)):
|
|
@@ -683,6 +684,9 @@ class Email(Communication):
|
|
|
683
684
|
if extracted_from_description:
|
|
684
685
|
extracted_description = f"{APPEARS_IN} {extracted_from_description}"
|
|
685
686
|
|
|
687
|
+
if isinstance(extracted_from_doc_cfg, EmailCfg):
|
|
688
|
+
extracted_description += ' email'
|
|
689
|
+
|
|
686
690
|
if self.config.description:
|
|
687
691
|
self.warn(f"Overwriting description '{self.config.description}' with extract description '{self.config.description}'")
|
|
688
692
|
|
|
@@ -708,10 +712,10 @@ class Email(Communication):
|
|
|
708
712
|
num_chars = quote_cutoff
|
|
709
713
|
|
|
710
714
|
# Truncate long emails but leave a note explaining what happened w/link to source document
|
|
711
|
-
if len(text) > num_chars and self.
|
|
715
|
+
if len(text) > num_chars and self._truncation_allowed:
|
|
712
716
|
text = text[0:num_chars]
|
|
713
717
|
doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
|
|
714
|
-
trim_note = f"<...trimmed to {num_chars} characters of {self.length}, read the rest at {doc_link_markup}...>"
|
|
718
|
+
trim_note = f"<...trimmed to {num_chars} characters of {self.length()}, read the rest at {doc_link_markup}...>"
|
|
715
719
|
trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
|
|
716
720
|
|
|
717
721
|
# Rewrite broken headers where the values are on separate lines from the field names
|
{epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/documents/imessage/text_message.py
RENAMED
|
@@ -5,6 +5,7 @@ from datetime import datetime
|
|
|
5
5
|
from rich.text import Text
|
|
6
6
|
|
|
7
7
|
from epstein_files.util.constant.names import JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN
|
|
8
|
+
from epstein_files.util.constant.strings import TIMESTAMP_DIM
|
|
8
9
|
from epstein_files.util.data import extract_last_name
|
|
9
10
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
10
11
|
from epstein_files.util.logging import logger
|
|
@@ -12,7 +13,6 @@ from epstein_files.util.rich import TEXT_LINK, highlighter
|
|
|
12
13
|
|
|
13
14
|
MSG_DATE_FORMAT = r"%m/%d/%y %I:%M:%S %p"
|
|
14
15
|
PHONE_NUMBER_REGEX = re.compile(r'^[\d+]+.*')
|
|
15
|
-
TIMESTAMP_STYLE = 'turquoise4 dim'
|
|
16
16
|
|
|
17
17
|
DISPLAY_LAST_NAME_ONLY = [
|
|
18
18
|
JEFFREY_EPSTEIN,
|
|
@@ -29,7 +29,7 @@ TEXTER_MAPPING = {
|
|
|
29
29
|
class TextMessage:
|
|
30
30
|
"""Class representing a single iMessage text message."""
|
|
31
31
|
author: str | None
|
|
32
|
-
author_str: str
|
|
32
|
+
author_str: str = ''
|
|
33
33
|
id_confirmed: bool = False
|
|
34
34
|
text: str
|
|
35
35
|
timestamp_str: str
|
|
@@ -37,7 +37,7 @@ class TextMessage:
|
|
|
37
37
|
def __post_init__(self):
|
|
38
38
|
self.author = TEXTER_MAPPING.get(self.author or UNKNOWN, self.author)
|
|
39
39
|
|
|
40
|
-
if self.author
|
|
40
|
+
if not self.author:
|
|
41
41
|
self.author_str = UNKNOWN
|
|
42
42
|
elif self.author in DISPLAY_LAST_NAME_ONLY and not self.author_str:
|
|
43
43
|
self.author_str = extract_last_name(self.author)
|
|
@@ -77,5 +77,5 @@ class TextMessage:
|
|
|
77
77
|
def __rich__(self) -> Text:
|
|
78
78
|
author_style = get_style_for_name(self.author_str if self.author_str.startswith('+') else self.author)
|
|
79
79
|
author_txt = Text(self.author_str, style=author_style)
|
|
80
|
-
timestamp_txt = Text(f"[{self.timestamp_str}]", style=
|
|
80
|
+
timestamp_txt = Text(f"[{self.timestamp_str}]", style=TIMESTAMP_DIM).append(' ')
|
|
81
81
|
return Text('').append(timestamp_txt).append(author_txt).append(': ', style='dim').append(self._message())
|
|
@@ -6,10 +6,12 @@ from typing import ClassVar
|
|
|
6
6
|
|
|
7
7
|
from rich.text import Text
|
|
8
8
|
|
|
9
|
-
from epstein_files.documents.other_file import OtherFile
|
|
9
|
+
from epstein_files.documents.other_file import Metadata, OtherFile
|
|
10
10
|
from epstein_files.util.constant.strings import JSON
|
|
11
11
|
from epstein_files.util.rich import INFO_STYLE
|
|
12
12
|
|
|
13
|
+
DESCRIPTION = "JSON data containing preview info for links sent in a messaging app like iMessage"
|
|
14
|
+
|
|
13
15
|
TEXT_FIELDS = [
|
|
14
16
|
'caption',
|
|
15
17
|
'standard',
|
|
@@ -23,7 +25,6 @@ TEXT_FIELDS = [
|
|
|
23
25
|
@dataclass
|
|
24
26
|
class JsonFile(OtherFile):
|
|
25
27
|
"""File containing JSON data."""
|
|
26
|
-
|
|
27
28
|
include_description_in_summary_panel: ClassVar[bool] = False
|
|
28
29
|
strip_whitespace: ClassVar[bool] = False
|
|
29
30
|
|
|
@@ -39,7 +40,7 @@ class JsonFile(OtherFile):
|
|
|
39
40
|
return JSON
|
|
40
41
|
|
|
41
42
|
def info_txt(self) -> Text | None:
|
|
42
|
-
return Text(
|
|
43
|
+
return Text(DESCRIPTION, style=INFO_STYLE)
|
|
43
44
|
|
|
44
45
|
def is_interesting(self):
|
|
45
46
|
return False
|
|
@@ -48,5 +49,10 @@ class JsonFile(OtherFile):
|
|
|
48
49
|
with open(self.file_path, encoding='utf-8-sig') as f:
|
|
49
50
|
return json.load(f)
|
|
50
51
|
|
|
52
|
+
def metadata(self) -> Metadata:
|
|
53
|
+
metadata = super().metadata()
|
|
54
|
+
metadata['description'] = DESCRIPTION
|
|
55
|
+
return metadata
|
|
56
|
+
|
|
51
57
|
def json_str(self) -> str:
|
|
52
58
|
return json.dumps(self.json_data(), indent=4)
|