epstein-files 1.0.10__py3-none-any.whl → 1.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +7 -9
- epstein_files/documents/communication.py +2 -2
- epstein_files/documents/document.py +94 -81
- epstein_files/documents/email.py +47 -5
- epstein_files/documents/imessage/text_message.py +4 -13
- epstein_files/documents/json_file.py +13 -1
- epstein_files/documents/messenger_log.py +32 -19
- epstein_files/documents/other_file.py +67 -44
- epstein_files/epstein_files.py +22 -15
- epstein_files/util/constant/names.py +11 -10
- epstein_files/util/constant/strings.py +2 -1
- epstein_files/util/constants.py +98 -88
- epstein_files/util/data.py +1 -1
- epstein_files/util/doc_cfg.py +32 -62
- epstein_files/util/env.py +29 -17
- epstein_files/util/file_helper.py +12 -29
- epstein_files/util/highlighted_group.py +34 -17
- epstein_files/util/logging.py +1 -7
- epstein_files/util/output.py +13 -8
- epstein_files/util/rich.py +15 -10
- epstein_files/util/word_count.py +65 -5
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.12.dist-info}/METADATA +1 -1
- epstein_files-1.0.12.dist-info/RECORD +33 -0
- epstein_files/count_words.py +0 -72
- epstein_files-1.0.10.dist-info/RECORD +0 -34
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.12.dist-info}/LICENSE +0 -0
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.12.dist-info}/WHEEL +0 -0
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.12.dist-info}/entry_points.txt +0 -0
epstein_files/__init__.py
CHANGED
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
"""
|
|
3
3
|
Reformat Epstein text message files for readability and count email senders.
|
|
4
|
-
For use with iMessage log files from https://drive.google.com/drive/folders/1hTNH5woIRio578onLGElkTWofUSWRoH_
|
|
5
4
|
|
|
6
|
-
Install: 'poetry install'
|
|
7
5
|
Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT epstein_generate'
|
|
8
6
|
"""
|
|
9
7
|
from sys import exit
|
|
@@ -15,7 +13,6 @@ from rich.padding import Padding
|
|
|
15
13
|
from rich.panel import Panel
|
|
16
14
|
from rich.text import Text
|
|
17
15
|
|
|
18
|
-
from epstein_files.count_words import write_word_counts_html
|
|
19
16
|
from epstein_files.epstein_files import EpsteinFiles, document_cls
|
|
20
17
|
from epstein_files.documents.document import INFO_PADDING, Document
|
|
21
18
|
from epstein_files.documents.email import Email
|
|
@@ -23,10 +20,11 @@ from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, TEXT_MSGS_
|
|
|
23
20
|
from epstein_files.util.env import args, specified_names
|
|
24
21
|
from epstein_files.util.file_helper import coerce_file_path, extract_file_id
|
|
25
22
|
from epstein_files.util.logging import logger
|
|
26
|
-
from epstein_files.util.output import (print_emails, print_json_files,
|
|
27
|
-
print_text_messages, write_urls)
|
|
23
|
+
from epstein_files.util.output import (print_emails, print_json_files, print_json_stats,
|
|
24
|
+
print_text_messages, write_json_metadata, write_urls)
|
|
28
25
|
from epstein_files.util.rich import build_highlighter, console, print_header, print_panel, write_html
|
|
29
26
|
from epstein_files.util.timer import Timer
|
|
27
|
+
from epstein_files.util.word_count import write_word_counts_html
|
|
30
28
|
|
|
31
29
|
|
|
32
30
|
def generate_html() -> None:
|
|
@@ -39,9 +37,9 @@ def generate_html() -> None:
|
|
|
39
37
|
epstein_files = EpsteinFiles.get_files(timer)
|
|
40
38
|
|
|
41
39
|
if args.json_metadata:
|
|
42
|
-
|
|
40
|
+
write_json_metadata(epstein_files)
|
|
43
41
|
exit()
|
|
44
|
-
elif args.
|
|
42
|
+
elif args.json_files:
|
|
45
43
|
print_json_files(epstein_files)
|
|
46
44
|
exit()
|
|
47
45
|
|
|
@@ -58,7 +56,7 @@ def generate_html() -> None:
|
|
|
58
56
|
emails_printed = print_emails(epstein_files)
|
|
59
57
|
timer.print_at_checkpoint(f"Printed {emails_printed:,} emails")
|
|
60
58
|
|
|
61
|
-
if args.
|
|
59
|
+
if args.output_other:
|
|
62
60
|
files_printed = epstein_files.print_other_files_table()
|
|
63
61
|
timer.print_at_checkpoint(f"Printed {len(files_printed)} other files")
|
|
64
62
|
|
|
@@ -96,7 +94,7 @@ def epstein_search():
|
|
|
96
94
|
|
|
97
95
|
console.print(search_result.document)
|
|
98
96
|
else:
|
|
99
|
-
console.print(search_result.document.
|
|
97
|
+
console.print(search_result.document.summary_panel())
|
|
100
98
|
|
|
101
99
|
for matching_line in search_result.lines:
|
|
102
100
|
line_txt = matching_line.__rich__()
|
|
@@ -34,9 +34,9 @@ class Communication(Document):
|
|
|
34
34
|
def is_attribution_uncertain(self) -> bool:
|
|
35
35
|
return bool(self.config and self.config.is_attribution_uncertain)
|
|
36
36
|
|
|
37
|
-
def
|
|
37
|
+
def external_links(self, _style: str = '', include_alt_link: bool = True) -> Text:
|
|
38
38
|
"""Overrides super() method to apply self.author_style."""
|
|
39
|
-
return super().
|
|
39
|
+
return super().external_links(self.author_style, include_alt_link=include_alt_link)
|
|
40
40
|
|
|
41
41
|
def summary(self) -> Text:
|
|
42
42
|
return self._summary().append(CLOSE_PROPERTIES_CHAR)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import re
|
|
3
|
+
from copy import deepcopy
|
|
3
4
|
from dataclasses import asdict, dataclass, field
|
|
4
5
|
from datetime import datetime
|
|
5
6
|
from pathlib import Path
|
|
@@ -15,13 +16,13 @@ from epstein_files.util.constant.names import *
|
|
|
15
16
|
from epstein_files.util.constant.strings import *
|
|
16
17
|
from epstein_files.util.constant.urls import *
|
|
17
18
|
from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
|
|
18
|
-
from epstein_files.util.data import collapse_newlines, date_str,
|
|
19
|
-
from epstein_files.util.doc_cfg import EmailCfg, DocCfg, Metadata, TextCfg
|
|
20
|
-
from epstein_files.util.env import args
|
|
21
|
-
from epstein_files.util.file_helper import (
|
|
19
|
+
from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_time_from_timestamp_str, without_falsey
|
|
20
|
+
from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
|
|
21
|
+
from epstein_files.util.env import DOCS_DIR, args
|
|
22
|
+
from epstein_files.util.file_helper import (file_stem_for_id, extract_file_id, file_size,
|
|
22
23
|
file_size_str, is_local_extract_file)
|
|
23
24
|
from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
|
|
24
|
-
from epstein_files.util.rich import SYMBOL_STYLE, console, highlighter, key_value_txt, link_text_obj
|
|
25
|
+
from epstein_files.util.rich import INFO_STYLE, SYMBOL_STYLE, console, highlighter, key_value_txt, link_text_obj
|
|
25
26
|
from epstein_files.util.search_result import MatchedLine
|
|
26
27
|
|
|
27
28
|
CLOSE_PROPERTIES_CHAR = ']'
|
|
@@ -30,7 +31,6 @@ INFO_INDENT = 2
|
|
|
30
31
|
INFO_PADDING = (0, 0, 0, INFO_INDENT)
|
|
31
32
|
MAX_TOP_LINES_LEN = 4000 # Only for logging
|
|
32
33
|
MIN_DOCUMENT_ID = 10477
|
|
33
|
-
LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
|
|
34
34
|
WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
|
|
35
35
|
|
|
36
36
|
MIN_TIMESTAMP = datetime(1991, 1, 1)
|
|
@@ -59,14 +59,27 @@ OCR_REPAIRS = {
|
|
|
59
59
|
|
|
60
60
|
@dataclass
|
|
61
61
|
class Document:
|
|
62
|
-
"""
|
|
62
|
+
"""
|
|
63
|
+
Base class for all Epstein Files documents.
|
|
64
|
+
|
|
65
|
+
Attributes:
|
|
66
|
+
file_path (Path): Local path to file
|
|
67
|
+
author (str | None): Who is responsible for the text in the file
|
|
68
|
+
config (DocCfg): Information about this fil
|
|
69
|
+
file_id (str): 6 digit (or 8 digits if it's a local extract file) string ID
|
|
70
|
+
filename (str): File's basename
|
|
71
|
+
length (int): Number of characters in the file after all the cleanup
|
|
72
|
+
lines (str): Number of lines in the file after all the cleanup
|
|
73
|
+
text (str): Contents of the file
|
|
74
|
+
timestamp (datetime | None): When the file was originally created
|
|
75
|
+
url_slug (str): Version of the filename that works in links to epsteinify etc.
|
|
76
|
+
"""
|
|
63
77
|
file_path: Path
|
|
64
78
|
# Optional fields
|
|
65
79
|
author: str | None = None
|
|
66
80
|
config: EmailCfg | DocCfg | TextCfg | None = None
|
|
67
81
|
file_id: str = field(init=False)
|
|
68
82
|
filename: str = field(init=False)
|
|
69
|
-
is_duplicate: bool = False
|
|
70
83
|
length: int = field(init=False)
|
|
71
84
|
lines: list[str] = field(init=False)
|
|
72
85
|
num_lines: int = field(init=False)
|
|
@@ -74,22 +87,16 @@ class Document:
|
|
|
74
87
|
timestamp: datetime | None = None
|
|
75
88
|
url_slug: str = field(init=False) # e.g. 'HOUSE_OVERSIGHT_123456
|
|
76
89
|
|
|
77
|
-
# Class
|
|
78
|
-
|
|
90
|
+
# Class variables
|
|
91
|
+
include_description_in_summary_panel: ClassVar[bool] = False
|
|
92
|
+
strip_whitespace: ClassVar[bool] = True # Overridden in JsonFile
|
|
79
93
|
|
|
80
94
|
def __post_init__(self):
|
|
81
95
|
self.filename = self.file_path.name
|
|
82
96
|
self.file_id = extract_file_id(self.filename)
|
|
83
|
-
self.config = ALL_FILE_CONFIGS.get(self.file_id)
|
|
84
|
-
self.is_duplicate = bool(self.config.dupe_of_id) if self.config else False
|
|
97
|
+
self.config = self.config or deepcopy(ALL_FILE_CONFIGS.get(self.file_id))
|
|
85
98
|
|
|
86
|
-
if self
|
|
87
|
-
self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
|
|
88
|
-
|
|
89
|
-
# Coerce FileConfig for court docs etc. to MessageCfg for email files extracted from that document
|
|
90
|
-
if self.class_name() == EMAIL_CLASS and self.config and not isinstance(self.config, EmailCfg):
|
|
91
|
-
self.config = EmailCfg.from_doc_cfg(self.config)
|
|
92
|
-
else:
|
|
99
|
+
if 'url_slug' not in vars(self):
|
|
93
100
|
self.url_slug = self.file_path.stem
|
|
94
101
|
|
|
95
102
|
self._set_computed_fields(text=self.text or self._load_file())
|
|
@@ -97,11 +104,7 @@ class Document:
|
|
|
97
104
|
self._extract_author()
|
|
98
105
|
self.timestamp = self._extract_timestamp()
|
|
99
106
|
|
|
100
|
-
def
|
|
101
|
-
"""Annoying workaround for circular import issues and isinstance()."""
|
|
102
|
-
return str(type(self).__name__)
|
|
103
|
-
|
|
104
|
-
def configured_description(self) -> str | None:
|
|
107
|
+
def config_description(self) -> str | None:
|
|
105
108
|
"""Overloaded in OtherFile."""
|
|
106
109
|
if self.config and self.config.description:
|
|
107
110
|
return f"({self.config.description})"
|
|
@@ -109,40 +112,51 @@ class Document:
|
|
|
109
112
|
def date_str(self) -> str | None:
|
|
110
113
|
return date_str(self.timestamp)
|
|
111
114
|
|
|
112
|
-
def description_panel(self, include_hints: bool = False) -> Panel:
|
|
113
|
-
"""Panelized description() with info_txt(), used in search results."""
|
|
114
|
-
hints = [Text('', style='italic').append(h) for h in (self.hints() if include_hints else [])]
|
|
115
|
-
return Panel(Group(*([self.summary()] + hints)), border_style=self.document_type_style(), expand=False)
|
|
116
|
-
|
|
117
|
-
def document_type_style(self) -> str:
|
|
118
|
-
return DOC_TYPE_STYLES[self.class_name()]
|
|
119
|
-
|
|
120
115
|
def duplicate_file_txt(self) -> Text:
|
|
121
116
|
"""If the file is a dupe make a nice message to explain what file it's a duplicate of."""
|
|
122
|
-
if not self.config or not self.config.dupe_of_id:
|
|
117
|
+
if not self.config or not self.config.dupe_of_id or self.config.dupe_type is None:
|
|
123
118
|
raise RuntimeError(f"duplicate_file_txt() called on {self.summary()} but not a dupe! config:\n\n{self.config}")
|
|
124
119
|
|
|
125
|
-
txt = Text(f"Not showing ", style=
|
|
126
|
-
txt.append(f" because it's {self.config.
|
|
120
|
+
txt = Text(f"Not showing ", style=INFO_STYLE).append(epstein_media_doc_link_txt(self.file_id, style='cyan'))
|
|
121
|
+
txt.append(f" because it's {DUPE_TYPE_STRS[self.config.dupe_type]} ")
|
|
127
122
|
return txt.append(epstein_media_doc_link_txt(self.config.dupe_of_id, style='royal_blue1'))
|
|
128
123
|
|
|
129
124
|
def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
130
125
|
"""Create a Text obj link to this document on epsteinify.com."""
|
|
131
|
-
return link_text_obj(epsteinify_doc_url(self.url_slug), link_txt or self.
|
|
126
|
+
return link_text_obj(epsteinify_doc_url(self.url_slug), link_txt or self.file_path.stem, style)
|
|
132
127
|
|
|
133
128
|
def epstein_media_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
134
129
|
"""Create a Text obj link to this document on epstein.media."""
|
|
135
|
-
return link_text_obj(epstein_media_doc_url(self.url_slug), link_txt or self.
|
|
130
|
+
return link_text_obj(epstein_media_doc_url(self.url_slug), link_txt or self.file_path.stem, style)
|
|
136
131
|
|
|
137
132
|
def epstein_web_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
138
133
|
"""Create a Text obj link to this document on EpsteinWeb."""
|
|
139
|
-
return link_text_obj(epstein_web_doc_url(self.url_slug), link_txt or self.
|
|
134
|
+
return link_text_obj(epstein_web_doc_url(self.url_slug), link_txt or self.file_path.stem, style)
|
|
135
|
+
|
|
136
|
+
def external_links(self, style: str = '', include_alt_link: bool = False) -> Text:
|
|
137
|
+
"""Returns colored links to epstein.media and and epsteinweb in a Text object."""
|
|
138
|
+
txt = Text('', style='white' if include_alt_link else ARCHIVE_LINK_COLOR)
|
|
139
|
+
|
|
140
|
+
if args.use_epstein_web:
|
|
141
|
+
txt.append(self.epstein_web_link(style=style))
|
|
142
|
+
|
|
143
|
+
if include_alt_link:
|
|
144
|
+
txt.append(' (').append(self.epsteinify_link(style='white dim', link_txt=EPSTEINIFY)).append(')')
|
|
145
|
+
txt.append(' (').append(self.epstein_media_link(style='white dim', link_txt=EPSTEIN_MEDIA)).append(')')
|
|
146
|
+
else:
|
|
147
|
+
txt.append(self.epstein_media_link(style=style))
|
|
148
|
+
|
|
149
|
+
if include_alt_link:
|
|
150
|
+
txt.append(' (').append(self.epsteinify_link(style='white dim', link_txt=EPSTEINIFY)).append(')')
|
|
151
|
+
txt.append(' (').append(self.epstein_web_link(style='white dim', link_txt=EPSTEIN_WEB)).append(')')
|
|
152
|
+
|
|
153
|
+
return txt
|
|
140
154
|
|
|
141
155
|
def file_info_panel(self) -> Group:
|
|
142
|
-
"""Panel with filename linking to raw file plus any
|
|
143
|
-
panel = Panel(self.
|
|
144
|
-
|
|
145
|
-
return Group(*([panel] +
|
|
156
|
+
"""Panel with filename linking to raw file plus any additional info about the file."""
|
|
157
|
+
panel = Panel(self.external_links(include_alt_link=True), border_style=self._border_style(), expand=False)
|
|
158
|
+
padded_info = [Padding(sentence, INFO_PADDING) for sentence in self.info()]
|
|
159
|
+
return Group(*([panel] + padded_info))
|
|
146
160
|
|
|
147
161
|
def file_size(self) -> int:
|
|
148
162
|
return file_size(self.file_path)
|
|
@@ -150,34 +164,33 @@ class Document:
|
|
|
150
164
|
def file_size_str(self) -> str:
|
|
151
165
|
return file_size_str(self.file_path)
|
|
152
166
|
|
|
153
|
-
def
|
|
154
|
-
"""
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
hints.append(highlighter(Text(hint_msg, style='white dim italic')))
|
|
160
|
-
|
|
161
|
-
return without_falsey(hints)
|
|
167
|
+
def info(self) -> list[Text]:
|
|
168
|
+
"""0 to 2 sentences containing the info_txt() as well as any configured description."""
|
|
169
|
+
return without_falsey([
|
|
170
|
+
self.info_txt(),
|
|
171
|
+
highlighter(Text(self.config_description(), style=INFO_STYLE)) if self.config_description() else None
|
|
172
|
+
])
|
|
162
173
|
|
|
163
174
|
def info_txt(self) -> Text | None:
|
|
164
175
|
"""Secondary info about this file (recipients, level of certainty, etc). Overload in subclasses."""
|
|
165
176
|
return None
|
|
166
177
|
|
|
178
|
+
def is_duplicate(self) -> bool:
|
|
179
|
+
return bool(self.config and self.config.dupe_of_id)
|
|
180
|
+
|
|
167
181
|
def is_local_extract_file(self) -> bool:
|
|
168
182
|
"""True if file created by extracting text from a court doc (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
|
|
169
183
|
return is_local_extract_file(self.filename)
|
|
170
184
|
|
|
171
|
-
def log(self, msg: str, level: int = logging.
|
|
185
|
+
def log(self, msg: str, level: int = logging.INFO):
|
|
172
186
|
"""Log with filename as a prefix."""
|
|
173
|
-
logger.log(level, f"{self.
|
|
187
|
+
logger.log(level, f"{self.file_path.stem} {msg}")
|
|
174
188
|
|
|
175
189
|
def log_top_lines(self, n: int = 10, msg: str = '', level: int = logging.INFO) -> None:
|
|
176
190
|
"""Log first 'n' lines of self.text at 'level'. 'msg' can be optionally provided."""
|
|
177
191
|
separator = '\n\n' if '\n' in msg else '. '
|
|
178
192
|
msg = (msg + separator) if msg else ''
|
|
179
|
-
|
|
180
|
-
logger.log(level, f"{msg}\n\n{self.top_lines(n)}\n")
|
|
193
|
+
self.log(f"{msg}First {n} lines:\n\n{self.top_lines(n)}\n", level)
|
|
181
194
|
|
|
182
195
|
def matching_lines(self, _pattern: re.Pattern | str) -> list[MatchedLine]:
|
|
183
196
|
"""Return lines matching a regex as colored list[Text]."""
|
|
@@ -189,13 +202,13 @@ class Document:
|
|
|
189
202
|
metadata.update({k: v for k, v in asdict(self).items() if k in METADATA_FIELDS and v is not None})
|
|
190
203
|
metadata['bytes'] = self.file_size()
|
|
191
204
|
metadata['filename'] = f"{self.url_slug}.txt"
|
|
192
|
-
metadata['type'] = self.
|
|
205
|
+
metadata['type'] = self._class_name()
|
|
193
206
|
|
|
194
207
|
if self.is_local_extract_file():
|
|
195
208
|
metadata['extracted_file'] = {
|
|
196
|
-
'explanation': '
|
|
197
|
-
'
|
|
198
|
-
'
|
|
209
|
+
'explanation': 'Manually extracted from one of the court filings.',
|
|
210
|
+
'extracted_from': self.url_slug + '.txt',
|
|
211
|
+
'url': extracted_file_url(self.filename),
|
|
199
212
|
}
|
|
200
213
|
|
|
201
214
|
return metadata
|
|
@@ -204,25 +217,6 @@ class Document:
|
|
|
204
217
|
with open(self.file_path) as f:
|
|
205
218
|
return f.read()
|
|
206
219
|
|
|
207
|
-
def raw_document_link_txt(self, style: str = '', include_alt_link: bool = False) -> Text:
|
|
208
|
-
"""Returns colored links to epstein.media and and epsteinweb in a Text object."""
|
|
209
|
-
txt = Text('', style='white' if include_alt_link else ARCHIVE_LINK_COLOR)
|
|
210
|
-
|
|
211
|
-
if args.use_epstein_web_links:
|
|
212
|
-
txt.append(self.epstein_web_link(style=style))
|
|
213
|
-
|
|
214
|
-
if include_alt_link:
|
|
215
|
-
txt.append(' (').append(self.epsteinify_link(style='white dim', link_txt=EPSTEINIFY)).append(')')
|
|
216
|
-
txt.append(' (').append(self.epstein_media_link(style='white dim', link_txt=EPSTEIN_MEDIA)).append(')')
|
|
217
|
-
else:
|
|
218
|
-
txt.append(self.epstein_media_link(style=style))
|
|
219
|
-
|
|
220
|
-
if include_alt_link:
|
|
221
|
-
txt.append(' (').append(self.epsteinify_link(style='white dim', link_txt=EPSTEINIFY)).append(')')
|
|
222
|
-
txt.append(' (').append(self.epstein_web_link(style='white dim', link_txt=EPSTEIN_WEB)).append(')')
|
|
223
|
-
|
|
224
|
-
return txt
|
|
225
|
-
|
|
226
220
|
def repair_ocr_text(self, repairs: dict[str | re.Pattern, str], text: str) -> str:
|
|
227
221
|
"""Apply a dict of repairs (key is pattern or string, value is replacement string) to text."""
|
|
228
222
|
for k, v in repairs.items():
|
|
@@ -234,7 +228,7 @@ class Document:
|
|
|
234
228
|
return text
|
|
235
229
|
|
|
236
230
|
def sort_key(self) -> tuple[datetime, str, int]:
|
|
237
|
-
if self.
|
|
231
|
+
if self.is_duplicate():
|
|
238
232
|
sort_id = self.config.dupe_of_id
|
|
239
233
|
dupe_idx = 1
|
|
240
234
|
else:
|
|
@@ -245,11 +239,11 @@ class Document:
|
|
|
245
239
|
|
|
246
240
|
def summary(self) -> Text:
|
|
247
241
|
"""Summary of this file for logging. Brackets are left open for subclasses to add stuff."""
|
|
248
|
-
txt = Text('').append(self.
|
|
242
|
+
txt = Text('').append(self._class_name(), style=self._class_style())
|
|
249
243
|
txt.append(f" {self.url_slug}", style=FILENAME_STYLE)
|
|
250
244
|
|
|
251
245
|
if self.timestamp:
|
|
252
|
-
timestamp_str =
|
|
246
|
+
timestamp_str = remove_time_from_timestamp_str(self.timestamp)
|
|
253
247
|
txt.append(' (', style=SYMBOL_STYLE)
|
|
254
248
|
txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
|
|
255
249
|
|
|
@@ -261,13 +255,32 @@ class Document:
|
|
|
261
255
|
|
|
262
256
|
return txt
|
|
263
257
|
|
|
258
|
+
def summary_panel(self) -> Panel:
|
|
259
|
+
"""Panelized description() with info_txt(), used in search results."""
|
|
260
|
+
sentences = [self.summary()]
|
|
261
|
+
|
|
262
|
+
if self.include_description_in_summary_panel:
|
|
263
|
+
sentences += [Text('', style='italic').append(h) for h in self.info()]
|
|
264
|
+
|
|
265
|
+
return Panel(Group(*sentences), border_style=self._class_style(), expand=False)
|
|
266
|
+
|
|
264
267
|
def top_lines(self, n: int = 10) -> str:
|
|
265
268
|
return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
|
|
266
269
|
|
|
270
|
+
def warn(self, msg: str) -> None:
|
|
271
|
+
self.log(msg, level=logging.WARNING)
|
|
272
|
+
|
|
267
273
|
def _border_style(self) -> str:
|
|
268
274
|
"""Should be overloaded in subclasses."""
|
|
269
275
|
return 'white'
|
|
270
276
|
|
|
277
|
+
def _class_name(self) -> str:
|
|
278
|
+
"""Annoying workaround for circular import issues and isinstance()."""
|
|
279
|
+
return str(type(self).__name__)
|
|
280
|
+
|
|
281
|
+
def _class_style(self) -> str:
|
|
282
|
+
return DOC_TYPE_STYLES[self._class_name()]
|
|
283
|
+
|
|
271
284
|
def _extract_author(self) -> None:
|
|
272
285
|
"""Get author from config. Extended in Email subclass to also check headers."""
|
|
273
286
|
if self.config and self.config.author:
|
epstein_files/documents/email.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import re
|
|
3
|
+
from copy import deepcopy
|
|
3
4
|
from dataclasses import asdict, dataclass, field
|
|
4
5
|
from datetime import datetime
|
|
5
6
|
from typing import ClassVar, cast
|
|
@@ -21,6 +22,7 @@ from epstein_files.util.constants import *
|
|
|
21
22
|
from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes, extract_last_name,
|
|
22
23
|
flatten, remove_timezone, uniquify)
|
|
23
24
|
from epstein_files.util.doc_cfg import EmailCfg, Metadata
|
|
25
|
+
from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
|
|
24
26
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
25
27
|
from epstein_files.util.logging import logger
|
|
26
28
|
from epstein_files.util.rich import *
|
|
@@ -35,9 +37,11 @@ REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGN
|
|
|
35
37
|
BAD_TIMEZONE_REGEX = re.compile(fr'\((UTC|GMT\+\d\d:\d\d)\)|{REDACTED}')
|
|
36
38
|
DATE_HEADER_REGEX = re.compile(r'(?:Date|Sent):? +(?!by|from|to|via)([^\n]{6,})\n')
|
|
37
39
|
TIMESTAMP_LINE_REGEX = re.compile(r"\d+:\d+")
|
|
40
|
+
LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
|
|
38
41
|
|
|
39
42
|
SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
|
|
40
43
|
REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
|
|
44
|
+
APPEARS_IN = 'Appears in'
|
|
41
45
|
MAX_CHARS_TO_PRINT = 4000
|
|
42
46
|
MAX_NUM_HEADER_LINES = 14
|
|
43
47
|
MAX_QUOTED_REPLIES = 2
|
|
@@ -248,6 +252,7 @@ KRASSNER_RECIPIENTS = uniquify(flatten([ALL_FILE_CONFIGS[id].recipients for id i
|
|
|
248
252
|
|
|
249
253
|
# No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
|
|
250
254
|
USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIPIENTS + [
|
|
255
|
+
'Alan Dlugash', # CCed with Richard Kahn
|
|
251
256
|
'Alan Rogers', # Random CC
|
|
252
257
|
'Andrew Friendly', # Presumably some relation of Kelly Friendly
|
|
253
258
|
'BS Stern', # A random fwd of email we have
|
|
@@ -264,6 +269,8 @@ USELESS_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + KRASSNER_RECIP
|
|
|
264
269
|
'Lyn Fontanilla', # Random CC
|
|
265
270
|
'Mark Albert', # Random CC
|
|
266
271
|
'Matthew Schafer', # Random CC
|
|
272
|
+
MICHAEL_BUCHHOLTZ, # Terry Kafka CC
|
|
273
|
+
'Nancy Dahl', # covered by Lawrence Krauss (her husband)
|
|
267
274
|
'Michael Simmons', # Random CC
|
|
268
275
|
'Nancy Portland', # Lawrence Krauss CC
|
|
269
276
|
'Oliver Goodenough', # Robert Trivers CC
|
|
@@ -318,6 +325,17 @@ class Email(Communication):
|
|
|
318
325
|
rewritten_header_ids: ClassVar[set[str]] = set([])
|
|
319
326
|
|
|
320
327
|
def __post_init__(self):
|
|
328
|
+
self.filename = self.file_path.name
|
|
329
|
+
self.file_id = extract_file_id(self.filename)
|
|
330
|
+
|
|
331
|
+
# Special handling for copying properties out of the config for the document this one was extracted from
|
|
332
|
+
if self.is_local_extract_file():
|
|
333
|
+
self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
|
|
334
|
+
extracted_from_doc_id = self.url_slug.split('_')[-1]
|
|
335
|
+
|
|
336
|
+
if extracted_from_doc_id in ALL_FILE_CONFIGS:
|
|
337
|
+
self._set_config_for_extracted_file(ALL_FILE_CONFIGS[extracted_from_doc_id])
|
|
338
|
+
|
|
321
339
|
super().__post_init__()
|
|
322
340
|
|
|
323
341
|
try:
|
|
@@ -570,7 +588,7 @@ class Email(Communication):
|
|
|
570
588
|
self._merge_lines(3) # Merge 4th and 5th rows
|
|
571
589
|
elif self.file_id in '026609 029402 032405 022695'.split():
|
|
572
590
|
self._merge_lines(4) # Merge 5th and 6th rows
|
|
573
|
-
elif self.file_id in ['019407', '031980', '030384', '033144', '030999', '033575', '029835', '030381']:
|
|
591
|
+
elif self.file_id in ['019407', '031980', '030384', '033144', '030999', '033575', '029835', '030381', '033357']:
|
|
574
592
|
self._merge_lines(2, 4)
|
|
575
593
|
elif self.file_id in ['029154', '029163']:
|
|
576
594
|
self._merge_lines(2, 5)
|
|
@@ -591,6 +609,10 @@ class Email(Communication):
|
|
|
591
609
|
self._merge_lines(7, 9)
|
|
592
610
|
elif self.file_id == '030299':
|
|
593
611
|
self._merge_lines(7, 10)
|
|
612
|
+
elif self.file_id == '014860':
|
|
613
|
+
self._merge_lines(3)
|
|
614
|
+
self._merge_lines(4)
|
|
615
|
+
self._merge_lines(4)
|
|
594
616
|
elif self.file_id == '029977':
|
|
595
617
|
self._set_computed_fields(text=self.text.replace('Sent 9/28/2012 2:41:02 PM', 'Sent: 9/28/2012 2:41:02 PM'))
|
|
596
618
|
|
|
@@ -606,9 +628,8 @@ class Email(Communication):
|
|
|
606
628
|
self._remove_line(3)
|
|
607
629
|
|
|
608
630
|
if old_text != self.text:
|
|
609
|
-
self.log(f"Modified text, old:\n\n" + '\n'.join(old_text.split('\n')[0:12]) + '\n'
|
|
610
|
-
self.log_top_lines(12, 'Result of modifications'
|
|
611
|
-
self.log('', logging.INFO)
|
|
631
|
+
self.log(f"Modified text, old:\n\n" + '\n'.join(old_text.split('\n')[0:12]) + '\n')
|
|
632
|
+
self.log_top_lines(12, 'Result of modifications')
|
|
612
633
|
|
|
613
634
|
lines = self.repair_ocr_text(OCR_REPAIRS, self.text).split('\n')
|
|
614
635
|
new_lines = []
|
|
@@ -646,6 +667,27 @@ class Email(Communication):
|
|
|
646
667
|
sent_from = sent_from_match.group(0)
|
|
647
668
|
return 'S' + sent_from[1:] if sent_from.startswith('sent') else sent_from
|
|
648
669
|
|
|
670
|
+
def _set_config_for_extracted_file(self, extracted_from_doc_cfg: DocCfg) -> None:
|
|
671
|
+
"""Copy info from original config for file this document was extracted from."""
|
|
672
|
+
if self.file_id in ALL_FILE_CONFIGS:
|
|
673
|
+
self.config = cast(EmailCfg, deepcopy(ALL_FILE_CONFIGS[self.file_id]))
|
|
674
|
+
self.warn(f"Merging existing config for {self.file_id} with config for file this document was extracted from")
|
|
675
|
+
else:
|
|
676
|
+
self.config = EmailCfg(id=self.file_id)
|
|
677
|
+
|
|
678
|
+
extracted_from_description = extracted_from_doc_cfg.complete_description()
|
|
679
|
+
|
|
680
|
+
if extracted_from_description:
|
|
681
|
+
extracted_description = f"{APPEARS_IN} {extracted_from_description}"
|
|
682
|
+
|
|
683
|
+
if self.config.description:
|
|
684
|
+
self.warn(f"Overwriting description '{self.config.description}' with extract description '{self.config.description}'")
|
|
685
|
+
|
|
686
|
+
self.config.description = extracted_description
|
|
687
|
+
|
|
688
|
+
self.config.is_interesting = self.config.is_interesting or extracted_from_doc_cfg.is_interesting
|
|
689
|
+
self.warn(f"Constructed synthetic config: {self.config}")
|
|
690
|
+
|
|
649
691
|
def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
|
|
650
692
|
logger.debug(f"Printing '{self.filename}'...")
|
|
651
693
|
yield self.file_info_panel()
|
|
@@ -697,7 +739,7 @@ class Email(Communication):
|
|
|
697
739
|
yield Padding(email_txt_panel, (0, 0, 1, INFO_INDENT))
|
|
698
740
|
|
|
699
741
|
if should_rewrite_header:
|
|
700
|
-
self.log_top_lines(self.header.num_header_rows + 4, f'Original header:'
|
|
742
|
+
self.log_top_lines(self.header.num_header_rows + 4, f'Original header:')
|
|
701
743
|
|
|
702
744
|
@staticmethod
|
|
703
745
|
def build_table(emails: list['Email'], _author: str | None) -> Table:
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from dataclasses import dataclass
|
|
2
|
+
from dataclasses import dataclass
|
|
3
3
|
from datetime import datetime
|
|
4
4
|
|
|
5
5
|
from rich.text import Text
|
|
6
6
|
|
|
7
|
-
from epstein_files.util.constant.names import JEFFREY_EPSTEIN,
|
|
7
|
+
from epstein_files.util.constant.names import JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN
|
|
8
8
|
from epstein_files.util.data import extract_last_name
|
|
9
9
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
10
10
|
from epstein_files.util.logging import logger
|
|
@@ -19,15 +19,6 @@ DISPLAY_LAST_NAME_ONLY = [
|
|
|
19
19
|
STEVE_BANNON,
|
|
20
20
|
]
|
|
21
21
|
|
|
22
|
-
PHONE_NUMBER_MAPPING = {
|
|
23
|
-
'+19174393646': ANTHONY_SCARAMUCCI,
|
|
24
|
-
'+13109906526': STEVE_BANNON,
|
|
25
|
-
'+16463880059': EVA,
|
|
26
|
-
'+13108737937': CELINA_DUBIN,
|
|
27
|
-
'+13108802851': STEVE_BANNON,
|
|
28
|
-
|
|
29
|
-
}
|
|
30
|
-
|
|
31
22
|
TEXTER_MAPPING = {
|
|
32
23
|
'e:': JEFFREY_EPSTEIN,
|
|
33
24
|
'e:jeeitunes@gmail.com': JEFFREY_EPSTEIN,
|
|
@@ -48,13 +39,13 @@ class TextMessage:
|
|
|
48
39
|
|
|
49
40
|
if self.author is None:
|
|
50
41
|
self.author_str = UNKNOWN
|
|
51
|
-
elif self.author in DISPLAY_LAST_NAME_ONLY:
|
|
42
|
+
elif self.author in DISPLAY_LAST_NAME_ONLY and not self.author_str:
|
|
52
43
|
self.author_str = extract_last_name(self.author)
|
|
53
44
|
else:
|
|
54
45
|
self.author_str = self.author_str or self.author
|
|
55
46
|
|
|
56
47
|
if not self.id_confirmed and self.author is not None and self.author != JEFFREY_EPSTEIN:
|
|
57
|
-
self.author_str
|
|
48
|
+
self.author_str += ' (?)'
|
|
58
49
|
|
|
59
50
|
def timestamp(self) -> datetime:
|
|
60
51
|
return datetime.strptime(self.timestamp_str, MSG_DATE_FORMAT)
|
|
@@ -8,11 +8,23 @@ from rich.text import Text
|
|
|
8
8
|
|
|
9
9
|
from epstein_files.documents.other_file import OtherFile
|
|
10
10
|
from epstein_files.util.constant.strings import JSON
|
|
11
|
+
from epstein_files.util.rich import INFO_STYLE
|
|
12
|
+
|
|
13
|
+
TEXT_FIELDS = [
|
|
14
|
+
'caption',
|
|
15
|
+
'standard',
|
|
16
|
+
'subtitle',
|
|
17
|
+
'text',
|
|
18
|
+
'title',
|
|
19
|
+
'to',
|
|
20
|
+
]
|
|
11
21
|
|
|
12
22
|
|
|
13
23
|
@dataclass
|
|
14
24
|
class JsonFile(OtherFile):
|
|
15
25
|
"""File containing JSON data."""
|
|
26
|
+
|
|
27
|
+
include_description_in_summary_panel: ClassVar[bool] = False
|
|
16
28
|
strip_whitespace: ClassVar[bool] = False
|
|
17
29
|
|
|
18
30
|
def __post_init__(self):
|
|
@@ -27,7 +39,7 @@ class JsonFile(OtherFile):
|
|
|
27
39
|
return JSON
|
|
28
40
|
|
|
29
41
|
def info_txt(self) -> Text | None:
|
|
30
|
-
return Text(f"JSON file,
|
|
42
|
+
return Text(f"JSON file, contains preview data for links sent a messaging app", style=INFO_STYLE)
|
|
31
43
|
|
|
32
44
|
def is_interesting(self):
|
|
33
45
|
return False
|