epstein-files 1.2.5__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +55 -23
- epstein_files/documents/communication.py +9 -5
- epstein_files/documents/document.py +231 -135
- epstein_files/documents/doj_file.py +242 -0
- epstein_files/documents/doj_files/full_text.py +166 -0
- epstein_files/documents/email.py +289 -232
- epstein_files/documents/emails/email_header.py +35 -16
- epstein_files/documents/emails/emailers.py +223 -0
- epstein_files/documents/imessage/text_message.py +2 -3
- epstein_files/documents/json_file.py +18 -14
- epstein_files/documents/messenger_log.py +23 -39
- epstein_files/documents/other_file.py +54 -48
- epstein_files/epstein_files.py +65 -29
- epstein_files/person.py +151 -94
- epstein_files/util/constant/names.py +37 -10
- epstein_files/util/constant/output_files.py +2 -0
- epstein_files/util/constant/strings.py +14 -7
- epstein_files/util/constant/urls.py +17 -0
- epstein_files/util/constants.py +556 -391
- epstein_files/util/data.py +2 -0
- epstein_files/util/doc_cfg.py +44 -33
- epstein_files/util/env.py +34 -19
- epstein_files/util/file_helper.py +30 -6
- epstein_files/util/helpers/debugging_helper.py +13 -0
- epstein_files/util/helpers/env_helpers.py +21 -0
- epstein_files/util/highlighted_group.py +121 -37
- epstein_files/util/layout/left_bar_panel.py +26 -0
- epstein_files/util/logging.py +28 -13
- epstein_files/util/output.py +49 -40
- epstein_files/util/rich.py +30 -3
- epstein_files/util/word_count.py +7 -7
- {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/METADATA +16 -3
- epstein_files-1.5.0.dist-info/RECORD +40 -0
- {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/entry_points.txt +1 -1
- epstein_files-1.2.5.dist-info/RECORD +0 -34
- {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/LICENSE +0 -0
- {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/WHEEL +0 -0
|
@@ -30,7 +30,6 @@ MAX_DAYS_SPANNED_TO_BE_VALID = 10
|
|
|
30
30
|
MAX_EXTRACTED_TIMESTAMPS = 100
|
|
31
31
|
MIN_TIMESTAMP = datetime(2000, 1, 1)
|
|
32
32
|
MID_TIMESTAMP = datetime(2007, 1, 1)
|
|
33
|
-
MAX_TIMESTAMP = datetime(2022, 12, 31)
|
|
34
33
|
PREVIEW_CHARS = int(580 * (1 if args.all_other_files else 1.5))
|
|
35
34
|
LOG_INDENT = '\n '
|
|
36
35
|
TIMESTAMP_LOG_INDENT = f'{LOG_INDENT} '
|
|
@@ -93,40 +92,28 @@ class OtherFile(Document):
|
|
|
93
92
|
"""
|
|
94
93
|
was_timestamp_extracted: bool = False
|
|
95
94
|
include_description_in_summary_panel: ClassVar[bool] = True # Class var for logging output
|
|
95
|
+
max_timestamp: ClassVar[datetime] = datetime(2022, 12, 31) # Overloaded in DojFile
|
|
96
96
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
if self.config
|
|
101
|
-
self.
|
|
102
|
-
self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
|
|
97
|
+
@property
|
|
98
|
+
def config_description(self) -> str | None:
|
|
99
|
+
"""Overloads superclass property."""
|
|
100
|
+
if self.config and self.config.description:
|
|
101
|
+
return self.config.complete_description
|
|
103
102
|
|
|
103
|
+
@property
|
|
104
104
|
def category(self) -> str | None:
|
|
105
105
|
return self.config and self.config.category
|
|
106
106
|
|
|
107
|
+
@property
|
|
107
108
|
def category_txt(self) -> Text | None:
|
|
108
|
-
return styled_category(self.category
|
|
109
|
-
|
|
110
|
-
def config_description(self) -> str | None:
|
|
111
|
-
"""Overloads superclass method."""
|
|
112
|
-
if self.config is not None:
|
|
113
|
-
return self.config.complete_description()
|
|
114
|
-
|
|
115
|
-
def highlighted_preview_text(self) -> Text:
|
|
116
|
-
try:
|
|
117
|
-
return highlighter(escape(self.preview_text()))
|
|
118
|
-
except Exception as e:
|
|
119
|
-
logger.error(f"Failed to apply markup in string '{escape_single_quotes(self.preview_text())}'\n"
|
|
120
|
-
f"Original string: '{escape_single_quotes(self.preview_text())}'\n"
|
|
121
|
-
f"File: '{self.filename}'\n")
|
|
122
|
-
|
|
123
|
-
return Text(escape(self.preview_text()))
|
|
109
|
+
return styled_category(self.category)
|
|
124
110
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
111
|
+
@property
|
|
112
|
+
def is_interesting(self) -> bool:
|
|
113
|
+
"""Overloaded. False for lame prefixes, duplicates, and other boring files."""
|
|
114
|
+
info_sentences = self.info
|
|
128
115
|
|
|
129
|
-
if self.is_duplicate
|
|
116
|
+
if self.is_duplicate:
|
|
130
117
|
return False
|
|
131
118
|
elif len(info_sentences) == 0:
|
|
132
119
|
return True
|
|
@@ -135,9 +122,9 @@ class OtherFile(Document):
|
|
|
135
122
|
return self.config.is_interesting
|
|
136
123
|
elif self.config.author in INTERESTING_AUTHORS:
|
|
137
124
|
return True
|
|
138
|
-
elif self.category
|
|
125
|
+
elif self.category == FINANCE and self.author is not None:
|
|
139
126
|
return False
|
|
140
|
-
elif self.category
|
|
127
|
+
elif self.category in UNINTERESTING_CATEGORIES:
|
|
141
128
|
return False
|
|
142
129
|
|
|
143
130
|
for prefix in UNINTERESTING_PREFIXES:
|
|
@@ -146,15 +133,33 @@ class OtherFile(Document):
|
|
|
146
133
|
|
|
147
134
|
return True
|
|
148
135
|
|
|
136
|
+
@property
|
|
149
137
|
def metadata(self) -> Metadata:
|
|
150
|
-
metadata = super().metadata
|
|
151
|
-
metadata['is_interesting'] = self.is_interesting
|
|
138
|
+
metadata = super().metadata
|
|
139
|
+
metadata['is_interesting'] = self.is_interesting
|
|
152
140
|
|
|
153
141
|
if self.was_timestamp_extracted:
|
|
154
142
|
metadata['was_timestamp_extracted'] = self.was_timestamp_extracted
|
|
155
143
|
|
|
156
144
|
return metadata
|
|
157
145
|
|
|
146
|
+
def __post_init__(self):
|
|
147
|
+
super().__post_init__()
|
|
148
|
+
|
|
149
|
+
if self.config is None and VI_DAILY_NEWS_REGEX.search(self.text):
|
|
150
|
+
self.log(f"Creating synthetic config for VI Daily News article...")
|
|
151
|
+
self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
|
|
152
|
+
|
|
153
|
+
def highlighted_preview_text(self) -> Text:
|
|
154
|
+
try:
|
|
155
|
+
return highlighter(escape(self.preview_text()))
|
|
156
|
+
except Exception as e:
|
|
157
|
+
logger.error(f"Failed to apply markup in string '{escape_single_quotes(self.preview_text())}'\n"
|
|
158
|
+
f"Original string: '{escape_single_quotes(self.preview_text())}'\n"
|
|
159
|
+
f"File: '{self.filename}'\n")
|
|
160
|
+
|
|
161
|
+
return Text(escape(self.preview_text()))
|
|
162
|
+
|
|
158
163
|
def preview_text(self) -> str:
|
|
159
164
|
return WHITESPACE_REGEX.sub(' ', self.text)[0:PREVIEW_CHARS]
|
|
160
165
|
|
|
@@ -164,9 +169,7 @@ class OtherFile(Document):
|
|
|
164
169
|
|
|
165
170
|
def _extract_timestamp(self) -> datetime | None:
|
|
166
171
|
"""Return configured timestamp or value extracted by scanning text with datefinder."""
|
|
167
|
-
if self.config and self.
|
|
168
|
-
return self.config.timestamp
|
|
169
|
-
elif self.config and any([s in (self.config_description() or '') for s in SKIP_TIMESTAMP_EXTRACT]):
|
|
172
|
+
if self.config and any([s in (self.config_description or '') for s in SKIP_TIMESTAMP_EXTRACT]):
|
|
170
173
|
return None
|
|
171
174
|
|
|
172
175
|
timestamps: list[datetime] = []
|
|
@@ -175,10 +178,11 @@ class OtherFile(Document):
|
|
|
175
178
|
warnings.filterwarnings("ignore", module="dateutil")
|
|
176
179
|
|
|
177
180
|
try:
|
|
178
|
-
|
|
181
|
+
# TODO: datefinder.find_dates() cannot find 08/29/2019 style e.g. in EFTA00005783 :(
|
|
182
|
+
for timestamp in datefinder.find_dates(self.text, strict=False):
|
|
179
183
|
timestamp = remove_timezone(timestamp)
|
|
180
184
|
|
|
181
|
-
if MIN_TIMESTAMP < timestamp <
|
|
185
|
+
if MIN_TIMESTAMP < timestamp < self.max_timestamp:
|
|
182
186
|
timestamps.append(timestamp)
|
|
183
187
|
|
|
184
188
|
if len(timestamps) >= MAX_EXTRACTED_TIMESTAMPS:
|
|
@@ -187,7 +191,7 @@ class OtherFile(Document):
|
|
|
187
191
|
self.warn(f"Error while iterating through datefinder.find_dates(): {e}")
|
|
188
192
|
|
|
189
193
|
if len(timestamps) == 0:
|
|
190
|
-
if not (self.is_duplicate
|
|
194
|
+
if not (self.is_duplicate or VAST_HOUSE in self.text):
|
|
191
195
|
self.log_top_lines(15, msg=f"No timestamps found")
|
|
192
196
|
|
|
193
197
|
return None
|
|
@@ -210,9 +214,10 @@ class OtherFile(Document):
|
|
|
210
214
|
self.log_top_lines(15, msg=timestamps_log_msg, level=logging.DEBUG)
|
|
211
215
|
|
|
212
216
|
@classmethod
|
|
213
|
-
def files_preview_table(cls, files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
|
|
217
|
+
def files_preview_table(cls, files: Sequence['OtherFile'], title_pfx: str = '', title: str = '') -> Table:
|
|
214
218
|
"""Build a table of OtherFile documents."""
|
|
215
|
-
|
|
219
|
+
title = title or f'{title_pfx}Other Files Details in Chronological Order'
|
|
220
|
+
table = build_table(title, show_lines=True, title_justify='left' if title else 'center')
|
|
216
221
|
table.add_column('File', justify='center', width=FILENAME_LENGTH)
|
|
217
222
|
table.add_column('Date', justify='center')
|
|
218
223
|
table.add_column('Size', justify='right', style='dim')
|
|
@@ -221,21 +226,21 @@ class OtherFile(Document):
|
|
|
221
226
|
|
|
222
227
|
for file in files:
|
|
223
228
|
link_and_info = [file.external_links_txt()]
|
|
224
|
-
date_str = file.date_str
|
|
229
|
+
date_str = file.date_str
|
|
225
230
|
|
|
226
|
-
if file.is_duplicate
|
|
227
|
-
preview_text = file.duplicate_file_txt
|
|
231
|
+
if file.is_duplicate:
|
|
232
|
+
preview_text = file.duplicate_file_txt
|
|
228
233
|
row_style = ' dim'
|
|
229
234
|
else:
|
|
230
|
-
link_and_info += file.info
|
|
235
|
+
link_and_info += file.info
|
|
231
236
|
preview_text = file.highlighted_preview_text()
|
|
232
237
|
row_style = ''
|
|
233
238
|
|
|
234
239
|
table.add_row(
|
|
235
240
|
Group(*link_and_info),
|
|
236
241
|
Text(date_str, style=TIMESTAMP_STYLE) if date_str else QUESTION_MARKS_TXT,
|
|
237
|
-
file.file_size_str
|
|
238
|
-
file.category_txt
|
|
242
|
+
file.file_size_str,
|
|
243
|
+
file.category_txt,
|
|
239
244
|
preview_text,
|
|
240
245
|
style=row_style
|
|
241
246
|
)
|
|
@@ -244,12 +249,13 @@ class OtherFile(Document):
|
|
|
244
249
|
|
|
245
250
|
@classmethod
|
|
246
251
|
def summary_table(cls, files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
|
|
247
|
-
|
|
248
|
-
categories =
|
|
252
|
+
"""Table showing file count by category."""
|
|
253
|
+
categories = uniquify([f.category for f in files])
|
|
254
|
+
categories = sorted(categories, key=lambda c: -len([f for f in files if f.category == c]))
|
|
249
255
|
table = cls.file_info_table(f'{title_pfx}Other Files Summary', 'Category')
|
|
250
256
|
|
|
251
257
|
for category in categories:
|
|
252
|
-
category_files = [f for f in files if f.category
|
|
258
|
+
category_files = [f for f in files if f.category == category]
|
|
253
259
|
table.add_row(styled_category(category), *cls.files_info_row(category_files))
|
|
254
260
|
|
|
255
261
|
table.columns = table.columns[:-2] + [table.columns[-1]] # Removee unknown author col
|
epstein_files/epstein_files.py
CHANGED
|
@@ -12,7 +12,8 @@ from typing import Sequence, Type, cast
|
|
|
12
12
|
from rich.table import Table
|
|
13
13
|
|
|
14
14
|
from epstein_files.documents.document import Document
|
|
15
|
-
from epstein_files.documents.
|
|
15
|
+
from epstein_files.documents.doj_file import DojFile
|
|
16
|
+
from epstein_files.documents.email import Email
|
|
16
17
|
from epstein_files.documents.json_file import JsonFile
|
|
17
18
|
from epstein_files.documents.messenger_log import MSG_REGEX, MessengerLog
|
|
18
19
|
from epstein_files.documents.other_file import OtherFile
|
|
@@ -21,7 +22,7 @@ from epstein_files.util.constant.strings import *
|
|
|
21
22
|
from epstein_files.util.constants import *
|
|
22
23
|
from epstein_files.util.data import flatten, json_safe, listify, uniquify
|
|
23
24
|
from epstein_files.util.doc_cfg import EmailCfg, Metadata
|
|
24
|
-
from epstein_files.util.env import DOCS_DIR, args, logger
|
|
25
|
+
from epstein_files.util.env import DOCS_DIR, DOJ_PDFS_20260130_DIR, args, logger
|
|
25
26
|
from epstein_files.util.file_helper import file_size_str
|
|
26
27
|
from epstein_files.util.highlighted_group import HIGHLIGHTED_NAMES, HighlightedNames
|
|
27
28
|
from epstein_files.util.search_result import SearchResult
|
|
@@ -49,14 +50,28 @@ class EpsteinFiles:
|
|
|
49
50
|
imessage_logs: list[MessengerLog] = field(default_factory=list)
|
|
50
51
|
json_files: list[JsonFile] = field(default_factory=list)
|
|
51
52
|
other_files: list[OtherFile] = field(default_factory=list)
|
|
53
|
+
doj_files: list[DojFile] = field(default_factory=list)
|
|
52
54
|
timer: Timer = field(default_factory=lambda: Timer())
|
|
53
55
|
uninteresting_ccs: list[Name] = field(default_factory=list)
|
|
54
56
|
|
|
57
|
+
@property
|
|
58
|
+
def all_documents(self) -> Sequence[Document]:
|
|
59
|
+
return self.imessage_logs + self.emails + self.other_files + self.doj_files
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def all_doj_files(self) -> Sequence[DojFile | Email]:
|
|
63
|
+
"""All files with the filename EFTAXXXXXX."""
|
|
64
|
+
return [doc for doc in self.all_documents if doc.is_doj_file]
|
|
65
|
+
|
|
55
66
|
def __post_init__(self):
|
|
56
67
|
"""Iterate through files and build appropriate objects."""
|
|
57
68
|
self.all_files = sorted([f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')])
|
|
58
|
-
|
|
59
|
-
|
|
69
|
+
|
|
70
|
+
if DOJ_PDFS_20260130_DIR:
|
|
71
|
+
self.all_files += sorted([f for f in DOJ_PDFS_20260130_DIR.glob('**/*.txt')])
|
|
72
|
+
|
|
73
|
+
docs = []
|
|
74
|
+
file_type_count = defaultdict(int) # Hack used by --skip-other-files option to get a few files parsed before skipping the rest
|
|
60
75
|
|
|
61
76
|
# Read through and classify all the files
|
|
62
77
|
for file_arg in self.all_files:
|
|
@@ -64,26 +79,28 @@ class EpsteinFiles:
|
|
|
64
79
|
document = Document(file_arg)
|
|
65
80
|
cls = document_cls(document)
|
|
66
81
|
|
|
67
|
-
if document.length
|
|
82
|
+
if document.length == 0:
|
|
68
83
|
logger.warning(f"Skipping empty file: {document}]")
|
|
69
84
|
continue
|
|
70
85
|
elif args.skip_other_files and cls == OtherFile and file_type_count[cls.__name__] > 1:
|
|
71
86
|
document.log(f"Skipping OtherFile...")
|
|
72
87
|
continue
|
|
73
88
|
|
|
74
|
-
|
|
75
|
-
logger.info(str(
|
|
89
|
+
docs.append(cls(file_arg, lines=document.lines, text=document.text).printable_document())
|
|
90
|
+
logger.info(str(docs[-1]))
|
|
76
91
|
file_type_count[cls.__name__] += 1
|
|
77
92
|
|
|
78
93
|
if doc_timer.seconds_since_start() > SLOW_FILE_SECONDS:
|
|
79
|
-
doc_timer.print_at_checkpoint(f"Slow file: {
|
|
94
|
+
doc_timer.print_at_checkpoint(f"Slow file: {docs[-1]} processed")
|
|
80
95
|
|
|
81
|
-
self.
|
|
82
|
-
self.
|
|
83
|
-
self.
|
|
84
|
-
self.json_files = [
|
|
96
|
+
self.doj_files = Document.sort_by_timestamp([d for d in docs if isinstance(d, DojFile)])
|
|
97
|
+
self.emails = Document.sort_by_timestamp([d for d in docs if isinstance(d, Email)])
|
|
98
|
+
self.imessage_logs = Document.sort_by_timestamp([d for d in docs if isinstance(d, MessengerLog)])
|
|
99
|
+
self.json_files = Document.sort_by_timestamp([d for d in docs if isinstance(d, JsonFile)])
|
|
100
|
+
self.other_files = Document.sort_by_timestamp([d for d in docs if isinstance(d, OtherFile) and not isinstance(d, DojFile)])
|
|
85
101
|
self._set_uninteresting_ccs()
|
|
86
102
|
self._copy_duplicate_email_properties()
|
|
103
|
+
self._find_email_attachments_and_set_is_first_for_user()
|
|
87
104
|
|
|
88
105
|
@classmethod
|
|
89
106
|
def get_files(cls, timer: Timer | None = None) -> 'EpsteinFiles':
|
|
@@ -110,19 +127,19 @@ class EpsteinFiles:
|
|
|
110
127
|
timer.print_at_checkpoint(f'Processed {len(epstein_files.all_files):,} documents')
|
|
111
128
|
return epstein_files
|
|
112
129
|
|
|
113
|
-
def all_documents(self) -> Sequence[Document]:
|
|
114
|
-
return self.imessage_logs + self.emails + self.other_files
|
|
115
|
-
|
|
116
130
|
def docs_matching(self, pattern: re.Pattern | str, names: list[Name] | None = None) -> list[SearchResult]:
|
|
117
131
|
"""Find documents whose text matches a pattern (file_type and names args limit the documents searched)."""
|
|
118
132
|
results: list[SearchResult] = []
|
|
119
133
|
|
|
120
|
-
for doc in self.all_documents
|
|
134
|
+
for doc in self.all_documents:
|
|
121
135
|
if names and doc.author not in names:
|
|
122
136
|
continue
|
|
123
137
|
|
|
124
138
|
lines = doc.matching_lines(pattern)
|
|
125
139
|
|
|
140
|
+
if args.min_line_length:
|
|
141
|
+
lines = [line for line in lines if len(line.line) > args.min_line_length]
|
|
142
|
+
|
|
126
143
|
if len(lines) > 0:
|
|
127
144
|
results.append(SearchResult(doc, lines))
|
|
128
145
|
|
|
@@ -136,15 +153,15 @@ class EpsteinFiles:
|
|
|
136
153
|
|
|
137
154
|
def email_author_counts(self) -> dict[Name, int]:
|
|
138
155
|
return {
|
|
139
|
-
person.name: len(person.unique_emails_by
|
|
140
|
-
for person in self.emailers() if len(person.unique_emails_by
|
|
156
|
+
person.name: len(person.unique_emails_by)
|
|
157
|
+
for person in self.emailers() if len(person.unique_emails_by) > 0
|
|
141
158
|
}
|
|
142
159
|
|
|
143
160
|
def email_authors_to_device_signatures(self) -> dict[str, set[str]]:
|
|
144
161
|
signatures = defaultdict(set)
|
|
145
162
|
|
|
146
163
|
for email in [e for e in self.non_duplicate_emails() if e.sent_from_device]:
|
|
147
|
-
signatures[email.author_or_unknown
|
|
164
|
+
signatures[email.author_or_unknown].add(email.sent_from_device)
|
|
148
165
|
|
|
149
166
|
return signatures
|
|
150
167
|
|
|
@@ -152,14 +169,14 @@ class EpsteinFiles:
|
|
|
152
169
|
signatures = defaultdict(set)
|
|
153
170
|
|
|
154
171
|
for email in [e for e in self.non_duplicate_emails() if e.sent_from_device]:
|
|
155
|
-
signatures[email.sent_from_device].add(email.author_or_unknown
|
|
172
|
+
signatures[email.sent_from_device].add(email.author_or_unknown)
|
|
156
173
|
|
|
157
174
|
return signatures
|
|
158
175
|
|
|
159
176
|
def email_recipient_counts(self) -> dict[Name, int]:
|
|
160
177
|
return {
|
|
161
|
-
person.name: len(person.unique_emails_to
|
|
162
|
-
for person in self.emailers() if len(person.unique_emails_to
|
|
178
|
+
person.name: len(person.unique_emails_to)
|
|
179
|
+
for person in self.emailers() if len(person.unique_emails_to) > 0
|
|
163
180
|
}
|
|
164
181
|
|
|
165
182
|
def email_signature_substitution_counts(self) -> dict[str, int]:
|
|
@@ -208,7 +225,7 @@ class EpsteinFiles:
|
|
|
208
225
|
|
|
209
226
|
def for_ids(self, file_ids: str | list[str]) -> list[Document]:
|
|
210
227
|
file_ids = listify(file_ids)
|
|
211
|
-
docs = [doc for doc in self.all_documents
|
|
228
|
+
docs = [doc for doc in (list(self.all_documents) + self.doj_files) if doc.file_id in file_ids]
|
|
212
229
|
|
|
213
230
|
if len(docs) != len(file_ids):
|
|
214
231
|
logger.warning(f"{len(file_ids)} file IDs provided but only {len(docs)} Epstein files found!")
|
|
@@ -251,7 +268,7 @@ class EpsteinFiles:
|
|
|
251
268
|
name=name,
|
|
252
269
|
emails=self.emails_for(name),
|
|
253
270
|
imessage_logs=self.imessage_logs_for(name),
|
|
254
|
-
|
|
271
|
+
is_uninteresting=name in self.uninteresting_emailers(),
|
|
255
272
|
other_files=[f for f in self.other_files if name and name == f.author]
|
|
256
273
|
)
|
|
257
274
|
for name in names
|
|
@@ -276,13 +293,30 @@ class EpsteinFiles:
|
|
|
276
293
|
|
|
277
294
|
return self._uninteresting_emailers
|
|
278
295
|
|
|
296
|
+
def _find_email_attachments_and_set_is_first_for_user(self) -> None:
|
|
297
|
+
for other_file in self.other_files:
|
|
298
|
+
if other_file.config and other_file.config.attached_to_email_id:
|
|
299
|
+
email = self.email_for_id(other_file.config.attached_to_email_id)
|
|
300
|
+
email.attached_docs.append(other_file)
|
|
301
|
+
|
|
302
|
+
if other_file.timestamp \
|
|
303
|
+
and other_file.timestamp != email.timestamp \
|
|
304
|
+
and not other_file.config_timestamp:
|
|
305
|
+
other_file.warn(f"Overwriting '{other_file.timestamp}' with {email}'s timestamp {email.timestamp}")
|
|
306
|
+
|
|
307
|
+
other_file.timestamp = email.timestamp
|
|
308
|
+
|
|
309
|
+
for emailer in self.emailers():
|
|
310
|
+
first_email = emailer.emails[0]
|
|
311
|
+
first_email._is_first_for_user = True
|
|
312
|
+
|
|
279
313
|
def _copy_duplicate_email_properties(self) -> None:
|
|
280
314
|
"""Ensure dupe emails have the properties of the emails they duplicate to capture any repairs, config etc."""
|
|
281
315
|
for email in self.emails:
|
|
282
|
-
if not email.is_duplicate
|
|
316
|
+
if not email.is_duplicate:
|
|
283
317
|
continue
|
|
284
318
|
|
|
285
|
-
original = self.email_for_id(email.duplicate_of_id
|
|
319
|
+
original = self.email_for_id(email.duplicate_of_id)
|
|
286
320
|
|
|
287
321
|
for field_name in DUPLICATE_PROPS_TO_COPY:
|
|
288
322
|
original_prop = getattr(original, field_name)
|
|
@@ -321,11 +355,13 @@ def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
|
|
|
321
355
|
def document_cls(doc: Document) -> Type[Document]:
|
|
322
356
|
search_area = doc.text[0:5000] # Limit search area to avoid pointless scans of huge files
|
|
323
357
|
|
|
324
|
-
if doc.length
|
|
358
|
+
if doc.length == 0:
|
|
325
359
|
return Document
|
|
360
|
+
elif doc.is_doj_file:
|
|
361
|
+
return DojFile
|
|
326
362
|
if doc.text[0] == '{':
|
|
327
363
|
return JsonFile
|
|
328
|
-
elif
|
|
364
|
+
elif Document.is_email(doc): # TODO: right now we setup the DojFile which makes an Email obj only later at print time
|
|
329
365
|
return Email
|
|
330
366
|
elif MSG_REGEX.search(search_area):
|
|
331
367
|
return MessengerLog
|
|
@@ -334,4 +370,4 @@ def document_cls(doc: Document) -> Type[Document]:
|
|
|
334
370
|
|
|
335
371
|
|
|
336
372
|
def _sorted_metadata(docs: Sequence[Document]) -> list[Metadata]:
|
|
337
|
-
return [json_safe(d.metadata
|
|
373
|
+
return [json_safe(d.metadata) for d in Document.sort_by_id(docs)]
|