epstein-files 1.0.10__py3-none-any.whl → 1.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +7 -9
- epstein_files/documents/communication.py +2 -2
- epstein_files/documents/document.py +94 -81
- epstein_files/documents/email.py +47 -5
- epstein_files/documents/imessage/text_message.py +4 -13
- epstein_files/documents/json_file.py +13 -1
- epstein_files/documents/messenger_log.py +32 -19
- epstein_files/documents/other_file.py +67 -44
- epstein_files/epstein_files.py +22 -15
- epstein_files/util/constant/names.py +11 -10
- epstein_files/util/constant/strings.py +2 -1
- epstein_files/util/constants.py +98 -88
- epstein_files/util/data.py +1 -1
- epstein_files/util/doc_cfg.py +32 -62
- epstein_files/util/env.py +29 -17
- epstein_files/util/file_helper.py +12 -29
- epstein_files/util/highlighted_group.py +34 -17
- epstein_files/util/logging.py +1 -7
- epstein_files/util/output.py +13 -8
- epstein_files/util/rich.py +15 -10
- epstein_files/util/word_count.py +65 -5
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.12.dist-info}/METADATA +1 -1
- epstein_files-1.0.12.dist-info/RECORD +33 -0
- epstein_files/count_words.py +0 -72
- epstein_files-1.0.10.dist-info/RECORD +0 -34
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.12.dist-info}/LICENSE +0 -0
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.12.dist-info}/WHEEL +0 -0
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.12.dist-info}/entry_points.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import re
|
|
2
3
|
from collections import defaultdict
|
|
3
4
|
from dataclasses import dataclass, field
|
|
@@ -15,7 +16,7 @@ from epstein_files.util.data import iso_timestamp, listify, sort_dict
|
|
|
15
16
|
from epstein_files.util.doc_cfg import Metadata, TextCfg
|
|
16
17
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
17
18
|
from epstein_files.util.logging import logger
|
|
18
|
-
from epstein_files.util.rich import build_table
|
|
19
|
+
from epstein_files.util.rich import build_table, highlighter
|
|
19
20
|
|
|
20
21
|
CONFIRMED_MSG = 'Found confirmed counterparty'
|
|
21
22
|
GUESSED_MSG = 'This is probably a conversation with'
|
|
@@ -27,7 +28,12 @@ REDACTED_AUTHOR_REGEX = re.compile(r"^([-+•_1MENO.=F]+|[4Ide])$")
|
|
|
27
28
|
class MessengerLog(Communication):
|
|
28
29
|
"""Class representing one iMessage log file (one conversation between Epstein and some counterparty)."""
|
|
29
30
|
config: TextCfg | None = None
|
|
30
|
-
|
|
31
|
+
messages: list[TextMessage] = field(default_factory=list)
|
|
32
|
+
phone_number: str | None = None
|
|
33
|
+
|
|
34
|
+
def __post_init__(self):
|
|
35
|
+
super().__post_init__()
|
|
36
|
+
self.messages = [self._build_message(match) for match in MSG_REGEX.finditer(self.text)]
|
|
31
37
|
|
|
32
38
|
def first_message_at(self, name: str | None) -> datetime:
|
|
33
39
|
return self.messages_by(name)[0].timestamp()
|
|
@@ -36,27 +42,29 @@ class MessengerLog(Communication):
|
|
|
36
42
|
if self.author is None:
|
|
37
43
|
return None
|
|
38
44
|
|
|
39
|
-
|
|
40
|
-
author_txt = Text(self.
|
|
41
|
-
|
|
45
|
+
info_msg = GUESSED_MSG if self.is_attribution_uncertain() else CONFIRMED_MSG
|
|
46
|
+
author_txt = Text(self.author, style=self.author_style + ' bold')
|
|
47
|
+
txt = Text(f"({info_msg} ", style='dim').append(author_txt)
|
|
42
48
|
|
|
43
|
-
|
|
44
|
-
|
|
49
|
+
if self.phone_number:
|
|
50
|
+
txt.append(f" using the phone number {self.phone_number}")
|
|
45
51
|
|
|
46
|
-
|
|
47
|
-
"""Lazily evaluated accessor for self._messages."""
|
|
48
|
-
if not self._messages:
|
|
49
|
-
self._messages = [self._build_message(match) for match in MSG_REGEX.finditer(self.text)]
|
|
52
|
+
return highlighter(txt.append(')'))
|
|
50
53
|
|
|
51
|
-
|
|
54
|
+
def last_message_at(self, name: str | None) -> datetime:
|
|
55
|
+
return self.messages_by(name)[-1].timestamp()
|
|
52
56
|
|
|
53
57
|
def messages_by(self, name: str | None) -> list[TextMessage]:
|
|
54
58
|
"""Return all messages by 'name'."""
|
|
55
|
-
return [m for m in self.messages
|
|
59
|
+
return [m for m in self.messages if m.author == name]
|
|
56
60
|
|
|
57
61
|
def metadata(self) -> Metadata:
|
|
58
62
|
metadata = super().metadata()
|
|
59
|
-
metadata.update({'num_messages': len(self.messages
|
|
63
|
+
metadata.update({'num_messages': len(self.messages)})
|
|
64
|
+
|
|
65
|
+
if self.phone_number:
|
|
66
|
+
metadata['phone_number'] = self.phone_number
|
|
67
|
+
|
|
60
68
|
return metadata
|
|
61
69
|
|
|
62
70
|
def _border_style(self) -> str:
|
|
@@ -65,11 +73,16 @@ class MessengerLog(Communication):
|
|
|
65
73
|
def _build_message(self, match: re.Match) -> TextMessage:
|
|
66
74
|
"""Turn a regex match into a TextMessage."""
|
|
67
75
|
author_str = REDACTED_AUTHOR_REGEX.sub('', match.group(1).strip())
|
|
76
|
+
is_phone_number = author_str.startswith('+')
|
|
77
|
+
|
|
78
|
+
if is_phone_number:
|
|
79
|
+
logger.info(f"{self.summary()} Found phone number: {author_str}")
|
|
80
|
+
self.phone_number = author_str
|
|
68
81
|
|
|
69
|
-
# If the Sender: is redacted that means it's from self.author
|
|
82
|
+
# If the Sender: is redacted or if it's an unredacted phone number that means it's from self.author
|
|
70
83
|
return TextMessage(
|
|
71
|
-
author=self.author if (
|
|
72
|
-
author_str=author_str if
|
|
84
|
+
author=self.author if (is_phone_number or not author_str) else author_str,
|
|
85
|
+
author_str=author_str if is_phone_number else None, # Preserve phone numbers
|
|
73
86
|
id_confirmed=not self.is_attribution_uncertain(),
|
|
74
87
|
text=match.group(4).strip(),
|
|
75
88
|
timestamp_str=match.group(2).strip(),
|
|
@@ -90,7 +103,7 @@ class MessengerLog(Communication):
|
|
|
90
103
|
yield self.file_info_panel()
|
|
91
104
|
yield Text('')
|
|
92
105
|
|
|
93
|
-
for message in self.messages
|
|
106
|
+
for message in self.messages:
|
|
94
107
|
yield message
|
|
95
108
|
|
|
96
109
|
@classmethod
|
|
@@ -99,7 +112,7 @@ class MessengerLog(Communication):
|
|
|
99
112
|
sender_counts: dict[str | None, int] = defaultdict(int)
|
|
100
113
|
|
|
101
114
|
for message_log in imessage_logs:
|
|
102
|
-
for message in message_log.messages
|
|
115
|
+
for message in message_log.messages:
|
|
103
116
|
sender_counts[message.author] += 1
|
|
104
117
|
|
|
105
118
|
return sender_counts
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import logging
|
|
3
3
|
import warnings
|
|
4
|
+
from collections import defaultdict
|
|
4
5
|
from dataclasses import asdict, dataclass
|
|
5
6
|
from datetime import datetime
|
|
7
|
+
from typing import ClassVar, Sequence
|
|
6
8
|
|
|
7
9
|
import datefinder
|
|
8
10
|
import dateutil
|
|
@@ -16,11 +18,11 @@ from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, WHITESPACE_R
|
|
|
16
18
|
from epstein_files.util.constant.strings import *
|
|
17
19
|
from epstein_files.util.constants import *
|
|
18
20
|
from epstein_files.util.doc_cfg import FINANCIAL_REPORTS_AUTHORS, DocCfg, Metadata
|
|
19
|
-
from epstein_files.util.data import escape_single_quotes, remove_timezone, uniquify
|
|
20
|
-
from epstein_files.util.file_helper import FILENAME_LENGTH
|
|
21
|
+
from epstein_files.util.data import escape_single_quotes, remove_timezone, sort_dict, uniquify
|
|
22
|
+
from epstein_files.util.file_helper import FILENAME_LENGTH, file_size_to_str
|
|
21
23
|
from epstein_files.util.env import args
|
|
22
|
-
from epstein_files.util.highlighted_group import
|
|
23
|
-
from epstein_files.util.rich import QUESTION_MARK_TXT, build_table, highlighter
|
|
24
|
+
from epstein_files.util.highlighted_group import styled_category
|
|
25
|
+
from epstein_files.util.rich import QUESTION_MARK_TXT, add_cols_to_table, build_table, highlighter
|
|
24
26
|
from epstein_files.util.logging import logger
|
|
25
27
|
|
|
26
28
|
MAX_DAYS_SPANNED_TO_BE_VALID = 10
|
|
@@ -38,14 +40,11 @@ UNINTERESTING_CATEGORES = [
|
|
|
38
40
|
ARTS,
|
|
39
41
|
BOOK,
|
|
40
42
|
JUNK,
|
|
43
|
+
SKYPE_LOG,
|
|
41
44
|
SPEECH,
|
|
42
45
|
]
|
|
43
46
|
|
|
44
|
-
|
|
45
|
-
'031794',
|
|
46
|
-
]
|
|
47
|
-
|
|
48
|
-
# OtherFiles whose description/hints match these prefixes are not displayed unless --all-other-files is used
|
|
47
|
+
# OtherFiles whose descriptions/info match these prefixes are not displayed unless --all-other-files is used
|
|
49
48
|
UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
|
|
50
49
|
'article about',
|
|
51
50
|
ARTICLE_DRAFT,
|
|
@@ -60,7 +59,7 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
|
|
|
60
59
|
CVRA,
|
|
61
60
|
DAILY_MAIL,
|
|
62
61
|
DAILY_TELEGRAPH,
|
|
63
|
-
|
|
62
|
+
CVRA_LEXIS_SEARCH[0:-12], # Because date at end :(
|
|
64
63
|
DERSH_GIUFFRE_TWEET,
|
|
65
64
|
'Financial Times',
|
|
66
65
|
'Forbes',
|
|
@@ -78,8 +77,10 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
|
|
|
78
77
|
LA_TIMES,
|
|
79
78
|
'Litigation Daily',
|
|
80
79
|
LAWRENCE_KRAUSS,
|
|
80
|
+
LAWRENCE_KRAUSS_ASU_ORIGINS,
|
|
81
81
|
'MarketWatch',
|
|
82
82
|
MARTIN_NOWAK,
|
|
83
|
+
'Morning News',
|
|
83
84
|
NOBEL_CHARITABLE_TRUST,
|
|
84
85
|
'Nautilus',
|
|
85
86
|
'New Yorker',
|
|
@@ -106,7 +107,7 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
|
|
|
106
107
|
TEXT_OF_US_LAW,
|
|
107
108
|
TRANSLATION,
|
|
108
109
|
TWEET,
|
|
109
|
-
|
|
110
|
+
REAL_DEAL_ARTICLE,
|
|
110
111
|
TRUMP_DISCLOSURES,
|
|
111
112
|
UBS_CIO_REPORT,
|
|
112
113
|
UN_GENERAL_ASSEMBLY,
|
|
@@ -122,24 +123,25 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
|
|
|
122
123
|
class OtherFile(Document):
|
|
123
124
|
"""File that is not an email, an iMessage log, or JSON data."""
|
|
124
125
|
|
|
126
|
+
include_description_in_summary_panel: ClassVar[bool] = True
|
|
127
|
+
|
|
125
128
|
def __post_init__(self):
|
|
126
129
|
super().__post_init__()
|
|
127
130
|
|
|
128
131
|
if self.config is None and VI_DAILY_NEWS_REGEX.search(self.text):
|
|
129
|
-
self.log(f"Creating synthetic config for VI Daily News article..."
|
|
132
|
+
self.log(f"Creating synthetic config for VI Daily News article...")
|
|
130
133
|
self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
|
|
131
134
|
|
|
132
135
|
def category(self) -> str | None:
|
|
133
136
|
return self.config and self.config.category
|
|
134
137
|
|
|
135
|
-
def
|
|
138
|
+
def category_txt(self) -> Text | None:
|
|
139
|
+
return styled_category(self.category() or UNKNOWN)
|
|
140
|
+
|
|
141
|
+
def config_description(self) -> str | None:
|
|
136
142
|
"""Overloads superclass method."""
|
|
137
143
|
if self.config is not None:
|
|
138
|
-
return self.config.
|
|
139
|
-
|
|
140
|
-
def description_panel(self, include_hints=True) -> Panel:
|
|
141
|
-
"""Panelized description() with info_txt(), used in search results."""
|
|
142
|
-
return super().description_panel(include_hints=include_hints)
|
|
144
|
+
return self.config.complete_description()
|
|
143
145
|
|
|
144
146
|
def highlighted_preview_text(self) -> Text:
|
|
145
147
|
try:
|
|
@@ -153,13 +155,11 @@ class OtherFile(Document):
|
|
|
153
155
|
|
|
154
156
|
def is_interesting(self):
|
|
155
157
|
"""False for lame prefixes, duplicates, and other boring files."""
|
|
156
|
-
|
|
158
|
+
info_sentences = self.info()
|
|
157
159
|
|
|
158
|
-
if self.is_duplicate:
|
|
160
|
+
if self.is_duplicate():
|
|
159
161
|
return False
|
|
160
|
-
elif
|
|
161
|
-
return False
|
|
162
|
-
elif len(hints) == 0:
|
|
162
|
+
elif len(info_sentences) == 0:
|
|
163
163
|
return True
|
|
164
164
|
elif self.config:
|
|
165
165
|
if self.config.is_interesting:
|
|
@@ -170,7 +170,7 @@ class OtherFile(Document):
|
|
|
170
170
|
return False
|
|
171
171
|
|
|
172
172
|
for prefix in UNINTERESTING_PREFIXES:
|
|
173
|
-
if
|
|
173
|
+
if info_sentences[0].plain.startswith(prefix):
|
|
174
174
|
return False
|
|
175
175
|
|
|
176
176
|
return True
|
|
@@ -195,7 +195,6 @@ class OtherFile(Document):
|
|
|
195
195
|
timestamps: list[datetime] = []
|
|
196
196
|
|
|
197
197
|
with warnings.catch_warnings():
|
|
198
|
-
warnings.filterwarnings("ignore", module="datefinder")
|
|
199
198
|
warnings.filterwarnings("ignore", module="dateutil")
|
|
200
199
|
|
|
201
200
|
try:
|
|
@@ -208,11 +207,11 @@ class OtherFile(Document):
|
|
|
208
207
|
if len(timestamps) >= MAX_EXTRACTED_TIMESTAMPS:
|
|
209
208
|
break
|
|
210
209
|
except ValueError as e:
|
|
211
|
-
|
|
210
|
+
self.log(f"Error while iterating through datefinder.find_dates(): {e}", logging.WARNING)
|
|
212
211
|
|
|
213
212
|
if len(timestamps) == 0:
|
|
214
|
-
if not self.is_duplicate
|
|
215
|
-
self.log_top_lines(15, msg=f"No timestamps found"
|
|
213
|
+
if not (self.is_duplicate() or VAST_HOUSE in self.text):
|
|
214
|
+
self.log_top_lines(15, msg=f"No timestamps found")
|
|
216
215
|
|
|
217
216
|
return None
|
|
218
217
|
elif len(timestamps) == 1:
|
|
@@ -231,7 +230,7 @@ class OtherFile(Document):
|
|
|
231
230
|
self.log_top_lines(15, msg=timestamps_log_msg, level=logging.DEBUG)
|
|
232
231
|
|
|
233
232
|
@staticmethod
|
|
234
|
-
def build_table(
|
|
233
|
+
def build_table(files: Sequence['OtherFile']) -> Table:
|
|
235
234
|
"""Build a table of OtherFile documents."""
|
|
236
235
|
table = build_table(None, show_lines=True)
|
|
237
236
|
table.add_column('File', justify='center', width=FILENAME_LENGTH)
|
|
@@ -240,31 +239,55 @@ class OtherFile(Document):
|
|
|
240
239
|
table.add_column('Type', justify='center')
|
|
241
240
|
table.add_column(FIRST_FEW_LINES, justify='left', style='pale_turquoise4')
|
|
242
241
|
|
|
243
|
-
for
|
|
244
|
-
link_and_info = [
|
|
245
|
-
|
|
246
|
-
date_str = doc.date_str()
|
|
242
|
+
for file in files:
|
|
243
|
+
link_and_info = [file.external_links()]
|
|
244
|
+
date_str = file.date_str()
|
|
247
245
|
|
|
248
|
-
if
|
|
249
|
-
preview_text =
|
|
246
|
+
if file.is_duplicate():
|
|
247
|
+
preview_text = file.duplicate_file_txt()
|
|
250
248
|
row_style = ' dim'
|
|
251
249
|
else:
|
|
252
|
-
link_and_info +=
|
|
253
|
-
preview_text =
|
|
250
|
+
link_and_info += file.info()
|
|
251
|
+
preview_text = file.highlighted_preview_text()
|
|
254
252
|
row_style = ''
|
|
255
253
|
|
|
256
|
-
if category:
|
|
257
|
-
category_txt = Text(category, get_style_for_category(category) or 'wheat4')
|
|
258
|
-
else:
|
|
259
|
-
category_txt = Text('')
|
|
260
|
-
|
|
261
254
|
table.add_row(
|
|
262
255
|
Group(*link_and_info),
|
|
263
256
|
Text(date_str, style=TIMESTAMP_DIM) if date_str else QUESTION_MARK_TXT,
|
|
264
|
-
|
|
265
|
-
category_txt,
|
|
257
|
+
file.file_size_str(),
|
|
258
|
+
file.category_txt(),
|
|
266
259
|
preview_text,
|
|
267
260
|
style=row_style
|
|
268
261
|
)
|
|
269
262
|
|
|
270
263
|
return table
|
|
264
|
+
|
|
265
|
+
@staticmethod
|
|
266
|
+
def count_by_category_table(files: Sequence['OtherFile']) -> Table:
|
|
267
|
+
counts = defaultdict(int)
|
|
268
|
+
category_bytes = defaultdict(int)
|
|
269
|
+
|
|
270
|
+
for file in files:
|
|
271
|
+
if file.category() is None:
|
|
272
|
+
logger.warning(f"file {file.file_id} has no category")
|
|
273
|
+
|
|
274
|
+
counts[file.category()] += 1
|
|
275
|
+
category_bytes[file.category()] += file.length
|
|
276
|
+
|
|
277
|
+
table = build_table('Other Files Summary')
|
|
278
|
+
add_cols_to_table(table, ['Category', 'Count', 'Has Author', 'No Author', 'Size'])
|
|
279
|
+
table.columns[-1].style = 'dim'
|
|
280
|
+
|
|
281
|
+
for (category, count) in sort_dict(counts):
|
|
282
|
+
category_files = [f for f in files if f.category() == category]
|
|
283
|
+
known_author_count = Document.known_author_count(category_files)
|
|
284
|
+
|
|
285
|
+
table.add_row(
|
|
286
|
+
styled_category(category or UNKNOWN),
|
|
287
|
+
str(count),
|
|
288
|
+
str(known_author_count),
|
|
289
|
+
str(count - known_author_count),
|
|
290
|
+
file_size_to_str(category_bytes[category]),
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
return table
|
epstein_files/epstein_files.py
CHANGED
|
@@ -23,12 +23,12 @@ from epstein_files.util.constant.strings import *
|
|
|
23
23
|
from epstein_files.util.constant.urls import (EPSTEIN_MEDIA, EPSTEIN_WEB, JMAIL, epstein_media_person_url,
|
|
24
24
|
epsteinify_name_url, epstein_web_person_url, search_jmail_url, search_twitter_url)
|
|
25
25
|
from epstein_files.util.constants import *
|
|
26
|
-
from epstein_files.util.data import dict_sets_to_lists, json_safe, sort_dict
|
|
26
|
+
from epstein_files.util.data import dict_sets_to_lists, json_safe, listify, sort_dict
|
|
27
27
|
from epstein_files.util.doc_cfg import EmailCfg, Metadata
|
|
28
|
-
from epstein_files.util.env import args, logger
|
|
29
|
-
from epstein_files.util.file_helper import
|
|
28
|
+
from epstein_files.util.env import DOCS_DIR, args, logger
|
|
29
|
+
from epstein_files.util.file_helper import file_size_str
|
|
30
30
|
from epstein_files.util.highlighted_group import get_info_for_name, get_style_for_name
|
|
31
|
-
from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT,
|
|
31
|
+
from epstein_files.util.rich import (DEFAULT_NAME_STYLE, NA_TXT, add_cols_to_table,
|
|
32
32
|
build_table, console, highlighter, link_text_obj, link_markup, print_author_header, print_centered,
|
|
33
33
|
print_other_site_link, print_panel, print_section_header, vertically_pad)
|
|
34
34
|
from epstein_files.util.search_result import SearchResult
|
|
@@ -66,7 +66,7 @@ class EpsteinFiles:
|
|
|
66
66
|
|
|
67
67
|
def __post_init__(self):
|
|
68
68
|
"""Iterate through files and build appropriate objects."""
|
|
69
|
-
self.all_files = [f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')]
|
|
69
|
+
self.all_files = sorted([f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')])
|
|
70
70
|
documents = []
|
|
71
71
|
file_type_count = defaultdict(int)
|
|
72
72
|
|
|
@@ -74,12 +74,15 @@ class EpsteinFiles:
|
|
|
74
74
|
for file_arg in self.all_files:
|
|
75
75
|
doc_timer = Timer(decimals=4)
|
|
76
76
|
document = Document(file_arg)
|
|
77
|
+
cls = document_cls(document)
|
|
77
78
|
|
|
78
79
|
if document.length == 0:
|
|
79
80
|
logger.warning(f"Skipping empty file: {document}]")
|
|
80
81
|
continue
|
|
82
|
+
elif args.skip_other_files and cls == OtherFile and file_type_count[cls.__name__] > 1:
|
|
83
|
+
logger.warning(f"Skipping {document.filename}...")
|
|
84
|
+
continue
|
|
81
85
|
|
|
82
|
-
cls = document_cls(document)
|
|
83
86
|
documents.append(cls(file_arg, text=document.text))
|
|
84
87
|
logger.info(str(documents[-1]))
|
|
85
88
|
file_type_count[cls.__name__] += 1
|
|
@@ -186,7 +189,8 @@ class EpsteinFiles:
|
|
|
186
189
|
else:
|
|
187
190
|
return [e for e in self.emails if author in e.recipients]
|
|
188
191
|
|
|
189
|
-
def get_documents_by_id(self, file_ids: list[str]) -> list[Document]:
|
|
192
|
+
def get_documents_by_id(self, file_ids: str | list[str]) -> list[Document]:
|
|
193
|
+
file_ids = listify(file_ids)
|
|
190
194
|
docs = [doc for doc in self.all_documents() if doc.file_id in file_ids]
|
|
191
195
|
|
|
192
196
|
if len(docs) != len(file_ids):
|
|
@@ -223,7 +227,7 @@ class EpsteinFiles:
|
|
|
223
227
|
f"{len(docs):,}",
|
|
224
228
|
f"{known:,}" if known is not None else NA_TXT,
|
|
225
229
|
f"{len(docs) - known:,}" if known is not None else NA_TXT,
|
|
226
|
-
f"{len([d for d in docs if d.is_duplicate])}",
|
|
230
|
+
f"{len([d for d in docs if d.is_duplicate()])}",
|
|
227
231
|
)
|
|
228
232
|
|
|
229
233
|
add_row('iMessage Logs', self.imessage_logs)
|
|
@@ -237,7 +241,7 @@ class EpsteinFiles:
|
|
|
237
241
|
"""Print complete emails to or from a particular 'author'. Returns the Emails that were printed."""
|
|
238
242
|
conversation_length = self.email_conversation_length_in_days(_author)
|
|
239
243
|
emails = self.emails_for(_author)
|
|
240
|
-
unique_emails = [email for email in emails if not email.is_duplicate]
|
|
244
|
+
unique_emails = [email for email in emails if not email.is_duplicate()]
|
|
241
245
|
author = _author or UNKNOWN
|
|
242
246
|
|
|
243
247
|
print_author_header(
|
|
@@ -250,7 +254,7 @@ class EpsteinFiles:
|
|
|
250
254
|
last_printed_email_was_duplicate = False
|
|
251
255
|
|
|
252
256
|
for email in emails:
|
|
253
|
-
if email.is_duplicate:
|
|
257
|
+
if email.is_duplicate():
|
|
254
258
|
console.print(Padding(email.duplicate_file_txt().append('...'), (0, 0, 0, 4)))
|
|
255
259
|
last_printed_email_was_duplicate = True
|
|
256
260
|
else:
|
|
@@ -263,7 +267,7 @@ class EpsteinFiles:
|
|
|
263
267
|
return emails
|
|
264
268
|
|
|
265
269
|
def print_emails_table_for(self, author: str | None) -> None:
|
|
266
|
-
emails = [email for email in self.emails_for(author) if not email.is_duplicate] # Remove dupes
|
|
270
|
+
emails = [email for email in self.emails_for(author) if not email.is_duplicate()] # Remove dupes
|
|
267
271
|
console.print(Align.center(Email.build_table(emails, author)), '\n')
|
|
268
272
|
|
|
269
273
|
def print_email_device_info(self) -> None:
|
|
@@ -272,7 +276,7 @@ class EpsteinFiles:
|
|
|
272
276
|
console.print(_build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
|
|
273
277
|
|
|
274
278
|
def print_emailer_counts_table(self) -> None:
|
|
275
|
-
footer = f"Identified authors of {self.attributed_email_count():,}
|
|
279
|
+
footer = f"Identified authors of {self.attributed_email_count():,} out of {len(self.emails):,} emails ."
|
|
276
280
|
counts_table = build_table("Email Counts", caption=footer)
|
|
277
281
|
add_cols_to_table(counts_table, ['Name', 'Count', 'Sent', "Recv'd", JMAIL, EPSTEIN_MEDIA, EPSTEIN_WEB, 'Twitter'])
|
|
278
282
|
|
|
@@ -303,7 +307,7 @@ class EpsteinFiles:
|
|
|
303
307
|
text_summary_msg = f"\nDeanonymized {Document.known_author_count(self.imessage_logs)} of "
|
|
304
308
|
text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files):,} files."
|
|
305
309
|
console.print(text_summary_msg)
|
|
306
|
-
imessage_msg_count = sum([len(log.messages
|
|
310
|
+
imessage_msg_count = sum([len(log.messages) for log in self.imessage_logs])
|
|
307
311
|
console.print(f"Found {imessage_msg_count} text messages in {len(self.imessage_logs)} iMessage log files.")
|
|
308
312
|
|
|
309
313
|
def print_other_files_table(self) -> list[OtherFile]:
|
|
@@ -318,17 +322,18 @@ class EpsteinFiles:
|
|
|
318
322
|
console.line(2)
|
|
319
323
|
|
|
320
324
|
console.print(OtherFile.build_table(interesting_files))
|
|
325
|
+
console.print(Padding(OtherFile.count_by_category_table(interesting_files), (2, 0, 2, 2)))
|
|
321
326
|
skipped_file_count = len(self.other_files) - len(interesting_files)
|
|
322
327
|
|
|
323
328
|
if skipped_file_count > 0:
|
|
324
|
-
logger.warning(f"Skipped {skipped_file_count} uninteresting files...")
|
|
329
|
+
logger.warning(f"Skipped {skipped_file_count} uninteresting other files...")
|
|
325
330
|
|
|
326
331
|
return interesting_files
|
|
327
332
|
|
|
328
333
|
def _tally_email_data(self) -> None:
|
|
329
334
|
"""Tally up summary info about Email objects."""
|
|
330
335
|
for email in self.emails:
|
|
331
|
-
if email.is_duplicate:
|
|
336
|
+
if email.is_duplicate():
|
|
332
337
|
continue
|
|
333
338
|
|
|
334
339
|
self.email_author_counts[email.author] += 1
|
|
@@ -360,6 +365,8 @@ def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
|
|
|
360
365
|
def document_cls(doc: Document) -> Type[Document]:
|
|
361
366
|
search_area = doc.text[0:5000] # Limit search area to avoid pointless scans of huge files
|
|
362
367
|
|
|
368
|
+
if doc.length == 0:
|
|
369
|
+
return Document
|
|
363
370
|
if doc.text[0] == '{':
|
|
364
371
|
return JsonFile
|
|
365
372
|
elif isinstance(doc.config, EmailCfg) or (DETECT_EMAIL_REGEX.match(search_area) and doc.config is None):
|
|
@@ -143,7 +143,7 @@ REID_HOFFMAN = 'Reid Hoffman'
|
|
|
143
143
|
REID_WEINGARTEN = 'Reid Weingarten'
|
|
144
144
|
RENATA_BOLOTOVA = 'Renata Bolotova'
|
|
145
145
|
RICHARD_KAHN = 'Richard Kahn'
|
|
146
|
-
|
|
146
|
+
ROBERT_D_CRITTON_JR = 'Robert D. Critton Jr.'
|
|
147
147
|
ROBERT_LAWRENCE_KUHN = 'Robert Lawrence Kuhn'
|
|
148
148
|
ROBERT_TRIVERS = 'Robert Trivers'
|
|
149
149
|
ROGER_SCHANK = 'Roger Schank'
|
|
@@ -178,6 +178,7 @@ JARED_KUSHNER = 'Jared Kushner'
|
|
|
178
178
|
JULIE_K_BROWN = 'Julie K. Brown'
|
|
179
179
|
KARIM_SADJADPOUR = 'KARIM SADJADPOUR'.title()
|
|
180
180
|
MICHAEL_J_BOCCIO = 'Michael J. Boccio'
|
|
181
|
+
NERIO_ALESSANDRI = 'Nerio Alessandri (Founder and Chairman of Technogym S.p.A. Italy)'
|
|
181
182
|
PAUL_G_CASSELL = 'Paul G. Cassell'
|
|
182
183
|
RUDY_GIULIANI = 'Rudy Giuliani'
|
|
183
184
|
TULSI_GABBARD = 'Tulsi Gabbard'
|
|
@@ -226,22 +227,22 @@ NAMES_TO_NOT_HIGHLIGHT: list[str] = [name.lower() for name in [
|
|
|
226
227
|
# Names to color white in the word counts
|
|
227
228
|
OTHER_NAMES = NAMES_TO_NOT_HIGHLIGHT + """
|
|
228
229
|
aaron albert alberto alec alexandra alice anderson andre ann anna anne ariana arthur
|
|
229
|
-
baldwin barack ben benjamin berger bert binant bob bonner boyden bradley brady branson bruno bryant burton
|
|
230
|
+
baldwin barack ben benjamin berger bert binant bob bonner boyden bradley brady branson bright bruno bryant burton
|
|
230
231
|
chapman charles charlie christopher clint cohen colin collins conway
|
|
231
|
-
davis dean debra deborah dennis diana diane diaz dickinson dixon dominique don dylan
|
|
232
|
+
danny davis dean debra deborah dennis diana diane diaz dickinson dixon dominique don dylan
|
|
232
233
|
edmond elizabeth emily entwistle erik evelyn
|
|
233
|
-
ferguson flachsbart francis franco frank
|
|
234
|
+
ferguson flachsbart francis franco frank frost
|
|
234
235
|
gardner gary geoff geoffrey gilbert gloria goldberg gonzalez gould graham greene guarino gwyneth
|
|
235
|
-
hancock harold harrison harry helen hirsch hofstadter horowitz hussein
|
|
236
|
-
isaac isaacson
|
|
236
|
+
hancock harold harrison harry hay helen hirsch hofstadter horowitz hussein
|
|
237
|
+
ian isaac isaacson
|
|
237
238
|
jamie jane janet jason jen jim joe johnson jones josh julie justin
|
|
238
239
|
karl kate kathy kelly kim kruger kyle
|
|
239
|
-
leo leonard lenny leslie lieberman louis lynch lynn
|
|
240
|
+
laurie leo leonard lenny leslie lieberman louis lynch lynn
|
|
240
241
|
marcus marianne matt matthew melissa michele michelle moore moscowitz
|
|
241
|
-
nicole nussbaum
|
|
242
|
+
nancy nicole nussbaum
|
|
242
243
|
paulson philippe
|
|
243
|
-
rafael ray richardson rob robin ron rudolph ryan
|
|
244
|
-
sara sarah seligman serge sergey silverman sloman smith snowden sorkin steele stevie stewart
|
|
244
|
+
rafael ray richard richardson rob robin ron rubin rudolph ryan
|
|
245
|
+
sara sarah sean seligman serge sergey silverman sloman smith snowden sorkin steele stevie stewart
|
|
245
246
|
ted theresa thompson tiffany timothy tony
|
|
246
247
|
valeria
|
|
247
248
|
walter warren weinstein weiss william
|
|
@@ -20,7 +20,7 @@ POLITICS = 'politics'
|
|
|
20
20
|
PROPERTY = 'property'
|
|
21
21
|
PUBLICIST = 'publicist'
|
|
22
22
|
REPUTATION = 'reputation'
|
|
23
|
-
SKYPE_LOG= '
|
|
23
|
+
SKYPE_LOG = 'Skype log'
|
|
24
24
|
SOCIAL = 'social'
|
|
25
25
|
SPEECH = 'speech'
|
|
26
26
|
|
|
@@ -39,6 +39,7 @@ MIAMI_HERALD = 'Miami Herald'
|
|
|
39
39
|
NYT = "New York Times"
|
|
40
40
|
PALM_BEACH_DAILY_NEWS = f'{PALM_BEACH} Daily News'
|
|
41
41
|
PALM_BEACH_POST = f'{PALM_BEACH} Post'
|
|
42
|
+
SHIMON_POST = 'The Shimon Post'
|
|
42
43
|
THE_REAL_DEAL = 'The Real Deal'
|
|
43
44
|
WAPO = 'WaPo'
|
|
44
45
|
VI_DAILY_NEWS = f'{VIRGIN_ISLANDS} Daily News'
|