epstein-files 1.0.13__py3-none-any.whl → 1.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +16 -11
- epstein_files/documents/communication.py +2 -2
- epstein_files/documents/document.py +59 -51
- epstein_files/documents/email.py +34 -30
- epstein_files/documents/imessage/text_message.py +4 -4
- epstein_files/documents/json_file.py +9 -3
- epstein_files/documents/messenger_log.py +29 -27
- epstein_files/documents/other_file.py +80 -100
- epstein_files/epstein_files.py +50 -69
- epstein_files/util/constant/names.py +3 -1
- epstein_files/util/constant/strings.py +1 -3
- epstein_files/util/constant/urls.py +1 -7
- epstein_files/util/constants.py +126 -114
- epstein_files/util/data.py +2 -0
- epstein_files/util/doc_cfg.py +11 -10
- epstein_files/util/env.py +12 -13
- epstein_files/util/file_helper.py +8 -4
- epstein_files/util/highlighted_group.py +8 -16
- epstein_files/util/output.py +56 -36
- epstein_files/util/rich.py +29 -29
- epstein_files/util/word_count.py +7 -9
- {epstein_files-1.0.13.dist-info → epstein_files-1.0.15.dist-info}/METADATA +10 -3
- epstein_files-1.0.15.dist-info/RECORD +33 -0
- epstein_files-1.0.13.dist-info/RECORD +0 -33
- {epstein_files-1.0.13.dist-info → epstein_files-1.0.15.dist-info}/LICENSE +0 -0
- {epstein_files-1.0.13.dist-info → epstein_files-1.0.15.dist-info}/WHEEL +0 -0
- {epstein_files-1.0.13.dist-info → epstein_files-1.0.15.dist-info}/entry_points.txt +0 -0
|
@@ -9,17 +9,17 @@ from rich.table import Table
|
|
|
9
9
|
from rich.text import Text
|
|
10
10
|
|
|
11
11
|
from epstein_files.documents.communication import Communication
|
|
12
|
-
from epstein_files.documents.imessage.text_message import
|
|
12
|
+
from epstein_files.documents.imessage.text_message import TextMessage
|
|
13
13
|
from epstein_files.util.constant.names import JEFFREY_EPSTEIN, UNKNOWN
|
|
14
|
-
from epstein_files.util.constant.strings import AUTHOR
|
|
15
|
-
from epstein_files.util.data import iso_timestamp, listify, sort_dict
|
|
14
|
+
from epstein_files.util.constant.strings import AUTHOR, TIMESTAMP_STYLE
|
|
15
|
+
from epstein_files.util.data import days_between, days_between_str, iso_timestamp, listify, sort_dict
|
|
16
16
|
from epstein_files.util.doc_cfg import Metadata, TextCfg
|
|
17
17
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
18
18
|
from epstein_files.util.logging import logger
|
|
19
19
|
from epstein_files.util.rich import LAST_TIMESTAMP_STYLE, build_table, highlighter
|
|
20
20
|
|
|
21
|
-
CONFIRMED_MSG = '
|
|
22
|
-
GUESSED_MSG = '
|
|
21
|
+
CONFIRMED_MSG = 'with confirmed counterparty'
|
|
22
|
+
GUESSED_MSG = 'and is probably with'
|
|
23
23
|
MSG_REGEX = re.compile(r'Sender:(.*?)\nTime:(.*? (AM|PM)).*?Message:(.*?)\s*?((?=(\nSender)|\Z))', re.DOTALL)
|
|
24
24
|
REDACTED_AUTHOR_REGEX = re.compile(r"^([-+•_1MENO.=F]+|[4Ide])$")
|
|
25
25
|
|
|
@@ -39,17 +39,20 @@ class MessengerLog(Communication):
|
|
|
39
39
|
return self.messages_by(name)[0].timestamp()
|
|
40
40
|
|
|
41
41
|
def info_txt(self) -> Text | None:
|
|
42
|
-
|
|
43
|
-
|
|
42
|
+
num_days_str = days_between_str(self.timestamp, self.messages[-1].timestamp())
|
|
43
|
+
txt = Text(f"(Covers {num_days_str} starting ", style='dim')
|
|
44
|
+
txt.append(self.date_str(), style=TIMESTAMP_STYLE).append(' ')
|
|
44
45
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
46
|
+
if not self.author:
|
|
47
|
+
txt.append('with unknown counterparty')
|
|
48
|
+
else:
|
|
49
|
+
txt.append(GUESSED_MSG if self.is_attribution_uncertain() else CONFIRMED_MSG).append(' ')
|
|
50
|
+
txt.append(Text(self.author, style=self.author_style + ' bold'))
|
|
48
51
|
|
|
49
52
|
if self.phone_number:
|
|
50
|
-
txt.append(f" using the phone number {self.phone_number}")
|
|
53
|
+
txt.append(highlighter(f" using the phone number {self.phone_number}"))
|
|
51
54
|
|
|
52
|
-
return
|
|
55
|
+
return txt.append(')')
|
|
53
56
|
|
|
54
57
|
def last_message_at(self, name: str | None) -> datetime:
|
|
55
58
|
return self.messages_by(name)[-1].timestamp()
|
|
@@ -82,7 +85,7 @@ class MessengerLog(Communication):
|
|
|
82
85
|
# If the Sender: is redacted or if it's an unredacted phone number that means it's from self.author
|
|
83
86
|
return TextMessage(
|
|
84
87
|
author=self.author if (is_phone_number or not author_str) else author_str,
|
|
85
|
-
author_str=author_str if is_phone_number else
|
|
88
|
+
author_str=author_str if is_phone_number else '', # Preserve phone numbers
|
|
86
89
|
id_confirmed=not self.is_attribution_uncertain(),
|
|
87
90
|
text=match.group(4).strip(),
|
|
88
91
|
timestamp_str=match.group(2).strip(),
|
|
@@ -90,12 +93,12 @@ class MessengerLog(Communication):
|
|
|
90
93
|
|
|
91
94
|
def _extract_timestamp(self) -> datetime:
|
|
92
95
|
for match in MSG_REGEX.finditer(self.text):
|
|
93
|
-
|
|
96
|
+
message = self._build_message(match)
|
|
94
97
|
|
|
95
98
|
try:
|
|
96
|
-
return
|
|
99
|
+
return message.timestamp()
|
|
97
100
|
except ValueError as e:
|
|
98
|
-
logger.info(f"Failed to parse '{timestamp_str}' to datetime! Using next match. Error: {e}'")
|
|
101
|
+
logger.info(f"Failed to parse '{message.timestamp_str}' to datetime! Using next match. Error: {e}'")
|
|
99
102
|
|
|
100
103
|
raise RuntimeError(f"{self}: No timestamp found!")
|
|
101
104
|
|
|
@@ -118,23 +121,22 @@ class MessengerLog(Communication):
|
|
|
118
121
|
return sender_counts
|
|
119
122
|
|
|
120
123
|
@classmethod
|
|
121
|
-
def
|
|
122
|
-
authors = listify(author)
|
|
123
|
-
return logs if JEFFREY_EPSTEIN in authors else [log for log in logs if log.author in authors]
|
|
124
|
-
|
|
125
|
-
@classmethod
|
|
126
|
-
def summary_table(cls, imessage_logs: list['MessengerLog']) -> Table:
|
|
124
|
+
def summary_table(cls, log_files: list['MessengerLog']) -> Table:
|
|
127
125
|
"""Build a table summarizing the text messages in 'imessage_logs'."""
|
|
128
|
-
|
|
129
|
-
|
|
126
|
+
author_counts = cls.count_authors(log_files)
|
|
127
|
+
msg_count = sum([len(log.messages) for log in log_files])
|
|
128
|
+
|
|
129
|
+
footer = f"Deanonymized {msg_count - author_counts[None]:,} of {msg_count:,} text messages in"
|
|
130
|
+
counts_table = build_table("Text Message Counts By Author", caption=f"{footer} {len(log_files)} files")
|
|
131
|
+
counts_table.add_column(AUTHOR.title(), justify='left', width=30)
|
|
130
132
|
counts_table.add_column('Files', justify='right', style='white')
|
|
131
133
|
counts_table.add_column("Msgs", justify='right')
|
|
132
134
|
counts_table.add_column('First Sent At', justify='center', highlight=True, width=21)
|
|
133
135
|
counts_table.add_column('Last Sent At', justify='center', style=LAST_TIMESTAMP_STYLE, width=21)
|
|
134
136
|
counts_table.add_column('Days', justify='right', style='dim')
|
|
135
137
|
|
|
136
|
-
for name, count in sort_dict(
|
|
137
|
-
logs =
|
|
138
|
+
for name, count in sort_dict(author_counts):
|
|
139
|
+
logs = log_files if name == JEFFREY_EPSTEIN else [log for log in log_files if log.author == name]
|
|
138
140
|
first_at = logs[0].first_message_at(name)
|
|
139
141
|
last_at = logs[-1].first_message_at(name)
|
|
140
142
|
|
|
@@ -144,7 +146,7 @@ class MessengerLog(Communication):
|
|
|
144
146
|
f"{count:,}",
|
|
145
147
|
iso_timestamp(first_at),
|
|
146
148
|
iso_timestamp(last_at),
|
|
147
|
-
str((
|
|
149
|
+
str(days_between(first_at, last_at)),
|
|
148
150
|
)
|
|
149
151
|
|
|
150
152
|
return counts_table
|
|
@@ -17,14 +17,15 @@ from rich.text import Text
|
|
|
17
17
|
from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, WHITESPACE_REGEX, Document
|
|
18
18
|
from epstein_files.util.constant.strings import *
|
|
19
19
|
from epstein_files.util.constants import *
|
|
20
|
-
from epstein_files.util.doc_cfg import
|
|
21
|
-
from epstein_files.util.data import escape_single_quotes, remove_timezone, sort_dict, uniquify
|
|
20
|
+
from epstein_files.util.doc_cfg import DocCfg, Metadata
|
|
21
|
+
from epstein_files.util.data import days_between, escape_single_quotes, remove_timezone, sort_dict, uniquify
|
|
22
22
|
from epstein_files.util.file_helper import FILENAME_LENGTH, file_size_to_str
|
|
23
23
|
from epstein_files.util.env import args
|
|
24
24
|
from epstein_files.util.highlighted_group import styled_category
|
|
25
|
-
from epstein_files.util.rich import QUESTION_MARK_TXT,
|
|
25
|
+
from epstein_files.util.rich import QUESTION_MARK_TXT, build_table, highlighter
|
|
26
26
|
from epstein_files.util.logging import logger
|
|
27
27
|
|
|
28
|
+
FIRST_FEW_LINES = 'First Few Lines'
|
|
28
29
|
MAX_DAYS_SPANNED_TO_BE_VALID = 10
|
|
29
30
|
MAX_EXTRACTED_TIMESTAMPS = 100
|
|
30
31
|
MIN_TIMESTAMP = datetime(2000, 1, 1)
|
|
@@ -36,94 +37,62 @@ TIMESTAMP_LOG_INDENT = f'{LOG_INDENT} '
|
|
|
36
37
|
VAST_HOUSE = 'vast house' # Michael Wolff article draft about Epstein indicator
|
|
37
38
|
VI_DAILY_NEWS_REGEX = re.compile(r'virgin\s*is[kl][ai]nds\s*daily\s*news', re.IGNORECASE)
|
|
38
39
|
|
|
39
|
-
|
|
40
|
+
SKIP_TIMESTAMP_EXTRACT = [
|
|
41
|
+
PALM_BEACH_TSV,
|
|
42
|
+
PALM_BEACH_PROPERTY_INFO,
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
UNINTERESTING_CATEGORIES = [
|
|
46
|
+
ACADEMIA,
|
|
47
|
+
ARTICLE,
|
|
40
48
|
ARTS,
|
|
41
49
|
BOOK,
|
|
50
|
+
CONFERENCE,
|
|
42
51
|
JUNK,
|
|
52
|
+
POLITICS,
|
|
43
53
|
SKYPE_LOG,
|
|
44
|
-
SPEECH,
|
|
45
54
|
]
|
|
46
55
|
|
|
47
56
|
# OtherFiles whose descriptions/info match these prefixes are not displayed unless --all-other-files is used
|
|
48
|
-
UNINTERESTING_PREFIXES =
|
|
57
|
+
UNINTERESTING_PREFIXES = [
|
|
49
58
|
'article about',
|
|
50
|
-
ARTICLE_DRAFT,
|
|
51
|
-
'Aviation International',
|
|
52
|
-
BBC,
|
|
53
|
-
BLOOMBERG,
|
|
54
|
-
'Boston Globe',
|
|
55
59
|
BROCKMAN_INC,
|
|
56
|
-
CHINA_DAILY,
|
|
57
|
-
CNN,
|
|
58
|
-
'completely redacted',
|
|
59
60
|
CVRA,
|
|
60
|
-
DAILY_MAIL,
|
|
61
|
-
DAILY_TELEGRAPH,
|
|
62
|
-
CVRA_LEXIS_SEARCH[0:-12], # Because date at end :(
|
|
63
61
|
DERSH_GIUFFRE_TWEET,
|
|
64
|
-
'Financial Times',
|
|
65
|
-
'Forbes',
|
|
66
|
-
'Frontlines',
|
|
67
|
-
'Future Science',
|
|
68
|
-
'Globe and Mail',
|
|
69
62
|
GORDON_GETTY,
|
|
70
63
|
f"{HARVARD} Econ",
|
|
71
64
|
HARVARD_POETRY,
|
|
72
|
-
'Inference',
|
|
73
65
|
JASTA,
|
|
74
|
-
|
|
75
|
-
JOHN_BOLTON_PRESS_CLIPPING,
|
|
76
|
-
'Journal of Criminal',
|
|
77
|
-
LA_TIMES,
|
|
78
|
-
'Litigation Daily',
|
|
79
|
-
LAWRENCE_KRAUSS,
|
|
80
|
-
LAWRENCE_KRAUSS_ASU_ORIGINS,
|
|
81
|
-
'MarketWatch',
|
|
82
|
-
MARTIN_NOWAK,
|
|
83
|
-
'Morning News',
|
|
66
|
+
LEXIS_NEXIS,
|
|
84
67
|
NOBEL_CHARITABLE_TRUST,
|
|
85
|
-
'Nautilus',
|
|
86
|
-
'New Yorker',
|
|
87
|
-
NYT,
|
|
88
68
|
PALM_BEACH_CODE_ENFORCEMENT,
|
|
89
|
-
PALM_BEACH_DAILY_NEWS,
|
|
90
|
-
PALM_BEACH_POST,
|
|
91
69
|
PALM_BEACH_TSV,
|
|
92
70
|
PALM_BEACH_WATER_COMMITTEE,
|
|
93
|
-
PAUL_KRASSNER,
|
|
94
|
-
PEGGY_SIEGAL,
|
|
95
|
-
'Politifact',
|
|
96
|
-
'Rafanelli',
|
|
97
|
-
ROBERT_LAWRENCE_KUHN,
|
|
98
|
-
ROBERT_TRIVERS,
|
|
99
|
-
'SCMP',
|
|
100
|
-
'SciencExpress',
|
|
101
|
-
'Scowcroft',
|
|
102
|
-
SHIMON_POST_ARTICLE,
|
|
103
|
-
SINGLE_PAGE,
|
|
104
|
-
STACEY_PLASKETT,
|
|
105
|
-
'Tatler',
|
|
106
|
-
TERJE_ROD_LARSEN,
|
|
107
|
-
TEXT_OF_US_LAW,
|
|
108
|
-
TRANSLATION,
|
|
109
71
|
TWEET,
|
|
110
|
-
REAL_DEAL_ARTICLE,
|
|
111
|
-
TRUMP_DISCLOSURES,
|
|
112
|
-
UBS_CIO_REPORT,
|
|
113
72
|
UN_GENERAL_ASSEMBLY,
|
|
114
|
-
'U.S. News',
|
|
115
73
|
'US Office',
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
74
|
+
]
|
|
75
|
+
|
|
76
|
+
INTERESTING_AUTHORS = [
|
|
77
|
+
EDWARD_JAY_EPSTEIN,
|
|
78
|
+
EHUD_BARAK,
|
|
79
|
+
JOI_ITO,
|
|
80
|
+
NOAM_CHOMSKY,
|
|
81
|
+
MICHAEL_WOLFF,
|
|
82
|
+
SVETLANA_POZHIDAEVA,
|
|
119
83
|
]
|
|
120
84
|
|
|
121
85
|
|
|
122
86
|
@dataclass
|
|
123
87
|
class OtherFile(Document):
|
|
124
|
-
"""
|
|
88
|
+
"""
|
|
89
|
+
File that is not an email, an iMessage log, or JSON data.
|
|
125
90
|
|
|
126
|
-
|
|
91
|
+
Attributes:
|
|
92
|
+
was_timestamp_extracted (bool): True if the timestamp was programmatically extracted (and could be wrong)
|
|
93
|
+
"""
|
|
94
|
+
was_timestamp_extracted: bool = False
|
|
95
|
+
include_description_in_summary_panel: ClassVar[bool] = True # Class var for logging output
|
|
127
96
|
|
|
128
97
|
def __post_init__(self):
|
|
129
98
|
super().__post_init__()
|
|
@@ -162,11 +131,13 @@ class OtherFile(Document):
|
|
|
162
131
|
elif len(info_sentences) == 0:
|
|
163
132
|
return True
|
|
164
133
|
elif self.config:
|
|
165
|
-
if self.config.is_interesting:
|
|
134
|
+
if self.config.is_interesting is not None:
|
|
135
|
+
return self.config.is_interesting
|
|
136
|
+
elif self.config.author in INTERESTING_AUTHORS:
|
|
166
137
|
return True
|
|
167
138
|
elif self.category() == FINANCE and self.author is not None:
|
|
168
139
|
return False
|
|
169
|
-
elif self.category() in
|
|
140
|
+
elif self.category() in UNINTERESTING_CATEGORIES:
|
|
170
141
|
return False
|
|
171
142
|
|
|
172
143
|
for prefix in UNINTERESTING_PREFIXES:
|
|
@@ -178,6 +149,10 @@ class OtherFile(Document):
|
|
|
178
149
|
def metadata(self) -> Metadata:
|
|
179
150
|
metadata = super().metadata()
|
|
180
151
|
metadata['is_interesting'] = self.is_interesting()
|
|
152
|
+
|
|
153
|
+
if self.was_timestamp_extracted:
|
|
154
|
+
metadata['was_timestamp_extracted'] = self.was_timestamp_extracted
|
|
155
|
+
|
|
181
156
|
return metadata
|
|
182
157
|
|
|
183
158
|
def preview_text(self) -> str:
|
|
@@ -191,6 +166,8 @@ class OtherFile(Document):
|
|
|
191
166
|
"""Return configured timestamp or value extracted by scanning text with datefinder."""
|
|
192
167
|
if self.config and self.config.timestamp:
|
|
193
168
|
return self.config.timestamp
|
|
169
|
+
elif self.config and any([s in (self.config_description() or '') for s in SKIP_TIMESTAMP_EXTRACT]):
|
|
170
|
+
return None
|
|
194
171
|
|
|
195
172
|
timestamps: list[datetime] = []
|
|
196
173
|
|
|
@@ -214,7 +191,10 @@ class OtherFile(Document):
|
|
|
214
191
|
self.log_top_lines(15, msg=f"No timestamps found")
|
|
215
192
|
|
|
216
193
|
return None
|
|
217
|
-
|
|
194
|
+
|
|
195
|
+
self.was_timestamp_extracted = True
|
|
196
|
+
|
|
197
|
+
if len(timestamps) == 1:
|
|
218
198
|
return timestamps[0]
|
|
219
199
|
else:
|
|
220
200
|
timestamps = sorted(uniquify(timestamps), reverse=True)
|
|
@@ -222,7 +202,7 @@ class OtherFile(Document):
|
|
|
222
202
|
return timestamps[0] # Most recent timestamp appearing in text is usually the closest
|
|
223
203
|
|
|
224
204
|
def _log_extracted_timestamps_info(self, timestamps: list[datetime]) -> None:
|
|
225
|
-
num_days_spanned = (timestamps[
|
|
205
|
+
num_days_spanned = days_between(timestamps[-1], timestamps[0])
|
|
226
206
|
timestamps_log_msg = f"Extracted {len(timestamps)} timestamps spanning {num_days_spanned} days{TIMESTAMP_LOG_INDENT}"
|
|
227
207
|
timestamps_log_msg += TIMESTAMP_LOG_INDENT.join([str(dt) for dt in timestamps])
|
|
228
208
|
|
|
@@ -230,9 +210,39 @@ class OtherFile(Document):
|
|
|
230
210
|
self.log_top_lines(15, msg=timestamps_log_msg, level=logging.DEBUG)
|
|
231
211
|
|
|
232
212
|
@staticmethod
|
|
233
|
-
def
|
|
213
|
+
def count_by_category_table(files: Sequence['OtherFile']) -> Table:
|
|
214
|
+
counts = defaultdict(int)
|
|
215
|
+
category_bytes = defaultdict(int)
|
|
216
|
+
|
|
217
|
+
for file in files:
|
|
218
|
+
if file.category() is None:
|
|
219
|
+
logger.warning(f"file {file.file_id} has no category")
|
|
220
|
+
|
|
221
|
+
counts[file.category()] += 1
|
|
222
|
+
category_bytes[file.category()] += file.file_size()
|
|
223
|
+
|
|
224
|
+
table = build_table('Other Files Summary', ['Category', 'Count', 'Has Author', 'No Author', 'Size'])
|
|
225
|
+
table.columns[0].min_width = 14
|
|
226
|
+
table.columns[-1].style = 'dim'
|
|
227
|
+
|
|
228
|
+
for (category, count) in sort_dict(counts):
|
|
229
|
+
category_files = [f for f in files if f.category() == category]
|
|
230
|
+
known_author_count = Document.known_author_count(category_files)
|
|
231
|
+
|
|
232
|
+
table.add_row(
|
|
233
|
+
styled_category(category or UNKNOWN),
|
|
234
|
+
str(count),
|
|
235
|
+
str(known_author_count),
|
|
236
|
+
str(count - known_author_count),
|
|
237
|
+
file_size_to_str(category_bytes[category]),
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
return table
|
|
241
|
+
|
|
242
|
+
@staticmethod
|
|
243
|
+
def files_preview_table(files: Sequence['OtherFile']) -> Table:
|
|
234
244
|
"""Build a table of OtherFile documents."""
|
|
235
|
-
table = build_table(
|
|
245
|
+
table = build_table('Other Files Details', show_lines=True)
|
|
236
246
|
table.add_column('File', justify='center', width=FILENAME_LENGTH)
|
|
237
247
|
table.add_column('Date', justify='center')
|
|
238
248
|
table.add_column('Size', justify='center')
|
|
@@ -240,7 +250,7 @@ class OtherFile(Document):
|
|
|
240
250
|
table.add_column(FIRST_FEW_LINES, justify='left', style='pale_turquoise4')
|
|
241
251
|
|
|
242
252
|
for file in files:
|
|
243
|
-
link_and_info = [file.
|
|
253
|
+
link_and_info = [file.external_links_txt()]
|
|
244
254
|
date_str = file.date_str()
|
|
245
255
|
|
|
246
256
|
if file.is_duplicate():
|
|
@@ -261,33 +271,3 @@ class OtherFile(Document):
|
|
|
261
271
|
)
|
|
262
272
|
|
|
263
273
|
return table
|
|
264
|
-
|
|
265
|
-
@staticmethod
|
|
266
|
-
def count_by_category_table(files: Sequence['OtherFile']) -> Table:
|
|
267
|
-
counts = defaultdict(int)
|
|
268
|
-
category_bytes = defaultdict(int)
|
|
269
|
-
|
|
270
|
-
for file in files:
|
|
271
|
-
if file.category() is None:
|
|
272
|
-
logger.warning(f"file {file.file_id} has no category")
|
|
273
|
-
|
|
274
|
-
counts[file.category()] += 1
|
|
275
|
-
category_bytes[file.category()] += file.length
|
|
276
|
-
|
|
277
|
-
table = build_table('Other Files Summary')
|
|
278
|
-
add_cols_to_table(table, ['Category', 'Count', 'Has Author', 'No Author', 'Size'])
|
|
279
|
-
table.columns[-1].style = 'dim'
|
|
280
|
-
|
|
281
|
-
for (category, count) in sort_dict(counts):
|
|
282
|
-
category_files = [f for f in files if f.category() == category]
|
|
283
|
-
known_author_count = Document.known_author_count(category_files)
|
|
284
|
-
|
|
285
|
-
table.add_row(
|
|
286
|
-
styled_category(category or UNKNOWN),
|
|
287
|
-
str(count),
|
|
288
|
-
str(known_author_count),
|
|
289
|
-
str(count - known_author_count),
|
|
290
|
-
file_size_to_str(category_bytes[category]),
|
|
291
|
-
)
|
|
292
|
-
|
|
293
|
-
return table
|
epstein_files/epstein_files.py
CHANGED
|
@@ -23,14 +23,14 @@ from epstein_files.util.constant.strings import *
|
|
|
23
23
|
from epstein_files.util.constant.urls import (EPSTEIN_MEDIA, EPSTEIN_WEB, JMAIL, epstein_media_person_url,
|
|
24
24
|
epsteinify_name_url, epstein_web_person_url, search_jmail_url, search_twitter_url)
|
|
25
25
|
from epstein_files.util.constants import *
|
|
26
|
-
from epstein_files.util.data import
|
|
26
|
+
from epstein_files.util.data import days_between, dict_sets_to_lists, json_safe, listify, sort_dict
|
|
27
27
|
from epstein_files.util.doc_cfg import EmailCfg, Metadata
|
|
28
28
|
from epstein_files.util.env import DOCS_DIR, args, logger
|
|
29
29
|
from epstein_files.util.file_helper import file_size_str
|
|
30
|
-
from epstein_files.util.highlighted_group import get_info_for_name, get_style_for_name
|
|
30
|
+
from epstein_files.util.highlighted_group import HIGHLIGHTED_NAMES, HighlightedNames, get_info_for_name, get_style_for_name
|
|
31
31
|
from epstein_files.util.rich import (DEFAULT_NAME_STYLE, LAST_TIMESTAMP_STYLE, NA_TXT, add_cols_to_table,
|
|
32
|
-
build_table, console, highlighter, link_text_obj, link_markup, print_author_header, print_centered,
|
|
33
|
-
|
|
32
|
+
print_all_files_page_link, build_table, console, highlighter, link_text_obj, link_markup, print_author_header, print_centered,
|
|
33
|
+
print_panel, print_section_header, vertically_pad)
|
|
34
34
|
from epstein_files.util.search_result import SearchResult
|
|
35
35
|
from epstein_files.util.timer import Timer
|
|
36
36
|
|
|
@@ -72,18 +72,18 @@ class EpsteinFiles:
|
|
|
72
72
|
|
|
73
73
|
# Read through and classify all the files
|
|
74
74
|
for file_arg in self.all_files:
|
|
75
|
-
doc_timer = Timer(decimals=
|
|
75
|
+
doc_timer = Timer(decimals=2)
|
|
76
76
|
document = Document(file_arg)
|
|
77
77
|
cls = document_cls(document)
|
|
78
78
|
|
|
79
|
-
if document.length == 0:
|
|
79
|
+
if document.length() == 0:
|
|
80
80
|
logger.warning(f"Skipping empty file: {document}]")
|
|
81
81
|
continue
|
|
82
82
|
elif args.skip_other_files and cls == OtherFile and file_type_count[cls.__name__] > 1:
|
|
83
|
-
|
|
83
|
+
document.log(f"Skipping OtherFile...")
|
|
84
84
|
continue
|
|
85
85
|
|
|
86
|
-
documents.append(cls(file_arg, text=document.text))
|
|
86
|
+
documents.append(cls(file_arg, lines=document.lines, text=document.text))
|
|
87
87
|
logger.info(str(documents[-1]))
|
|
88
88
|
file_type_count[cls.__name__] += 1
|
|
89
89
|
|
|
@@ -104,16 +104,20 @@ class EpsteinFiles:
|
|
|
104
104
|
if PICKLED_PATH.exists() and not args.overwrite_pickle:
|
|
105
105
|
with gzip.open(PICKLED_PATH, 'rb') as file:
|
|
106
106
|
epstein_files = pickle.load(file)
|
|
107
|
-
timer.print_at_checkpoint(f"Loaded {len(epstein_files.all_files):,} documents from '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})")
|
|
108
107
|
epstein_files.timer = timer
|
|
108
|
+
timer_msg = f"Loaded {len(epstein_files.all_files):,} documents from '{PICKLED_PATH}'"
|
|
109
|
+
epstein_files.timer.print_at_checkpoint(f"{timer_msg} ({file_size_str(PICKLED_PATH)})")
|
|
109
110
|
return epstein_files
|
|
110
111
|
|
|
111
112
|
logger.warning(f"Building new cache file, this will take a few minutes...")
|
|
112
113
|
epstein_files = EpsteinFiles(timer=timer)
|
|
113
114
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
115
|
+
if args.skip_other_files:
|
|
116
|
+
logger.warning(f"Not writing pickled data because --skip-other-files")
|
|
117
|
+
else:
|
|
118
|
+
with gzip.open(PICKLED_PATH, 'wb') as file:
|
|
119
|
+
pickle.dump(epstein_files, file)
|
|
120
|
+
logger.warning(f"Pickled data to '{PICKLED_PATH}' ({file_size_str(PICKLED_PATH)})...")
|
|
117
121
|
|
|
118
122
|
timer.print_at_checkpoint(f'Processed {len(epstein_files.all_files):,} documents')
|
|
119
123
|
return epstein_files
|
|
@@ -127,9 +131,6 @@ class EpsteinFiles:
|
|
|
127
131
|
names = names if include_useless else [e for e in names if e is None or e.lower() not in EXCLUDED_EMAILERS]
|
|
128
132
|
return sorted(list(set(names)), key=lambda e: self.email_author_counts[e] + self.email_recipient_counts[e])
|
|
129
133
|
|
|
130
|
-
def attributed_email_count(self) -> int:
|
|
131
|
-
return sum([i for author, i in self.email_author_counts.items() if author != UNKNOWN])
|
|
132
|
-
|
|
133
134
|
def docs_matching(
|
|
134
135
|
self,
|
|
135
136
|
pattern: re.Pattern | str,
|
|
@@ -156,7 +157,7 @@ class EpsteinFiles:
|
|
|
156
157
|
return self.emails_for(author)[-1].timestamp
|
|
157
158
|
|
|
158
159
|
def email_conversation_length_in_days(self, author: str | None) -> int:
|
|
159
|
-
return (self.
|
|
160
|
+
return days_between(self.earliest_email_at(author), self.last_email_at(author))
|
|
160
161
|
|
|
161
162
|
def email_signature_substitution_counts(self) -> dict[str, int]:
|
|
162
163
|
"""Return the number of times an email signature was replaced with "<...snipped...>" for each author."""
|
|
@@ -172,7 +173,7 @@ class EpsteinFiles:
|
|
|
172
173
|
return sorted(list(self.unknown_recipient_email_ids))
|
|
173
174
|
|
|
174
175
|
def emails_by(self, author: str | None) -> list[Email]:
|
|
175
|
-
return [e for e in self.emails if e.author == author]
|
|
176
|
+
return Document.sort_by_timestamp([e for e in self.emails if e.author == author])
|
|
176
177
|
|
|
177
178
|
def emails_for(self, author: str | None) -> list[Email]:
|
|
178
179
|
"""Returns emails to or from a given 'author' sorted chronologically."""
|
|
@@ -185,9 +186,11 @@ class EpsteinFiles:
|
|
|
185
186
|
|
|
186
187
|
def emails_to(self, author: str | None) -> list[Email]:
|
|
187
188
|
if author is None:
|
|
188
|
-
|
|
189
|
+
emails = [e for e in self.emails if len(e.recipients) == 0 or None in e.recipients]
|
|
189
190
|
else:
|
|
190
|
-
|
|
191
|
+
emails = [e for e in self.emails if author in e.recipients]
|
|
192
|
+
|
|
193
|
+
return Document.sort_by_timestamp(emails)
|
|
191
194
|
|
|
192
195
|
def get_documents_by_id(self, file_ids: str | list[str]) -> list[Document]:
|
|
193
196
|
file_ids = listify(file_ids)
|
|
@@ -198,20 +201,29 @@ class EpsteinFiles:
|
|
|
198
201
|
|
|
199
202
|
return docs
|
|
200
203
|
|
|
201
|
-
def imessage_logs_for(self, author: str | None | list[str | None]) -> Sequence[MessengerLog]:
|
|
202
|
-
return MessengerLog.logs_for(author, self.imessage_logs)
|
|
203
|
-
|
|
204
204
|
def json_metadata(self) -> str:
|
|
205
205
|
"""Create a JSON string containing metadata for all the files."""
|
|
206
206
|
metadata = {
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
207
|
+
'files': {
|
|
208
|
+
Email.__name__: _sorted_metadata(self.emails),
|
|
209
|
+
JsonFile.__name__: _sorted_metadata(self.json_files),
|
|
210
|
+
MessengerLog.__name__: _sorted_metadata(self.imessage_logs),
|
|
211
|
+
OtherFile.__name__: _sorted_metadata(self.non_json_other_files()),
|
|
212
|
+
},
|
|
213
|
+
'people': {
|
|
214
|
+
name: highlighted_group.get_info(name)
|
|
215
|
+
for highlighted_group in HIGHLIGHTED_NAMES
|
|
216
|
+
if isinstance(highlighted_group, HighlightedNames)
|
|
217
|
+
for name, description in highlighted_group.emailers.items()
|
|
218
|
+
if description
|
|
219
|
+
}
|
|
211
220
|
}
|
|
212
221
|
|
|
213
222
|
return json.dumps(metadata, indent=4, sort_keys=True)
|
|
214
223
|
|
|
224
|
+
def non_duplicate_emails(self) -> list[Email]:
|
|
225
|
+
return [email for email in self.emails if not email.is_duplicate()]
|
|
226
|
+
|
|
215
227
|
def non_json_other_files(self) -> list[OtherFile]:
|
|
216
228
|
return [doc for doc in self.other_files if not isinstance(doc, JsonFile)]
|
|
217
229
|
|
|
@@ -230,8 +242,8 @@ class EpsteinFiles:
|
|
|
230
242
|
f"{len([d for d in docs if d.is_duplicate()])}",
|
|
231
243
|
)
|
|
232
244
|
|
|
233
|
-
add_row('iMessage Logs', self.imessage_logs)
|
|
234
245
|
add_row('Emails', self.emails)
|
|
246
|
+
add_row('iMessage Logs', self.imessage_logs)
|
|
235
247
|
add_row('JSON Data', self.json_files)
|
|
236
248
|
add_row('Other', self.non_json_other_files())
|
|
237
249
|
console.print(Align.center(table))
|
|
@@ -271,12 +283,13 @@ class EpsteinFiles:
|
|
|
271
283
|
console.print(Align.center(Email.build_table(emails, author)), '\n')
|
|
272
284
|
|
|
273
285
|
def print_email_device_info(self) -> None:
|
|
274
|
-
print_panel(f"Email [italic]Sent from \\[DEVICE][/italic] Signature Breakdown", padding=(
|
|
286
|
+
print_panel(f"Email [italic]Sent from \\[DEVICE][/italic] Signature Breakdown", padding=(2, 0, 0, 0), centered=True)
|
|
275
287
|
console.print(_build_signature_table(self.email_authors_to_device_signatures, (AUTHOR, DEVICE_SIGNATURE)))
|
|
276
288
|
console.print(_build_signature_table(self.email_device_signatures_to_authors, (DEVICE_SIGNATURE, AUTHOR), ', '))
|
|
277
289
|
|
|
278
|
-
def
|
|
279
|
-
|
|
290
|
+
def table_of_emailers(self) -> Table:
|
|
291
|
+
attributed_emails = [e for e in self.non_duplicate_emails() if e.author]
|
|
292
|
+
footer = f"Identified authors of {len(attributed_emails):,} out of {len(self.non_duplicate_emails()):,} emails."
|
|
280
293
|
counts_table = build_table("Email Counts", caption=footer)
|
|
281
294
|
|
|
282
295
|
add_cols_to_table(counts_table, [
|
|
@@ -308,49 +321,17 @@ class EpsteinFiles:
|
|
|
308
321
|
str(self.email_recipient_counts[name]),
|
|
309
322
|
emails[0].timestamp_without_seconds(),
|
|
310
323
|
emails[-1].timestamp_without_seconds(),
|
|
311
|
-
|
|
312
|
-
'' if
|
|
313
|
-
'' if
|
|
314
|
-
|
|
324
|
+
link_text_obj(search_jmail_url(name), JMAIL) if name else '',
|
|
325
|
+
link_text_obj(epstein_media_person_url(name), 'eMedia') if is_ok_for_epstein_web(name) else '',
|
|
326
|
+
link_text_obj(epstein_web_person_url(name), 'eWeb') if is_ok_for_epstein_web(name) else '',
|
|
327
|
+
link_text_obj(search_twitter_url(name), 'search X') if name else '',
|
|
315
328
|
)
|
|
316
329
|
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
def print_imessage_summary(self) -> None:
|
|
320
|
-
"""Print summary table and stats for text messages."""
|
|
321
|
-
console.print(MessengerLog.summary_table(self.imessage_logs))
|
|
322
|
-
text_summary_msg = f"\nDeanonymized {Document.known_author_count(self.imessage_logs)} of "
|
|
323
|
-
text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files):,} files."
|
|
324
|
-
console.print(text_summary_msg)
|
|
325
|
-
imessage_msg_count = sum([len(log.messages) for log in self.imessage_logs])
|
|
326
|
-
console.print(f"Found {imessage_msg_count} text messages in {len(self.imessage_logs)} iMessage log files.")
|
|
327
|
-
|
|
328
|
-
def print_other_files_table(self) -> list[OtherFile]:
|
|
329
|
-
"""Returns the OtherFile objects that were interesting enough to print."""
|
|
330
|
-
interesting_files = [doc for doc in self.other_files if args.all_other_files or doc.is_interesting()]
|
|
331
|
-
header_pfx = '' if args.all_other_files else 'Selected '
|
|
332
|
-
print_section_header(f"{FIRST_FEW_LINES} of {len(interesting_files)} {header_pfx}Files That Are Neither Emails Nor Text Msgs")
|
|
333
|
-
|
|
334
|
-
if not args.all_other_files:
|
|
335
|
-
print_centered(f"(the other site is uncurated and has all {len(self.other_files)} unclassifiable files and {len(self.emails):,} emails)", style='dim')
|
|
336
|
-
print_other_site_link(False)
|
|
337
|
-
console.line(2)
|
|
338
|
-
|
|
339
|
-
console.print(OtherFile.build_table(interesting_files))
|
|
340
|
-
console.print(Padding(OtherFile.count_by_category_table(interesting_files), (2, 0, 2, 2)))
|
|
341
|
-
skipped_file_count = len(self.other_files) - len(interesting_files)
|
|
342
|
-
|
|
343
|
-
if skipped_file_count > 0:
|
|
344
|
-
logger.warning(f"Skipped {skipped_file_count} uninteresting other files...")
|
|
345
|
-
|
|
346
|
-
return interesting_files
|
|
330
|
+
return counts_table
|
|
347
331
|
|
|
348
332
|
def _tally_email_data(self) -> None:
|
|
349
333
|
"""Tally up summary info about Email objects."""
|
|
350
|
-
for email in self.
|
|
351
|
-
if email.is_duplicate():
|
|
352
|
-
continue
|
|
353
|
-
|
|
334
|
+
for email in self.non_duplicate_emails():
|
|
354
335
|
self.email_author_counts[email.author] += 1
|
|
355
336
|
|
|
356
337
|
if len(email.recipients) == 0:
|
|
@@ -380,7 +361,7 @@ def count_by_month(docs: Sequence[Document]) -> dict[str | None, int]:
|
|
|
380
361
|
def document_cls(doc: Document) -> Type[Document]:
|
|
381
362
|
search_area = doc.text[0:5000] # Limit search area to avoid pointless scans of huge files
|
|
382
363
|
|
|
383
|
-
if doc.length == 0:
|
|
364
|
+
if doc.length() == 0:
|
|
384
365
|
return Document
|
|
385
366
|
if doc.text[0] == '{':
|
|
386
367
|
return JsonFile
|
|
@@ -187,9 +187,11 @@ VIRGINIA_GIUFFRE = 'Virginia Giuffre'
|
|
|
187
187
|
|
|
188
188
|
# Organizations
|
|
189
189
|
BOFA = 'BofA'
|
|
190
|
+
BOFA_MERRILL = f'{BOFA} / Merrill Lynch'
|
|
190
191
|
CNN = 'CNN'
|
|
191
192
|
DEUTSCHE_BANK = 'Deutsche Bank'
|
|
192
193
|
ELECTRON_CAPITAL_PARTNERS = 'Electron Capital Partners'
|
|
194
|
+
EPSTEIN_FOUNDATION = 'Jeffrey Epstein VI Foundation'
|
|
193
195
|
GOLDMAN_SACHS = 'Goldman Sachs'
|
|
194
196
|
GOLDMAN_INVESTMENT_MGMT = f'{GOLDMAN_SACHS} Investment Management Division'
|
|
195
197
|
HARVARD = 'Harvard'
|
|
@@ -238,7 +240,7 @@ OTHER_NAMES = NAMES_TO_NOT_HIGHLIGHT + """
|
|
|
238
240
|
ian isaac isaacson
|
|
239
241
|
james jamie jane janet jason jen jim joe johnson jones josh julie justin
|
|
240
242
|
karl kate kathy kelly kim kruger kyle
|
|
241
|
-
laurie leo leonard lenny leslie lieberman louis lynch lynn
|
|
243
|
+
laurie lawrence leo leonard lenny leslie lieberman louis lynch lynn
|
|
242
244
|
marcus marianne matt matthew melissa michele michelle moore moscowitz
|
|
243
245
|
nancy nicole nussbaum
|
|
244
246
|
owen
|