epstein-files 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +4 -1
- epstein_files/documents/document.py +7 -2
- epstein_files/documents/email.py +33 -13
- epstein_files/documents/imessage/text_message.py +11 -15
- epstein_files/documents/messenger_log.py +15 -11
- epstein_files/documents/other_file.py +13 -8
- epstein_files/epstein_files.py +21 -15
- epstein_files/util/constant/names.py +19 -23
- epstein_files/util/constant/strings.py +8 -2
- epstein_files/util/constant/urls.py +1 -0
- epstein_files/util/constants.py +194 -116
- epstein_files/util/data.py +1 -1
- epstein_files/util/doc_cfg.py +5 -4
- epstein_files/util/env.py +3 -2
- epstein_files/util/highlighted_group.py +30 -25
- epstein_files/util/logging.py +1 -0
- epstein_files/util/output.py +8 -9
- epstein_files/util/rich.py +6 -1
- {epstein_files-1.0.2.dist-info → epstein_files-1.0.4.dist-info}/METADATA +18 -8
- epstein_files-1.0.4.dist-info/RECORD +33 -0
- epstein_files-1.0.2.dist-info/RECORD +0 -33
- {epstein_files-1.0.2.dist-info → epstein_files-1.0.4.dist-info}/LICENSE +0 -0
- {epstein_files-1.0.2.dist-info → epstein_files-1.0.4.dist-info}/WHEEL +0 -0
- {epstein_files-1.0.2.dist-info → epstein_files-1.0.4.dist-info}/entry_points.txt +0 -0
epstein_files/__init__.py
CHANGED
|
@@ -4,7 +4,7 @@ Reformat Epstein text message files for readability and count email senders.
|
|
|
4
4
|
For use with iMessage log files from https://drive.google.com/drive/folders/1hTNH5woIRio578onLGElkTWofUSWRoH_
|
|
5
5
|
|
|
6
6
|
Install: 'poetry install'
|
|
7
|
-
Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT
|
|
7
|
+
Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT epstein_generate'
|
|
8
8
|
"""
|
|
9
9
|
from sys import exit
|
|
10
10
|
|
|
@@ -87,6 +87,9 @@ def epstein_search():
|
|
|
87
87
|
console.line()
|
|
88
88
|
|
|
89
89
|
if args.whole_file:
|
|
90
|
+
if isinstance(search_result.document, Email):
|
|
91
|
+
search_result.document.truncation_allowed = False
|
|
92
|
+
|
|
90
93
|
console.print(search_result.document)
|
|
91
94
|
else:
|
|
92
95
|
console.print(search_result.document.description_panel())
|
|
@@ -15,7 +15,7 @@ from epstein_files.util.constant.names import *
|
|
|
15
15
|
from epstein_files.util.constant.strings import *
|
|
16
16
|
from epstein_files.util.constant.urls import *
|
|
17
17
|
from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
|
|
18
|
-
from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize,
|
|
18
|
+
from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize, without_falsey
|
|
19
19
|
from epstein_files.util.doc_cfg import EmailCfg, DocCfg, Metadata, TextCfg
|
|
20
20
|
from epstein_files.util.env import args
|
|
21
21
|
from epstein_files.util.file_helper import (DOCS_DIR, file_stem_for_id, extract_file_id, file_size,
|
|
@@ -159,7 +159,7 @@ class Document:
|
|
|
159
159
|
if hint_msg:
|
|
160
160
|
hints.append(highlighter(Text(hint_msg, style='white dim italic')))
|
|
161
161
|
|
|
162
|
-
return
|
|
162
|
+
return without_falsey(hints)
|
|
163
163
|
|
|
164
164
|
def info_txt(self) -> Text | None:
|
|
165
165
|
"""Secondary info about this file (recipients, level of certainty, etc). Overload in subclasses."""
|
|
@@ -356,6 +356,11 @@ class Document:
|
|
|
356
356
|
for f in tmpfiles:
|
|
357
357
|
f.unlink()
|
|
358
358
|
|
|
359
|
+
@staticmethod
|
|
360
|
+
def known_author_count(docs: Sequence['Document']) -> int:
|
|
361
|
+
"""Count of how many Document objects have an author attribution."""
|
|
362
|
+
return len([doc for doc in docs if doc.author])
|
|
363
|
+
|
|
359
364
|
@staticmethod
|
|
360
365
|
def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
|
|
361
366
|
return sorted(docs, key=lambda doc: doc.sort_key())
|
epstein_files/documents/email.py
CHANGED
|
@@ -30,7 +30,6 @@ BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communicati
|
|
|
30
30
|
DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
|
|
31
31
|
LINK_LINE_REGEX = re.compile(f"^(> )?htt")
|
|
32
32
|
QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
|
|
33
|
-
REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + ['********************************']
|
|
34
33
|
REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
|
|
35
34
|
|
|
36
35
|
BAD_TIMEZONE_REGEX = re.compile(fr'\((UTC|GMT\+\d\d:\d\d)\)|{REDACTED}')
|
|
@@ -39,10 +38,16 @@ TIMESTAMP_LINE_REGEX = re.compile(r"\d+:\d+")
|
|
|
39
38
|
|
|
40
39
|
SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
|
|
41
40
|
REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
|
|
41
|
+
IS_JUNK_MAIL = 'is_junk_mail'
|
|
42
42
|
MAX_CHARS_TO_PRINT = 4000
|
|
43
43
|
MAX_NUM_HEADER_LINES = 14
|
|
44
44
|
MAX_QUOTED_REPLIES = 2
|
|
45
45
|
|
|
46
|
+
REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
|
|
47
|
+
'********************************',
|
|
48
|
+
'Begin forwarded message',
|
|
49
|
+
]
|
|
50
|
+
|
|
46
51
|
OCR_REPAIRS: dict[str | re.Pattern, str] = {
|
|
47
52
|
re.compile(r'grnail\.com'): 'gmail.com',
|
|
48
53
|
re.compile(r"^(From|To)(: )?[_1.]{5,}", re.MULTILINE): rf"\1: {REDACTED}", # Redacted email addresses
|
|
@@ -119,6 +124,7 @@ EMAIL_SIGNATURE_REGEXES = {
|
|
|
119
124
|
# Invalid for links to EpsteinWeb
|
|
120
125
|
JUNK_EMAILERS = [
|
|
121
126
|
'asmallworld@travel.asmallworld.net',
|
|
127
|
+
"digest-noreply@quora.com",
|
|
122
128
|
'editorialstaff@flipboard.com',
|
|
123
129
|
'How To Academy',
|
|
124
130
|
'Jokeland',
|
|
@@ -126,9 +132,13 @@ JUNK_EMAILERS = [
|
|
|
126
132
|
'Saved by Internet Explorer 11',
|
|
127
133
|
]
|
|
128
134
|
|
|
129
|
-
|
|
130
|
-
|
|
135
|
+
MAILING_LISTS = [
|
|
136
|
+
INTELLIGENCE_SQUARED,
|
|
131
137
|
'middle.east.update@hotmail.com',
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + MAILING_LISTS + [
|
|
141
|
+
'Alan S Halperin',
|
|
132
142
|
'Mitchell Bard',
|
|
133
143
|
'Skip Rimer',
|
|
134
144
|
]
|
|
@@ -281,7 +291,7 @@ SELF_EMAILS_FILE_IDS = [
|
|
|
281
291
|
]
|
|
282
292
|
|
|
283
293
|
METADATA_FIELDS = [
|
|
284
|
-
|
|
294
|
+
IS_JUNK_MAIL,
|
|
285
295
|
'recipients',
|
|
286
296
|
'sent_from_device',
|
|
287
297
|
]
|
|
@@ -294,7 +304,6 @@ class Email(Communication):
|
|
|
294
304
|
actual_text (str) - best effort at the text actually sent in this email, excluding quoted replies and forwards
|
|
295
305
|
config (EmailCfg | None) - manual config for this email (if it exists)
|
|
296
306
|
header (EmailHeader) - header data extracted from the text (from/to/sent/subject etc)
|
|
297
|
-
is_junk_mail (bool) - True if this is junk mail
|
|
298
307
|
recipients (list[str | None]) - who this email was sent to
|
|
299
308
|
sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
|
|
300
309
|
signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
|
|
@@ -302,17 +311,16 @@ class Email(Communication):
|
|
|
302
311
|
actual_text: str = field(init=False)
|
|
303
312
|
config: EmailCfg | None = None
|
|
304
313
|
header: EmailHeader = field(init=False)
|
|
305
|
-
is_junk_mail: bool = False
|
|
306
314
|
recipients: list[str | None] = field(default_factory=list)
|
|
307
315
|
sent_from_device: str | None = None
|
|
308
316
|
signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
|
|
317
|
+
truncation_allowed: bool = True
|
|
309
318
|
|
|
310
319
|
# For logging how many headers we prettified while printing, kind of janky
|
|
311
320
|
rewritten_header_ids: ClassVar[set[str]] = set([])
|
|
312
321
|
|
|
313
322
|
def __post_init__(self):
|
|
314
323
|
super().__post_init__()
|
|
315
|
-
self.is_junk_mail = self.author in JUNK_EMAILERS
|
|
316
324
|
|
|
317
325
|
if self.config and self.config.recipients:
|
|
318
326
|
self.recipients = cast(list[str | None], self.config.recipients)
|
|
@@ -331,9 +339,17 @@ class Email(Communication):
|
|
|
331
339
|
txt = Text("OCR text of email from ", style='grey46').append(self.author_txt).append(' to ')
|
|
332
340
|
return txt.append(self._recipients_txt()).append(highlighter(f" probably sent at {self.timestamp}"))
|
|
333
341
|
|
|
342
|
+
def is_fwded_article(self) -> bool:
|
|
343
|
+
return bool(self.config and self.config.is_fwded_article)
|
|
344
|
+
|
|
345
|
+
def is_junk_mail(self) -> bool:
|
|
346
|
+
return self.author in JUNK_EMAILERS or self.author in MAILING_LISTS
|
|
347
|
+
|
|
334
348
|
def metadata(self) -> Metadata:
|
|
349
|
+
local_metadata = asdict(self)
|
|
350
|
+
local_metadata[IS_JUNK_MAIL] = self.is_junk_mail()
|
|
335
351
|
metadata = super().metadata()
|
|
336
|
-
metadata.update({k: v for k, v in
|
|
352
|
+
metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
|
|
337
353
|
return metadata
|
|
338
354
|
|
|
339
355
|
def subject(self) -> str:
|
|
@@ -352,17 +368,18 @@ class Email(Communication):
|
|
|
352
368
|
"""The text that comes before likely quoted replies and forwards etc."""
|
|
353
369
|
if self.config and self.config.actual_text is not None:
|
|
354
370
|
return self.config.actual_text
|
|
371
|
+
|
|
372
|
+
text = '\n'.join(self.text.split('\n')[self.header.num_header_rows:]).strip()
|
|
373
|
+
|
|
374
|
+
if self.config and self.config.fwded_text_after:
|
|
375
|
+
return text.split(self.config.fwded_text_after)[0].strip()
|
|
355
376
|
elif self.header.num_header_rows == 0:
|
|
356
377
|
return self.text
|
|
357
378
|
|
|
358
|
-
text = '\n'.join(self.text.split('\n')[self.header.num_header_rows:]).strip()
|
|
359
379
|
reply_text_match = REPLY_TEXT_REGEX.search(text)
|
|
360
380
|
# logger.info(f"Raw text:\n" + self.top_lines(20) + '\n\n')
|
|
361
381
|
# logger.info(f"With header removed:\n" + text[0:500] + '\n\n')
|
|
362
382
|
|
|
363
|
-
if self.file_id in ['024624']: # This email starts with "On September 14th"
|
|
364
|
-
return text.split('On Tue, May 14')[0].strip()
|
|
365
|
-
|
|
366
383
|
if reply_text_match:
|
|
367
384
|
actual_num_chars = len(reply_text_match.group(1))
|
|
368
385
|
actual_text_pct = f"{(100 * float(actual_num_chars) / len(text)):.1f}%"
|
|
@@ -555,6 +572,9 @@ class Email(Communication):
|
|
|
555
572
|
self._merge_lines(3, 5)
|
|
556
573
|
elif self.file_id == '028931':
|
|
557
574
|
self._merge_lines(3, 6)
|
|
575
|
+
elif self.file_id == '013415':
|
|
576
|
+
for _i in range(2):
|
|
577
|
+
self._merge_lines(4)
|
|
558
578
|
elif self.file_id in ['033568']:
|
|
559
579
|
for _i in range(5):
|
|
560
580
|
self._merge_lines(5)
|
|
@@ -637,7 +657,7 @@ class Email(Communication):
|
|
|
637
657
|
num_chars = quote_cutoff
|
|
638
658
|
|
|
639
659
|
# Truncate long emails but leave a note explaining what happened w/link to source document
|
|
640
|
-
if len(text) > num_chars:
|
|
660
|
+
if len(text) > num_chars and self.truncation_allowed:
|
|
641
661
|
text = text[0:num_chars]
|
|
642
662
|
doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
|
|
643
663
|
trim_note = f"<...trimmed to {num_chars} characters of {self.length}, read the rest at {doc_link_markup}...>"
|
|
@@ -4,7 +4,7 @@ from datetime import datetime
|
|
|
4
4
|
|
|
5
5
|
from rich.text import Text
|
|
6
6
|
|
|
7
|
-
from epstein_files.util.constant.names import JEFFREY_EPSTEIN, ANTHONY_SCARAMUCCI, STEVE_BANNON, UNKNOWN
|
|
7
|
+
from epstein_files.util.constant.names import JEFFREY_EPSTEIN, ANTHONY_SCARAMUCCI, CELINA_DUBIN, EVA, STEVE_BANNON, UNKNOWN
|
|
8
8
|
from epstein_files.util.data import extract_last_name
|
|
9
9
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
10
10
|
from epstein_files.util.logging import logger
|
|
@@ -19,17 +19,18 @@ DISPLAY_LAST_NAME_ONLY = [
|
|
|
19
19
|
STEVE_BANNON,
|
|
20
20
|
]
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
'+
|
|
24
|
-
'+
|
|
25
|
-
'+
|
|
26
|
-
|
|
22
|
+
PHONE_NUMBER_MAPPING = {
|
|
23
|
+
'+19174393646': ANTHONY_SCARAMUCCI,
|
|
24
|
+
'+13109906526': STEVE_BANNON,
|
|
25
|
+
'+16463880059': EVA,
|
|
26
|
+
'+13108737937': CELINA_DUBIN,
|
|
27
|
+
'+13108802851': STEVE_BANNON,
|
|
28
|
+
|
|
29
|
+
}
|
|
27
30
|
|
|
28
31
|
TEXTER_MAPPING = {
|
|
29
32
|
'e:': JEFFREY_EPSTEIN,
|
|
30
33
|
'e:jeeitunes@gmail.com': JEFFREY_EPSTEIN,
|
|
31
|
-
'+19174393646': ANTHONY_SCARAMUCCI,
|
|
32
|
-
'+13109906526': STEVE_BANNON,
|
|
33
34
|
}
|
|
34
35
|
|
|
35
36
|
|
|
@@ -37,7 +38,7 @@ TEXTER_MAPPING = {
|
|
|
37
38
|
class TextMessage:
|
|
38
39
|
"""Class representing a single iMessage text message."""
|
|
39
40
|
author: str | None
|
|
40
|
-
author_str: str =
|
|
41
|
+
author_str: str | None = None
|
|
41
42
|
id_confirmed: bool = False
|
|
42
43
|
text: str
|
|
43
44
|
timestamp_str: str
|
|
@@ -47,14 +48,10 @@ class TextMessage:
|
|
|
47
48
|
|
|
48
49
|
if self.author is None:
|
|
49
50
|
self.author_str = UNKNOWN
|
|
50
|
-
elif self.author in UNKNOWN_TEXTERS:
|
|
51
|
-
logger.warning(f"Bad text from '{self.author}': \"{self.text}\"")
|
|
52
|
-
self.author_str = self.author
|
|
53
|
-
self.author = None # TODO: this shouldn't be happening; we still know the author...
|
|
54
51
|
elif self.author in DISPLAY_LAST_NAME_ONLY:
|
|
55
52
|
self.author_str = extract_last_name(self.author)
|
|
56
53
|
else:
|
|
57
|
-
self.author_str = self.author
|
|
54
|
+
self.author_str = self.author_str or self.author
|
|
58
55
|
|
|
59
56
|
if not self.id_confirmed and self.author is not None and self.author != JEFFREY_EPSTEIN:
|
|
60
57
|
self.author_str = self.author + ' (?)'
|
|
@@ -87,7 +84,6 @@ class TextMessage:
|
|
|
87
84
|
return msg_txt
|
|
88
85
|
|
|
89
86
|
def __rich__(self) -> Text:
|
|
90
|
-
# TODO: Workaround for phone numbers that sucks
|
|
91
87
|
author_style = get_style_for_name(self.author_str if self.author_str.startswith('+') else self.author)
|
|
92
88
|
author_txt = Text(self.author_str, style=author_style)
|
|
93
89
|
timestamp_txt = Text(f"[{self.timestamp_str}]", style=TIMESTAMP_STYLE).append(' ')
|
|
@@ -44,17 +44,8 @@ class MessengerLog(Communication):
|
|
|
44
44
|
|
|
45
45
|
def messages(self) -> list[TextMessage]:
|
|
46
46
|
"""Lazily evaluated accessor for self._messages."""
|
|
47
|
-
if
|
|
48
|
-
self._messages = [
|
|
49
|
-
TextMessage(
|
|
50
|
-
# If the Sender: is redacted that means it's from self.author
|
|
51
|
-
author=REDACTED_AUTHOR_REGEX.sub('', match.group(1).strip()) or self.author,
|
|
52
|
-
id_confirmed=not self.is_attribution_uncertain(),
|
|
53
|
-
text=match.group(4).strip(),
|
|
54
|
-
timestamp_str=match.group(2).strip(),
|
|
55
|
-
)
|
|
56
|
-
for match in MSG_REGEX.finditer(self.text)
|
|
57
|
-
]
|
|
47
|
+
if not self._messages:
|
|
48
|
+
self._messages = [self._build_message(match) for match in MSG_REGEX.finditer(self.text)]
|
|
58
49
|
|
|
59
50
|
return self._messages
|
|
60
51
|
|
|
@@ -70,6 +61,19 @@ class MessengerLog(Communication):
|
|
|
70
61
|
def _border_style(self) -> str:
|
|
71
62
|
return self.author_style
|
|
72
63
|
|
|
64
|
+
def _build_message(self, match: re.Match) -> TextMessage:
|
|
65
|
+
"""Turn a regex match into a TextMessage."""
|
|
66
|
+
author_str = REDACTED_AUTHOR_REGEX.sub('', match.group(1).strip())
|
|
67
|
+
|
|
68
|
+
# If the Sender: is redacted that means it's from self.author
|
|
69
|
+
return TextMessage(
|
|
70
|
+
author=self.author if (author_str.startswith('+') or not author_str) else author_str,
|
|
71
|
+
author_str=author_str if author_str.startswith('+') else None, # Preserve phone numbers
|
|
72
|
+
id_confirmed=not self.is_attribution_uncertain(),
|
|
73
|
+
text=match.group(4).strip(),
|
|
74
|
+
timestamp_str=match.group(2).strip(),
|
|
75
|
+
)
|
|
76
|
+
|
|
73
77
|
def _extract_timestamp(self) -> datetime:
|
|
74
78
|
for match in MSG_REGEX.finditer(self.text):
|
|
75
79
|
timestamp_str = match.group(2).strip()
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import logging
|
|
3
3
|
import warnings
|
|
4
|
-
from dataclasses import dataclass
|
|
4
|
+
from dataclasses import asdict, dataclass
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
|
|
7
7
|
import datefinder
|
|
@@ -15,7 +15,7 @@ from rich.text import Text
|
|
|
15
15
|
from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, WHITESPACE_REGEX, Document
|
|
16
16
|
from epstein_files.util.constant.strings import *
|
|
17
17
|
from epstein_files.util.constants import *
|
|
18
|
-
from epstein_files.util.doc_cfg import FINANCIAL_REPORTS_AUTHORS, DocCfg
|
|
18
|
+
from epstein_files.util.doc_cfg import FINANCIAL_REPORTS_AUTHORS, DocCfg, Metadata
|
|
19
19
|
from epstein_files.util.data import escape_single_quotes, remove_timezone, uniquify
|
|
20
20
|
from epstein_files.util.file_helper import FILENAME_LENGTH
|
|
21
21
|
from epstein_files.util.env import args
|
|
@@ -83,11 +83,10 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
|
|
|
83
83
|
NOBEL_CHARITABLE_TRUST,
|
|
84
84
|
'Nautilus',
|
|
85
85
|
'New Yorker',
|
|
86
|
-
|
|
87
|
-
NYT_COLUMN,
|
|
86
|
+
NYT,
|
|
88
87
|
PALM_BEACH_CODE_ENFORCEMENT,
|
|
89
|
-
|
|
90
|
-
|
|
88
|
+
PALM_BEACH_DAILY_NEWS,
|
|
89
|
+
PALM_BEACH_POST,
|
|
91
90
|
PALM_BEACH_TSV,
|
|
92
91
|
PALM_BEACH_WATER_COMMITTEE,
|
|
93
92
|
PAUL_KRASSNER,
|
|
@@ -102,6 +101,7 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
|
|
|
102
101
|
SHIMON_POST_ARTICLE,
|
|
103
102
|
SINGLE_PAGE,
|
|
104
103
|
STACEY_PLASKETT,
|
|
104
|
+
'Tatler',
|
|
105
105
|
TERJE_ROD_LARSEN,
|
|
106
106
|
TEXT_OF_US_LAW,
|
|
107
107
|
TRANSLATION,
|
|
@@ -113,7 +113,7 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
|
|
|
113
113
|
'U.S. News',
|
|
114
114
|
'US Office',
|
|
115
115
|
'Vanity Fair',
|
|
116
|
-
|
|
116
|
+
VI_DAILY_NEWS,
|
|
117
117
|
WAPO,
|
|
118
118
|
]
|
|
119
119
|
|
|
@@ -127,7 +127,7 @@ class OtherFile(Document):
|
|
|
127
127
|
|
|
128
128
|
if self.config is None and VI_DAILY_NEWS_REGEX.search(self.text):
|
|
129
129
|
self.log(f"Creating synthetic config for VI Daily News article...", logging.INFO)
|
|
130
|
-
self.config = DocCfg(id=self.file_id,
|
|
130
|
+
self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
|
|
131
131
|
|
|
132
132
|
def category(self) -> str | None:
|
|
133
133
|
return self.config and self.config.category
|
|
@@ -175,6 +175,11 @@ class OtherFile(Document):
|
|
|
175
175
|
|
|
176
176
|
return True
|
|
177
177
|
|
|
178
|
+
def metadata(self) -> Metadata:
|
|
179
|
+
metadata = super().metadata()
|
|
180
|
+
metadata['is_interesting'] = self.is_interesting()
|
|
181
|
+
return metadata
|
|
182
|
+
|
|
178
183
|
def preview_text(self) -> str:
|
|
179
184
|
return WHITESPACE_REGEX.sub(' ', self.text)[0:PREVIEW_CHARS]
|
|
180
185
|
|
epstein_files/epstein_files.py
CHANGED
|
@@ -21,11 +21,11 @@ from epstein_files.documents.messenger_log import MSG_REGEX, MessengerLog
|
|
|
21
21
|
from epstein_files.documents.other_file import OtherFile
|
|
22
22
|
from epstein_files.util.constant.output_files import PICKLED_PATH
|
|
23
23
|
from epstein_files.util.constant.strings import *
|
|
24
|
-
from epstein_files.util.constant.urls import (EPSTEIN_WEB, JMAIL,
|
|
25
|
-
search_jmail_url, search_twitter_url)
|
|
24
|
+
from epstein_files.util.constant.urls import (EPSTEIN_MEDIA, EPSTEIN_WEB, JMAIL, epstein_media_person_url,
|
|
25
|
+
epsteinify_name_url, epstein_web_person_url, search_jmail_url, search_twitter_url)
|
|
26
26
|
from epstein_files.util.constants import *
|
|
27
27
|
from epstein_files.util.data import dict_sets_to_lists, json_safe, sort_dict
|
|
28
|
-
from epstein_files.util.doc_cfg import EmailCfg
|
|
28
|
+
from epstein_files.util.doc_cfg import EmailCfg, Metadata
|
|
29
29
|
from epstein_files.util.env import args, logger
|
|
30
30
|
from epstein_files.util.file_helper import DOCS_DIR, file_size_str
|
|
31
31
|
from epstein_files.util.highlighted_group import get_info_for_name, get_style_for_name
|
|
@@ -68,6 +68,7 @@ class EpsteinFiles:
|
|
|
68
68
|
"""Iterate through files and build appropriate objects."""
|
|
69
69
|
self.all_files = [f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')]
|
|
70
70
|
documents = []
|
|
71
|
+
file_type_count = defaultdict(int)
|
|
71
72
|
|
|
72
73
|
# Read through and classify all the files
|
|
73
74
|
for file_arg in self.all_files:
|
|
@@ -75,12 +76,13 @@ class EpsteinFiles:
|
|
|
75
76
|
document = Document(file_arg)
|
|
76
77
|
|
|
77
78
|
if document.length == 0:
|
|
78
|
-
logger.warning(f"Skipping empty file: {document}")
|
|
79
|
+
logger.warning(f"Skipping empty file: {document}]")
|
|
79
80
|
continue
|
|
80
81
|
|
|
81
82
|
cls = document_cls(document)
|
|
82
83
|
documents.append(cls(file_arg, text=document.text))
|
|
83
84
|
logger.info(str(documents[-1]))
|
|
85
|
+
file_type_count[cls.__name__] += 1
|
|
84
86
|
|
|
85
87
|
if doc_timer.seconds_since_start() > SLOW_FILE_SECONDS:
|
|
86
88
|
doc_timer.print_at_checkpoint(f"Slow file: {documents[-1]} processed")
|
|
@@ -195,15 +197,13 @@ class EpsteinFiles:
|
|
|
195
197
|
def imessage_logs_for(self, author: str | None | list[str | None]) -> Sequence[MessengerLog]:
|
|
196
198
|
return MessengerLog.logs_for(author, self.imessage_logs)
|
|
197
199
|
|
|
198
|
-
def identified_imessage_log_count(self) -> int:
|
|
199
|
-
return len([log for log in self.imessage_logs if log.author])
|
|
200
|
-
|
|
201
200
|
def json_metadata(self) -> str:
|
|
201
|
+
"""Create a JSON string containing metadata for all the files."""
|
|
202
202
|
metadata = {
|
|
203
|
-
EMAIL_CLASS:
|
|
204
|
-
JSON_FILE_CLASS:
|
|
205
|
-
MESSENGER_LOG_CLASS:
|
|
206
|
-
OTHER_FILE_CLASS:
|
|
203
|
+
EMAIL_CLASS: _sorted_metadata(self.emails),
|
|
204
|
+
JSON_FILE_CLASS: _sorted_metadata(self.json_files),
|
|
205
|
+
MESSENGER_LOG_CLASS: _sorted_metadata(self.imessage_logs),
|
|
206
|
+
OTHER_FILE_CLASS: _sorted_metadata(self.non_json_other_files()),
|
|
207
207
|
}
|
|
208
208
|
|
|
209
209
|
return json.dumps(metadata, indent=4, sort_keys=True)
|
|
@@ -216,7 +216,7 @@ class EpsteinFiles:
|
|
|
216
216
|
add_cols_to_table(table, ['File Type', 'Files', 'Author Known', 'Author Unknown', 'Duplicates'])
|
|
217
217
|
|
|
218
218
|
def add_row(label: str, docs: list):
|
|
219
|
-
known = None if isinstance(docs[0], JsonFile) else
|
|
219
|
+
known = None if isinstance(docs[0], JsonFile) else Document.known_author_count(docs)
|
|
220
220
|
|
|
221
221
|
table.add_row(
|
|
222
222
|
label,
|
|
@@ -274,7 +274,7 @@ class EpsteinFiles:
|
|
|
274
274
|
def print_emailer_counts_table(self) -> None:
|
|
275
275
|
footer = f"Identified authors of {self.attributed_email_count():,} emails out of {len(self.emails):,}."
|
|
276
276
|
counts_table = Table(title=f"Email Counts", caption=footer, header_style="bold")
|
|
277
|
-
add_cols_to_table(counts_table, ['Name', 'Count', 'Sent', "Recv'd", JMAIL, EPSTEIN_WEB, 'Twitter'])
|
|
277
|
+
add_cols_to_table(counts_table, ['Name', 'Count', 'Sent', "Recv'd", JMAIL, EPSTEIN_MEDIA, EPSTEIN_WEB, 'Twitter'])
|
|
278
278
|
|
|
279
279
|
emailer_counts = {
|
|
280
280
|
emailer: self.email_author_counts[emailer] + self.email_recipient_counts[emailer]
|
|
@@ -290,7 +290,8 @@ class EpsteinFiles:
|
|
|
290
290
|
str(self.email_author_counts[p]),
|
|
291
291
|
str(self.email_recipient_counts[p]),
|
|
292
292
|
'' if p is None else link_text_obj(search_jmail_url(p), JMAIL),
|
|
293
|
-
'' if not is_ok_for_epstein_web(p) else link_text_obj(
|
|
293
|
+
'' if not is_ok_for_epstein_web(p) else link_text_obj(epstein_media_person_url(p), EPSTEIN_MEDIA),
|
|
294
|
+
'' if not is_ok_for_epstein_web(p) else link_text_obj(epstein_web_person_url(p), EPSTEIN_WEB),
|
|
294
295
|
'' if p is None else link_text_obj(search_twitter_url(p), 'search X'),
|
|
295
296
|
)
|
|
296
297
|
|
|
@@ -299,7 +300,7 @@ class EpsteinFiles:
|
|
|
299
300
|
def print_imessage_summary(self) -> None:
|
|
300
301
|
"""Print summary table and stats for text messages."""
|
|
301
302
|
console.print(MessengerLog.summary_table(self.imessage_logs))
|
|
302
|
-
text_summary_msg = f"\nDeanonymized {self.
|
|
303
|
+
text_summary_msg = f"\nDeanonymized {Document.known_author_count(self.imessage_logs)} of "
|
|
303
304
|
text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files):,} files."
|
|
304
305
|
console.print(text_summary_msg)
|
|
305
306
|
imessage_msg_count = sum([len(log.messages()) for log in self.imessage_logs])
|
|
@@ -394,3 +395,8 @@ def is_ok_for_epstein_web(name: str | None) -> bool:
|
|
|
394
395
|
return False
|
|
395
396
|
|
|
396
397
|
return True
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def _sorted_metadata(docs: Sequence[Document]) -> list[Metadata]:
|
|
401
|
+
docs_sorted_by_id = sorted(docs, key=lambda d: d.file_id)
|
|
402
|
+
return [json_safe(d.metadata()) for d in docs_sorted_by_id]
|
|
@@ -198,14 +198,10 @@ OSBORNE_LLP = f"{IAN_OSBORNE} & Partners LLP" # Ian Osborne's PR firm
|
|
|
198
198
|
TRUMP_ORG = 'Trump Organization'
|
|
199
199
|
UBS = 'UBS'
|
|
200
200
|
|
|
201
|
-
# Locations
|
|
202
|
-
PALM_BEACH = 'Palm Beach'
|
|
203
|
-
VIRGIN_ISLANDS = 'Virgin Islands'
|
|
204
|
-
|
|
205
201
|
# First and last names that should be made part of a highlighting regex for emailers
|
|
206
202
|
NAMES_TO_NOT_HIGHLIGHT: list[str] = [name.lower() for name in [
|
|
207
|
-
'Al', 'Alfredo', 'Allen', 'Alex', 'Alexander', 'Amanda', 'Andres', 'Andrew',
|
|
208
|
-
'Bard', 'Barry', 'Bill', 'Black', 'Brad', 'Bruce',
|
|
203
|
+
'Al', 'Alan', 'Alfredo', 'Allen', 'Alex', 'Alexander', 'Amanda', 'Andres', 'Andrew',
|
|
204
|
+
'Bard', 'Barry', 'Bill', 'Black', 'Boris', 'Brad', 'Bruce',
|
|
209
205
|
'Carolyn', 'Chris', 'Christina',
|
|
210
206
|
'Dan', 'Daniel', 'Danny', 'Darren', 'Dave', 'David',
|
|
211
207
|
'Ed', 'Edward', 'Edwards', 'Epstein', 'Eric', 'Erika', 'Etienne',
|
|
@@ -215,10 +211,10 @@ NAMES_TO_NOT_HIGHLIGHT: list[str] = [name.lower() for name in [
|
|
|
215
211
|
'Ian',
|
|
216
212
|
'Jack', 'James', 'Jay', 'Jean', 'Jeff', 'Jeffrey', 'Jennifer', 'Jeremy', 'jessica', 'Joel', 'John', 'Jon', 'Jonathan', 'Joseph', 'Jr',
|
|
217
213
|
'Kahn', 'Katherine', 'Ken', 'Kevin',
|
|
218
|
-
'Leon', 'Lesley', 'Linda', 'Link', 'Lisa',
|
|
214
|
+
'Larry', 'Leon', 'Lesley', 'Linda', 'Link', 'Lisa',
|
|
219
215
|
'Mann', 'Marc', 'Marie', 'Mark', 'Martin', 'Melanie', 'Michael', 'Mike', 'Miller', 'Mitchell', 'Miles', 'Morris', 'Moskowitz',
|
|
220
216
|
'Nancy', 'Neal', 'New',
|
|
221
|
-
'Paul', 'Paula', 'Pen', 'Peter', 'Philip',
|
|
217
|
+
'Paul', 'Paula', 'Pen', 'Peter', 'Philip', 'Prince',
|
|
222
218
|
'Randall', 'Reid', 'Richard', 'Robert', 'Rodriguez', 'Roger', 'Rosenberg', 'Ross', 'Roth', 'Rubin',
|
|
223
219
|
'Scott', 'Sean', 'Stanley', 'Stern', 'Stephen', 'Steve', 'Steven', 'Stone', 'Susan',
|
|
224
220
|
'The', 'Thomas', 'Tim', 'Tom', 'Tyler',
|
|
@@ -228,25 +224,25 @@ NAMES_TO_NOT_HIGHLIGHT: list[str] = [name.lower() for name in [
|
|
|
228
224
|
]]
|
|
229
225
|
|
|
230
226
|
# Names to color white in the word counts
|
|
231
|
-
OTHER_NAMES = """
|
|
232
|
-
aaron albert alberto alec
|
|
233
|
-
baldwin barack
|
|
234
|
-
chapman charles charlie
|
|
235
|
-
|
|
236
|
-
|
|
227
|
+
OTHER_NAMES = NAMES_TO_NOT_HIGHLIGHT + """
|
|
228
|
+
aaron albert alberto alec alexandra alice anderson andre ann anna anne ariana arthur
|
|
229
|
+
baldwin barack ben benjamin berger bert binant bob bonner boyden bradley brady branson bruno bryant burton
|
|
230
|
+
chapman charles charlie christopher clint cohen colin collins conway
|
|
231
|
+
davis dean debra deborah dennis diana diane diaz dickinson dixon dominique don dylan
|
|
232
|
+
edmond elizabeth emily entwistle erik evelyn
|
|
237
233
|
ferguson flachsbart francis franco frank
|
|
238
|
-
gardner gary geoff geoffrey
|
|
234
|
+
gardner gary geoff geoffrey gilbert gloria goldberg gonzalez gould graham greene guarino gwyneth
|
|
239
235
|
hancock harold harrison harry helen hirsch hofstadter horowitz hussein
|
|
240
236
|
isaac isaacson
|
|
241
|
-
jamie
|
|
242
|
-
kate kathy kelly
|
|
243
|
-
leonard lenny lieberman louis lynch lynn
|
|
244
|
-
marcus marianne matt matthew melissa michele michelle
|
|
237
|
+
jamie jane janet jason jen jim joe johnson jones josh julie justin
|
|
238
|
+
kate kathy kelly kim kruger kyle
|
|
239
|
+
leo leonard lenny leslie lieberman louis lynch lynn
|
|
240
|
+
marcus marianne matt matthew melissa michele michelle moore moscowitz
|
|
245
241
|
nicole nussbaum
|
|
246
|
-
|
|
247
|
-
rafael ray richardson rob robin
|
|
248
|
-
sara sarah seligman serge sergey silverman sloman smith snowden sorkin
|
|
249
|
-
ted theresa thompson tiffany
|
|
242
|
+
paulson philippe
|
|
243
|
+
rafael ray richardson rob robin ron rudolph ryan
|
|
244
|
+
sara sarah seligman serge sergey silverman sloman smith snowden sorkin steele stevie stewart
|
|
245
|
+
ted theresa thompson tiffany timothy
|
|
250
246
|
valeria
|
|
251
247
|
walter warren weinstein weiss william
|
|
252
248
|
zach zack
|
|
@@ -30,6 +30,10 @@ REPUTATION = 'reputation'
|
|
|
30
30
|
SOCIAL = 'social'
|
|
31
31
|
SPEECH = 'speech'
|
|
32
32
|
|
|
33
|
+
# Locations
|
|
34
|
+
PALM_BEACH = 'Palm Beach'
|
|
35
|
+
VIRGIN_ISLANDS = 'Virgin Islands'
|
|
36
|
+
|
|
33
37
|
# Publications
|
|
34
38
|
BBC = 'BBC'
|
|
35
39
|
BLOOMBERG = 'Bloomberg'
|
|
@@ -38,10 +42,12 @@ DAILY_MAIL = 'Daily Mail'
|
|
|
38
42
|
DAILY_TELEGRAPH = "Daily Telegraph"
|
|
39
43
|
LA_TIMES = 'LA Times'
|
|
40
44
|
MIAMI_HERALD = 'Miami Herald'
|
|
41
|
-
|
|
42
|
-
|
|
45
|
+
NYT = "New York Times"
|
|
46
|
+
PALM_BEACH_DAILY_NEWS = f'{PALM_BEACH} Daily News'
|
|
47
|
+
PALM_BEACH_POST = f'{PALM_BEACH} Post'
|
|
43
48
|
THE_REAL_DEAL = 'The Real Deal'
|
|
44
49
|
WAPO = 'WaPo'
|
|
50
|
+
VI_DAILY_NEWS = f'{VIRGIN_ISLANDS} Daily News'
|
|
45
51
|
|
|
46
52
|
# Site types
|
|
47
53
|
EMAIL = 'email'
|
|
@@ -71,6 +71,7 @@ epsteinify_name_url = lambda name: f"{EPSTEINIFY_URL}/?name={urllib.parse.quote(
|
|
|
71
71
|
epstein_media_doc_url = lambda file_stem: build_doc_url(DOC_LINK_BASE_URLS[EPSTEIN_MEDIA], file_stem, True)
|
|
72
72
|
epstein_media_doc_link_markup = lambda filename_or_id, style = TEXT_LINK: external_doc_link_markup(EPSTEIN_MEDIA, filename_or_id, style)
|
|
73
73
|
epstein_media_doc_link_txt = lambda filename_or_id, style = TEXT_LINK: Text.from_markup(epstein_media_doc_link_markup(filename_or_id, style))
|
|
74
|
+
epstein_media_person_url = lambda person: f"{EPSTEIN_MEDIA_URL}/people/{parameterize(person)}"
|
|
74
75
|
|
|
75
76
|
epstein_web_doc_url = lambda file_stem: f"{DOC_LINK_BASE_URLS[EPSTEIN_WEB]}/{file_stem}.jpg"
|
|
76
77
|
epstein_web_person_url = lambda person: f"{EPSTEIN_WEB_URL}/{parameterize(person)}"
|