epstein-files 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +11 -24
- epstein_files/documents/communication.py +0 -3
- epstein_files/documents/document.py +61 -18
- epstein_files/documents/email.py +11 -5
- epstein_files/documents/emails/email_header.py +10 -2
- epstein_files/documents/imessage/text_message.py +3 -2
- epstein_files/documents/other_file.py +16 -34
- epstein_files/epstein_files.py +23 -33
- epstein_files/person.py +39 -65
- epstein_files/util/constant/names.py +13 -6
- epstein_files/util/constant/strings.py +0 -1
- epstein_files/util/constant/urls.py +1 -0
- epstein_files/util/constants.py +3 -1
- epstein_files/util/data.py +1 -1
- epstein_files/util/doc_cfg.py +3 -3
- epstein_files/util/env.py +4 -4
- epstein_files/util/highlighted_group.py +112 -94
- epstein_files/util/logging.py +1 -1
- epstein_files/util/output.py +36 -12
- epstein_files/util/rich.py +14 -14
- epstein_files/util/timer.py +14 -0
- {epstein_files-1.2.0.dist-info → epstein_files-1.2.1.dist-info}/METADATA +5 -2
- epstein_files-1.2.1.dist-info/RECORD +34 -0
- epstein_files-1.2.0.dist-info/RECORD +0 -34
- {epstein_files-1.2.0.dist-info → epstein_files-1.2.1.dist-info}/LICENSE +0 -0
- {epstein_files-1.2.0.dist-info → epstein_files-1.2.1.dist-info}/WHEEL +0 -0
- {epstein_files-1.2.0.dist-info → epstein_files-1.2.1.dist-info}/entry_points.txt +0 -0
epstein_files/__init__.py
CHANGED
|
@@ -21,7 +21,7 @@ from epstein_files.util.env import args
|
|
|
21
21
|
from epstein_files.util.file_helper import coerce_file_path, extract_file_id
|
|
22
22
|
from epstein_files.util.logging import exit_with_error, logger
|
|
23
23
|
from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
|
|
24
|
-
print_other_files_section, print_text_messages_section, print_email_timeline,
|
|
24
|
+
print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info,
|
|
25
25
|
print_json_metadata, write_urls)
|
|
26
26
|
from epstein_files.util.rich import (build_highlighter, console, print_color_key, print_title_page_header,
|
|
27
27
|
print_title_page_tables, print_subtitle_panel, write_html)
|
|
@@ -38,15 +38,15 @@ def generate_html() -> None:
|
|
|
38
38
|
timer = Timer()
|
|
39
39
|
epstein_files = EpsteinFiles.get_files(timer)
|
|
40
40
|
|
|
41
|
-
if args.
|
|
41
|
+
if args.emailers_info:
|
|
42
|
+
print_emailers_info(epstein_files)
|
|
43
|
+
exit()
|
|
44
|
+
elif args.json_metadata:
|
|
42
45
|
print_json_metadata(epstein_files)
|
|
43
46
|
exit()
|
|
44
47
|
elif args.json_files:
|
|
45
48
|
print_json_files(epstein_files)
|
|
46
49
|
exit()
|
|
47
|
-
elif args.emailers_info_png:
|
|
48
|
-
print_emailers_info_png(epstein_files)
|
|
49
|
-
exit()
|
|
50
50
|
|
|
51
51
|
print_title_page_header()
|
|
52
52
|
|
|
@@ -59,25 +59,19 @@ def generate_html() -> None:
|
|
|
59
59
|
exit()
|
|
60
60
|
|
|
61
61
|
if args.output_texts:
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
timer.print_at_checkpoint(f'Printed {len(imessage_logs)} text message log files')
|
|
62
|
+
printed_logs = print_text_messages_section(epstein_files)
|
|
63
|
+
timer.log_section_complete('MessengerLog', epstein_files.imessage_logs, printed_logs)
|
|
65
64
|
|
|
66
65
|
if args.output_emails:
|
|
67
|
-
|
|
68
|
-
timer.
|
|
66
|
+
printed_emails = print_emails_section(epstein_files)
|
|
67
|
+
timer.log_section_complete('Email', epstein_files.emails, printed_emails)
|
|
69
68
|
elif args.email_timeline:
|
|
70
69
|
print_email_timeline(epstein_files)
|
|
71
70
|
timer.print_at_checkpoint(f"Printed chronological emails table")
|
|
72
71
|
|
|
73
72
|
if args.output_other:
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
else:
|
|
77
|
-
files = [f for f in epstein_files.other_files if args.all_other_files or f.is_interesting()]
|
|
78
|
-
|
|
79
|
-
print_other_files_section(files, epstein_files)
|
|
80
|
-
timer.print_at_checkpoint(f"Printed {len(files)} other files (skipped {len(epstein_files.other_files) - len(files)})")
|
|
73
|
+
printed_files = print_other_files_section(epstein_files)
|
|
74
|
+
timer.log_section_complete('OtherFile', epstein_files.other_files, printed_files)
|
|
81
75
|
|
|
82
76
|
write_html(args.build)
|
|
83
77
|
logger.warning(f"Total time: {timer.seconds_since_start_str()}")
|
|
@@ -94,7 +88,6 @@ def epstein_diff():
|
|
|
94
88
|
|
|
95
89
|
def epstein_search():
|
|
96
90
|
"""Search the cleaned up text of the files."""
|
|
97
|
-
_assert_positional_args()
|
|
98
91
|
epstein_files = EpsteinFiles.get_files()
|
|
99
92
|
|
|
100
93
|
for search_term in args.positional_args:
|
|
@@ -117,7 +110,6 @@ def epstein_search():
|
|
|
117
110
|
|
|
118
111
|
def epstein_show():
|
|
119
112
|
"""Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
|
|
120
|
-
_assert_positional_args()
|
|
121
113
|
raw_docs: list[Document] = []
|
|
122
114
|
console.line()
|
|
123
115
|
|
|
@@ -142,8 +134,3 @@ def epstein_show():
|
|
|
142
134
|
|
|
143
135
|
def epstein_word_count() -> None:
|
|
144
136
|
write_word_counts_html()
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
def _assert_positional_args():
|
|
148
|
-
if not args.positional_args:
|
|
149
|
-
exit_with_error(f"No positional args provided!\n")
|
|
@@ -34,9 +34,6 @@ class Communication(Document):
|
|
|
34
34
|
"""Overrides super() method to apply self.author_style."""
|
|
35
35
|
return super().external_links_txt(self.author_style(), include_alt_links=include_alt_links)
|
|
36
36
|
|
|
37
|
-
def is_attribution_uncertain(self) -> bool:
|
|
38
|
-
return bool(self.config and self.config.is_attribution_uncertain)
|
|
39
|
-
|
|
40
37
|
def summary(self) -> Text:
|
|
41
38
|
return self._summary().append(CLOSE_PROPERTIES_CHAR)
|
|
42
39
|
|
|
@@ -11,17 +11,19 @@ from rich.console import Console, ConsoleOptions, Group, RenderResult
|
|
|
11
11
|
from rich.padding import Padding
|
|
12
12
|
from rich.panel import Panel
|
|
13
13
|
from rich.text import Text
|
|
14
|
+
from rich.table import Table
|
|
14
15
|
|
|
15
16
|
from epstein_files.util.constant.names import *
|
|
16
17
|
from epstein_files.util.constant.strings import *
|
|
17
18
|
from epstein_files.util.constant.urls import *
|
|
18
19
|
from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
|
|
19
|
-
from epstein_files.util.data import collapse_newlines, date_str, patternize,
|
|
20
|
+
from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time, without_falsey
|
|
20
21
|
from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
|
|
21
22
|
from epstein_files.util.env import DOCS_DIR, args
|
|
22
|
-
from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, is_local_extract_file
|
|
23
|
+
from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, file_size_to_str, is_local_extract_file
|
|
23
24
|
from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
|
|
24
|
-
from epstein_files.util.rich import INFO_STYLE,
|
|
25
|
+
from epstein_files.util.rich import (INFO_STYLE, NA_TXT, SYMBOL_STYLE, add_cols_to_table, build_table, console,
|
|
26
|
+
highlighter, join_texts, key_value_txt, link_text_obj, parenthesize)
|
|
25
27
|
from epstein_files.util.search_result import MatchedLine
|
|
26
28
|
|
|
27
29
|
ALT_LINK_STYLE = 'white dim'
|
|
@@ -55,6 +57,14 @@ OCR_REPAIRS = {
|
|
|
55
57
|
'Nil Priell': 'Nili Priell',
|
|
56
58
|
}
|
|
57
59
|
|
|
60
|
+
SUMMARY_TABLE_COLS: list[str | dict] = [
|
|
61
|
+
'Count',
|
|
62
|
+
{'name': 'Has Author', 'style': 'honeydew2'},
|
|
63
|
+
{'name': 'No Author', 'style': 'wheat4'},
|
|
64
|
+
{'name': 'Uncertain Author', 'style': 'royal_blue1 dim'},
|
|
65
|
+
{'name': 'Size', 'justify': 'right', 'style': 'dim'},
|
|
66
|
+
]
|
|
67
|
+
|
|
58
68
|
|
|
59
69
|
@dataclass
|
|
60
70
|
class Document:
|
|
@@ -181,6 +191,9 @@ class Document:
|
|
|
181
191
|
"""Secondary info about this file (description recipients, etc). Overload in subclasses."""
|
|
182
192
|
return None
|
|
183
193
|
|
|
194
|
+
def is_attribution_uncertain(self) -> bool:
|
|
195
|
+
return bool(self.config and self.config.is_attribution_uncertain)
|
|
196
|
+
|
|
184
197
|
def is_duplicate(self) -> bool:
|
|
185
198
|
return bool(self.duplicate_of_id())
|
|
186
199
|
|
|
@@ -240,17 +253,6 @@ class Document:
|
|
|
240
253
|
|
|
241
254
|
return text
|
|
242
255
|
|
|
243
|
-
def sort_key(self) -> tuple[datetime, str, int]:
|
|
244
|
-
"""Sort by timestamp, file_id, then whether or not it's a duplicate file."""
|
|
245
|
-
if self.is_duplicate():
|
|
246
|
-
sort_id = self.config.duplicate_of_id
|
|
247
|
-
dupe_idx = 1
|
|
248
|
-
else:
|
|
249
|
-
sort_id = self.file_id
|
|
250
|
-
dupe_idx = 0
|
|
251
|
-
|
|
252
|
-
return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
|
|
253
|
-
|
|
254
256
|
def source_file_id(self) -> str:
|
|
255
257
|
"""Strip off the _1, _2, etc. suffixes for extracted documents."""
|
|
256
258
|
return self.file_id[0:6]
|
|
@@ -261,7 +263,7 @@ class Document:
|
|
|
261
263
|
txt.append(f" {self.file_path.stem}", style=FILENAME_STYLE)
|
|
262
264
|
|
|
263
265
|
if self.timestamp:
|
|
264
|
-
timestamp_str =
|
|
266
|
+
timestamp_str = remove_zero_time(self.timestamp).replace('T', ' ')
|
|
265
267
|
txt.append(' (', style=SYMBOL_STYLE)
|
|
266
268
|
txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
|
|
267
269
|
|
|
@@ -282,6 +284,17 @@ class Document:
|
|
|
282
284
|
|
|
283
285
|
return Panel(Group(*sentences), border_style=self._class_style(), expand=False)
|
|
284
286
|
|
|
287
|
+
def timestamp_sort_key(self) -> tuple[datetime, str, int]:
|
|
288
|
+
"""Sort by timestamp, file_id, then whether or not it's a duplicate file."""
|
|
289
|
+
if self.is_duplicate():
|
|
290
|
+
sort_id = self.config.duplicate_of_id
|
|
291
|
+
dupe_idx = 1
|
|
292
|
+
else:
|
|
293
|
+
sort_id = self.file_id
|
|
294
|
+
dupe_idx = 0
|
|
295
|
+
|
|
296
|
+
return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
|
|
297
|
+
|
|
285
298
|
def top_lines(self, n: int = 10) -> str:
|
|
286
299
|
"""First n lines."""
|
|
287
300
|
return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
|
|
@@ -361,6 +374,32 @@ class Document:
|
|
|
361
374
|
def __str__(self) -> str:
|
|
362
375
|
return self.summary().plain
|
|
363
376
|
|
|
377
|
+
@classmethod
|
|
378
|
+
def file_info_table(cls, title: str, first_col_name: str) -> Table:
|
|
379
|
+
"""Empty table with appropriate cols for summarizing groups of files."""
|
|
380
|
+
table = build_table(title)
|
|
381
|
+
cols = [{'name': first_col_name, 'min_width': 14}] + SUMMARY_TABLE_COLS
|
|
382
|
+
add_cols_to_table(table, cols, 'right')
|
|
383
|
+
return table
|
|
384
|
+
|
|
385
|
+
@classmethod
|
|
386
|
+
def files_info(cls, files: Sequence['Document'], is_author_na: bool = False) -> dict[str, str | Text]:
|
|
387
|
+
"""Summary info about a group of files."""
|
|
388
|
+
file_count = len(files)
|
|
389
|
+
author_count = cls.known_author_count(files)
|
|
390
|
+
|
|
391
|
+
return {
|
|
392
|
+
'count': str(file_count),
|
|
393
|
+
'author_count': NA_TXT if is_author_na else str(author_count),
|
|
394
|
+
'no_author_count': NA_TXT if is_author_na else str(file_count - author_count),
|
|
395
|
+
'uncertain_author_count': NA_TXT if is_author_na else str(len([f for f in files if f.is_attribution_uncertain()])),
|
|
396
|
+
'bytes': file_size_to_str(sum([f.file_size() for f in files])),
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
@classmethod
|
|
400
|
+
def files_info_row(cls, files: Sequence['Document'], author_na: bool = False) -> Sequence[str | Text]:
|
|
401
|
+
return [v for v in cls.files_info(files, author_na).values()]
|
|
402
|
+
|
|
364
403
|
@staticmethod
|
|
365
404
|
def diff_files(files: list[str]) -> None:
|
|
366
405
|
"""Diff the contents of two Documents after all cleanup, BOM removal, etc."""
|
|
@@ -398,14 +437,18 @@ class Document:
|
|
|
398
437
|
|
|
399
438
|
@staticmethod
|
|
400
439
|
def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
|
|
401
|
-
return sorted(docs, key=lambda doc: doc.
|
|
440
|
+
return sorted(docs, key=lambda doc: doc.timestamp_sort_key())
|
|
402
441
|
|
|
403
|
-
@
|
|
404
|
-
def uniquify(
|
|
442
|
+
@staticmethod
|
|
443
|
+
def uniquify(documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
|
|
405
444
|
"""Uniquify by file_id."""
|
|
406
445
|
id_map = {doc.file_id: doc for doc in documents}
|
|
407
446
|
return [doc for doc in id_map.values()]
|
|
408
447
|
|
|
448
|
+
@staticmethod
|
|
449
|
+
def without_dupes(docs: Sequence['DocumentType']) -> list['DocumentType']:
|
|
450
|
+
return [doc for doc in docs if not doc.is_duplicate()]
|
|
451
|
+
|
|
409
452
|
|
|
410
453
|
DocumentType = TypeVar('DocumentType', bound=Document)
|
|
411
454
|
|
epstein_files/documents/email.py
CHANGED
|
@@ -32,7 +32,7 @@ BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE
|
|
|
32
32
|
BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
|
|
33
33
|
DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
|
|
34
34
|
LINK_LINE_REGEX = re.compile(f"^(> )?htt")
|
|
35
|
-
QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote
|
|
35
|
+
QUOTED_REPLY_LINE_REGEX = re.compile(r'(\nFrom:(.*)|wrote:)\n', re.IGNORECASE)
|
|
36
36
|
REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
|
|
37
37
|
|
|
38
38
|
BAD_TIMEZONE_REGEX = re.compile(fr'\((UTC|GMT\+\d\d:\d\d)\)|{REDACTED}')
|
|
@@ -114,7 +114,7 @@ EMAIL_SIGNATURE_REGEXES = {
|
|
|
114
114
|
DAVID_INGRAM: re.compile(r"Thank you in advance.*\nDavid Ingram.*\nCorrespondent\nReuters.*\nThomson.*(\n(Office|Mobile|Reuters.com).*)*"),
|
|
115
115
|
DEEPAK_CHOPRA: re.compile(fr"({DEEPAK_CHOPRA}( MD)?\n)?2013 Costa Del Mar Road\nCarlsbad, CA 92009(\n(Chopra Foundation|Super Genes: Unlock.*))?(\nJiyo)?(\nChopra Center for Wellbeing)?(\nHome: Where Everyone is Welcome)?"),
|
|
116
116
|
JEFFREY_EPSTEIN: re.compile(r"((\*+|please note)\n+)?(> )?(• )?(» )?The information contained in this communication is\n(> )*(» )?confidential.*?all attachments.( copyright -all rights reserved?)?", re.DOTALL),
|
|
117
|
-
JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*", re.IGNORECASE),
|
|
117
|
+
JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*(\nTel:.*)?(\nEmail:.*)?", re.IGNORECASE),
|
|
118
118
|
KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
|
|
119
119
|
LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
|
|
120
120
|
LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
|
|
@@ -436,9 +436,9 @@ class Email(Communication):
|
|
|
436
436
|
elif self.header.num_header_rows == 0:
|
|
437
437
|
return self.text
|
|
438
438
|
|
|
439
|
-
reply_text_match = REPLY_TEXT_REGEX.search(text)
|
|
440
439
|
self.log_top_lines(20, "Raw text:", logging.DEBUG)
|
|
441
440
|
self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
|
|
441
|
+
reply_text_match = REPLY_TEXT_REGEX.search(text)
|
|
442
442
|
|
|
443
443
|
if reply_text_match:
|
|
444
444
|
actual_num_chars = len(reply_text_match.group(1))
|
|
@@ -550,9 +550,15 @@ class Email(Communication):
|
|
|
550
550
|
|
|
551
551
|
def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
|
|
552
552
|
"""Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
|
|
553
|
-
|
|
553
|
+
if text is None:
|
|
554
|
+
header_offset = len(self.header.header_chars)
|
|
555
|
+
text = self.text[header_offset:]
|
|
556
|
+
else:
|
|
557
|
+
header_offset = 0
|
|
558
|
+
|
|
559
|
+
for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text)):
|
|
554
560
|
if i >= n:
|
|
555
|
-
return match.end() - 1
|
|
561
|
+
return match.end() + header_offset - 1
|
|
556
562
|
|
|
557
563
|
def _merge_lines(self, idx: int, idx2: int | None = None) -> None:
|
|
558
564
|
"""Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""
|
|
@@ -9,7 +9,6 @@ from epstein_files.util.logging import logger
|
|
|
9
9
|
from epstein_files.util.rich import UNKNOWN
|
|
10
10
|
|
|
11
11
|
FIELD_NAMES = ['Date', 'From', 'Sent', 'Subject']
|
|
12
|
-
NON_HEADER_FIELDS = ['field_names', 'num_header_rows', 'was_initially_empty']
|
|
13
12
|
ON_BEHALF_OF = 'on behalf of'
|
|
14
13
|
TO_FIELDS = ['bcc', 'cc', 'to']
|
|
15
14
|
EMAILER_FIELDS = [AUTHOR] + TO_FIELDS
|
|
@@ -28,10 +27,18 @@ CONFIGURED_ACTUAL_TEXTS = [
|
|
|
28
27
|
if isinstance(cfg, EmailCfg) and cfg.actual_text is not None
|
|
29
28
|
]
|
|
30
29
|
|
|
30
|
+
NON_HEADER_FIELDS = [
|
|
31
|
+
'field_names',
|
|
32
|
+
'header_chars',
|
|
33
|
+
'num_header_rows',
|
|
34
|
+
'was_initially_empty',
|
|
35
|
+
]
|
|
36
|
+
|
|
31
37
|
|
|
32
38
|
@dataclass(kw_only=True)
|
|
33
39
|
class EmailHeader:
|
|
34
40
|
field_names: list[str] # Order is same as the order header fields appear in the email file text
|
|
41
|
+
header_chars: str = ''
|
|
35
42
|
num_header_rows: int = field(init=False)
|
|
36
43
|
was_initially_empty: bool = False
|
|
37
44
|
|
|
@@ -101,6 +108,7 @@ class EmailHeader:
|
|
|
101
108
|
setattr(self, field_name, value)
|
|
102
109
|
|
|
103
110
|
self.num_header_rows = len(self.field_names) + num_headers
|
|
111
|
+
self.header_chars = '\n'.join(email_lines[0:self.num_header_rows])
|
|
104
112
|
log_msg = f"Corrected empty header using {self.num_header_rows} lines to:\n"
|
|
105
113
|
logger.debug(f"{log_msg}{self}\n\nTop lines:\n\n%s", '\n'.join(email_lines[0:(num_headers + 1) * 2]))
|
|
106
114
|
|
|
@@ -163,7 +171,7 @@ class EmailHeader:
|
|
|
163
171
|
if should_log_header:
|
|
164
172
|
logger.debug(f"Header being parsed was this:\n\n{header}\n")
|
|
165
173
|
|
|
166
|
-
return
|
|
174
|
+
return cls(field_names=field_names, header_chars=header, **kw_args)
|
|
167
175
|
|
|
168
176
|
@staticmethod
|
|
169
177
|
def cleanup_str(_str: str) -> str:
|
|
@@ -4,7 +4,7 @@ from datetime import datetime
|
|
|
4
4
|
|
|
5
5
|
from rich.text import Text
|
|
6
6
|
|
|
7
|
-
from epstein_files.util.constant.names import JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN, Name, extract_last_name
|
|
7
|
+
from epstein_files.util.constant.names import ANTHONY_SCARAMUCCI, JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN, Name, extract_last_name
|
|
8
8
|
from epstein_files.util.constant.strings import TIMESTAMP_DIM
|
|
9
9
|
from epstein_files.util.data import iso_timestamp
|
|
10
10
|
from epstein_files.util.highlighted_group import get_style_for_name
|
|
@@ -17,6 +17,7 @@ PHONE_NUMBER_REGEX = re.compile(r'^[\d+]+.*')
|
|
|
17
17
|
UNCERTAIN_SUFFIX = ' (?)'
|
|
18
18
|
|
|
19
19
|
DISPLAY_LAST_NAME_ONLY = [
|
|
20
|
+
ANTHONY_SCARAMUCCI,
|
|
20
21
|
JEFFREY_EPSTEIN,
|
|
21
22
|
STEVE_BANNON,
|
|
22
23
|
]
|
|
@@ -59,7 +60,7 @@ class TextMessage:
|
|
|
59
60
|
try:
|
|
60
61
|
timestamp_str = iso_timestamp(self.parse_timestamp())
|
|
61
62
|
except Exception as e:
|
|
62
|
-
logger.
|
|
63
|
+
logger.info(f"Failed to parse timestamp for {self}")
|
|
63
64
|
timestamp_str = self.timestamp_str
|
|
64
65
|
|
|
65
66
|
return Text(f"[{timestamp_str}]", style=TIMESTAMP_DIM)
|
|
@@ -22,7 +22,7 @@ from epstein_files.util.data import days_between, escape_single_quotes, remove_t
|
|
|
22
22
|
from epstein_files.util.file_helper import FILENAME_LENGTH, file_size_to_str
|
|
23
23
|
from epstein_files.util.env import args
|
|
24
24
|
from epstein_files.util.highlighted_group import QUESTION_MARKS_TXT, styled_category
|
|
25
|
-
from epstein_files.util.rich import build_table, highlighter
|
|
25
|
+
from epstein_files.util.rich import add_cols_to_table, build_table, highlighter
|
|
26
26
|
from epstein_files.util.logging import logger
|
|
27
27
|
|
|
28
28
|
FIRST_FEW_LINES = 'First Few Lines'
|
|
@@ -209,39 +209,8 @@ class OtherFile(Document):
|
|
|
209
209
|
if num_days_spanned > MAX_DAYS_SPANNED_TO_BE_VALID and VAST_HOUSE not in self.text:
|
|
210
210
|
self.log_top_lines(15, msg=timestamps_log_msg, level=logging.DEBUG)
|
|
211
211
|
|
|
212
|
-
@
|
|
213
|
-
def
|
|
214
|
-
counts = defaultdict(int)
|
|
215
|
-
category_bytes = defaultdict(int)
|
|
216
|
-
|
|
217
|
-
for file in files:
|
|
218
|
-
if file.category() is None:
|
|
219
|
-
logger.warning(f"file {file.file_id} has no category")
|
|
220
|
-
|
|
221
|
-
counts[file.category()] += 1
|
|
222
|
-
category_bytes[file.category()] += file.file_size()
|
|
223
|
-
|
|
224
|
-
table = build_table(f'{title_pfx}Other Files Summary', ['Category', 'Count', 'Has Author', 'No Author', 'Size'])
|
|
225
|
-
table.columns[-1].justify = 'right'
|
|
226
|
-
table.columns[0].min_width = 14
|
|
227
|
-
table.columns[-1].style = 'dim'
|
|
228
|
-
|
|
229
|
-
for (category, count) in sort_dict(counts):
|
|
230
|
-
category_files = [f for f in files if f.category() == category]
|
|
231
|
-
known_author_count = Document.known_author_count(category_files)
|
|
232
|
-
|
|
233
|
-
table.add_row(
|
|
234
|
-
styled_category(category),
|
|
235
|
-
str(count),
|
|
236
|
-
str(known_author_count),
|
|
237
|
-
str(count - known_author_count),
|
|
238
|
-
file_size_to_str(category_bytes[category]),
|
|
239
|
-
)
|
|
240
|
-
|
|
241
|
-
return table
|
|
242
|
-
|
|
243
|
-
@staticmethod
|
|
244
|
-
def files_preview_table(files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
|
|
212
|
+
@classmethod
|
|
213
|
+
def files_preview_table(cls, files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
|
|
245
214
|
"""Build a table of OtherFile documents."""
|
|
246
215
|
table = build_table(f'{title_pfx}Other Files Details in Chronological Order', show_lines=True)
|
|
247
216
|
table.add_column('File', justify='center', width=FILENAME_LENGTH)
|
|
@@ -272,3 +241,16 @@ class OtherFile(Document):
|
|
|
272
241
|
)
|
|
273
242
|
|
|
274
243
|
return table
|
|
244
|
+
|
|
245
|
+
@classmethod
|
|
246
|
+
def summary_table(cls, files: Sequence['OtherFile'], title_pfx: str = '') -> Table:
|
|
247
|
+
categories = uniquify([f.category() for f in files])
|
|
248
|
+
categories = sorted(categories, key=lambda c: -len([f for f in files if f.category() == c]))
|
|
249
|
+
table = cls.file_info_table(f'{title_pfx}Other Files Summary', 'Category')
|
|
250
|
+
|
|
251
|
+
for category in categories:
|
|
252
|
+
category_files = [f for f in files if f.category() == category]
|
|
253
|
+
table.add_row(styled_category(category), *cls.files_info_row(category_files))
|
|
254
|
+
|
|
255
|
+
table.columns = table.columns[:-2] + [table.columns[-1]] # Removee unknown author col
|
|
256
|
+
return table
|
epstein_files/epstein_files.py
CHANGED
|
@@ -9,6 +9,8 @@ from datetime import datetime
|
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from typing import Sequence, Type, cast
|
|
11
11
|
|
|
12
|
+
from rich.table import Table
|
|
13
|
+
|
|
12
14
|
from epstein_files.documents.document import Document
|
|
13
15
|
from epstein_files.documents.email import DETECT_EMAIL_REGEX, Email
|
|
14
16
|
from epstein_files.documents.json_file import JsonFile
|
|
@@ -22,7 +24,6 @@ from epstein_files.util.doc_cfg import EmailCfg, Metadata
|
|
|
22
24
|
from epstein_files.util.env import DOCS_DIR, args, logger
|
|
23
25
|
from epstein_files.util.file_helper import file_size_str
|
|
24
26
|
from epstein_files.util.highlighted_group import HIGHLIGHTED_NAMES, HighlightedNames
|
|
25
|
-
from epstein_files.util.rich import NA_TXT, add_cols_to_table, build_table, console, print_centered
|
|
26
27
|
from epstein_files.util.search_result import SearchResult
|
|
27
28
|
from epstein_files.util.timer import Timer
|
|
28
29
|
|
|
@@ -31,9 +32,13 @@ PICKLED_PATH = Path("the_epstein_files.pkl.gz")
|
|
|
31
32
|
SLOW_FILE_SECONDS = 1.0
|
|
32
33
|
|
|
33
34
|
EMAILS_WITH_UNINTERESTING_CCS = [
|
|
34
|
-
'025329',
|
|
35
|
-
'024923',
|
|
36
|
-
'033568',
|
|
35
|
+
'025329', # Krassner
|
|
36
|
+
'024923', # Krassner
|
|
37
|
+
'033568', # Krassner
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
EMAILS_WITH_UNINTERESTING_BCCS = [
|
|
41
|
+
'014797_1', # Ross Gow
|
|
37
42
|
]
|
|
38
43
|
|
|
39
44
|
|
|
@@ -45,7 +50,7 @@ class EpsteinFiles:
|
|
|
45
50
|
json_files: list[JsonFile] = field(default_factory=list)
|
|
46
51
|
other_files: list[OtherFile] = field(default_factory=list)
|
|
47
52
|
timer: Timer = field(default_factory=lambda: Timer())
|
|
48
|
-
uninteresting_ccs: list[Name] = field(
|
|
53
|
+
uninteresting_ccs: list[Name] = field(default_factory=list)
|
|
49
54
|
|
|
50
55
|
def __post_init__(self):
|
|
51
56
|
"""Iterate through files and build appropriate objects."""
|
|
@@ -88,13 +93,12 @@ class EpsteinFiles:
|
|
|
88
93
|
if PICKLED_PATH.exists() and not args.overwrite_pickle and not args.skip_other_files:
|
|
89
94
|
with gzip.open(PICKLED_PATH, 'rb') as file:
|
|
90
95
|
epstein_files = pickle.load(file)
|
|
91
|
-
epstein_files.timer = timer
|
|
92
96
|
timer_msg = f"Loaded {len(epstein_files.all_files):,} documents from '{PICKLED_PATH}'"
|
|
93
|
-
|
|
97
|
+
timer.print_at_checkpoint(f"{timer_msg} ({file_size_str(PICKLED_PATH)})")
|
|
94
98
|
return epstein_files
|
|
95
99
|
|
|
96
100
|
logger.warning(f"Building new cache file, this will take a few minutes...")
|
|
97
|
-
epstein_files = EpsteinFiles(
|
|
101
|
+
epstein_files = EpsteinFiles()
|
|
98
102
|
|
|
99
103
|
if args.skip_other_files:
|
|
100
104
|
logger.warning(f"Not writing pickled data because --skip-other-files")
|
|
@@ -235,7 +239,7 @@ class EpsteinFiles:
|
|
|
235
239
|
return json.dumps(metadata, indent=4, sort_keys=True)
|
|
236
240
|
|
|
237
241
|
def non_duplicate_emails(self) -> list[Email]:
|
|
238
|
-
return
|
|
242
|
+
return Document.without_dupes(self.emails)
|
|
239
243
|
|
|
240
244
|
def non_json_other_files(self) -> list[OtherFile]:
|
|
241
245
|
return [doc for doc in self.other_files if not isinstance(doc, JsonFile)]
|
|
@@ -253,34 +257,20 @@ class EpsteinFiles:
|
|
|
253
257
|
for name in names
|
|
254
258
|
]
|
|
255
259
|
|
|
256
|
-
def
|
|
257
|
-
table =
|
|
258
|
-
|
|
259
|
-
table.
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
table.add_row(
|
|
265
|
-
label,
|
|
266
|
-
f"{len(docs):,}",
|
|
267
|
-
f"{known:,}" if known is not None else NA_TXT,
|
|
268
|
-
f"{len(docs) - known:,}" if known is not None else NA_TXT,
|
|
269
|
-
f"{len([d for d in docs if d.is_duplicate()])}",
|
|
270
|
-
)
|
|
271
|
-
|
|
272
|
-
add_row('Emails', self.emails)
|
|
273
|
-
add_row('iMessage Logs', self.imessage_logs)
|
|
274
|
-
add_row('JSON Data', self.json_files)
|
|
275
|
-
add_row('Other', self.non_json_other_files())
|
|
276
|
-
print_centered(table)
|
|
277
|
-
console.line()
|
|
260
|
+
def overview_table(self) -> Table:
|
|
261
|
+
table = Document.file_info_table('Files Overview', 'File Type')
|
|
262
|
+
table.add_row('Emails', *Document.files_info_row(self.emails))
|
|
263
|
+
table.add_row('iMessage Logs', *Document.files_info_row(self.imessage_logs))
|
|
264
|
+
table.add_row('JSON Data', *Document.files_info_row(self.json_files, True))
|
|
265
|
+
table.add_row('Other', *Document.files_info_row(self.non_json_other_files()))
|
|
266
|
+
return table
|
|
278
267
|
|
|
279
268
|
def unknown_recipient_ids(self) -> list[str]:
|
|
280
269
|
"""IDs of emails whose recipient is not known."""
|
|
281
270
|
return sorted([e.file_id for e in self.emails if None in e.recipients or not e.recipients])
|
|
282
271
|
|
|
283
272
|
def uninteresting_emailers(self) -> list[Name]:
|
|
273
|
+
"""Emailers whom we don't want to print a separate section for because they're just CCed."""
|
|
284
274
|
if '_uninteresting_emailers' not in vars(self):
|
|
285
275
|
self._uninteresting_emailers = sorted(uniquify(UNINTERESTING_EMAILERS + self.uninteresting_ccs))
|
|
286
276
|
|
|
@@ -306,8 +296,8 @@ class EpsteinFiles:
|
|
|
306
296
|
self.emails = Document.sort_by_timestamp(self.emails)
|
|
307
297
|
|
|
308
298
|
def _set_uninteresting_ccs(self) -> None:
|
|
309
|
-
|
|
310
|
-
|
|
299
|
+
for id in EMAILS_WITH_UNINTERESTING_BCCS:
|
|
300
|
+
self.uninteresting_ccs += copy(cast(list[Name], self.email_for_id(id).header.bcc))
|
|
311
301
|
|
|
312
302
|
for id in EMAILS_WITH_UNINTERESTING_CCS:
|
|
313
303
|
self.uninteresting_ccs += self.email_for_id(id).recipients
|