epstein-files 1.2.0__py3-none-any.whl → 1.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +42 -30
- epstein_files/documents/communication.py +0 -3
- epstein_files/documents/document.py +66 -19
- epstein_files/documents/email.py +203 -208
- epstein_files/documents/emails/email_header.py +10 -2
- epstein_files/documents/imessage/text_message.py +3 -2
- epstein_files/documents/other_file.py +16 -34
- epstein_files/epstein_files.py +24 -35
- epstein_files/person.py +67 -73
- epstein_files/util/constant/names.py +21 -12
- epstein_files/util/constant/output_files.py +8 -5
- epstein_files/util/constant/strings.py +2 -2
- epstein_files/util/constant/urls.py +14 -2
- epstein_files/util/constants.py +38 -12
- epstein_files/util/data.py +2 -1
- epstein_files/util/doc_cfg.py +3 -3
- epstein_files/util/env.py +10 -7
- epstein_files/util/highlighted_group.py +366 -202
- epstein_files/util/logging.py +1 -1
- epstein_files/util/output.py +54 -21
- epstein_files/util/rich.py +21 -16
- epstein_files/util/timer.py +14 -0
- epstein_files/util/word_count.py +1 -1
- {epstein_files-1.2.0.dist-info → epstein_files-1.2.5.dist-info}/METADATA +5 -2
- epstein_files-1.2.5.dist-info/RECORD +34 -0
- epstein_files-1.2.0.dist-info/RECORD +0 -34
- {epstein_files-1.2.0.dist-info → epstein_files-1.2.5.dist-info}/LICENSE +0 -0
- {epstein_files-1.2.0.dist-info → epstein_files-1.2.5.dist-info}/WHEEL +0 -0
- {epstein_files-1.2.0.dist-info → epstein_files-1.2.5.dist-info}/entry_points.txt +0 -0
epstein_files/__init__.py
CHANGED
|
@@ -16,14 +16,18 @@ from rich.text import Text
|
|
|
16
16
|
from epstein_files.epstein_files import EpsteinFiles, document_cls
|
|
17
17
|
from epstein_files.documents.document import INFO_PADDING, Document
|
|
18
18
|
from epstein_files.documents.email import Email
|
|
19
|
+
from epstein_files.documents.messenger_log import MessengerLog
|
|
20
|
+
from epstein_files.documents.other_file import OtherFile
|
|
19
21
|
from epstein_files.util.constant.output_files import make_clean
|
|
22
|
+
from epstein_files.util.constant.strings import ID_REGEX
|
|
23
|
+
from epstein_files.util.data import flatten
|
|
20
24
|
from epstein_files.util.env import args
|
|
21
25
|
from epstein_files.util.file_helper import coerce_file_path, extract_file_id
|
|
22
26
|
from epstein_files.util.logging import exit_with_error, logger
|
|
23
27
|
from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
|
|
24
|
-
print_other_files_section, print_text_messages_section, print_email_timeline,
|
|
28
|
+
print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info,
|
|
25
29
|
print_json_metadata, write_urls)
|
|
26
|
-
from epstein_files.util.rich import (build_highlighter, console, print_color_key, print_title_page_header,
|
|
30
|
+
from epstein_files.util.rich import (build_highlighter, console, highlighter, print_color_key, print_title_page_header,
|
|
27
31
|
print_title_page_tables, print_subtitle_panel, write_html)
|
|
28
32
|
from epstein_files.util.timer import Timer
|
|
29
33
|
from epstein_files.util.word_count import write_word_counts_html
|
|
@@ -38,15 +42,15 @@ def generate_html() -> None:
|
|
|
38
42
|
timer = Timer()
|
|
39
43
|
epstein_files = EpsteinFiles.get_files(timer)
|
|
40
44
|
|
|
41
|
-
if args.
|
|
45
|
+
if args.emailers_info:
|
|
46
|
+
print_emailers_info(epstein_files)
|
|
47
|
+
exit()
|
|
48
|
+
elif args.json_metadata:
|
|
42
49
|
print_json_metadata(epstein_files)
|
|
43
50
|
exit()
|
|
44
51
|
elif args.json_files:
|
|
45
52
|
print_json_files(epstein_files)
|
|
46
53
|
exit()
|
|
47
|
-
elif args.emailers_info_png:
|
|
48
|
-
print_emailers_info_png(epstein_files)
|
|
49
|
-
exit()
|
|
50
54
|
|
|
51
55
|
print_title_page_header()
|
|
52
56
|
|
|
@@ -59,29 +63,26 @@ def generate_html() -> None:
|
|
|
59
63
|
exit()
|
|
60
64
|
|
|
61
65
|
if args.output_texts:
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
timer.print_at_checkpoint(f'Printed {len(imessage_logs)} text message log files')
|
|
66
|
+
printed_logs = print_text_messages_section(epstein_files)
|
|
67
|
+
timer.log_section_complete('MessengerLog', epstein_files.imessage_logs, printed_logs)
|
|
65
68
|
|
|
66
69
|
if args.output_emails:
|
|
67
|
-
|
|
68
|
-
timer.
|
|
70
|
+
printed_emails = print_emails_section(epstein_files)
|
|
71
|
+
timer.log_section_complete('Email', epstein_files.emails, printed_emails)
|
|
69
72
|
elif args.email_timeline:
|
|
70
73
|
print_email_timeline(epstein_files)
|
|
71
74
|
timer.print_at_checkpoint(f"Printed chronological emails table")
|
|
72
75
|
|
|
73
76
|
if args.output_other:
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
else:
|
|
77
|
-
files = [f for f in epstein_files.other_files if args.all_other_files or f.is_interesting()]
|
|
78
|
-
|
|
79
|
-
print_other_files_section(files, epstein_files)
|
|
80
|
-
timer.print_at_checkpoint(f"Printed {len(files)} other files (skipped {len(epstein_files.other_files) - len(files)})")
|
|
77
|
+
printed_files = print_other_files_section(epstein_files)
|
|
78
|
+
timer.log_section_complete('OtherFile', epstein_files.other_files, printed_files)
|
|
81
79
|
|
|
82
80
|
write_html(args.build)
|
|
83
81
|
logger.warning(f"Total time: {timer.seconds_since_start_str()}")
|
|
84
82
|
|
|
83
|
+
if args.debug:
|
|
84
|
+
highlighter.print_highlight_counts(console)
|
|
85
|
+
|
|
85
86
|
# JSON stats (mostly used for building pytest checks)
|
|
86
87
|
if args.json_stats:
|
|
87
88
|
print_json_stats(epstein_files)
|
|
@@ -94,36 +95,52 @@ def epstein_diff():
|
|
|
94
95
|
|
|
95
96
|
def epstein_search():
|
|
96
97
|
"""Search the cleaned up text of the files."""
|
|
97
|
-
_assert_positional_args()
|
|
98
98
|
epstein_files = EpsteinFiles.get_files()
|
|
99
99
|
|
|
100
|
+
if ID_REGEX.match(args.positional_args[0]):
|
|
101
|
+
logger.warning(f"'{args.positional_args[0]}' seems to be an ID, running epstein_show instead...")
|
|
102
|
+
epstein_show()
|
|
103
|
+
return
|
|
104
|
+
|
|
100
105
|
for search_term in args.positional_args:
|
|
101
106
|
temp_highlighter = build_highlighter(search_term)
|
|
102
107
|
search_results = epstein_files.docs_matching(search_term, args.names)
|
|
103
108
|
print_subtitle_panel(f"Found {len(search_results)} documents matching '{search_term}'")
|
|
104
109
|
|
|
105
110
|
for search_result in search_results:
|
|
106
|
-
|
|
111
|
+
document = search_result.document
|
|
112
|
+
|
|
113
|
+
if (isinstance(document, Email) and not args.output_emails) \
|
|
114
|
+
or (isinstance(document, OtherFile) and not args.output_other) \
|
|
115
|
+
or (isinstance(document, MessengerLog) and not args.output_texts):
|
|
116
|
+
document.warn(f"{type(document).__name__} Skipping search result...")
|
|
117
|
+
continue
|
|
107
118
|
|
|
108
119
|
if args.whole_file:
|
|
109
|
-
console.print(
|
|
120
|
+
console.print(document)
|
|
110
121
|
else:
|
|
111
|
-
console.print(
|
|
122
|
+
console.print(document.summary_panel())
|
|
112
123
|
|
|
113
124
|
for matching_line in search_result.lines:
|
|
114
125
|
line_txt = matching_line.__rich__()
|
|
115
126
|
console.print(Padding(temp_highlighter(line_txt), INFO_PADDING), style='gray37')
|
|
116
127
|
|
|
128
|
+
console.line()
|
|
129
|
+
|
|
117
130
|
|
|
118
131
|
def epstein_show():
|
|
119
132
|
"""Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
|
|
120
|
-
_assert_positional_args()
|
|
121
133
|
raw_docs: list[Document] = []
|
|
122
134
|
console.line()
|
|
123
135
|
|
|
124
136
|
try:
|
|
125
|
-
|
|
126
|
-
|
|
137
|
+
if args.names:
|
|
138
|
+
people = EpsteinFiles.get_files().person_objs(args.names)
|
|
139
|
+
raw_docs = [doc for doc in flatten([p.emails for p in people])]
|
|
140
|
+
else:
|
|
141
|
+
ids = [extract_file_id(arg) for arg in args.positional_args]
|
|
142
|
+
raw_docs = [Document(coerce_file_path(id)) for id in ids]
|
|
143
|
+
|
|
127
144
|
docs = Document.sort_by_timestamp([document_cls(doc)(doc.file_path) for doc in raw_docs])
|
|
128
145
|
except Exception as e:
|
|
129
146
|
exit_with_error(str(e))
|
|
@@ -142,8 +159,3 @@ def epstein_show():
|
|
|
142
159
|
|
|
143
160
|
def epstein_word_count() -> None:
|
|
144
161
|
write_word_counts_html()
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
def _assert_positional_args():
|
|
148
|
-
if not args.positional_args:
|
|
149
|
-
exit_with_error(f"No positional args provided!\n")
|
|
@@ -34,9 +34,6 @@ class Communication(Document):
|
|
|
34
34
|
"""Overrides super() method to apply self.author_style."""
|
|
35
35
|
return super().external_links_txt(self.author_style(), include_alt_links=include_alt_links)
|
|
36
36
|
|
|
37
|
-
def is_attribution_uncertain(self) -> bool:
|
|
38
|
-
return bool(self.config and self.config.is_attribution_uncertain)
|
|
39
|
-
|
|
40
37
|
def summary(self) -> Text:
|
|
41
38
|
return self._summary().append(CLOSE_PROPERTIES_CHAR)
|
|
42
39
|
|
|
@@ -11,17 +11,19 @@ from rich.console import Console, ConsoleOptions, Group, RenderResult
|
|
|
11
11
|
from rich.padding import Padding
|
|
12
12
|
from rich.panel import Panel
|
|
13
13
|
from rich.text import Text
|
|
14
|
+
from rich.table import Table
|
|
14
15
|
|
|
15
16
|
from epstein_files.util.constant.names import *
|
|
16
17
|
from epstein_files.util.constant.strings import *
|
|
17
18
|
from epstein_files.util.constant.urls import *
|
|
18
19
|
from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
|
|
19
|
-
from epstein_files.util.data import collapse_newlines, date_str, patternize,
|
|
20
|
+
from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time, without_falsey
|
|
20
21
|
from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
|
|
21
22
|
from epstein_files.util.env import DOCS_DIR, args
|
|
22
|
-
from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, is_local_extract_file
|
|
23
|
+
from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, file_size_to_str, is_local_extract_file
|
|
23
24
|
from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
|
|
24
|
-
from epstein_files.util.rich import INFO_STYLE,
|
|
25
|
+
from epstein_files.util.rich import (INFO_STYLE, NA_TXT, SYMBOL_STYLE, add_cols_to_table, build_table, console,
|
|
26
|
+
highlighter, join_texts, key_value_txt, link_text_obj, parenthesize)
|
|
25
27
|
from epstein_files.util.search_result import MatchedLine
|
|
26
28
|
|
|
27
29
|
ALT_LINK_STYLE = 'white dim'
|
|
@@ -55,6 +57,14 @@ OCR_REPAIRS = {
|
|
|
55
57
|
'Nil Priell': 'Nili Priell',
|
|
56
58
|
}
|
|
57
59
|
|
|
60
|
+
SUMMARY_TABLE_COLS: list[str | dict] = [
|
|
61
|
+
'Count',
|
|
62
|
+
{'name': 'Has Author', 'style': 'honeydew2'},
|
|
63
|
+
{'name': 'No Author', 'style': 'wheat4'},
|
|
64
|
+
{'name': 'Uncertain Author', 'style': 'royal_blue1 dim'},
|
|
65
|
+
{'name': 'Size', 'justify': 'right', 'style': 'dim'},
|
|
66
|
+
]
|
|
67
|
+
|
|
58
68
|
|
|
59
69
|
@dataclass
|
|
60
70
|
class Document:
|
|
@@ -181,6 +191,9 @@ class Document:
|
|
|
181
191
|
"""Secondary info about this file (description recipients, etc). Overload in subclasses."""
|
|
182
192
|
return None
|
|
183
193
|
|
|
194
|
+
def is_attribution_uncertain(self) -> bool:
|
|
195
|
+
return bool(self.config and self.config.is_attribution_uncertain)
|
|
196
|
+
|
|
184
197
|
def is_duplicate(self) -> bool:
|
|
185
198
|
return bool(self.duplicate_of_id())
|
|
186
199
|
|
|
@@ -240,17 +253,6 @@ class Document:
|
|
|
240
253
|
|
|
241
254
|
return text
|
|
242
255
|
|
|
243
|
-
def sort_key(self) -> tuple[datetime, str, int]:
|
|
244
|
-
"""Sort by timestamp, file_id, then whether or not it's a duplicate file."""
|
|
245
|
-
if self.is_duplicate():
|
|
246
|
-
sort_id = self.config.duplicate_of_id
|
|
247
|
-
dupe_idx = 1
|
|
248
|
-
else:
|
|
249
|
-
sort_id = self.file_id
|
|
250
|
-
dupe_idx = 0
|
|
251
|
-
|
|
252
|
-
return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
|
|
253
|
-
|
|
254
256
|
def source_file_id(self) -> str:
|
|
255
257
|
"""Strip off the _1, _2, etc. suffixes for extracted documents."""
|
|
256
258
|
return self.file_id[0:6]
|
|
@@ -261,7 +263,7 @@ class Document:
|
|
|
261
263
|
txt.append(f" {self.file_path.stem}", style=FILENAME_STYLE)
|
|
262
264
|
|
|
263
265
|
if self.timestamp:
|
|
264
|
-
timestamp_str =
|
|
266
|
+
timestamp_str = remove_zero_time(self.timestamp).replace('T', ' ')
|
|
265
267
|
txt.append(' (', style=SYMBOL_STYLE)
|
|
266
268
|
txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
|
|
267
269
|
|
|
@@ -269,7 +271,7 @@ class Document:
|
|
|
269
271
|
txt.append(", ").append(key_value_txt('lines', self.num_lines()))
|
|
270
272
|
|
|
271
273
|
if self.config and self.config.duplicate_of_id:
|
|
272
|
-
txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='
|
|
274
|
+
txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='cyan dim')))
|
|
273
275
|
|
|
274
276
|
return txt
|
|
275
277
|
|
|
@@ -282,6 +284,17 @@ class Document:
|
|
|
282
284
|
|
|
283
285
|
return Panel(Group(*sentences), border_style=self._class_style(), expand=False)
|
|
284
286
|
|
|
287
|
+
def timestamp_sort_key(self) -> tuple[datetime, str, int]:
|
|
288
|
+
"""Sort by timestamp, file_id, then whether or not it's a duplicate file."""
|
|
289
|
+
if self.is_duplicate():
|
|
290
|
+
sort_id = self.config.duplicate_of_id
|
|
291
|
+
dupe_idx = 1
|
|
292
|
+
else:
|
|
293
|
+
sort_id = self.file_id
|
|
294
|
+
dupe_idx = 0
|
|
295
|
+
|
|
296
|
+
return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
|
|
297
|
+
|
|
285
298
|
def top_lines(self, n: int = 10) -> str:
|
|
286
299
|
"""First n lines."""
|
|
287
300
|
return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
|
|
@@ -361,6 +374,32 @@ class Document:
|
|
|
361
374
|
def __str__(self) -> str:
|
|
362
375
|
return self.summary().plain
|
|
363
376
|
|
|
377
|
+
@classmethod
|
|
378
|
+
def file_info_table(cls, title: str, first_col_name: str) -> Table:
|
|
379
|
+
"""Empty table with appropriate cols for summarizing groups of files."""
|
|
380
|
+
table = build_table(title)
|
|
381
|
+
cols = [{'name': first_col_name, 'min_width': 14}] + SUMMARY_TABLE_COLS
|
|
382
|
+
add_cols_to_table(table, cols, 'right')
|
|
383
|
+
return table
|
|
384
|
+
|
|
385
|
+
@classmethod
|
|
386
|
+
def files_info(cls, files: Sequence['Document'], is_author_na: bool = False) -> dict[str, str | Text]:
|
|
387
|
+
"""Summary info about a group of files."""
|
|
388
|
+
file_count = len(files)
|
|
389
|
+
author_count = cls.known_author_count(files)
|
|
390
|
+
|
|
391
|
+
return {
|
|
392
|
+
'count': str(file_count),
|
|
393
|
+
'author_count': NA_TXT if is_author_na else str(author_count),
|
|
394
|
+
'no_author_count': NA_TXT if is_author_na else str(file_count - author_count),
|
|
395
|
+
'uncertain_author_count': NA_TXT if is_author_na else str(len([f for f in files if f.is_attribution_uncertain()])),
|
|
396
|
+
'bytes': file_size_to_str(sum([f.file_size() for f in files])),
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
@classmethod
|
|
400
|
+
def files_info_row(cls, files: Sequence['Document'], author_na: bool = False) -> Sequence[str | Text]:
|
|
401
|
+
return [v for v in cls.files_info(files, author_na).values()]
|
|
402
|
+
|
|
364
403
|
@staticmethod
|
|
365
404
|
def diff_files(files: list[str]) -> None:
|
|
366
405
|
"""Diff the contents of two Documents after all cleanup, BOM removal, etc."""
|
|
@@ -396,16 +435,24 @@ class Document:
|
|
|
396
435
|
"""Count of how many Document objects have an author attribution."""
|
|
397
436
|
return len([doc for doc in docs if doc.author])
|
|
398
437
|
|
|
438
|
+
@staticmethod
|
|
439
|
+
def sort_by_id(docs: Sequence['DocumentType']) -> list['DocumentType']:
|
|
440
|
+
return sorted(docs, key=lambda d: d.file_id)
|
|
441
|
+
|
|
399
442
|
@staticmethod
|
|
400
443
|
def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
|
|
401
|
-
return sorted(docs, key=lambda doc: doc.
|
|
444
|
+
return sorted(docs, key=lambda doc: doc.timestamp_sort_key())
|
|
402
445
|
|
|
403
|
-
@
|
|
404
|
-
def uniquify(
|
|
446
|
+
@staticmethod
|
|
447
|
+
def uniquify(documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
|
|
405
448
|
"""Uniquify by file_id."""
|
|
406
449
|
id_map = {doc.file_id: doc for doc in documents}
|
|
407
450
|
return [doc for doc in id_map.values()]
|
|
408
451
|
|
|
452
|
+
@staticmethod
|
|
453
|
+
def without_dupes(docs: Sequence['DocumentType']) -> list['DocumentType']:
|
|
454
|
+
return [doc for doc in docs if not doc.is_duplicate()]
|
|
455
|
+
|
|
409
456
|
|
|
410
457
|
DocumentType = TypeVar('DocumentType', bound=Document)
|
|
411
458
|
|