epstein-files 1.2.1__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +55 -11
- epstein_files/documents/document.py +13 -2
- epstein_files/documents/email.py +329 -258
- epstein_files/documents/emails/email_header.py +17 -8
- epstein_files/documents/other_file.py +8 -6
- epstein_files/epstein_files.py +18 -4
- epstein_files/person.py +65 -20
- epstein_files/util/constant/names.py +18 -12
- epstein_files/util/constant/output_files.py +8 -5
- epstein_files/util/constant/strings.py +4 -2
- epstein_files/util/constant/urls.py +13 -2
- epstein_files/util/constants.py +486 -224
- epstein_files/util/data.py +1 -0
- epstein_files/util/doc_cfg.py +33 -27
- epstein_files/util/env.py +18 -8
- epstein_files/util/file_helper.py +2 -0
- epstein_files/util/highlighted_group.py +321 -132
- epstein_files/util/output.py +19 -24
- epstein_files/util/rich.py +9 -3
- epstein_files/util/word_count.py +2 -2
- {epstein_files-1.2.1.dist-info → epstein_files-1.4.1.dist-info}/METADATA +3 -3
- epstein_files-1.4.1.dist-info/RECORD +34 -0
- {epstein_files-1.2.1.dist-info → epstein_files-1.4.1.dist-info}/entry_points.txt +1 -1
- epstein_files-1.2.1.dist-info/RECORD +0 -34
- {epstein_files-1.2.1.dist-info → epstein_files-1.4.1.dist-info}/LICENSE +0 -0
- {epstein_files-1.2.1.dist-info → epstein_files-1.4.1.dist-info}/WHEEL +0 -0
epstein_files/__init__.py
CHANGED
|
@@ -16,15 +16,19 @@ from rich.text import Text
|
|
|
16
16
|
from epstein_files.epstein_files import EpsteinFiles, document_cls
|
|
17
17
|
from epstein_files.documents.document import INFO_PADDING, Document
|
|
18
18
|
from epstein_files.documents.email import Email
|
|
19
|
+
from epstein_files.documents.messenger_log import MessengerLog
|
|
20
|
+
from epstein_files.documents.other_file import OtherFile
|
|
19
21
|
from epstein_files.util.constant.output_files import make_clean
|
|
22
|
+
from epstein_files.util.constant.strings import ID_REGEX
|
|
23
|
+
from epstein_files.util.data import flatten
|
|
20
24
|
from epstein_files.util.env import args
|
|
21
25
|
from epstein_files.util.file_helper import coerce_file_path, extract_file_id
|
|
22
26
|
from epstein_files.util.logging import exit_with_error, logger
|
|
23
27
|
from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
|
|
24
28
|
print_other_files_section, print_text_messages_section, print_email_timeline, print_emailers_info,
|
|
25
29
|
print_json_metadata, write_urls)
|
|
26
|
-
from epstein_files.util.rich import (build_highlighter, console, print_color_key,
|
|
27
|
-
print_title_page_tables, print_subtitle_panel, write_html)
|
|
30
|
+
from epstein_files.util.rich import (build_highlighter, console, highlighter, print_color_key, print_json,
|
|
31
|
+
print_title_page_header, print_title_page_tables, print_subtitle_panel, write_html)
|
|
28
32
|
from epstein_files.util.timer import Timer
|
|
29
33
|
from epstein_files.util.word_count import write_word_counts_html
|
|
30
34
|
|
|
@@ -76,6 +80,9 @@ def generate_html() -> None:
|
|
|
76
80
|
write_html(args.build)
|
|
77
81
|
logger.warning(f"Total time: {timer.seconds_since_start_str()}")
|
|
78
82
|
|
|
83
|
+
if args.debug:
|
|
84
|
+
highlighter.print_highlight_counts(console)
|
|
85
|
+
|
|
79
86
|
# JSON stats (mostly used for building pytest checks)
|
|
80
87
|
if args.json_stats:
|
|
81
88
|
print_json_stats(epstein_files)
|
|
@@ -86,27 +93,54 @@ def epstein_diff():
|
|
|
86
93
|
Document.diff_files(args.positional_args)
|
|
87
94
|
|
|
88
95
|
|
|
89
|
-
def
|
|
96
|
+
def epstein_grep():
|
|
90
97
|
"""Search the cleaned up text of the files."""
|
|
91
98
|
epstein_files = EpsteinFiles.get_files()
|
|
92
99
|
|
|
100
|
+
if ID_REGEX.match(args.positional_args[0]):
|
|
101
|
+
logger.warning(f"'{args.positional_args[0]}' seems to be an ID, running epstein_show instead...")
|
|
102
|
+
epstein_show()
|
|
103
|
+
return
|
|
104
|
+
|
|
93
105
|
for search_term in args.positional_args:
|
|
94
106
|
temp_highlighter = build_highlighter(search_term)
|
|
95
107
|
search_results = epstein_files.docs_matching(search_term, args.names)
|
|
96
108
|
print_subtitle_panel(f"Found {len(search_results)} documents matching '{search_term}'")
|
|
109
|
+
last_document = None
|
|
97
110
|
|
|
98
111
|
for search_result in search_results:
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
112
|
+
doc = search_result.document
|
|
113
|
+
lines = search_result.lines
|
|
114
|
+
|
|
115
|
+
if (isinstance(doc, Email) and not args.output_emails) \
|
|
116
|
+
or (isinstance(doc, OtherFile) and not args.output_other) \
|
|
117
|
+
or (isinstance(doc, MessengerLog) and not args.output_texts):
|
|
118
|
+
doc.log(f"{type(doc).__name__} Skipping search result...")
|
|
119
|
+
continue
|
|
120
|
+
elif isinstance(doc, Email) and args.email_body:
|
|
121
|
+
lines = [l for l in search_result.lines if l.line_number > doc.header.num_header_rows]
|
|
122
|
+
|
|
123
|
+
if not lines:
|
|
124
|
+
doc.log(f"None of the matches for '{search_term}' seem to be in the body of the email")
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
if doc.is_duplicate():
|
|
128
|
+
if last_document and not last_document.is_duplicate():
|
|
129
|
+
console.line()
|
|
130
|
+
|
|
131
|
+
last_document = doc
|
|
132
|
+
console.print(doc.duplicate_file_txt())
|
|
133
|
+
elif args.whole_file:
|
|
134
|
+
console.print(doc)
|
|
103
135
|
else:
|
|
104
|
-
console.print(
|
|
136
|
+
console.print(doc.summary_panel())
|
|
105
137
|
|
|
106
|
-
for matching_line in
|
|
138
|
+
for matching_line in lines:
|
|
107
139
|
line_txt = matching_line.__rich__()
|
|
108
140
|
console.print(Padding(temp_highlighter(line_txt), INFO_PADDING), style='gray37')
|
|
109
141
|
|
|
142
|
+
console.line()
|
|
143
|
+
|
|
110
144
|
|
|
111
145
|
def epstein_show():
|
|
112
146
|
"""Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
|
|
@@ -114,8 +148,13 @@ def epstein_show():
|
|
|
114
148
|
console.line()
|
|
115
149
|
|
|
116
150
|
try:
|
|
117
|
-
|
|
118
|
-
|
|
151
|
+
if args.names:
|
|
152
|
+
people = EpsteinFiles.get_files().person_objs(args.names)
|
|
153
|
+
raw_docs = [doc for doc in flatten([p.emails for p in people])]
|
|
154
|
+
else:
|
|
155
|
+
ids = [extract_file_id(arg.strip().strip('_')) for arg in args.positional_args]
|
|
156
|
+
raw_docs = [Document(coerce_file_path(id)) for id in ids]
|
|
157
|
+
|
|
119
158
|
docs = Document.sort_by_timestamp([document_cls(doc)(doc.file_path) for doc in raw_docs])
|
|
120
159
|
except Exception as e:
|
|
121
160
|
exit_with_error(str(e))
|
|
@@ -130,6 +169,11 @@ def epstein_show():
|
|
|
130
169
|
if isinstance(doc, Email):
|
|
131
170
|
console.print(Panel(Text("actual_text: ").append(doc.summary()), expand=False, style=doc._border_style()))
|
|
132
171
|
console.print(escape(doc._actual_text()), '\n')
|
|
172
|
+
metadata = doc.metadata()
|
|
173
|
+
metadata['is_fwded_article'] = doc.is_fwded_article()
|
|
174
|
+
metadata['is_word_count_worthy'] = doc.is_word_count_worthy()
|
|
175
|
+
metadata['_is_first_for_user'] = doc._is_first_for_user
|
|
176
|
+
print_json(f"{doc.file_id} Metadata", metadata)
|
|
133
177
|
|
|
134
178
|
|
|
135
179
|
def epstein_word_count() -> None:
|
|
@@ -197,6 +197,9 @@ class Document:
|
|
|
197
197
|
def is_duplicate(self) -> bool:
|
|
198
198
|
return bool(self.duplicate_of_id())
|
|
199
199
|
|
|
200
|
+
def is_interesting(self) -> bool:
|
|
201
|
+
return bool(self.config and self.config.is_interesting)
|
|
202
|
+
|
|
200
203
|
def is_local_extract_file(self) -> bool:
|
|
201
204
|
"""True if extracted from other file (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
|
|
202
205
|
return is_local_extract_file(self.filename)
|
|
@@ -267,11 +270,11 @@ class Document:
|
|
|
267
270
|
txt.append(' (', style=SYMBOL_STYLE)
|
|
268
271
|
txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
|
|
269
272
|
|
|
270
|
-
txt.append(' [').append(key_value_txt('size', Text(self.
|
|
273
|
+
txt.append(' [').append(key_value_txt('size', Text(str(self.length()), style='aquamarine1')))
|
|
271
274
|
txt.append(", ").append(key_value_txt('lines', self.num_lines()))
|
|
272
275
|
|
|
273
276
|
if self.config and self.config.duplicate_of_id:
|
|
274
|
-
txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='
|
|
277
|
+
txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='cyan dim')))
|
|
275
278
|
|
|
276
279
|
return txt
|
|
277
280
|
|
|
@@ -435,6 +438,14 @@ class Document:
|
|
|
435
438
|
"""Count of how many Document objects have an author attribution."""
|
|
436
439
|
return len([doc for doc in docs if doc.author])
|
|
437
440
|
|
|
441
|
+
@staticmethod
|
|
442
|
+
def sort_by_id(docs: Sequence['DocumentType']) -> list['DocumentType']:
|
|
443
|
+
return sorted(docs, key=lambda d: d.file_id)
|
|
444
|
+
|
|
445
|
+
@staticmethod
|
|
446
|
+
def sort_by_length(docs: Sequence['DocumentType']) -> list['DocumentType']:
|
|
447
|
+
return sorted(docs, key=lambda d: d.file_size(), reverse=True)
|
|
448
|
+
|
|
438
449
|
@staticmethod
|
|
439
450
|
def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
|
|
440
451
|
return sorted(docs, key=lambda doc: doc.timestamp_sort_key())
|