epstein-files 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +59 -51
- epstein_files/documents/communication.py +9 -9
- epstein_files/documents/document.py +111 -87
- epstein_files/documents/email.py +154 -85
- epstein_files/documents/emails/email_header.py +7 -6
- epstein_files/documents/imessage/text_message.py +3 -2
- epstein_files/documents/json_file.py +17 -0
- epstein_files/documents/messenger_log.py +62 -3
- epstein_files/documents/other_file.py +165 -17
- epstein_files/epstein_files.py +100 -143
- epstein_files/util/constant/names.py +6 -0
- epstein_files/util/constant/strings.py +27 -0
- epstein_files/util/constant/urls.py +22 -9
- epstein_files/util/constants.py +968 -1015
- epstein_files/util/data.py +14 -28
- epstein_files/util/{file_cfg.py → doc_cfg.py} +120 -34
- epstein_files/util/env.py +16 -18
- epstein_files/util/file_helper.py +56 -17
- epstein_files/util/highlighted_group.py +227 -175
- epstein_files/util/logging.py +57 -0
- epstein_files/util/rich.py +18 -13
- epstein_files/util/search_result.py +14 -6
- epstein_files/util/timer.py +24 -0
- epstein_files/util/word_count.py +2 -1
- {epstein_files-1.0.0.dist-info → epstein_files-1.0.1.dist-info}/METADATA +3 -2
- epstein_files-1.0.1.dist-info/RECORD +30 -0
- epstein_files-1.0.0.dist-info/RECORD +0 -28
- {epstein_files-1.0.0.dist-info → epstein_files-1.0.1.dist-info}/LICENSE +0 -0
- {epstein_files-1.0.0.dist-info → epstein_files-1.0.1.dist-info}/WHEEL +0 -0
epstein_files/util/data.py
CHANGED
|
@@ -13,7 +13,8 @@ from dateutil.parser import parse
|
|
|
13
13
|
from rich.text import Text
|
|
14
14
|
|
|
15
15
|
from epstein_files.util.constant import names
|
|
16
|
-
from epstein_files.util.env import args
|
|
16
|
+
from epstein_files.util.env import args
|
|
17
|
+
from epstein_files.util.logging import logger
|
|
17
18
|
|
|
18
19
|
T = TypeVar('T')
|
|
19
20
|
|
|
@@ -26,14 +27,6 @@ PACIFIC_TZ = tz.gettz("America/Los_Angeles")
|
|
|
26
27
|
TIMEZONE_INFO = {"PST": PACIFIC_TZ, "PDT": PACIFIC_TZ} # Suppresses annoying warnings from parse() calls
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
def collapse_newlines(text: str) -> str:
|
|
30
|
-
return MULTINEWLINE_REGEX.sub('\n\n', text)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def date_str(timestamp: datetime | None) -> str | None:
|
|
34
|
-
return timestamp.isoformat()[0:10] if timestamp else None
|
|
35
|
-
|
|
36
|
-
|
|
37
30
|
def dict_sets_to_lists(d: dict[str, set]) -> dict[str, list]:
|
|
38
31
|
return {k: sorted(list(v)) for k, v in d.items()}
|
|
39
32
|
|
|
@@ -70,14 +63,19 @@ def flatten(_list: list[list[T]]) -> list[T]:
|
|
|
70
63
|
return list(itertools.chain.from_iterable(_list))
|
|
71
64
|
|
|
72
65
|
|
|
73
|
-
def
|
|
74
|
-
return
|
|
66
|
+
def json_safe(d: dict) -> dict:
|
|
67
|
+
return {
|
|
68
|
+
'None' if k is None else k: v.isoformat() if isinstance(v, datetime) else v
|
|
69
|
+
for k,v in d.items()
|
|
70
|
+
}
|
|
75
71
|
|
|
76
72
|
|
|
77
|
-
def listify(listlike
|
|
73
|
+
def listify(listlike) -> list:
|
|
78
74
|
"""Create a list of 'listlike'. Returns empty list if 'listlike' is None or empty string."""
|
|
79
75
|
if isinstance(listlike, list):
|
|
80
76
|
return listlike
|
|
77
|
+
elif listlike is None:
|
|
78
|
+
return [None]
|
|
81
79
|
elif listlike:
|
|
82
80
|
return [listlike]
|
|
83
81
|
else:
|
|
@@ -110,22 +108,10 @@ def sort_dict(d: dict[str | None, int] | dict[str, int]) -> list[tuple[str | Non
|
|
|
110
108
|
return sorted(d.items(), key=sort_key)
|
|
111
109
|
|
|
112
110
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
started_at: float = field(default_factory=lambda: time.perf_counter())
|
|
116
|
-
checkpoint_at: float = field(default_factory=lambda: time.perf_counter())
|
|
117
|
-
|
|
118
|
-
def print_at_checkpoint(self, msg: str) -> None:
|
|
119
|
-
logger.warning(f"{msg} in {self.seconds_since_checkpoint()}")
|
|
120
|
-
self.checkpoint_at = time.perf_counter()
|
|
121
|
-
|
|
122
|
-
def seconds_since_checkpoint(self) -> str:
|
|
123
|
-
return f"{(time.perf_counter() - self.checkpoint_at):.2f} seconds"
|
|
124
|
-
|
|
125
|
-
def seconds_since_start(self) -> str:
|
|
126
|
-
return f"{(time.perf_counter() - self.started_at):.2f} seconds"
|
|
127
|
-
|
|
128
|
-
|
|
111
|
+
collapse_newlines = lambda text: MULTINEWLINE_REGEX.sub('\n\n', text)
|
|
112
|
+
date_str = lambda dt: dt.isoformat()[0:10] if dt else None
|
|
129
113
|
escape_double_quotes = lambda text: text.replace('"', r'\"')
|
|
130
114
|
escape_single_quotes = lambda text: text.replace("'", r"\'")
|
|
115
|
+
iso_timestamp = lambda dt: dt.isoformat().replace('T', ' ')
|
|
131
116
|
uniquify = lambda _list: list(set(_list))
|
|
117
|
+
without_nones = lambda _list: [e for e in _list if e]
|
|
@@ -6,22 +6,27 @@ from typing import Generator, Literal
|
|
|
6
6
|
|
|
7
7
|
from dateutil.parser import parse
|
|
8
8
|
|
|
9
|
-
from epstein_files.util.constant.names import
|
|
10
|
-
from epstein_files.util.constant.strings import
|
|
9
|
+
from epstein_files.util.constant.names import *
|
|
10
|
+
from epstein_files.util.constant.strings import *
|
|
11
|
+
from epstein_files.util.data import without_nones
|
|
11
12
|
|
|
12
|
-
DuplicateType = Literal['
|
|
13
|
+
DuplicateType = Literal['earlier', 'quoted', 'redacted', 'same']
|
|
14
|
+
Metadata = dict[str, bool | datetime | int | str | list[str | None] |dict[str, bool | str]]
|
|
13
15
|
|
|
16
|
+
# Misc
|
|
17
|
+
CONSTANTIZE_NAMES = False # A flag set to True that causes repr() of these classes to return strings of usable code
|
|
14
18
|
INDENT = ' '
|
|
15
19
|
INDENT_NEWLINE = f'\n{INDENT}'
|
|
16
20
|
INDENTED_JOIN = f',{INDENT_NEWLINE}'
|
|
17
|
-
|
|
18
|
-
|
|
21
|
+
MAX_LINE_LENGTH = 150
|
|
22
|
+
REPUTATION_MGMT = f'{REPUTATION} management'
|
|
23
|
+
SAME = 'same'
|
|
19
24
|
|
|
20
|
-
|
|
21
|
-
'earlier': 'earlier draft of',
|
|
25
|
+
DUPE_TYPE_STRS: dict[DuplicateType, str] = {
|
|
26
|
+
'earlier': 'an earlier draft of',
|
|
22
27
|
'quoted': 'quoted in full in',
|
|
23
|
-
'redacted': 'redacted version of',
|
|
24
|
-
|
|
28
|
+
'redacted': 'a redacted version of',
|
|
29
|
+
SAME: 'the same as',
|
|
25
30
|
}
|
|
26
31
|
|
|
27
32
|
FIELD_SORT_KEY = {
|
|
@@ -30,57 +35,114 @@ FIELD_SORT_KEY = {
|
|
|
30
35
|
'attribution_reason': 'zz',
|
|
31
36
|
}
|
|
32
37
|
|
|
38
|
+
FINANCIAL_REPORTS_AUTHORS = [
|
|
39
|
+
BOFA,
|
|
40
|
+
DEUTSCHE_BANK,
|
|
41
|
+
ELECTRON_CAPITAL_PARTNERS,
|
|
42
|
+
GOLDMAN_INVESTMENT_MGMT,
|
|
43
|
+
'Invesco',
|
|
44
|
+
JP_MORGAN,
|
|
45
|
+
'Morgan Stanley',
|
|
46
|
+
'S&P',
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
# Fields like timestamp and author are better added from the Document object
|
|
50
|
+
INVALID_FOR_METADATA = [
|
|
51
|
+
'actual_text',
|
|
52
|
+
'date',
|
|
53
|
+
'id',
|
|
54
|
+
'timestamp',
|
|
55
|
+
'was_generated',
|
|
56
|
+
]
|
|
57
|
+
|
|
33
58
|
|
|
34
59
|
@dataclass(kw_only=True)
|
|
35
|
-
class
|
|
36
|
-
"""
|
|
60
|
+
class DocCfg:
|
|
61
|
+
"""
|
|
62
|
+
Encapsulates info about files that needs to be manually configured because it cannot be programmatically inferred.
|
|
37
63
|
|
|
38
64
|
Attributes:
|
|
39
65
|
id (str): ID of file
|
|
40
66
|
author (str | None): Author of the document (if any)
|
|
67
|
+
category (str | None): Type of file
|
|
41
68
|
date (str | None): If passed will be immediated parsed into the 'timestamp' field
|
|
42
69
|
dupe_of_id (str | None): If this is a dupe the ID of the duplicated file. This file will be suppressed
|
|
43
|
-
dupe_type (DuplicateType | None): The type of duplicate this file is
|
|
44
|
-
duplicate_ids (list[str]): Inverse of 'dupe_of_id' - this file will NOT be suppressed but 'duplicate_ids' will be
|
|
70
|
+
dupe_type (DuplicateType | None): The type of duplicate this file is or its 'duplicate_ids' are
|
|
71
|
+
duplicate_ids (list[str]): Inverse of 'dupe_of_id' - this file will NOT be suppressed but 'duplicate_ids' will be
|
|
72
|
+
is_interesting (bool): Override other considerations and always consider this file interesting
|
|
45
73
|
timestamp (datetime | None): Time this email was sent, file was created, article published, etc.
|
|
46
74
|
was_generated (bool): True if this object was generated by the duplicate_cfgs() method
|
|
47
75
|
"""
|
|
48
76
|
id: str
|
|
49
77
|
author: str | None = None
|
|
78
|
+
category: str | None = None
|
|
50
79
|
date: str | None = None
|
|
51
80
|
description: str | None = None
|
|
52
81
|
dupe_of_id: str | None = None
|
|
53
82
|
dupe_type: DuplicateType | None = None
|
|
54
83
|
duplicate_ids: list[str] = field(default_factory=list)
|
|
84
|
+
is_interesting: bool = False
|
|
55
85
|
timestamp: datetime | None = None
|
|
56
|
-
was_generated: bool = False
|
|
86
|
+
was_generated: bool = False
|
|
57
87
|
|
|
58
88
|
def __post_init__(self):
|
|
59
|
-
if self.dupe_of_id:
|
|
60
|
-
self.dupe_type = self.dupe_type or 'same'
|
|
61
|
-
|
|
62
89
|
if self.date:
|
|
63
90
|
self.timestamp = parse(self.date)
|
|
64
91
|
|
|
92
|
+
if self.dupe_of_id or self.duplicate_ids:
|
|
93
|
+
self.dupe_type = self.dupe_type or SAME
|
|
94
|
+
|
|
65
95
|
def duplicate_reason(self) -> str | None:
|
|
66
96
|
if self.dupe_type is not None:
|
|
67
|
-
return
|
|
97
|
+
return DUPE_TYPE_STRS[self.dupe_type]
|
|
68
98
|
|
|
69
|
-
def duplicate_cfgs(self) -> Generator['
|
|
99
|
+
def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
|
|
100
|
+
"""Create synthetic DocCfg objects that set the 'dupe_of_id' field to point back to this object."""
|
|
70
101
|
for id in self.duplicate_ids:
|
|
71
102
|
dupe_cfg = deepcopy(self)
|
|
72
103
|
dupe_cfg.id = id
|
|
73
104
|
dupe_cfg.dupe_of_id = self.id
|
|
74
|
-
dupe_cfg.
|
|
105
|
+
dupe_cfg.duplicate_ids = []
|
|
106
|
+
dupe_cfg.dupe_type = self.dupe_type
|
|
75
107
|
dupe_cfg.was_generated = True
|
|
76
108
|
yield dupe_cfg
|
|
77
109
|
|
|
110
|
+
def info_str(self) -> str | None:
|
|
111
|
+
"""String that summarizes what is known about this document."""
|
|
112
|
+
if self.category == REPUTATION:
|
|
113
|
+
return f"{REPUTATION_MGMT}: {self.description}"
|
|
114
|
+
elif self.author and self.description:
|
|
115
|
+
if self.category in [ACADEMIA, BOOK]:
|
|
116
|
+
return self.title_by_author()
|
|
117
|
+
elif self.category == FINANCE and self.author in FINANCIAL_REPORTS_AUTHORS:
|
|
118
|
+
return f"{self.author} report: '{self.description}'"
|
|
119
|
+
elif self.category and self.author is None and self.description is None:
|
|
120
|
+
return self.category
|
|
121
|
+
|
|
122
|
+
pieces = without_nones([self.author, self.description])
|
|
123
|
+
return ' '.join(pieces) if pieces else None
|
|
124
|
+
|
|
125
|
+
def metadata(self) -> Metadata:
|
|
126
|
+
non_null_fields = {k: v for k, v in asdict(self).items() if v and k not in INVALID_FOR_METADATA}
|
|
127
|
+
|
|
128
|
+
if self.category in [EMAIL, TEXT_MESSAGE]:
|
|
129
|
+
del non_null_fields['category']
|
|
130
|
+
|
|
131
|
+
return non_null_fields
|
|
132
|
+
|
|
78
133
|
def non_null_field_names(self) -> list[str]:
|
|
79
134
|
return [f.name for f in self.sorted_fields() if getattr(self, f.name)]
|
|
80
135
|
|
|
81
136
|
def sorted_fields(self) -> list[Field]:
|
|
82
137
|
return sorted(fields(self), key=lambda f: FIELD_SORT_KEY.get(f.name, f.name))
|
|
83
138
|
|
|
139
|
+
def title_by_author(self) -> str:
|
|
140
|
+
if not (self.author and self.description):
|
|
141
|
+
raise RuntimeError(f"Can't call title_by_author() without author and description!")
|
|
142
|
+
|
|
143
|
+
title = self.description if '"' in self.description else f"'{self.description}'"
|
|
144
|
+
return f"{title} by {self.author}"
|
|
145
|
+
|
|
84
146
|
def _props_strs(self) -> list[str]:
|
|
85
147
|
props = []
|
|
86
148
|
add_prop = lambda f, value: props.append(f"{f.name}={value}")
|
|
@@ -92,14 +154,16 @@ class FileCfg:
|
|
|
92
154
|
continue
|
|
93
155
|
elif _field.name == AUTHOR:
|
|
94
156
|
add_prop(_field, constantize_name(str(value)) if CONSTANTIZE_NAMES else f"'{value}'")
|
|
157
|
+
elif _field.name == 'category' and value in [EMAIL, TEXT_MESSAGE]:
|
|
158
|
+
continue
|
|
95
159
|
elif _field.name == 'recipients' and isinstance(value, list):
|
|
96
160
|
recipients_str = str([constantize_name(r) if (CONSTANTIZE_NAMES and r) else r for r in value])
|
|
97
161
|
add_prop(_field, recipients_str.replace("'", '') if CONSTANTIZE_NAMES else recipients_str)
|
|
162
|
+
elif _field.name == 'timestamp' and self.date is not None:
|
|
163
|
+
continue # Don't print both timestamp and date
|
|
98
164
|
elif isinstance(value, datetime):
|
|
99
165
|
value_str = re.sub(' 00:00:00', '', str(value))
|
|
100
166
|
add_prop(_field, f"parse('{value_str}')" if CONSTANTIZE_NAMES else f"'{value}'")
|
|
101
|
-
elif _field.name == 'description':
|
|
102
|
-
add_prop(_field, value.strip())
|
|
103
167
|
elif isinstance(value, str):
|
|
104
168
|
if "'" in value:
|
|
105
169
|
value = '"' + value.replace('"', r'\"') + '"'
|
|
@@ -112,7 +176,7 @@ class FileCfg:
|
|
|
112
176
|
|
|
113
177
|
return props
|
|
114
178
|
|
|
115
|
-
def __eq__(self, other: '
|
|
179
|
+
def __eq__(self, other: 'DocCfg') -> bool:
|
|
116
180
|
"""Return True if everything matches other than the two 'dupe_' fields ('duplicate_ids' is compared)."""
|
|
117
181
|
for _field in self.sorted_fields():
|
|
118
182
|
if _field.name == 'id' or _field.name.startswith('dupe'):
|
|
@@ -127,7 +191,7 @@ class FileCfg:
|
|
|
127
191
|
type_str = f"{type(self).__name__}("
|
|
128
192
|
single_line_repr = type_str + ', '.join(props) + f')'
|
|
129
193
|
|
|
130
|
-
if
|
|
194
|
+
if len(single_line_repr) < MAX_LINE_LENGTH:
|
|
131
195
|
repr_str = single_line_repr
|
|
132
196
|
else:
|
|
133
197
|
repr_str = f"{type_str}{INDENT_NEWLINE}" + INDENTED_JOIN.join(props)
|
|
@@ -142,31 +206,53 @@ class FileCfg:
|
|
|
142
206
|
|
|
143
207
|
|
|
144
208
|
@dataclass(kw_only=True)
|
|
145
|
-
class
|
|
209
|
+
class CommunicationCfg(DocCfg):
|
|
146
210
|
"""
|
|
147
|
-
Convenience class to unite various configured properties for a given Communication file.
|
|
148
211
|
Manual config is always required for MessengerLog author attribution. It's also often needed for Email
|
|
149
212
|
files to handle the terrible OCR text that Congress provided which messes up a lot of the email headers.
|
|
150
213
|
|
|
151
214
|
Attributes:
|
|
152
|
-
actual_text (str | None): In dire cases of broken OCR we just configure the body of the email as a string.
|
|
153
215
|
attribution_reason (str | None): Optional explanation of why this email was attributed to this author.
|
|
154
216
|
is_attribution_uncertain (bool): True if we have a good idea of who the author is but are not 100% certain
|
|
217
|
+
"""
|
|
218
|
+
attribution_reason: str | None = None
|
|
219
|
+
is_attribution_uncertain: bool = False
|
|
220
|
+
|
|
221
|
+
def __repr__(self) -> str:
|
|
222
|
+
return super().__repr__()
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
@dataclass(kw_only=True)
|
|
226
|
+
class EmailCfg(CommunicationCfg):
|
|
227
|
+
"""
|
|
228
|
+
Attributes:
|
|
229
|
+
actual_text (str | None): In dire cases of broken OCR we just configure the body of the email as a string.
|
|
155
230
|
is_fwded_article (bool): True if this is a newspaper article someone fwded. Used to exclude articles from word counting.
|
|
156
231
|
recipients (list[str | None]): Who received the email
|
|
157
232
|
"""
|
|
158
233
|
actual_text: str | None = None # Override for the Email._actual_text() method for particularly broken emails
|
|
159
|
-
attribution_reason: str | None = None
|
|
160
|
-
is_attribution_uncertain: bool = False
|
|
161
234
|
is_fwded_article: bool = False
|
|
162
235
|
recipients: list[str | None] = field(default_factory=list)
|
|
163
236
|
|
|
164
|
-
def
|
|
165
|
-
|
|
237
|
+
def __post_init__(self):
|
|
238
|
+
super().__post_init__()
|
|
239
|
+
self.category = EMAIL
|
|
166
240
|
|
|
241
|
+
@classmethod
|
|
242
|
+
def from_doc_cfg(cls, cfg: DocCfg) -> 'EmailCfg':
|
|
243
|
+
return cls(**asdict(cfg))
|
|
244
|
+
|
|
245
|
+
# This is necessary for some dumb reason. @dataclass(repr=False) doesn't cut it
|
|
167
246
|
def __repr__(self) -> str:
|
|
168
247
|
return super().__repr__()
|
|
169
248
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
249
|
+
|
|
250
|
+
@dataclass(kw_only=True)
|
|
251
|
+
class TextCfg(CommunicationCfg):
|
|
252
|
+
def __post_init__(self):
|
|
253
|
+
super().__post_init__()
|
|
254
|
+
self.category = TEXT_MESSAGE
|
|
255
|
+
|
|
256
|
+
# This is necessary for some dumb reason. @dataclass(repr=False) doesn't cut it
|
|
257
|
+
def __repr__(self) -> str:
|
|
258
|
+
return super().__repr__()
|
epstein_files/util/env.py
CHANGED
|
@@ -4,7 +4,7 @@ from os import environ
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from sys import argv
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from epstein_files.util.logging import datefinder_logger, env_log_level, logger
|
|
8
8
|
|
|
9
9
|
DEFAULT_WIDTH = 154
|
|
10
10
|
HTML_SCRIPTS = ['generate_html.py', 'count_words.py']
|
|
@@ -12,9 +12,8 @@ HTML_SCRIPTS = ['generate_html.py', 'count_words.py']
|
|
|
12
12
|
|
|
13
13
|
parser = ArgumentParser(description="Parse epstein OCR docs and generate HTML page.")
|
|
14
14
|
parser.add_argument('--build', '-b', action='store_true', help='write HTML to docs/index.html')
|
|
15
|
-
parser.add_argument('--all-emails', '-ae', action='store_true', help='all the emails')
|
|
16
|
-
parser.add_argument('--all-
|
|
17
|
-
parser.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of a limited selection')
|
|
15
|
+
parser.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
|
|
16
|
+
parser.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just interesting ones')
|
|
18
17
|
parser.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
|
|
19
18
|
parser.add_argument('--name', '-n', action='append', dest='names', help='specify the name(s) whose communications should be output')
|
|
20
19
|
parser.add_argument('--output-emails', '-oe', action='store_true', help='generate other files section')
|
|
@@ -22,24 +21,25 @@ parser.add_argument('--output-other-files', '-oo', action='store_true', help='ge
|
|
|
22
21
|
parser.add_argument('--output-texts', '-ot', action='store_true', help='generate other files section')
|
|
23
22
|
parser.add_argument('--pickled', '-p', action='store_true', help='use pickled EpsteinFiles object')
|
|
24
23
|
parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='generate new pickled EpsteinFiles object')
|
|
24
|
+
parser.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (only used by scripts)')
|
|
25
25
|
parser.add_argument('--sort-alphabetical', '-alpha', action='store_true', help='sort emailers alphabetically in counts table')
|
|
26
26
|
parser.add_argument('--suppress-output', '-s', action='store_true', help='no output to terminal (use with --build)')
|
|
27
|
-
parser.add_argument('--use-epstein-web-links', '-use', action='store_true', help='use epsteinweb.org links instead of
|
|
28
|
-
parser.add_argument('--search-other', '-so', action='store_true', help='search for string in non email/text files (only used by search script)')
|
|
27
|
+
parser.add_argument('--use-epstein-web-links', '-use', action='store_true', help='use epsteinweb.org links instead of epstein.media')
|
|
29
28
|
parser.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use')
|
|
30
29
|
parser.add_argument('--whole-file', '-wf', action='store_true', help='print whole file (only used by search script)')
|
|
31
30
|
parser.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
|
|
32
31
|
parser.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
|
|
32
|
+
parser.add_argument('--make-clean', '-mc', action='store_true', help='delete all build artifact HTML and JSON files')
|
|
33
33
|
parser.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
|
|
34
|
+
parser.add_argument('--json-metadata', '-jm', action='store_true', help='dump JSON metadata for all files')
|
|
34
35
|
parser.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats at the end')
|
|
35
36
|
parser.add_argument('positional_args', nargs='*', help='Optional args (only used by helper scripts)')
|
|
36
37
|
args = parser.parse_args()
|
|
37
38
|
|
|
38
|
-
is_env_var_set = lambda s: len(environ.get(s) or '') > 0
|
|
39
39
|
current_script = Path(argv[0]).name
|
|
40
|
+
is_env_var_set = lambda s: len(environ.get(s) or '') > 0
|
|
40
41
|
is_html_script = current_script in HTML_SCRIPTS
|
|
41
42
|
|
|
42
|
-
args.deep_debug = args.deep_debug or is_env_var_set('DEEP_DEBUG')
|
|
43
43
|
args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
|
|
44
44
|
args.output_emails = args.output_emails or args.all_emails
|
|
45
45
|
args.output_other_files = args.output_other_files or args.all_other_files
|
|
@@ -48,27 +48,25 @@ args.width = args.width if is_html_script else None
|
|
|
48
48
|
specified_names: list[str | None] = [None if n == 'None' else n for n in (args.names or [])]
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
#
|
|
52
|
-
logging.basicConfig(level="NOTSET", format="%(message)s", datefmt="[%X]", handlers=[RichHandler()])
|
|
53
|
-
# logging.basicConfig(level="DEBUG", handlers=[RichHandler()])
|
|
54
|
-
logger = logging.getLogger("rich")
|
|
55
|
-
|
|
51
|
+
# Log level args
|
|
56
52
|
if args.deep_debug:
|
|
57
53
|
logger.setLevel(logging.DEBUG)
|
|
58
54
|
elif args.debug:
|
|
59
55
|
logger.setLevel(logging.INFO)
|
|
60
56
|
elif args.suppress_logs:
|
|
61
57
|
logger.setLevel(logging.FATAL)
|
|
62
|
-
|
|
58
|
+
elif not env_log_level:
|
|
63
59
|
logger.setLevel(logging.WARNING)
|
|
64
60
|
|
|
65
|
-
|
|
61
|
+
logger.info(f'Log level set to {logger.level}...')
|
|
66
62
|
datefinder_logger.setLevel(logger.level)
|
|
67
63
|
|
|
68
64
|
|
|
69
65
|
# Massage args that depend on other args to the appropriate state
|
|
70
|
-
if not (args.output_texts or args.output_emails or args.output_other_files):
|
|
71
|
-
|
|
66
|
+
if not (args.json_metadata or args.output_texts or args.output_emails or args.output_other_files):
|
|
67
|
+
if is_html_script:
|
|
68
|
+
logger.warning(f"No output section chosen; outputting default of texts, selected emails, and other files...")
|
|
69
|
+
|
|
72
70
|
args.output_texts = True
|
|
73
71
|
args.output_emails = True
|
|
74
72
|
args.output_other_files = True
|
|
@@ -77,4 +75,4 @@ if args.use_epstein_web_links:
|
|
|
77
75
|
logger.warning(f"Using links to epsteinweb.org links instead of epsteinify.com...")
|
|
78
76
|
|
|
79
77
|
if args.debug:
|
|
80
|
-
logger.warning(f"
|
|
78
|
+
logger.warning(f"Invocation args:\nis_html_script={is_html_script},\nspecified_names={specified_names},\nargs={args}")
|
|
@@ -3,11 +3,12 @@ from os import environ
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from sys import exit
|
|
5
5
|
|
|
6
|
-
from epstein_files.util.constant.strings import HOUSE_OVERSIGHT_PREFIX
|
|
6
|
+
from epstein_files.util.constant.strings import FILE_NAME_REGEX, FILE_STEM_REGEX, HOUSE_OVERSIGHT_PREFIX
|
|
7
7
|
|
|
8
8
|
EPSTEIN_DOCS_DIR_ENV_VAR_NAME = 'EPSTEIN_DOCS_DIR'
|
|
9
9
|
DOCS_DIR_ENV = environ[EPSTEIN_DOCS_DIR_ENV_VAR_NAME]
|
|
10
10
|
DOCS_DIR = Path(DOCS_DIR_ENV or '').resolve()
|
|
11
|
+
PICKLED_PATH = Path("the_epstein_files.pkl.gz")
|
|
11
12
|
|
|
12
13
|
if not DOCS_DIR_ENV:
|
|
13
14
|
print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME} env var not set!")
|
|
@@ -16,30 +17,36 @@ elif not DOCS_DIR.exists():
|
|
|
16
17
|
print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!")
|
|
17
18
|
exit(1)
|
|
18
19
|
|
|
19
|
-
JSON_DIR = DOCS_DIR.joinpath('json_files')
|
|
20
20
|
HTML_DIR = Path('docs')
|
|
21
21
|
EXTRACTED_EMAILS_DIR = Path('emails_extracted_from_legal_filings')
|
|
22
|
+
EPSTEIN_WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_texts_and_emails_word_count.html')
|
|
22
23
|
GH_PAGES_HTML_PATH = HTML_DIR.joinpath('index.html')
|
|
24
|
+
JSON_METADATA_PATH = HTML_DIR.joinpath('epstein_files_nov_2025_cryptadamus_metadata.json')
|
|
23
25
|
WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_emails_word_count.html')
|
|
24
|
-
EPSTEIN_WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_texts_and_emails_word_count.html')
|
|
25
|
-
PICKLED_PATH = Path("the_epstein_files.pkl.gz")
|
|
26
26
|
|
|
27
|
-
|
|
28
|
-
|
|
27
|
+
BUILD_ARTIFACTS = [
|
|
28
|
+
EPSTEIN_WORD_COUNT_HTML_PATH,
|
|
29
|
+
GH_PAGES_HTML_PATH,
|
|
30
|
+
JSON_METADATA_PATH,
|
|
31
|
+
WORD_COUNT_HTML_PATH,
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
FILE_ID_REGEX = re.compile(fr".*{FILE_NAME_REGEX.pattern}")
|
|
29
35
|
FILENAME_LENGTH = len(HOUSE_OVERSIGHT_PREFIX) + 6
|
|
30
36
|
KB = 1024
|
|
31
37
|
MB = KB * KB
|
|
32
38
|
|
|
33
39
|
|
|
34
40
|
# Handles both string and int 'id' args.
|
|
35
|
-
|
|
41
|
+
id_str = lambda id: f"{int(id):06d}"
|
|
36
42
|
filename_for_id = lambda id: file_stem_for_id(id) + '.txt'
|
|
37
43
|
|
|
38
44
|
|
|
39
45
|
def coerce_file_stem(filename_or_id: int | str) -> str:
|
|
40
46
|
"""Generate a valid file_stem no matter what form the argument comes in."""
|
|
41
47
|
if isinstance(filename_or_id, str) and filename_or_id.startswith(HOUSE_OVERSIGHT_PREFIX):
|
|
42
|
-
|
|
48
|
+
file_id = extract_file_id(filename_or_id)
|
|
49
|
+
file_stem = file_stem_for_id(file_id)
|
|
43
50
|
else:
|
|
44
51
|
file_stem = file_stem_for_id(filename_or_id)
|
|
45
52
|
|
|
@@ -49,33 +56,65 @@ def coerce_file_stem(filename_or_id: int | str) -> str:
|
|
|
49
56
|
return file_stem
|
|
50
57
|
|
|
51
58
|
|
|
52
|
-
def
|
|
53
|
-
|
|
59
|
+
def coerce_file_name(filename_or_id: int | str) -> str:
|
|
60
|
+
return coerce_file_stem(filename_or_id) + '.txt'
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def coerce_file_path(filename_or_id: int | str) -> Path:
|
|
64
|
+
return DOCS_DIR.joinpath(coerce_file_name(filename_or_id))
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def extract_file_id(filename_or_id: int | str | Path) -> str:
|
|
68
|
+
if isinstance(filename_or_id, int) or (isinstance(filename_or_id, str) and len(filename_or_id) <= 6):
|
|
69
|
+
return id_str(filename_or_id)
|
|
70
|
+
|
|
71
|
+
file_match = FILE_ID_REGEX.match(str(filename_or_id))
|
|
54
72
|
|
|
55
73
|
if not file_match:
|
|
56
|
-
raise RuntimeError(f"Failed to extract file ID from {
|
|
74
|
+
raise RuntimeError(f"Failed to extract file ID from {filename_or_id}")
|
|
57
75
|
|
|
58
76
|
return file_match.group(1)
|
|
59
77
|
|
|
60
78
|
|
|
79
|
+
def file_size(file_path: str | Path) -> int:
|
|
80
|
+
return Path(file_path).stat().st_size
|
|
81
|
+
|
|
82
|
+
|
|
61
83
|
def file_size_str(file_path: str | Path) -> str:
|
|
62
|
-
|
|
84
|
+
size = file_size(file_path)
|
|
63
85
|
digits = 2
|
|
64
86
|
|
|
65
|
-
if
|
|
66
|
-
size_num =
|
|
87
|
+
if size > MB:
|
|
88
|
+
size_num = float(size) / MB
|
|
67
89
|
size_str = 'MB'
|
|
68
|
-
elif
|
|
69
|
-
size_num =
|
|
90
|
+
elif size > KB:
|
|
91
|
+
size_num = float(size) / KB
|
|
70
92
|
size_str = 'kb'
|
|
71
93
|
digits = 1
|
|
72
94
|
else:
|
|
73
|
-
return f"{
|
|
95
|
+
return f"{size} b"
|
|
74
96
|
|
|
75
97
|
return f"{size_num:,.{digits}f} {size_str}"
|
|
76
98
|
|
|
77
99
|
|
|
100
|
+
def file_stem_for_id(id: int | str) -> str:
|
|
101
|
+
if isinstance(id, int) or (isinstance(id, str) and len(id) <= 6):
|
|
102
|
+
return f"{HOUSE_OVERSIGHT_PREFIX}{id_str(id)}"
|
|
103
|
+
elif len(id) == 8:
|
|
104
|
+
return f"{HOUSE_OVERSIGHT_PREFIX}{id}"
|
|
105
|
+
else:
|
|
106
|
+
raise RuntimeError(f"Unknown kind of file id {id}")
|
|
107
|
+
|
|
108
|
+
|
|
78
109
|
def is_local_extract_file(filename) -> bool:
|
|
79
110
|
"""Return true if filename is of form 'HOUSE_OVERSIGHT_029835_1.txt'."""
|
|
80
111
|
file_match = FILE_ID_REGEX.match(str(filename))
|
|
81
112
|
return True if file_match and file_match.group(2) else False
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def make_clean() -> None:
|
|
116
|
+
"""Delete all build artifacts."""
|
|
117
|
+
for build_file in BUILD_ARTIFACTS:
|
|
118
|
+
if build_file.exists():
|
|
119
|
+
print(f"Removing build file '{build_file}'...")
|
|
120
|
+
build_file.unlink()
|