epstein-files 1.2.5__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +55 -23
- epstein_files/documents/communication.py +9 -5
- epstein_files/documents/document.py +231 -135
- epstein_files/documents/doj_file.py +242 -0
- epstein_files/documents/doj_files/full_text.py +166 -0
- epstein_files/documents/email.py +289 -232
- epstein_files/documents/emails/email_header.py +35 -16
- epstein_files/documents/emails/emailers.py +223 -0
- epstein_files/documents/imessage/text_message.py +2 -3
- epstein_files/documents/json_file.py +18 -14
- epstein_files/documents/messenger_log.py +23 -39
- epstein_files/documents/other_file.py +54 -48
- epstein_files/epstein_files.py +65 -29
- epstein_files/person.py +151 -94
- epstein_files/util/constant/names.py +37 -10
- epstein_files/util/constant/output_files.py +2 -0
- epstein_files/util/constant/strings.py +14 -7
- epstein_files/util/constant/urls.py +17 -0
- epstein_files/util/constants.py +556 -391
- epstein_files/util/data.py +2 -0
- epstein_files/util/doc_cfg.py +44 -33
- epstein_files/util/env.py +34 -19
- epstein_files/util/file_helper.py +30 -6
- epstein_files/util/helpers/debugging_helper.py +13 -0
- epstein_files/util/helpers/env_helpers.py +21 -0
- epstein_files/util/highlighted_group.py +121 -37
- epstein_files/util/layout/left_bar_panel.py +26 -0
- epstein_files/util/logging.py +28 -13
- epstein_files/util/output.py +49 -40
- epstein_files/util/rich.py +30 -3
- epstein_files/util/word_count.py +7 -7
- {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/METADATA +16 -3
- epstein_files-1.5.0.dist-info/RECORD +40 -0
- {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/entry_points.txt +1 -1
- epstein_files-1.2.5.dist-info/RECORD +0 -34
- {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/LICENSE +0 -0
- {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/WHEEL +0 -0
epstein_files/util/data.py
CHANGED
|
@@ -19,6 +19,8 @@ MULTINEWLINE_REGEX = re.compile(r"\n{2,}")
|
|
|
19
19
|
CONSTANT_VAR_REGEX = re.compile(r"^[A-Z_]+$")
|
|
20
20
|
ALL_NAMES = [v for k, v in vars(names).items() if isinstance(v, str) and CONSTANT_VAR_REGEX.match(k)]
|
|
21
21
|
|
|
22
|
+
AMERICAN_DATE_FORMAT = r"%m/%d/%y %I:%M:%S %p"
|
|
23
|
+
AMERICAN_TIME_REGEX = re.compile(r"(\d{1,2}/\d{1,2}/\d{2,4}\s+\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)?)")
|
|
22
24
|
PACIFIC_TZ = tz.gettz("America/Los_Angeles")
|
|
23
25
|
TIMEZONE_INFO = {"PDT": PACIFIC_TZ, "PST": PACIFIC_TZ} # Suppresses annoying warnings from parse() calls
|
|
24
26
|
|
epstein_files/util/doc_cfg.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import re
|
|
2
3
|
from copy import deepcopy
|
|
3
4
|
from dataclasses import Field, asdict, dataclass, field, fields
|
|
@@ -9,20 +10,21 @@ from dateutil.parser import parse
|
|
|
9
10
|
from epstein_files.util.constant.names import *
|
|
10
11
|
from epstein_files.util.constant.strings import *
|
|
11
12
|
from epstein_files.util.data import remove_zero_time, without_falsey
|
|
13
|
+
from epstein_files.util.env import args
|
|
12
14
|
|
|
13
|
-
DuplicateType = Literal['earlier', 'quoted', 'redacted', 'same']
|
|
15
|
+
DuplicateType = Literal['bounced', 'earlier', 'quoted', 'redacted', 'same']
|
|
14
16
|
Metadata = dict[str, bool | datetime | int | str | list[str | None] |dict[str, bool | str]]
|
|
15
17
|
|
|
16
18
|
# Misc
|
|
17
|
-
CONSTANTIZE_NAMES = False # A flag set to True that causes repr() of these classes to return strings of usable code
|
|
18
19
|
INDENT = ' '
|
|
19
20
|
INDENT_NEWLINE = f'\n{INDENT}'
|
|
20
21
|
INDENTED_JOIN = f',{INDENT_NEWLINE}'
|
|
21
|
-
MAX_LINE_LENGTH =
|
|
22
|
+
MAX_LINE_LENGTH = 135
|
|
22
23
|
REPUTATION_MGMT = f'{REPUTATION} management'
|
|
23
24
|
SAME = 'same'
|
|
24
25
|
|
|
25
26
|
DUPE_TYPE_STRS: dict[DuplicateType, str] = {
|
|
27
|
+
'bounced': 'a bounced copy of',
|
|
26
28
|
'earlier': 'an earlier draft of',
|
|
27
29
|
'quoted': 'quoted in full in',
|
|
28
30
|
'redacted': 'a redacted version of',
|
|
@@ -32,7 +34,10 @@ DUPE_TYPE_STRS: dict[DuplicateType, str] = {
|
|
|
32
34
|
FIELD_SORT_KEY = {
|
|
33
35
|
'id': 'a',
|
|
34
36
|
'author': 'aa',
|
|
35
|
-
'
|
|
37
|
+
'comment': 'zz',
|
|
38
|
+
'duplicate_ids': 'dup',
|
|
39
|
+
'duplicate_of_id': 'dupe',
|
|
40
|
+
'recipients': 'aaa',
|
|
36
41
|
}
|
|
37
42
|
|
|
38
43
|
FINANCIAL_REPORTS_AUTHORS = [
|
|
@@ -49,7 +54,6 @@ FINANCIAL_REPORTS_AUTHORS = [
|
|
|
49
54
|
# Fields like timestamp and author are better added from the Document object
|
|
50
55
|
NON_METADATA_FIELDS = [
|
|
51
56
|
'actual_text',
|
|
52
|
-
'date',
|
|
53
57
|
'id',
|
|
54
58
|
'is_synthetic',
|
|
55
59
|
]
|
|
@@ -64,18 +68,19 @@ class DocCfg:
|
|
|
64
68
|
id (str): ID of file
|
|
65
69
|
author (Name): Author of the document (if any)
|
|
66
70
|
category (str | None): Type of file
|
|
67
|
-
date (str | None):
|
|
71
|
+
date (str | None): Parsed to a datetime by timestamp() if it exists
|
|
68
72
|
dupe_type (DuplicateType | None): The type of duplicate this file is or its 'duplicate_ids' are
|
|
69
73
|
duplicate_ids (list[str]): IDs of *other* documents that are dupes of this document
|
|
70
74
|
duplicate_of_id (str | None): If this is a dupe the ID of the duplicated file. This file will be suppressed
|
|
71
75
|
is_interesting (bool | None): Override other considerations and always consider this file interesting (or not)
|
|
72
|
-
timestamp (datetime | None): Time this email was sent, file was created, article published, etc.
|
|
73
76
|
is_synthetic (bool): True if this config was generated by the duplicate_cfgs() method
|
|
77
|
+
replace_text_with (bool): True if `description` should replace body of the document when printing.
|
|
74
78
|
"""
|
|
75
79
|
id: str
|
|
76
80
|
attached_to_email_id: str | None = None
|
|
77
81
|
author: Name = None
|
|
78
82
|
category: str | None = None
|
|
83
|
+
comment: str = ''
|
|
79
84
|
date: str | None = None
|
|
80
85
|
description: str | None = None
|
|
81
86
|
dupe_type: DuplicateType | None = None
|
|
@@ -84,15 +89,9 @@ class DocCfg:
|
|
|
84
89
|
is_attribution_uncertain: bool = False
|
|
85
90
|
is_interesting: bool | None = None
|
|
86
91
|
is_synthetic: bool = False
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def __post_init__(self):
|
|
90
|
-
if self.date:
|
|
91
|
-
self.timestamp = parse(self.date)
|
|
92
|
-
|
|
93
|
-
if self.duplicate_of_id or self.duplicate_ids:
|
|
94
|
-
self.dupe_type = self.dupe_type or SAME
|
|
92
|
+
replace_text_with: str = ''
|
|
95
93
|
|
|
94
|
+
@property
|
|
96
95
|
def complete_description(self) -> str | None:
|
|
97
96
|
"""String that summarizes what is known about this document."""
|
|
98
97
|
description = ''
|
|
@@ -130,6 +129,24 @@ class DocCfg:
|
|
|
130
129
|
|
|
131
130
|
return description
|
|
132
131
|
|
|
132
|
+
@property
|
|
133
|
+
def metadata(self) -> Metadata:
|
|
134
|
+
metadata = {k: v for k, v in asdict(self).items() if k not in NON_METADATA_FIELDS and v}
|
|
135
|
+
|
|
136
|
+
if self.is_interesting is False:
|
|
137
|
+
metadata['is_interesting'] = False
|
|
138
|
+
|
|
139
|
+
return metadata
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def timestamp(self) -> datetime | None:
|
|
143
|
+
if self.date:
|
|
144
|
+
return parse(self.date)
|
|
145
|
+
|
|
146
|
+
def __post_init__(self):
|
|
147
|
+
if self.duplicate_of_id or self.duplicate_ids:
|
|
148
|
+
self.dupe_type = self.dupe_type or SAME
|
|
149
|
+
|
|
133
150
|
def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
|
|
134
151
|
"""Create synthetic DocCfg objects that set the 'duplicate_of_id' field to point back to this object."""
|
|
135
152
|
for id in self.duplicate_ids:
|
|
@@ -141,9 +158,6 @@ class DocCfg:
|
|
|
141
158
|
dupe_cfg.is_synthetic = True
|
|
142
159
|
yield dupe_cfg
|
|
143
160
|
|
|
144
|
-
def metadata(self) -> Metadata:
|
|
145
|
-
return {k: v for k, v in asdict(self).items() if k not in NON_METADATA_FIELDS and v}
|
|
146
|
-
|
|
147
161
|
def _props_strs(self) -> list[str]:
|
|
148
162
|
props = []
|
|
149
163
|
add_prop = lambda f, value: props.append(f"{f.name}={value}")
|
|
@@ -151,20 +165,16 @@ class DocCfg:
|
|
|
151
165
|
for _field in sorted(fields(self), key=lambda f: FIELD_SORT_KEY.get(f.name, f.name)):
|
|
152
166
|
value = getattr(self, _field.name)
|
|
153
167
|
|
|
154
|
-
if
|
|
168
|
+
if _field.name in ['actual_text', 'is_fwded_article', 'is_interesting']: # fields can be False or None or ''
|
|
169
|
+
if value is not None:
|
|
170
|
+
add_prop(_field, str(value))
|
|
171
|
+
elif not value or _field.name == 'dupe_type' and value == 'same':
|
|
155
172
|
continue
|
|
156
173
|
elif _field.name == AUTHOR:
|
|
157
|
-
add_prop(_field, constantize_name(str(value)) if
|
|
158
|
-
elif _field.name == '
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
recipients_str = str([constantize_name(r) if (CONSTANTIZE_NAMES and r) else r for r in value])
|
|
162
|
-
add_prop(_field, recipients_str.replace("'", '') if CONSTANTIZE_NAMES else recipients_str)
|
|
163
|
-
elif _field.name == 'timestamp' and self.date is not None:
|
|
164
|
-
continue # Don't print both timestamp and date
|
|
165
|
-
elif isinstance(value, datetime):
|
|
166
|
-
value_str = remove_zero_time(value)
|
|
167
|
-
add_prop(_field, f"parse('{value_str}')" if CONSTANTIZE_NAMES else f"'{value}'")
|
|
174
|
+
add_prop(_field, constantize_name(str(value)) if args.constantize else f"'{value}'")
|
|
175
|
+
elif _field.name == 'recipients':
|
|
176
|
+
recipients_str = str([constantize_name(r) if (args.constantize and r) else r for r in value])
|
|
177
|
+
add_prop(_field, recipients_str.replace("'", '') if args.constantize else recipients_str)
|
|
168
178
|
elif isinstance(value, str):
|
|
169
179
|
if "'" in value:
|
|
170
180
|
value = '"' + value.replace('"', r'\"') + '"'
|
|
@@ -182,14 +192,14 @@ class DocCfg:
|
|
|
182
192
|
type_str = f"{type(self).__name__}("
|
|
183
193
|
single_line_repr = type_str + ', '.join(props) + f')'
|
|
184
194
|
|
|
185
|
-
if len(single_line_repr) < MAX_LINE_LENGTH:
|
|
195
|
+
if len(single_line_repr) < MAX_LINE_LENGTH or (self.comment and getattr(self, 'is_fwded_article')):
|
|
186
196
|
repr_str = single_line_repr
|
|
187
197
|
else:
|
|
188
198
|
repr_str = f"{type_str}{INDENT_NEWLINE}" + INDENTED_JOIN.join(props)
|
|
189
199
|
repr_str += ',' if props else ''
|
|
190
200
|
repr_str += '\n)'
|
|
191
201
|
|
|
192
|
-
if
|
|
202
|
+
if args.constantize:
|
|
193
203
|
repr_str = INDENT + INDENT_NEWLINE.join(repr_str.split('\n'))
|
|
194
204
|
return repr_str.replace(',,', ',').replace(',),', '),').replace(',),', '),')
|
|
195
205
|
else:
|
|
@@ -224,9 +234,10 @@ class EmailCfg(CommunicationCfg):
|
|
|
224
234
|
"""
|
|
225
235
|
actual_text: str | None = None
|
|
226
236
|
fwded_text_after: str | None = None
|
|
227
|
-
is_fwded_article: bool =
|
|
237
|
+
is_fwded_article: bool | None = None
|
|
228
238
|
recipients: list[Name] = field(default_factory=list)
|
|
229
239
|
subject: str | None = None
|
|
240
|
+
truncate_to: int | None = None
|
|
230
241
|
|
|
231
242
|
# This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
|
|
232
243
|
def __repr__(self) -> str:
|
epstein_files/util/env.py
CHANGED
|
@@ -5,23 +5,24 @@ from pathlib import Path
|
|
|
5
5
|
|
|
6
6
|
from rich_argparse_plus import RichHelpFormatterPlus
|
|
7
7
|
|
|
8
|
-
from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, CHRONOLOGICAL_EMAILS_PATH,
|
|
9
|
-
|
|
8
|
+
from epstein_files.util.constant.output_files import (ALL_EMAILS_PATH, CHRONOLOGICAL_EMAILS_PATH,
|
|
9
|
+
DOJ_2026_HTML_PATH, TEXT_MSGS_HTML_PATH)
|
|
10
|
+
from epstein_files.util.helpers.env_helpers import get_env_dir
|
|
11
|
+
from epstein_files.util.logging import env_log_level, exit_with_error, logger, set_log_level
|
|
10
12
|
|
|
11
13
|
DEFAULT_WIDTH = 155
|
|
12
14
|
DEFAULT_FILE = 'default_file'
|
|
13
15
|
EPSTEIN_GENERATE = 'epstein_generate'
|
|
14
16
|
HTML_SCRIPTS = [EPSTEIN_GENERATE, 'epstein_word_count']
|
|
15
17
|
|
|
16
|
-
#
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
18
|
+
# Get dirs from Env vars
|
|
19
|
+
DOCS_DIR_ENV_VAR = 'EPSTEIN_DOCS_DIR'
|
|
20
|
+
DOJ_PDFS_20260130_DIR_ENV_VAR = 'EPSTEIN_DOJ_PDFS_20260130_DIR'
|
|
21
|
+
DOJ_TXTS_20260130_DIR_ENV_VAR = 'EPSTEIN_DOJ_TXTS_20260130_DIR'
|
|
20
22
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
exit_with_error(f"{EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!\n")
|
|
23
|
+
DOCS_DIR: Path = get_env_dir(DOCS_DIR_ENV_VAR, must_exist=True)
|
|
24
|
+
DOJ_PDFS_20260130_DIR: Path = get_env_dir(DOJ_PDFS_20260130_DIR_ENV_VAR, must_exist=False)
|
|
25
|
+
DOJ_TXTS_20260130_DIR: Path = get_env_dir(DOJ_TXTS_20260130_DIR_ENV_VAR, must_exist=False)
|
|
25
26
|
|
|
26
27
|
is_env_var_set = lambda s: len(environ.get(s) or '') > 0
|
|
27
28
|
is_output_arg = lambda arg: any([arg.startswith(pfx) for pfx in ['colors_only', 'json', 'make_clean', 'output']])
|
|
@@ -41,6 +42,7 @@ output.add_argument('--email-timeline', action='store_true', help='print a table
|
|
|
41
42
|
output.add_argument('--emailers-info', '-ei', action='store_true', help='write a .png of the eeailers info table')
|
|
42
43
|
output.add_argument('--json-files', action='store_true', help='pretty print all the raw JSON data files in the collection and exit')
|
|
43
44
|
output.add_argument('--json-metadata', action='store_true', help='dump JSON metadata for all files and exit')
|
|
45
|
+
output.add_argument('--output-doj-files', '-od', action='store_true', help='generate the DOJ files from 2026-01-30')
|
|
44
46
|
output.add_argument('--output-emails', '-oe', action='store_true', help='generate emails section')
|
|
45
47
|
output.add_argument('--output-other', '-oo', action='store_true', help='generate other files section')
|
|
46
48
|
output.add_argument('--output-texts', '-ot', action='store_true', help='generate text messages section')
|
|
@@ -49,16 +51,19 @@ output.add_argument('--suppress-output', action='store_true', help='no output to
|
|
|
49
51
|
output.add_argument('--uninteresting', action='store_true', help='only output uninteresting other files')
|
|
50
52
|
output.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use (in characters)')
|
|
51
53
|
|
|
52
|
-
scripts = parser.add_argument_group('SCRIPTS', 'Options used by
|
|
54
|
+
scripts = parser.add_argument_group('SCRIPTS', 'Options used by epstein_grep, epstein_show, and epstein_diff.')
|
|
53
55
|
scripts.add_argument('positional_args', nargs='*', help='strings to searchs for, file IDs to show or diff, etc.')
|
|
56
|
+
scripts.add_argument('--email-body', action='store_true', help='epstein_grep but only for the body of the email')
|
|
57
|
+
scripts.add_argument('--min-line-length', type=int, help='epstein_grep minimum length of a matched line')
|
|
54
58
|
scripts.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (used by epstein_show)')
|
|
55
59
|
scripts.add_argument('--whole-file', '-wf', action='store_true', help='print whole files')
|
|
56
60
|
|
|
57
61
|
debug = parser.add_argument_group('DEBUG')
|
|
58
62
|
debug.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
|
|
63
|
+
debug.add_argument('--constantize', action='store_true', help='constantize names when printing repr() of objects')
|
|
59
64
|
debug.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
|
|
60
65
|
debug.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
|
|
61
|
-
debug.add_argument('--
|
|
66
|
+
debug.add_argument('--stats', '-j', action='store_true', help='print JSON formatted stats about the files')
|
|
62
67
|
debug.add_argument('--skip-other-files', '-sof', action='store_true', help='skip parsing non email/text files')
|
|
63
68
|
debug.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
|
|
64
69
|
debug.add_argument('--truncate', '-t', type=int, help='truncate emails to this many characters')
|
|
@@ -66,7 +71,11 @@ debug.add_argument('--write-txt', '-wt', action='store_true', help='write a plai
|
|
|
66
71
|
|
|
67
72
|
|
|
68
73
|
# Parse args
|
|
69
|
-
|
|
74
|
+
if environ.get('INVOKED_BY_PYTEST'):
|
|
75
|
+
args = parser.parse_args([EPSTEIN_GENERATE])
|
|
76
|
+
else:
|
|
77
|
+
args = parser.parse_args()
|
|
78
|
+
|
|
70
79
|
is_html_script = parser.prog in HTML_SCRIPTS
|
|
71
80
|
|
|
72
81
|
args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
|
|
@@ -78,7 +87,9 @@ args.width = args.width if is_html_script else None
|
|
|
78
87
|
args.any_output_selected = any([is_output_arg(arg) and val for arg, val in vars(args).items()])
|
|
79
88
|
|
|
80
89
|
if not (args.any_output_selected or args.email_timeline or args.emailers_info):
|
|
81
|
-
|
|
90
|
+
if is_html_script:
|
|
91
|
+
logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
|
|
92
|
+
|
|
82
93
|
args.output_emails = args.output_other = args.output_texts = True
|
|
83
94
|
|
|
84
95
|
if is_html_script:
|
|
@@ -95,25 +106,29 @@ if is_html_script:
|
|
|
95
106
|
args.build = ALL_EMAILS_PATH
|
|
96
107
|
elif args.email_timeline:
|
|
97
108
|
args.build = CHRONOLOGICAL_EMAILS_PATH
|
|
109
|
+
elif args.output_doj_files:
|
|
110
|
+
args.build = DOJ_2026_HTML_PATH
|
|
98
111
|
else:
|
|
99
112
|
args.build = TEXT_MSGS_HTML_PATH
|
|
100
|
-
elif parser.prog.startswith('epstein_') and not args.positional_args:
|
|
113
|
+
elif parser.prog.startswith('epstein_') and not args.positional_args and not args.names:
|
|
101
114
|
exit_with_error(f"{parser.prog} requires positional arguments but got none!")
|
|
102
115
|
|
|
103
116
|
if args.names:
|
|
104
117
|
logger.warning(f"Output restricted to {args.names}")
|
|
105
118
|
args.output_other = False
|
|
106
119
|
|
|
120
|
+
if args.truncate and args.whole_file:
|
|
121
|
+
exit_with_error(f"--whole-file and --truncate are incompatible")
|
|
107
122
|
|
|
108
123
|
# Log level args
|
|
109
124
|
if args.deep_debug:
|
|
110
|
-
|
|
125
|
+
set_log_level(logging.DEBUG)
|
|
111
126
|
elif args.debug:
|
|
112
|
-
|
|
127
|
+
set_log_level(logging.INFO)
|
|
113
128
|
elif args.suppress_logs:
|
|
114
|
-
|
|
129
|
+
set_log_level(logging.FATAL)
|
|
115
130
|
elif not env_log_level:
|
|
116
|
-
|
|
131
|
+
set_log_level(logging.WARNING)
|
|
117
132
|
|
|
118
133
|
logger.debug(f'Log level set to {logger.level}...')
|
|
119
134
|
args_str = ',\n'.join([f"{k}={v}" for k, v in vars(args).items() if v])
|
|
@@ -1,43 +1,67 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
|
-
from epstein_files.util.constant.strings import
|
|
5
|
-
|
|
4
|
+
from epstein_files.util.constant.strings import (DOJ_FILE_NAME_REGEX, EFTA_PREFIX,
|
|
5
|
+
HOUSE_OVERSIGHT_NOV_2025_FILE_NAME_REGEX, HOUSE_OVERSIGHT_NOV_2025_FILE_STEM_REGEX,
|
|
6
|
+
HOUSE_OVERSIGHT_PREFIX)
|
|
7
|
+
from epstein_files.util.env import DOCS_DIR, DOJ_TXTS_20260130_DIR
|
|
6
8
|
from epstein_files.util.logging import logger
|
|
7
9
|
|
|
8
10
|
EXTRACTED_EMAILS_DIR = Path('emails_extracted_from_legal_filings')
|
|
9
|
-
FILE_ID_REGEX = re.compile(fr".*{
|
|
11
|
+
FILE_ID_REGEX = re.compile(fr".*{HOUSE_OVERSIGHT_NOV_2025_FILE_NAME_REGEX.pattern}")
|
|
10
12
|
FILENAME_LENGTH = len(HOUSE_OVERSIGHT_PREFIX) + 6
|
|
11
13
|
KB = 1024
|
|
12
14
|
MB = KB * KB
|
|
13
15
|
|
|
14
16
|
# Coerce methods handle both string and int arguments.
|
|
15
17
|
coerce_file_name = lambda filename_or_id: coerce_file_stem(filename_or_id) + '.txt'
|
|
16
|
-
coerce_file_path = lambda filename_or_id: DOCS_DIR.joinpath(coerce_file_name(filename_or_id))
|
|
17
18
|
file_size = lambda file_path: Path(file_path).stat().st_size
|
|
18
19
|
id_str = lambda id: f"{int(id):06d}"
|
|
19
20
|
|
|
20
21
|
|
|
22
|
+
def coerce_file_path(filename_or_id: int | str) -> Path:
|
|
23
|
+
"""Returns the `Path` for the file with `filename_or_id` ID."""
|
|
24
|
+
filename = coerce_file_name(filename_or_id)
|
|
25
|
+
|
|
26
|
+
if isinstance(filename_or_id, str) and DOJ_FILE_NAME_REGEX.match(filename_or_id):
|
|
27
|
+
for txt_file in DOJ_TXTS_20260130_DIR.glob('**/*.txt'):
|
|
28
|
+
if txt_file.name == filename:
|
|
29
|
+
return txt_file
|
|
30
|
+
|
|
31
|
+
raise RuntimeError(f"'{filename_or_id}' looks like DOJ file but no file named {filename} in '{DOJ_TXTS_20260130_DIR}'")
|
|
32
|
+
else:
|
|
33
|
+
return DOCS_DIR.joinpath(filename)
|
|
34
|
+
|
|
35
|
+
|
|
21
36
|
def coerce_file_stem(filename_or_id: int | str) -> str:
|
|
22
|
-
"""Generate a valid
|
|
37
|
+
"""Generate a valid file stem no matter what form the argument comes in."""
|
|
38
|
+
if isinstance(filename_or_id, str) and DOJ_FILE_NAME_REGEX.search(filename_or_id):
|
|
39
|
+
return Path(filename_or_id).stem
|
|
40
|
+
|
|
23
41
|
if isinstance(filename_or_id, str) and filename_or_id.startswith(HOUSE_OVERSIGHT_PREFIX):
|
|
24
42
|
file_id = extract_file_id(filename_or_id)
|
|
25
43
|
file_stem = file_stem_for_id(file_id)
|
|
26
44
|
else:
|
|
27
45
|
file_stem = file_stem_for_id(filename_or_id)
|
|
28
46
|
|
|
29
|
-
if not
|
|
47
|
+
if not HOUSE_OVERSIGHT_NOV_2025_FILE_STEM_REGEX.match(file_stem):
|
|
30
48
|
raise RuntimeError(f"Invalid stem '{file_stem}' from '{filename_or_id}'")
|
|
31
49
|
|
|
32
50
|
return file_stem
|
|
33
51
|
|
|
34
52
|
|
|
35
53
|
def extract_file_id(filename_or_id: int | str | Path) -> str:
|
|
54
|
+
# DOJ 2026-01 files have different pattern
|
|
55
|
+
if isinstance(filename_or_id, str) and filename_or_id.startswith(EFTA_PREFIX):
|
|
56
|
+
return Path(filename_or_id).stem
|
|
57
|
+
|
|
36
58
|
if isinstance(filename_or_id, str):
|
|
37
59
|
filename_or_id = filename_or_id.removesuffix(',')
|
|
38
60
|
|
|
39
61
|
if isinstance(filename_or_id, int) or (isinstance(filename_or_id, str) and len(filename_or_id) <= 6):
|
|
40
62
|
return id_str(filename_or_id)
|
|
63
|
+
elif isinstance(filename_or_id, str) and len(filename_or_id) == 8:
|
|
64
|
+
return f"{HOUSE_OVERSIGHT_PREFIX}{filename_or_id}"
|
|
41
65
|
|
|
42
66
|
file_match = FILE_ID_REGEX.match(str(filename_or_id).upper())
|
|
43
67
|
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
def _show_timestamps(epstein_files):
|
|
4
|
+
for doc in epstein_files.doj_files:
|
|
5
|
+
doc.warn(f"timestamp: {doc.timestamp}")
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _verify_filenames(epstein_files):
|
|
9
|
+
doc_filenames = set([doc.file_path.name for doc in epstein_files.all_documents])
|
|
10
|
+
|
|
11
|
+
for file_path in epstein_files.all_files:
|
|
12
|
+
if file_path.name not in doc_filenames:
|
|
13
|
+
print(f"'{file_path}' is not in list of {len(doc_filenames)} Document obj filenames!")
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Helpers for dealing with environment variables."""
|
|
2
|
+
from os import environ
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from epstein_files.util.logging import exit_with_error, logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_env_dir(env_var_name: str, must_exist: bool = True) -> Path | None:
|
|
9
|
+
if (dir := environ.get(env_var_name)):
|
|
10
|
+
dir = Path(dir)
|
|
11
|
+
error_msg = f"env var {env_var_name} set to '{dir}' but that's not a directory"
|
|
12
|
+
|
|
13
|
+
if dir.is_dir():
|
|
14
|
+
return dir.resolve()
|
|
15
|
+
elif must_exist:
|
|
16
|
+
exit_with_error(f"Required {error_msg}.\n")
|
|
17
|
+
else:
|
|
18
|
+
logger.warning(f"Optional {error_msg}. Some features will be unavailable.")
|
|
19
|
+
return None
|
|
20
|
+
else:
|
|
21
|
+
logger.warning(f"Optional env var {env_var_name} not set. Some features will be unavailable.")
|