epstein-files 1.0.10__py3-none-any.whl → 1.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +7 -9
- epstein_files/documents/communication.py +2 -2
- epstein_files/documents/document.py +94 -81
- epstein_files/documents/email.py +47 -5
- epstein_files/documents/imessage/text_message.py +4 -13
- epstein_files/documents/json_file.py +13 -1
- epstein_files/documents/messenger_log.py +32 -19
- epstein_files/documents/other_file.py +67 -44
- epstein_files/epstein_files.py +22 -15
- epstein_files/util/constant/names.py +11 -10
- epstein_files/util/constant/strings.py +2 -1
- epstein_files/util/constants.py +98 -88
- epstein_files/util/data.py +1 -1
- epstein_files/util/doc_cfg.py +32 -62
- epstein_files/util/env.py +29 -17
- epstein_files/util/file_helper.py +12 -29
- epstein_files/util/highlighted_group.py +34 -17
- epstein_files/util/logging.py +1 -7
- epstein_files/util/output.py +13 -8
- epstein_files/util/rich.py +15 -10
- epstein_files/util/word_count.py +65 -5
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.12.dist-info}/METADATA +1 -1
- epstein_files-1.0.12.dist-info/RECORD +33 -0
- epstein_files/count_words.py +0 -72
- epstein_files-1.0.10.dist-info/RECORD +0 -34
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.12.dist-info}/LICENSE +0 -0
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.12.dist-info}/WHEEL +0 -0
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.12.dist-info}/entry_points.txt +0 -0
epstein_files/util/doc_cfg.py
CHANGED
|
@@ -8,7 +8,7 @@ from dateutil.parser import parse
|
|
|
8
8
|
|
|
9
9
|
from epstein_files.util.constant.names import *
|
|
10
10
|
from epstein_files.util.constant.strings import *
|
|
11
|
-
from epstein_files.util.data import without_falsey
|
|
11
|
+
from epstein_files.util.data import remove_time_from_timestamp_str, without_falsey
|
|
12
12
|
|
|
13
13
|
DuplicateType = Literal['earlier', 'quoted', 'redacted', 'same']
|
|
14
14
|
Metadata = dict[str, bool | datetime | int | str | list[str | None] |dict[str, bool | str]]
|
|
@@ -47,12 +47,11 @@ FINANCIAL_REPORTS_AUTHORS = [
|
|
|
47
47
|
]
|
|
48
48
|
|
|
49
49
|
# Fields like timestamp and author are better added from the Document object
|
|
50
|
-
|
|
50
|
+
NON_METADATA_FIELDS = [
|
|
51
51
|
'actual_text',
|
|
52
52
|
'date',
|
|
53
53
|
'id',
|
|
54
|
-
'
|
|
55
|
-
'was_generated',
|
|
54
|
+
'is_synthetic',
|
|
56
55
|
]
|
|
57
56
|
|
|
58
57
|
|
|
@@ -68,10 +67,10 @@ class DocCfg:
|
|
|
68
67
|
date (str | None): If passed will be immediated parsed into the 'timestamp' field
|
|
69
68
|
dupe_of_id (str | None): If this is a dupe the ID of the duplicated file. This file will be suppressed
|
|
70
69
|
dupe_type (DuplicateType | None): The type of duplicate this file is or its 'duplicate_ids' are
|
|
71
|
-
duplicate_ids (list[str]):
|
|
70
|
+
duplicate_ids (list[str]): IDs of *other* documents that are dupes of this document
|
|
72
71
|
is_interesting (bool): Override other considerations and always consider this file interesting
|
|
73
72
|
timestamp (datetime | None): Time this email was sent, file was created, article published, etc.
|
|
74
|
-
|
|
73
|
+
is_synthetic (bool): True if this config was generated by the duplicate_cfgs() method
|
|
75
74
|
"""
|
|
76
75
|
id: str
|
|
77
76
|
author: str | None = None
|
|
@@ -82,8 +81,8 @@ class DocCfg:
|
|
|
82
81
|
dupe_type: DuplicateType | None = None
|
|
83
82
|
duplicate_ids: list[str] = field(default_factory=list)
|
|
84
83
|
is_interesting: bool = False
|
|
84
|
+
is_synthetic: bool = False
|
|
85
85
|
timestamp: datetime | None = None
|
|
86
|
-
was_generated: bool = False
|
|
87
86
|
|
|
88
87
|
def __post_init__(self):
|
|
89
88
|
if self.date:
|
|
@@ -92,66 +91,48 @@ class DocCfg:
|
|
|
92
91
|
if self.dupe_of_id or self.duplicate_ids:
|
|
93
92
|
self.dupe_type = self.dupe_type or SAME
|
|
94
93
|
|
|
95
|
-
def
|
|
96
|
-
if self.dupe_type is not None:
|
|
97
|
-
return DUPE_TYPE_STRS[self.dupe_type]
|
|
98
|
-
|
|
99
|
-
def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
|
|
100
|
-
"""Create synthetic DocCfg objects that set the 'dupe_of_id' field to point back to this object."""
|
|
101
|
-
for id in self.duplicate_ids:
|
|
102
|
-
dupe_cfg = deepcopy(self)
|
|
103
|
-
dupe_cfg.id = id
|
|
104
|
-
dupe_cfg.dupe_of_id = self.id
|
|
105
|
-
dupe_cfg.duplicate_ids = []
|
|
106
|
-
dupe_cfg.dupe_type = self.dupe_type
|
|
107
|
-
dupe_cfg.was_generated = True
|
|
108
|
-
yield dupe_cfg
|
|
109
|
-
|
|
110
|
-
def info_str(self) -> str | None:
|
|
94
|
+
def complete_description(self) -> str | None:
|
|
111
95
|
"""String that summarizes what is known about this document."""
|
|
112
|
-
if self.category and not self.description:
|
|
96
|
+
if self.category and not self.description and not self.author:
|
|
113
97
|
return self.category
|
|
114
98
|
elif self.category == REPUTATION:
|
|
115
99
|
return f"{REPUTATION_MGMT}: {self.description}"
|
|
100
|
+
elif self.category == SKYPE_LOG:
|
|
101
|
+
msg = f"{self.category} of conversation with {self.author}" if self.author else self.category
|
|
102
|
+
return f"{msg} {self.description}" if self.description else msg
|
|
116
103
|
elif self.author and self.description:
|
|
117
104
|
if self.category in [ACADEMIA, BOOK]:
|
|
118
|
-
|
|
105
|
+
title = self.description if '"' in self.description else f"'{self.description}'"
|
|
106
|
+
return f"{title} by {self.author}"
|
|
119
107
|
elif self.category == FINANCE and self.author in FINANCIAL_REPORTS_AUTHORS:
|
|
120
108
|
return f"{self.author} report: '{self.description}'"
|
|
121
109
|
elif self.category == LEGAL and 'v.' in self.author:
|
|
122
|
-
return f"{self.author}:
|
|
110
|
+
return f"{self.author}: {self.description}"
|
|
123
111
|
elif self.category and self.author is None and self.description is None:
|
|
124
112
|
return self.category
|
|
125
113
|
|
|
126
114
|
pieces = without_falsey([self.author, self.description])
|
|
127
115
|
return ' '.join(pieces) if pieces else None
|
|
128
116
|
|
|
129
|
-
def
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
def sorted_fields(self) -> list[Field]:
|
|
141
|
-
return sorted(fields(self), key=lambda f: FIELD_SORT_KEY.get(f.name, f.name))
|
|
142
|
-
|
|
143
|
-
def title_by_author(self) -> str:
|
|
144
|
-
if not (self.author and self.description):
|
|
145
|
-
raise RuntimeError(f"Can't call title_by_author() without author and description!")
|
|
117
|
+
def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
|
|
118
|
+
"""Create synthetic DocCfg objects that set the 'dupe_of_id' field to point back to this object."""
|
|
119
|
+
for id in self.duplicate_ids:
|
|
120
|
+
dupe_cfg = deepcopy(self)
|
|
121
|
+
dupe_cfg.id = id
|
|
122
|
+
dupe_cfg.dupe_of_id = self.id
|
|
123
|
+
dupe_cfg.duplicate_ids = []
|
|
124
|
+
dupe_cfg.dupe_type = self.dupe_type
|
|
125
|
+
dupe_cfg.is_synthetic = True
|
|
126
|
+
yield dupe_cfg
|
|
146
127
|
|
|
147
|
-
|
|
148
|
-
return
|
|
128
|
+
def metadata(self) -> Metadata:
|
|
129
|
+
return {k: v for k, v in asdict(self).items() if k not in NON_METADATA_FIELDS and v}
|
|
149
130
|
|
|
150
131
|
def _props_strs(self) -> list[str]:
|
|
151
132
|
props = []
|
|
152
133
|
add_prop = lambda f, value: props.append(f"{f.name}={value}")
|
|
153
134
|
|
|
154
|
-
for _field in self.
|
|
135
|
+
for _field in sorted(fields(self), key=lambda f: FIELD_SORT_KEY.get(f.name, f.name)):
|
|
155
136
|
value = getattr(self, _field.name)
|
|
156
137
|
|
|
157
138
|
if value is None or value is False or (isinstance(value, list) and len(value) == 0):
|
|
@@ -160,13 +141,13 @@ class DocCfg:
|
|
|
160
141
|
add_prop(_field, constantize_name(str(value)) if CONSTANTIZE_NAMES else f"'{value}'")
|
|
161
142
|
elif _field.name == 'category' and value in [EMAIL, TEXT_MESSAGE]:
|
|
162
143
|
continue
|
|
163
|
-
elif _field.name == 'recipients' and
|
|
144
|
+
elif _field.name == 'recipients' and value:
|
|
164
145
|
recipients_str = str([constantize_name(r) if (CONSTANTIZE_NAMES and r) else r for r in value])
|
|
165
146
|
add_prop(_field, recipients_str.replace("'", '') if CONSTANTIZE_NAMES else recipients_str)
|
|
166
147
|
elif _field.name == 'timestamp' and self.date is not None:
|
|
167
148
|
continue # Don't print both timestamp and date
|
|
168
149
|
elif isinstance(value, datetime):
|
|
169
|
-
value_str =
|
|
150
|
+
value_str = remove_time_from_timestamp_str(value)
|
|
170
151
|
add_prop(_field, f"parse('{value_str}')" if CONSTANTIZE_NAMES else f"'{value}'")
|
|
171
152
|
elif isinstance(value, str):
|
|
172
153
|
if "'" in value:
|
|
@@ -221,22 +202,15 @@ class EmailCfg(CommunicationCfg):
|
|
|
221
202
|
"""
|
|
222
203
|
Attributes:
|
|
223
204
|
actual_text (str | None): In dire cases of broken OCR we just configure the body of the email as a string.
|
|
205
|
+
fwded_text_after (str | None): If set, any text after this is a fwd of an article or similar
|
|
224
206
|
is_fwded_article (bool): True if this is a newspaper article someone fwded. Used to exclude articles from word counting.
|
|
225
207
|
recipients (list[str | None]): Who received the email
|
|
226
208
|
"""
|
|
227
|
-
actual_text: str | None = None
|
|
228
|
-
fwded_text_after: str | None = None
|
|
209
|
+
actual_text: str | None = None
|
|
210
|
+
fwded_text_after: str | None = None
|
|
229
211
|
is_fwded_article: bool = False
|
|
230
212
|
recipients: list[str | None] = field(default_factory=list)
|
|
231
213
|
|
|
232
|
-
def __post_init__(self):
|
|
233
|
-
super().__post_init__()
|
|
234
|
-
self.category = EMAIL
|
|
235
|
-
|
|
236
|
-
@classmethod
|
|
237
|
-
def from_doc_cfg(cls, cfg: DocCfg) -> 'EmailCfg':
|
|
238
|
-
return cls(**asdict(cfg))
|
|
239
|
-
|
|
240
214
|
# This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
|
|
241
215
|
def __repr__(self) -> str:
|
|
242
216
|
return super().__repr__()
|
|
@@ -244,10 +218,6 @@ class EmailCfg(CommunicationCfg):
|
|
|
244
218
|
|
|
245
219
|
@dataclass(kw_only=True)
|
|
246
220
|
class TextCfg(CommunicationCfg):
|
|
247
|
-
def __post_init__(self):
|
|
248
|
-
super().__post_init__()
|
|
249
|
-
self.category = TEXT_MESSAGE
|
|
250
|
-
|
|
251
221
|
# This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
|
|
252
222
|
def __repr__(self) -> str:
|
|
253
223
|
return super().__repr__()
|
epstein_files/util/env.py
CHANGED
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
from argparse import ArgumentParser
|
|
3
3
|
from os import environ
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from sys import argv
|
|
5
|
+
from sys import argv, exit
|
|
6
6
|
|
|
7
7
|
from rich_argparse_plus import RichHelpFormatterPlus
|
|
8
8
|
|
|
@@ -11,28 +11,30 @@ from epstein_files.util.logging import env_log_level, logger
|
|
|
11
11
|
COUNT_WORDS_SCRIPT = 'epstein_word_count'
|
|
12
12
|
DEFAULT_WIDTH = 145
|
|
13
13
|
HTML_SCRIPTS = ['epstein_generate', COUNT_WORDS_SCRIPT]
|
|
14
|
+
EPSTEIN_DOCS_DIR_ENV_VAR_NAME = 'EPSTEIN_DOCS_DIR'
|
|
15
|
+
|
|
14
16
|
|
|
15
17
|
RichHelpFormatterPlus.choose_theme('morning_glory')
|
|
16
18
|
parser = ArgumentParser(description="Parse epstein OCR docs and generate HTML pages.", formatter_class=RichHelpFormatterPlus)
|
|
19
|
+
parser.add_argument('--make-clean', action='store_true', help='delete all HTML build artifact and write latest URLs to .urls.env')
|
|
17
20
|
parser.add_argument('--name', '-n', action='append', dest='names', help='specify the name(s) whose communications should be output')
|
|
18
|
-
parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='ovewrite cached
|
|
21
|
+
parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='re-parse the files and ovewrite cached data')
|
|
19
22
|
|
|
20
|
-
output = parser.add_argument_group('OUTPUT')
|
|
23
|
+
output = parser.add_argument_group('OUTPUT', 'Options used by epstein_generate.')
|
|
21
24
|
output.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
|
|
22
25
|
output.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just the interesting ones')
|
|
23
|
-
output.add_argument('--build', '-b', action='store_true', help='write output to
|
|
24
|
-
output.add_argument('--json-
|
|
25
|
-
output.add_argument('--
|
|
26
|
-
output.add_argument('--output-emails', '-oe', action='store_true', help='generate
|
|
27
|
-
output.add_argument('--output-
|
|
28
|
-
output.add_argument('--output-other-files', '-oo', action='store_true', help='generate other files section')
|
|
26
|
+
output.add_argument('--build', '-b', action='store_true', help='write HTML output to a file')
|
|
27
|
+
output.add_argument('--json-files', action='store_true', help='pretty print all the raw JSON data files in the collection and exit')
|
|
28
|
+
output.add_argument('--json-metadata', action='store_true', help='dump JSON metadata for all files and exit')
|
|
29
|
+
output.add_argument('--output-emails', '-oe', action='store_true', help='generate emails section')
|
|
30
|
+
output.add_argument('--output-other', '-oo', action='store_true', help='generate other files section')
|
|
29
31
|
output.add_argument('--output-texts', '-ot', action='store_true', help='generate text messages section')
|
|
30
32
|
output.add_argument('--sort-alphabetical', action='store_true', help='sort emailers alphabetically intead of by email count')
|
|
31
33
|
output.add_argument('--suppress-output', action='store_true', help='no output to terminal (use with --build)')
|
|
32
34
|
output.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use (in characters)')
|
|
33
|
-
output.add_argument('--use-epstein-web
|
|
35
|
+
output.add_argument('--use-epstein-web', action='store_true', help='use epsteinweb.org links instead of epstein.media')
|
|
34
36
|
|
|
35
|
-
scripts = parser.add_argument_group('SCRIPTS', '
|
|
37
|
+
scripts = parser.add_argument_group('SCRIPTS', 'Options used by epstein_search, epstein_show, and epstein_diff.')
|
|
36
38
|
scripts.add_argument('positional_args', nargs='*', help='strings to searchs for, file IDs to show or diff, etc.')
|
|
37
39
|
scripts.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (used by epstein_show)')
|
|
38
40
|
scripts.add_argument('--whole-file', '-wf', action='store_true', help='print whole file (used by epstein_search)')
|
|
@@ -42,23 +44,35 @@ debug.add_argument('--colors-only', '-c', action='store_true', help='print heade
|
|
|
42
44
|
debug.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
|
|
43
45
|
debug.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
|
|
44
46
|
debug.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats about the files')
|
|
47
|
+
debug.add_argument('--skip-other-files', '-sof', action='store_true', help='skip parsing non email/text files')
|
|
45
48
|
debug.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
|
|
46
49
|
args = parser.parse_args()
|
|
47
50
|
|
|
51
|
+
|
|
52
|
+
# Verify Epstein docs can be found
|
|
53
|
+
DOCS_DIR_ENV = environ.get(EPSTEIN_DOCS_DIR_ENV_VAR_NAME)
|
|
54
|
+
DOCS_DIR = Path(DOCS_DIR_ENV or '').resolve()
|
|
55
|
+
|
|
56
|
+
if not DOCS_DIR_ENV:
|
|
57
|
+
print(f"\n ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME} env var not set!\n")
|
|
58
|
+
exit(1)
|
|
59
|
+
elif not DOCS_DIR.exists():
|
|
60
|
+
print(f"\n ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!\n")
|
|
61
|
+
exit(1)
|
|
62
|
+
|
|
48
63
|
current_script = Path(argv[0]).name
|
|
49
64
|
is_env_var_set = lambda s: len(environ.get(s) or '') > 0
|
|
50
65
|
is_html_script = current_script in HTML_SCRIPTS
|
|
51
66
|
|
|
52
67
|
args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
|
|
53
68
|
args.output_emails = args.output_emails or args.all_emails
|
|
54
|
-
args.
|
|
69
|
+
args.output_other = args.output_other or args.all_other_files
|
|
55
70
|
args.overwrite_pickle = args.overwrite_pickle or (is_env_var_set('OVERWRITE_PICKLE') and not is_env_var_set('PICKLED'))
|
|
56
71
|
args.width = args.width if is_html_script else None
|
|
57
72
|
is_output_selected = any([arg.startswith('output_') and value for arg, value in vars(args).items()])
|
|
58
73
|
is_output_selected = is_output_selected or args.json_metadata or args.colors_only
|
|
59
74
|
specified_names: list[str | None] = [None if n == 'None' else n for n in (args.names or [])]
|
|
60
75
|
|
|
61
|
-
|
|
62
76
|
# Log level args
|
|
63
77
|
if args.deep_debug:
|
|
64
78
|
logger.setLevel(logging.DEBUG)
|
|
@@ -74,11 +88,9 @@ logger.info(f'Log level set to {logger.level}...')
|
|
|
74
88
|
# Massage args that depend on other args to the appropriate state
|
|
75
89
|
if current_script == 'epstein_generate' and not (is_output_selected or args.make_clean):
|
|
76
90
|
logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
|
|
77
|
-
args.output_texts = True
|
|
78
|
-
args.output_emails = True
|
|
79
|
-
args.output_other_files = True
|
|
91
|
+
args.output_texts = args.output_emails = args.output_other = True
|
|
80
92
|
|
|
81
|
-
if args.
|
|
93
|
+
if args.use_epstein_web:
|
|
82
94
|
logger.warning(f"Using links to epsteinweb.org links instead of epsteinify.com...")
|
|
83
95
|
|
|
84
96
|
if args.debug:
|
|
@@ -1,20 +1,9 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from os import environ
|
|
3
2
|
from pathlib import Path
|
|
4
|
-
from sys import exit
|
|
5
3
|
|
|
6
4
|
from epstein_files.util.constant.strings import FILE_NAME_REGEX, FILE_STEM_REGEX, HOUSE_OVERSIGHT_PREFIX
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
DOCS_DIR_ENV = environ[EPSTEIN_DOCS_DIR_ENV_VAR_NAME]
|
|
10
|
-
DOCS_DIR = Path(DOCS_DIR_ENV or '').resolve()
|
|
11
|
-
|
|
12
|
-
if not DOCS_DIR_ENV:
|
|
13
|
-
print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME} env var not set!")
|
|
14
|
-
exit(1)
|
|
15
|
-
elif not DOCS_DIR.exists():
|
|
16
|
-
print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!")
|
|
17
|
-
exit(1)
|
|
5
|
+
from epstein_files.util.env import DOCS_DIR
|
|
6
|
+
from epstein_files.util.logging import logger
|
|
18
7
|
|
|
19
8
|
EXTRACTED_EMAILS_DIR = Path('emails_extracted_from_legal_filings')
|
|
20
9
|
FILE_ID_REGEX = re.compile(fr".*{FILE_NAME_REGEX.pattern}")
|
|
@@ -22,10 +11,13 @@ FILENAME_LENGTH = len(HOUSE_OVERSIGHT_PREFIX) + 6
|
|
|
22
11
|
KB = 1024
|
|
23
12
|
MB = KB * KB
|
|
24
13
|
|
|
14
|
+
file_size = lambda file_path: Path(file_path).stat().st_size
|
|
15
|
+
file_size_str = lambda file_path: file_size_to_str(file_size(file_path))
|
|
25
16
|
|
|
26
|
-
#
|
|
17
|
+
# Coerce methods handle both string and int arguments.
|
|
18
|
+
coerce_file_name = lambda filename_or_id: coerce_file_stem(filename_or_id) + '.txt'
|
|
19
|
+
coerce_file_path = lambda filename_or_id: DOCS_DIR.joinpath(coerce_file_name(filename_or_id))
|
|
27
20
|
id_str = lambda id: f"{int(id):06d}"
|
|
28
|
-
filename_for_id = lambda id: file_stem_for_id(id) + '.txt'
|
|
29
21
|
|
|
30
22
|
|
|
31
23
|
def coerce_file_stem(filename_or_id: int | str) -> str:
|
|
@@ -42,14 +34,6 @@ def coerce_file_stem(filename_or_id: int | str) -> str:
|
|
|
42
34
|
return file_stem
|
|
43
35
|
|
|
44
36
|
|
|
45
|
-
def coerce_file_name(filename_or_id: int | str) -> str:
|
|
46
|
-
return coerce_file_stem(filename_or_id) + '.txt'
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def coerce_file_path(filename_or_id: int | str) -> Path:
|
|
50
|
-
return DOCS_DIR.joinpath(coerce_file_name(filename_or_id))
|
|
51
|
-
|
|
52
|
-
|
|
53
37
|
def extract_file_id(filename_or_id: int | str | Path) -> str:
|
|
54
38
|
if isinstance(filename_or_id, int) or (isinstance(filename_or_id, str) and len(filename_or_id) <= 6):
|
|
55
39
|
return id_str(filename_or_id)
|
|
@@ -62,12 +46,7 @@ def extract_file_id(filename_or_id: int | str | Path) -> str:
|
|
|
62
46
|
return file_match.group(1)
|
|
63
47
|
|
|
64
48
|
|
|
65
|
-
def
|
|
66
|
-
return Path(file_path).stat().st_size
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def file_size_str(file_path: str | Path) -> str:
|
|
70
|
-
size = file_size(file_path)
|
|
49
|
+
def file_size_to_str(size: int) -> str:
|
|
71
50
|
digits = 2
|
|
72
51
|
|
|
73
52
|
if size > MB:
|
|
@@ -96,3 +75,7 @@ def is_local_extract_file(filename) -> bool:
|
|
|
96
75
|
"""Return true if filename is of form 'HOUSE_OVERSIGHT_029835_1.txt'."""
|
|
97
76
|
file_match = FILE_ID_REGEX.match(str(filename))
|
|
98
77
|
return True if file_match and file_match.group(2) else False
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def log_file_write(file_path: str | Path) -> None:
|
|
81
|
+
logger.warning(f"Wrote {file_size_str(file_path)} to '{file_path}'")
|
|
@@ -2,6 +2,7 @@ import re
|
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
3
|
|
|
4
4
|
from rich.highlighter import RegexHighlighter
|
|
5
|
+
from rich.text import Text
|
|
5
6
|
|
|
6
7
|
from epstein_files.util.constant.names import *
|
|
7
8
|
from epstein_files.util.constant.strings import *
|
|
@@ -21,7 +22,7 @@ EPSTEIN_ESTATE_EXECUTOR = f"Epstein {ESTATE_EXECUTOR}"
|
|
|
21
22
|
REGEX_STYLE_PREFIX = 'regex'
|
|
22
23
|
SIMPLE_NAME_REGEX = re.compile(r"^[-\w ]+$", re.IGNORECASE)
|
|
23
24
|
|
|
24
|
-
|
|
25
|
+
CATEGORY_STYLE_MAPPING = {
|
|
25
26
|
ARTICLE: JOURNALIST,
|
|
26
27
|
ARTS: ENTERTAINER,
|
|
27
28
|
BOOK: JOURNALIST,
|
|
@@ -31,6 +32,12 @@ CATEGORY_LABEL_MAPPING = {
|
|
|
31
32
|
REPUTATION: PUBLICIST,
|
|
32
33
|
}
|
|
33
34
|
|
|
35
|
+
CATEGORY_STYLES = {
|
|
36
|
+
JSON: 'dark_red',
|
|
37
|
+
JUNK: 'grey19',
|
|
38
|
+
'letter': 'medium_orchid1'
|
|
39
|
+
}
|
|
40
|
+
|
|
34
41
|
|
|
35
42
|
@dataclass(kw_only=True)
|
|
36
43
|
class HighlightedText:
|
|
@@ -156,7 +163,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
156
163
|
HighlightedNames(
|
|
157
164
|
label=BUSINESS,
|
|
158
165
|
style='spring_green4',
|
|
159
|
-
pattern=r'Gruterite|(John\s*)?Kluge|Marc Rich|(Mi(chael|ke)\s*)?Ovitz|(Steve\s+)?Wynn|(Les(lie)?\s+)?Wexner|SALSS|Swedish[-\s]*American\s*Life\s*Science\s*Summit|Valhi|(Yves\s*)?Bouvier',
|
|
166
|
+
pattern=r'Gruterite|(John\s*)?Kluge|Marc Rich|(Mi(chael|ke)\s*)?Ovitz|(Steve\s+)?Wynn|(Les(lie)?\s+)?Wexner|New Leaf Ventures|Park Partners|SALSS|Swedish[-\s]*American\s*Life\s*Science\s*Summit|Valhi|(Yves\s*)?Bouvier',
|
|
160
167
|
emailers = {
|
|
161
168
|
ALIREZA_ITTIHADIEH: 'CEO Freestream Aircraft Limited',
|
|
162
169
|
BARBRO_C_EHNBOM: 'Swedish pharmaceuticals, SALSS',
|
|
@@ -216,6 +223,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
216
223
|
'Linda Pinto': 'interior design at Alberto Pinto Cabinet',
|
|
217
224
|
MERWIN_DELA_CRUZ: None, # HOUSE_OVERSIGHT_032652 Groff says "Jojo and Merwin both requested off Nov. 25 and 26"
|
|
218
225
|
NADIA_MARCINKO: 'pilot',
|
|
226
|
+
'Sean J. Lancaster': 'airplane reseller',
|
|
219
227
|
}
|
|
220
228
|
),
|
|
221
229
|
HighlightedNames(
|
|
@@ -253,6 +261,8 @@ HIGHLIGHTED_NAMES = [
|
|
|
253
261
|
MARTIN_WEINBERG: CRIMINAL_DEFENSE_ATTORNEY,
|
|
254
262
|
MICHAEL_MILLER: 'Steptoe LLP partner',
|
|
255
263
|
REID_WEINGARTEN: 'Steptoe LLP partner',
|
|
264
|
+
ROBERT_D_CRITTON_JR: 'criminal defense attorney',
|
|
265
|
+
'Robert Gold': None,
|
|
256
266
|
'Roy Black': CRIMINAL_DEFENSE_2008,
|
|
257
267
|
SCOTT_J_LINK: None,
|
|
258
268
|
TONJA_HADDAD_COLEMAN: f'{EPSTEIN_V_ROTHSTEIN_EDWARDS_ATTORNEY}, maybe daughter of Fred Haddad?',
|
|
@@ -303,15 +313,17 @@ HIGHLIGHTED_NAMES = [
|
|
|
303
313
|
}
|
|
304
314
|
),
|
|
305
315
|
HighlightedNames(
|
|
306
|
-
label=
|
|
316
|
+
label=FINANCE,
|
|
307
317
|
style='green',
|
|
308
|
-
pattern=r'Apollo|Ari\s*Glass|Bank|(Bernie\s*)?Madoff|Black(rock|stone)|B\s*of\s*A|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|(money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
|
|
318
|
+
pattern=r'Apollo|Ari\s*Glass|Bank|(Bernie\s*)?Madoff|Black(rock|stone)|B\s*of\s*A|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|((anti.?)?money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
|
|
309
319
|
emailers={
|
|
310
320
|
AMANDA_ENS: 'Citigroup',
|
|
321
|
+
BRAD_WECHSLER: f"head of {LEON_BLACK}'s personal investment vehicle according to FT",
|
|
311
322
|
DANIEL_SABBA: 'UBS Investment Bank',
|
|
312
323
|
DAVID_FISZEL: 'CIO Honeycomb Asset Management',
|
|
313
324
|
JES_STALEY: 'former CEO of Barclays',
|
|
314
325
|
JIDE_ZEITLIN: 'former partner at Goldman Sachs, allegations of sexual misconduct',
|
|
326
|
+
'Laurie Cameron': 'currency trading',
|
|
315
327
|
LEON_BLACK: 'Apollo CEO',
|
|
316
328
|
MARC_LEON: 'Luxury Properties Sari Morrocco',
|
|
317
329
|
MELANIE_SPINELLA: f'representative of {LEON_BLACK}',
|
|
@@ -325,6 +337,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
325
337
|
style='deep_pink2',
|
|
326
338
|
pattern=r'Cambridge|(Derek\s*)?Bok|Elisa(\s*New)?|Harvard(\s*(Business|Law|University)(\s*School)?)?|(Jonathan\s*)?Zittrain|(Stephen\s*)?Kosslyn',
|
|
327
339
|
emailers = {
|
|
340
|
+
"Donald Rubin": f"Professor of Statistics",
|
|
328
341
|
"Kelly Friendly": f"longtime aide and spokesperson of {LARRY_SUMMERS}",
|
|
329
342
|
LARRY_SUMMERS: 'board of Digital Currency Group (DCG), Harvard president, Obama economic advisor',
|
|
330
343
|
'Leah Reis-Dennis': 'producer for Lisa New\'s Poetry in America',
|
|
@@ -370,7 +383,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
370
383
|
HighlightedNames(
|
|
371
384
|
label=JOURNALIST,
|
|
372
385
|
style='bright_yellow',
|
|
373
|
-
pattern=r'Palm\s*Beach\s*(Daily\s*News|Post)|ABC(\s*News)?|Alex\s*Yablon|(Andrew\s*)?Marra|Arianna(\s*Huffington)?|(Arthur\s*)?Kretchmer|BBC|Bloomberg|Breitbart|Charlie\s*Rose|China\s*Daily|CNBC|CNN(politics?)?|Con[cs]hita|Sarnoff|(?<!Virgin[-\s]Islands[-\s])Daily\s*(Beast|Mail|News|Telegraph)|(David\s*)?Pecker|David\s*Brooks|Ed\s*Krassenstein|(Emily\s*)?Michot|Ezra\s*Klein|(George\s*)?Stephanopoulus|Globe\s*and\s*Mail|Good\s*Morning\s*America|Graydon(\s*Carter)?|Huffington(\s*Post)?|Ingram, David|(James\s*)?Patterson|Jonathan\s*Karl|Julie\s*(K.?\s*)?Brown|(Katie\s*)?Couric|Keith\s*Larsen|L\.?A\.?\s*Times|Miami\s*Herald|(Michele\s*)?Dargan|(National\s*)?Enquirer|(The\s*)?N(ew\s*)?Y(ork\s*)?(P(ost)?|T(imes)?)|(The\s*)?New\s*Yorker|NYer|PERVERSION\s*OF\s*JUSTICE|Politico|Pro\s*Publica|(Sean\s*)?Hannity|Sulzberger|SunSentinel|Susan Edelman|(Uma\s*)?Sanghvi|(The\s*)?Wa(shington\s*)?Po(st)?|Viceland|Vick[iy]\s*Ward|Vox|WGBH|(The\s*)?Wall\s*Street\s*Journal|WSJ|[-\w.]+@(bbc|independent|mailonline|mirror|thetimes)\.co\.uk',
|
|
386
|
+
pattern=r'Palm\s*Beach\s*(Daily\s*News|Post)|ABC(\s*News)?|Alex\s*Yablon|(Andrew\s*)?Marra|Arianna(\s*Huffington)?|(Arthur\s*)?Kretchmer|BBC|Bloomberg|Breitbart|Charlie\s*Rose|China\s*Daily|CNBC|CNN(politics?)?|Con[cs]hita|Sarnoff|(?<!Virgin[-\s]Islands[-\s])Daily\s*(Beast|Mail|News|Telegraph)|(David\s*)?Pecker|David\s*Brooks|Ed\s*Krassenstein|(Emily\s*)?Michot|Ezra\s*Klein|(George\s*)?Stephanopoulus|Globe\s*and\s*Mail|Good\s*Morning\s*America|Graydon(\s*Carter)?|Huffington(\s*Post)?|Ingram, David|(James\s*)?(Hill|Patterson)|Jonathan\s*Karl|Julie\s*(K.?\s*)?Brown|(Katie\s*)?Couric|Keith\s*Larsen|L\.?A\.?\s*Times|Miami\s*Herald|(Michele\s*)?Dargan|(National\s*)?Enquirer|(The\s*)?N(ew\s*)?Y(ork\s*)?(P(ost)?|T(imes)?)|(The\s*)?New\s*Yorker|NYer|PERVERSION\s*OF\s*JUSTICE|Politico|Pro\s*Publica|(Sean\s*)?Hannity|Sulzberger|SunSentinel|Susan Edelman|(Uma\s*)?Sanghvi|(The\s*)?Wa(shington\s*)?Po(st)?|Viceland|Vick[iy]\s*Ward|Vox|WGBH|(The\s*)?Wall\s*Street\s*Journal|WSJ|[-\w.]+@(bbc|independent|mailonline|mirror|thetimes)\.co\.uk',
|
|
374
387
|
emailers = {
|
|
375
388
|
EDWARD_JAY_EPSTEIN: 'reporter who wrote about the kinds of crimes Epstein was involved in, no relation to Jeffrey',
|
|
376
389
|
'James Hill': 'ABC News',
|
|
@@ -390,7 +403,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
390
403
|
HighlightedNames(
|
|
391
404
|
label='law enforcement',
|
|
392
405
|
style='color(24) bold',
|
|
393
|
-
pattern=r'ag|(Alicia\s*)?Valle|attorney|((Bob|Robert)\s*)?Mueller|(Byung\s)?Pak|CFTC
|
|
406
|
+
pattern=r'ag|(Alicia\s*)?Valle|AML|attorney|((Bob|Robert)\s*)?Mueller|(Byung\s)?Pak|CFTC?|CIA|CIS|CVRA|Dep(artmen)?t\.?\s*of\s*(the\s*)?(Justice|Treasury)|DHS|DOJ|FBI|FCPA|FDIC|Federal\s*Bureau\s*of\s*Investigation|FinCEN|FINRA|FOIA|FTC|IRS|(James\s*)?Comey|(Jennifer\s*Shasky\s*)?Calvery|((Judge|Mark)\s*)?(Carney|Filip)|(Kirk )?Blouin|KYC|NIH|NS(A|C)|OCC|OFAC|(Lann?a\s*)?Belohlavek|lawyer|(Michael\s*)?Reiter|OGE|Office\s*of\s*Government\s*Ethics|Police Code Enforcement|(Preet\s*)?Bharara|SCOTUS|SD(FL|NY)|Southern\s*District\s*of\s*(Florida|New\s*York)|SEC|Secret\s*Service|Securities\s*and\s*Exchange\s*Commission|State\s*Dep(artmen)?t|Strzok|Supreme\s*Court|Treasury\s*(Dep(artmen)?t|Secretary)|TSA|USAID|(William\s*J\.?\s*)?Zloch',
|
|
394
407
|
emailers = {
|
|
395
408
|
ANN_MARIE_VILLAFANA: 'southern district of Florida U.S. Attorney',
|
|
396
409
|
DANNY_FROST: 'Director of Communications at Manhattan DA',
|
|
@@ -450,6 +463,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
450
463
|
IAN_OSBORNE: f"{OSBORNE_LLP} reputation repairer possibly hired by Epstein ca. 2011-06",
|
|
451
464
|
MICHAEL_SITRICK: 'crisis PR',
|
|
452
465
|
PEGGY_SIEGAL: 'socialite',
|
|
466
|
+
'R. Couri Hay': None,
|
|
453
467
|
ROSS_GOW: 'Acuity Reputation Management',
|
|
454
468
|
TYLER_SHEARS: f"{REPUTATION_MGMT}, worked on Epstein's Google search results with {CHRISTINA_GALBRAITH}",
|
|
455
469
|
}
|
|
@@ -477,6 +491,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
477
491
|
style='red bold',
|
|
478
492
|
pattern=r'Alfa\s*Bank|Anya\s*Rasulova|Chernobyl|Day\s+One\s+Ventures|(Dmitry\s)?(Kiselyov|(Lana\s*)?Pozhidaeva|Medvedev|Rybolo(o?l?ev|vlev))|Dmitry|FSB|GRU|KGB|Kislyak|Kremlin|Kuznetsova|Lavrov|Lukoil|Moscow|(Oleg\s*)?Deripaska|Oleksandr Vilkul|Rosneft|RT|St.?\s*?Petersburg|Russian?|Sberbank|Soviet(\s*Union)?|USSR|Vladimir|(Vladimir\s*)?(Putin|Yudashkin)|Women\s*Empowerment|Xitrans',
|
|
479
493
|
emailers = {
|
|
494
|
+
'Dasha Zhukova': 'art collector, daughter of Alexander Zhukov',
|
|
480
495
|
MASHA_DROKOVA: 'silicon valley VC, former Putin Youth',
|
|
481
496
|
RENATA_BOLOTOVA: 'former aspiring model, now fund manager at New York State Insurance Fund',
|
|
482
497
|
SVETLANA_POZHIDAEVA: f'Epstein\'s Russian assistant who was recommended for a visa by Sergei Belyakov (FSB) and {DAVID_BLAINE}',
|
|
@@ -485,14 +500,16 @@ HIGHLIGHTED_NAMES = [
|
|
|
485
500
|
HighlightedNames(
|
|
486
501
|
label=ACADEMIA,
|
|
487
502
|
style='light_goldenrod2',
|
|
488
|
-
pattern=r'Alain Forget|Brotherton|Carl\s*Sagan|Columbia|David Grosof|J(ames|im)\s*Watson|(Lord\s*)?Martin\s*Rees|Massachusetts\s*Institute\s*of\s*Technology|MIT(\s*Media\s*Lab)?|Media\s*Lab|Minsky|((Noam|Valeria)\s*)?Chomsky|Praluent|Regeneron|(Richard\s*)?Dawkins|Sanofi|Stanford|(Stephen\s*)?Hawking|(Steven?\s*)?Pinker|UCLA',
|
|
503
|
+
pattern=r'Alain Forget|Brotherton|Carl\s*Sagan|Columbia|David Grosof|J(ames|im)\s*Watson|(Lord\s*)?Martin\s*Rees|Massachusetts\s*Institute\s*of\s*Technology|MIT(\s*Media\s*Lab)?|Media\s*Lab|Minsky|((Noam|Valeria)\s*)?Chomsky|Norman\s*Finkelstein|Praluent|Regeneron|(Richard\s*)?Dawkins|Sanofi|Stanford|(Stephen\s*)?Hawking|(Steven?\s*)?Pinker|UCLA',
|
|
489
504
|
emailers = {
|
|
490
505
|
DAVID_HAIG: None,
|
|
491
506
|
JOSCHA_BACH: 'cognitive science / AI research',
|
|
492
507
|
'Daniel Kahneman': 'Nobel economic sciences laureate and cognitivie psychologist (?)',
|
|
508
|
+
'Ed Boyden': 'Associate Professor, MIT Media Lab neurobiology',
|
|
493
509
|
LAWRENCE_KRAUSS: 'theoretical physicist',
|
|
494
510
|
LINDA_STONE: 'ex-Microsoft, MIT Media Lab',
|
|
495
511
|
MARK_TRAMO: 'professor of neurology at UCLA',
|
|
512
|
+
'Nancy Dahl': f'wife of {LAWRENCE_KRAUSS}',
|
|
496
513
|
NEAL_KASSELL: 'professor of neurosurgery at University of Virginia',
|
|
497
514
|
PETER_ATTIA: 'longevity medicine',
|
|
498
515
|
ROBERT_TRIVERS: 'evolutionary biology',
|
|
@@ -588,7 +605,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
588
605
|
HighlightedText(
|
|
589
606
|
label='phone_number',
|
|
590
607
|
style='bright_green',
|
|
591
|
-
pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})
|
|
608
|
+
pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|(\b|\+)[\d+]{10,12}\b",
|
|
592
609
|
),
|
|
593
610
|
]
|
|
594
611
|
|
|
@@ -648,18 +665,14 @@ def get_info_for_name(name: str) -> str | None:
|
|
|
648
665
|
|
|
649
666
|
|
|
650
667
|
def get_style_for_category(category: str) -> str | None:
|
|
651
|
-
if category in
|
|
668
|
+
if category in CATEGORY_STYLES:
|
|
669
|
+
return CATEGORY_STYLES[category]
|
|
670
|
+
elif category in [CONFERENCE, SPEECH]:
|
|
652
671
|
return f"{get_style_for_category(ACADEMIA)} dim"
|
|
653
|
-
elif category == JSON:
|
|
654
|
-
return 'dark_red'
|
|
655
|
-
elif category == JUNK:
|
|
656
|
-
return 'grey19'
|
|
657
|
-
elif category == 'letter':
|
|
658
|
-
return 'medium_orchid1'
|
|
659
672
|
elif category == SOCIAL:
|
|
660
|
-
return
|
|
673
|
+
return get_style_for_category(PUBLICIST)
|
|
661
674
|
|
|
662
|
-
category =
|
|
675
|
+
category = CATEGORY_STYLE_MAPPING.get(category, category)
|
|
663
676
|
|
|
664
677
|
for highlight_group in HIGHLIGHTED_NAMES:
|
|
665
678
|
if highlight_group.label == category:
|
|
@@ -672,6 +685,10 @@ def get_style_for_name(name: str | None, default_style: str = DEFAULT, allow_bol
|
|
|
672
685
|
return style if allow_bold else style.replace('bold', '').strip()
|
|
673
686
|
|
|
674
687
|
|
|
688
|
+
def styled_category(category: str) -> Text:
|
|
689
|
+
return Text(category, get_style_for_category(category) or 'wheat4')
|
|
690
|
+
|
|
691
|
+
|
|
675
692
|
def _get_highlight_group_for_name(name: str) -> HighlightedNames | None:
|
|
676
693
|
for highlight_group in HIGHLIGHTED_NAMES:
|
|
677
694
|
if highlight_group.regex.search(name):
|
epstein_files/util/logging.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from os import environ
|
|
3
|
-
from pathlib import Path
|
|
4
3
|
|
|
5
4
|
import datefinder
|
|
6
5
|
import rich_argparse_plus
|
|
@@ -10,7 +9,6 @@ from rich.logging import RichHandler
|
|
|
10
9
|
from rich.theme import Theme
|
|
11
10
|
|
|
12
11
|
from epstein_files.util.constant.strings import *
|
|
13
|
-
from epstein_files.util.file_helper import file_size_str
|
|
14
12
|
|
|
15
13
|
FILENAME_STYLE = 'gray27'
|
|
16
14
|
|
|
@@ -34,7 +32,7 @@ LOG_LEVEL_ENV_VAR = 'LOG_LEVEL'
|
|
|
34
32
|
# Augment the standard log highlighter with 'epstein_filename' matcher
|
|
35
33
|
class LogHighlighter(ReprHighlighter):
|
|
36
34
|
highlights = ReprHighlighter.highlights + [
|
|
37
|
-
*[fr"(?P<{doc_type}>{doc_type})" for doc_type in DOC_TYPE_STYLES.keys()],
|
|
35
|
+
*[fr"(?P<{doc_type}>{doc_type}(Cfg)?)" for doc_type in DOC_TYPE_STYLES.keys()],
|
|
38
36
|
"(?P<epstein_filename>" + FILE_NAME_REGEX.pattern + ')',
|
|
39
37
|
]
|
|
40
38
|
|
|
@@ -60,7 +58,3 @@ if env_log_level_str:
|
|
|
60
58
|
|
|
61
59
|
logger.warning(f"Setting log level to {env_log_level} based on {LOG_LEVEL_ENV_VAR} env var...")
|
|
62
60
|
logger.setLevel(env_log_level)
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def log_file_write(file_path: str | Path) -> None:
|
|
66
|
-
logger.warning(f"Wrote {file_size_str(file_path)} to '{file_path}'")
|
epstein_files/util/output.py
CHANGED
|
@@ -11,7 +11,8 @@ from epstein_files.util.constant.names import *
|
|
|
11
11
|
from epstein_files.util.constant.output_files import JSON_FILES_JSON_PATH, JSON_METADATA_PATH
|
|
12
12
|
from epstein_files.util.data import dict_sets_to_lists
|
|
13
13
|
from epstein_files.util.env import args, specified_names
|
|
14
|
-
from epstein_files.util.
|
|
14
|
+
from epstein_files.util.file_helper import log_file_write
|
|
15
|
+
from epstein_files.util.logging import logger
|
|
15
16
|
from epstein_files.util.rich import *
|
|
16
17
|
|
|
17
18
|
PRINT_COLOR_KEY_EVERY_N_EMAILS = 150
|
|
@@ -60,7 +61,6 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
|
|
|
60
61
|
"""Returns number of emails printed."""
|
|
61
62
|
print_section_header(('Selections from ' if not args.all_emails else '') + 'His Emails')
|
|
62
63
|
print_other_site_link(is_header=False)
|
|
63
|
-
|
|
64
64
|
emailers_to_print: list[str | None]
|
|
65
65
|
emailer_tables: list[str | None] = []
|
|
66
66
|
already_printed_emails: list[Email] = []
|
|
@@ -106,8 +106,8 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
|
|
|
106
106
|
_verify_all_emails_were_printed(epstein_files, already_printed_emails)
|
|
107
107
|
|
|
108
108
|
fwded_articles = [e for e in already_printed_emails if e.config and e.config.is_fwded_article]
|
|
109
|
-
|
|
110
|
-
logger.warning(f"
|
|
109
|
+
log_msg = f"Rewrote {len(Email.rewritten_header_ids)} email headers (out of {len(already_printed_emails)})"
|
|
110
|
+
logger.warning(f"{log_msg}, {len(fwded_articles)} of the emails were forwarded articles.")
|
|
111
111
|
return len(already_printed_emails)
|
|
112
112
|
|
|
113
113
|
|
|
@@ -121,11 +121,11 @@ def print_json_files(epstein_files: EpsteinFiles):
|
|
|
121
121
|
else:
|
|
122
122
|
for json_file in epstein_files.json_files:
|
|
123
123
|
console.line(2)
|
|
124
|
-
console.print(json_file.
|
|
124
|
+
console.print(json_file.summary_panel())
|
|
125
125
|
console.print_json(json_file.json_str(), indent=4, sort_keys=False)
|
|
126
126
|
|
|
127
127
|
|
|
128
|
-
def
|
|
128
|
+
def write_json_metadata(epstein_files: EpsteinFiles) -> None:
|
|
129
129
|
json_str = epstein_files.json_metadata()
|
|
130
130
|
|
|
131
131
|
if args.build:
|
|
@@ -187,8 +187,13 @@ def write_urls() -> None:
|
|
|
187
187
|
def _verify_all_emails_were_printed(epstein_files: EpsteinFiles, already_printed_emails: list[Email]) -> None:
|
|
188
188
|
"""Log warnings if some emails were never printed."""
|
|
189
189
|
email_ids_that_were_printed = set([email.file_id for email in already_printed_emails])
|
|
190
|
-
logger.warning(f"Printed {len(already_printed_emails)} emails of {len(email_ids_that_were_printed)} unique file IDs.")
|
|
190
|
+
logger.warning(f"Printed {len(already_printed_emails):,} emails of {len(email_ids_that_were_printed):,} unique file IDs.")
|
|
191
|
+
missed_an_email = False
|
|
191
192
|
|
|
192
193
|
for email in epstein_files.emails:
|
|
193
|
-
if email.file_id not in email_ids_that_were_printed and not email.is_duplicate:
|
|
194
|
+
if email.file_id not in email_ids_that_were_printed and not email.is_duplicate():
|
|
194
195
|
logger.warning(f"Failed to print {email.summary()}")
|
|
196
|
+
missed_an_email = True
|
|
197
|
+
|
|
198
|
+
if not missed_an_email:
|
|
199
|
+
logger.warning(f"All {len(epstein_files.emails):,} emails printed at least once.")
|