epstein-files 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +75 -135
- epstein_files/documents/communication.py +9 -9
- epstein_files/documents/document.py +115 -87
- epstein_files/documents/email.py +154 -85
- epstein_files/documents/emails/email_header.py +7 -6
- epstein_files/documents/imessage/text_message.py +3 -2
- epstein_files/documents/json_file.py +17 -0
- epstein_files/documents/messenger_log.py +62 -3
- epstein_files/documents/other_file.py +165 -17
- epstein_files/epstein_files.py +128 -169
- epstein_files/util/constant/names.py +8 -1
- epstein_files/util/constant/output_files.py +29 -0
- epstein_files/util/constant/strings.py +27 -0
- epstein_files/util/constant/urls.py +25 -9
- epstein_files/util/constants.py +1018 -1045
- epstein_files/util/data.py +20 -55
- epstein_files/util/{file_cfg.py → doc_cfg.py} +121 -43
- epstein_files/util/env.py +19 -20
- epstein_files/util/file_helper.py +38 -21
- epstein_files/util/highlighted_group.py +229 -177
- epstein_files/util/logging.py +63 -0
- epstein_files/util/output.py +180 -0
- epstein_files/util/rich.py +29 -17
- epstein_files/util/search_result.py +14 -6
- epstein_files/util/timer.py +24 -0
- epstein_files/util/word_count.py +2 -1
- {epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/METADATA +20 -4
- epstein_files-1.0.2.dist-info/RECORD +33 -0
- epstein_files-1.0.2.dist-info/entry_points.txt +7 -0
- epstein_files-1.0.0.dist-info/RECORD +0 -28
- {epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/LICENSE +0 -0
- {epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/WHEEL +0 -0
epstein_files/util/data.py
CHANGED
|
@@ -3,17 +3,13 @@ Helpers for dealing with various kinds of data.
|
|
|
3
3
|
"""
|
|
4
4
|
import itertools
|
|
5
5
|
import re
|
|
6
|
-
import time
|
|
7
|
-
from dataclasses import dataclass, field
|
|
8
6
|
from datetime import datetime, timezone
|
|
9
7
|
from dateutil import tz
|
|
10
8
|
from typing import TypeVar
|
|
11
9
|
|
|
12
|
-
from dateutil.parser import parse
|
|
13
|
-
from rich.text import Text
|
|
14
|
-
|
|
15
10
|
from epstein_files.util.constant import names
|
|
16
|
-
from epstein_files.util.env import args
|
|
11
|
+
from epstein_files.util.env import args
|
|
12
|
+
from epstein_files.util.logging import logger
|
|
17
13
|
|
|
18
14
|
T = TypeVar('T')
|
|
19
15
|
|
|
@@ -23,37 +19,22 @@ CONSTANT_VAR_REGEX = re.compile(r"^[A-Z_]+$")
|
|
|
23
19
|
ALL_NAMES = [v for k, v in vars(names).items() if isinstance(v, str) and CONSTANT_VAR_REGEX.match(k)]
|
|
24
20
|
|
|
25
21
|
PACIFIC_TZ = tz.gettz("America/Los_Angeles")
|
|
26
|
-
TIMEZONE_INFO = {"
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def collapse_newlines(text: str) -> str:
|
|
30
|
-
return MULTINEWLINE_REGEX.sub('\n\n', text)
|
|
22
|
+
TIMEZONE_INFO = {"PDT": PACIFIC_TZ, "PST": PACIFIC_TZ} # Suppresses annoying warnings from parse() calls
|
|
31
23
|
|
|
32
24
|
|
|
33
|
-
|
|
34
|
-
|
|
25
|
+
collapse_newlines = lambda text: MULTINEWLINE_REGEX.sub('\n\n', text)
|
|
26
|
+
date_str = lambda dt: dt.isoformat()[0:10] if dt else None
|
|
27
|
+
escape_double_quotes = lambda text: text.replace('"', r'\"')
|
|
28
|
+
escape_single_quotes = lambda text: text.replace("'", r"\'")
|
|
29
|
+
iso_timestamp = lambda dt: dt.isoformat().replace('T', ' ')
|
|
30
|
+
uniquify = lambda _list: list(set(_list))
|
|
31
|
+
without_nones = lambda _list: [e for e in _list if e]
|
|
35
32
|
|
|
36
33
|
|
|
37
34
|
def dict_sets_to_lists(d: dict[str, set]) -> dict[str, list]:
|
|
38
35
|
return {k: sorted(list(v)) for k, v in d.items()}
|
|
39
36
|
|
|
40
37
|
|
|
41
|
-
def extract_datetime(s: str) -> datetime | None:
|
|
42
|
-
match = ISO_DATE_REGEX.search(s)
|
|
43
|
-
|
|
44
|
-
if not match:
|
|
45
|
-
return None
|
|
46
|
-
|
|
47
|
-
date_str = match.group(0)
|
|
48
|
-
|
|
49
|
-
if len(date_str) == 4:
|
|
50
|
-
date_str += '-01-01'
|
|
51
|
-
elif len(date_str) == 7:
|
|
52
|
-
date_str += '-01'
|
|
53
|
-
|
|
54
|
-
return parse(date_str, tzinfos=TIMEZONE_INFO)
|
|
55
|
-
|
|
56
|
-
|
|
57
38
|
def extract_last_name(name: str) -> str:
|
|
58
39
|
if ' ' not in name:
|
|
59
40
|
return name
|
|
@@ -70,14 +51,19 @@ def flatten(_list: list[list[T]]) -> list[T]:
|
|
|
70
51
|
return list(itertools.chain.from_iterable(_list))
|
|
71
52
|
|
|
72
53
|
|
|
73
|
-
def
|
|
74
|
-
return
|
|
54
|
+
def json_safe(d: dict) -> dict:
|
|
55
|
+
return {
|
|
56
|
+
'None' if k is None else k: v.isoformat() if isinstance(v, datetime) else v
|
|
57
|
+
for k,v in d.items()
|
|
58
|
+
}
|
|
75
59
|
|
|
76
60
|
|
|
77
|
-
def listify(listlike
|
|
61
|
+
def listify(listlike) -> list:
|
|
78
62
|
"""Create a list of 'listlike'. Returns empty list if 'listlike' is None or empty string."""
|
|
79
63
|
if isinstance(listlike, list):
|
|
80
64
|
return listlike
|
|
65
|
+
elif listlike is None:
|
|
66
|
+
return [None]
|
|
81
67
|
elif listlike:
|
|
82
68
|
return [listlike]
|
|
83
69
|
else:
|
|
@@ -93,8 +79,8 @@ def ordinal_str(n: int) -> str:
|
|
|
93
79
|
return str(n) + suffix
|
|
94
80
|
|
|
95
81
|
|
|
96
|
-
def patternize(_pattern: str | re.Pattern):
|
|
97
|
-
return _pattern if isinstance(_pattern, re.Pattern) else re.compile(
|
|
82
|
+
def patternize(_pattern: str | re.Pattern) -> re.Pattern:
|
|
83
|
+
return _pattern if isinstance(_pattern, re.Pattern) else re.compile(fr"({_pattern})", re.IGNORECASE)
|
|
98
84
|
|
|
99
85
|
|
|
100
86
|
def remove_timezone(timestamp: datetime) -> datetime:
|
|
@@ -108,24 +94,3 @@ def remove_timezone(timestamp: datetime) -> datetime:
|
|
|
108
94
|
def sort_dict(d: dict[str | None, int] | dict[str, int]) -> list[tuple[str | None, int]]:
|
|
109
95
|
sort_key = lambda e: (e[0] or '').lower() if args.sort_alphabetical else [-e[1], (e[0] or '').lower()]
|
|
110
96
|
return sorted(d.items(), key=sort_key)
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
@dataclass
|
|
114
|
-
class Timer:
|
|
115
|
-
started_at: float = field(default_factory=lambda: time.perf_counter())
|
|
116
|
-
checkpoint_at: float = field(default_factory=lambda: time.perf_counter())
|
|
117
|
-
|
|
118
|
-
def print_at_checkpoint(self, msg: str) -> None:
|
|
119
|
-
logger.warning(f"{msg} in {self.seconds_since_checkpoint()}")
|
|
120
|
-
self.checkpoint_at = time.perf_counter()
|
|
121
|
-
|
|
122
|
-
def seconds_since_checkpoint(self) -> str:
|
|
123
|
-
return f"{(time.perf_counter() - self.checkpoint_at):.2f} seconds"
|
|
124
|
-
|
|
125
|
-
def seconds_since_start(self) -> str:
|
|
126
|
-
return f"{(time.perf_counter() - self.started_at):.2f} seconds"
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
escape_double_quotes = lambda text: text.replace('"', r'\"')
|
|
130
|
-
escape_single_quotes = lambda text: text.replace("'", r"\'")
|
|
131
|
-
uniquify = lambda _list: list(set(_list))
|
|
@@ -6,22 +6,27 @@ from typing import Generator, Literal
|
|
|
6
6
|
|
|
7
7
|
from dateutil.parser import parse
|
|
8
8
|
|
|
9
|
-
from epstein_files.util.constant.names import
|
|
10
|
-
from epstein_files.util.constant.strings import
|
|
9
|
+
from epstein_files.util.constant.names import *
|
|
10
|
+
from epstein_files.util.constant.strings import *
|
|
11
|
+
from epstein_files.util.data import without_nones
|
|
11
12
|
|
|
12
|
-
DuplicateType = Literal['
|
|
13
|
+
DuplicateType = Literal['earlier', 'quoted', 'redacted', 'same']
|
|
14
|
+
Metadata = dict[str, bool | datetime | int | str | list[str | None] |dict[str, bool | str]]
|
|
13
15
|
|
|
16
|
+
# Misc
|
|
17
|
+
CONSTANTIZE_NAMES = False # A flag set to True that causes repr() of these classes to return strings of usable code
|
|
14
18
|
INDENT = ' '
|
|
15
19
|
INDENT_NEWLINE = f'\n{INDENT}'
|
|
16
20
|
INDENTED_JOIN = f',{INDENT_NEWLINE}'
|
|
17
|
-
|
|
18
|
-
|
|
21
|
+
MAX_LINE_LENGTH = 150
|
|
22
|
+
REPUTATION_MGMT = f'{REPUTATION} management'
|
|
23
|
+
SAME = 'same'
|
|
19
24
|
|
|
20
|
-
|
|
21
|
-
'earlier': 'earlier draft of',
|
|
25
|
+
DUPE_TYPE_STRS: dict[DuplicateType, str] = {
|
|
26
|
+
'earlier': 'an earlier draft of',
|
|
22
27
|
'quoted': 'quoted in full in',
|
|
23
|
-
'redacted': 'redacted version of',
|
|
24
|
-
|
|
28
|
+
'redacted': 'a redacted version of',
|
|
29
|
+
SAME: 'the same as',
|
|
25
30
|
}
|
|
26
31
|
|
|
27
32
|
FIELD_SORT_KEY = {
|
|
@@ -30,57 +35,116 @@ FIELD_SORT_KEY = {
|
|
|
30
35
|
'attribution_reason': 'zz',
|
|
31
36
|
}
|
|
32
37
|
|
|
38
|
+
FINANCIAL_REPORTS_AUTHORS = [
|
|
39
|
+
BOFA,
|
|
40
|
+
DEUTSCHE_BANK,
|
|
41
|
+
ELECTRON_CAPITAL_PARTNERS,
|
|
42
|
+
GOLDMAN_INVESTMENT_MGMT,
|
|
43
|
+
'Invesco',
|
|
44
|
+
JP_MORGAN,
|
|
45
|
+
'Morgan Stanley',
|
|
46
|
+
'S&P',
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
# Fields like timestamp and author are better added from the Document object
|
|
50
|
+
INVALID_FOR_METADATA = [
|
|
51
|
+
'actual_text',
|
|
52
|
+
'date',
|
|
53
|
+
'id',
|
|
54
|
+
'timestamp',
|
|
55
|
+
'was_generated',
|
|
56
|
+
]
|
|
57
|
+
|
|
33
58
|
|
|
34
59
|
@dataclass(kw_only=True)
|
|
35
|
-
class
|
|
36
|
-
"""
|
|
60
|
+
class DocCfg:
|
|
61
|
+
"""
|
|
62
|
+
Encapsulates info about files that needs to be manually configured because it cannot be programmatically inferred.
|
|
37
63
|
|
|
38
64
|
Attributes:
|
|
39
65
|
id (str): ID of file
|
|
40
66
|
author (str | None): Author of the document (if any)
|
|
67
|
+
category (str | None): Type of file
|
|
41
68
|
date (str | None): If passed will be immediated parsed into the 'timestamp' field
|
|
42
69
|
dupe_of_id (str | None): If this is a dupe the ID of the duplicated file. This file will be suppressed
|
|
43
|
-
dupe_type (DuplicateType | None): The type of duplicate this file is
|
|
44
|
-
duplicate_ids (list[str]): Inverse of 'dupe_of_id' - this file will NOT be suppressed but 'duplicate_ids' will be
|
|
70
|
+
dupe_type (DuplicateType | None): The type of duplicate this file is or its 'duplicate_ids' are
|
|
71
|
+
duplicate_ids (list[str]): Inverse of 'dupe_of_id' - this file will NOT be suppressed but 'duplicate_ids' will be
|
|
72
|
+
is_interesting (bool): Override other considerations and always consider this file interesting
|
|
45
73
|
timestamp (datetime | None): Time this email was sent, file was created, article published, etc.
|
|
46
74
|
was_generated (bool): True if this object was generated by the duplicate_cfgs() method
|
|
47
75
|
"""
|
|
48
76
|
id: str
|
|
49
77
|
author: str | None = None
|
|
78
|
+
category: str | None = None
|
|
50
79
|
date: str | None = None
|
|
51
80
|
description: str | None = None
|
|
52
81
|
dupe_of_id: str | None = None
|
|
53
82
|
dupe_type: DuplicateType | None = None
|
|
54
83
|
duplicate_ids: list[str] = field(default_factory=list)
|
|
84
|
+
is_interesting: bool = False
|
|
55
85
|
timestamp: datetime | None = None
|
|
56
|
-
was_generated: bool = False
|
|
86
|
+
was_generated: bool = False
|
|
57
87
|
|
|
58
88
|
def __post_init__(self):
|
|
59
|
-
if self.dupe_of_id:
|
|
60
|
-
self.dupe_type = self.dupe_type or 'same'
|
|
61
|
-
|
|
62
89
|
if self.date:
|
|
63
90
|
self.timestamp = parse(self.date)
|
|
64
91
|
|
|
92
|
+
if self.dupe_of_id or self.duplicate_ids:
|
|
93
|
+
self.dupe_type = self.dupe_type or SAME
|
|
94
|
+
|
|
65
95
|
def duplicate_reason(self) -> str | None:
|
|
66
96
|
if self.dupe_type is not None:
|
|
67
|
-
return
|
|
97
|
+
return DUPE_TYPE_STRS[self.dupe_type]
|
|
68
98
|
|
|
69
|
-
def duplicate_cfgs(self) -> Generator['
|
|
99
|
+
def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
|
|
100
|
+
"""Create synthetic DocCfg objects that set the 'dupe_of_id' field to point back to this object."""
|
|
70
101
|
for id in self.duplicate_ids:
|
|
71
102
|
dupe_cfg = deepcopy(self)
|
|
72
103
|
dupe_cfg.id = id
|
|
73
104
|
dupe_cfg.dupe_of_id = self.id
|
|
74
|
-
dupe_cfg.
|
|
105
|
+
dupe_cfg.duplicate_ids = []
|
|
106
|
+
dupe_cfg.dupe_type = self.dupe_type
|
|
75
107
|
dupe_cfg.was_generated = True
|
|
76
108
|
yield dupe_cfg
|
|
77
109
|
|
|
110
|
+
def info_str(self) -> str | None:
|
|
111
|
+
"""String that summarizes what is known about this document."""
|
|
112
|
+
if self.category == REPUTATION:
|
|
113
|
+
return f"{REPUTATION_MGMT}: {self.description}"
|
|
114
|
+
elif self.author and self.description:
|
|
115
|
+
if self.category in [ACADEMIA, BOOK]:
|
|
116
|
+
return self.title_by_author()
|
|
117
|
+
elif self.category == FINANCE and self.author in FINANCIAL_REPORTS_AUTHORS:
|
|
118
|
+
return f"{self.author} report: '{self.description}'"
|
|
119
|
+
elif self.category == LEGAL and 'v.' in self.author:
|
|
120
|
+
return f"{self.author}: '{self.description}'"
|
|
121
|
+
elif self.category and self.author is None and self.description is None:
|
|
122
|
+
return self.category
|
|
123
|
+
|
|
124
|
+
pieces = without_nones([self.author, self.description])
|
|
125
|
+
return ' '.join(pieces) if pieces else None
|
|
126
|
+
|
|
127
|
+
def metadata(self) -> Metadata:
|
|
128
|
+
non_null_fields = {k: v for k, v in asdict(self).items() if v and k not in INVALID_FOR_METADATA}
|
|
129
|
+
|
|
130
|
+
if self.category in [EMAIL, TEXT_MESSAGE]:
|
|
131
|
+
del non_null_fields['category']
|
|
132
|
+
|
|
133
|
+
return non_null_fields
|
|
134
|
+
|
|
78
135
|
def non_null_field_names(self) -> list[str]:
|
|
79
136
|
return [f.name for f in self.sorted_fields() if getattr(self, f.name)]
|
|
80
137
|
|
|
81
138
|
def sorted_fields(self) -> list[Field]:
|
|
82
139
|
return sorted(fields(self), key=lambda f: FIELD_SORT_KEY.get(f.name, f.name))
|
|
83
140
|
|
|
141
|
+
def title_by_author(self) -> str:
|
|
142
|
+
if not (self.author and self.description):
|
|
143
|
+
raise RuntimeError(f"Can't call title_by_author() without author and description!")
|
|
144
|
+
|
|
145
|
+
title = self.description if '"' in self.description else f"'{self.description}'"
|
|
146
|
+
return f"{title} by {self.author}"
|
|
147
|
+
|
|
84
148
|
def _props_strs(self) -> list[str]:
|
|
85
149
|
props = []
|
|
86
150
|
add_prop = lambda f, value: props.append(f"{f.name}={value}")
|
|
@@ -92,14 +156,16 @@ class FileCfg:
|
|
|
92
156
|
continue
|
|
93
157
|
elif _field.name == AUTHOR:
|
|
94
158
|
add_prop(_field, constantize_name(str(value)) if CONSTANTIZE_NAMES else f"'{value}'")
|
|
159
|
+
elif _field.name == 'category' and value in [EMAIL, TEXT_MESSAGE]:
|
|
160
|
+
continue
|
|
95
161
|
elif _field.name == 'recipients' and isinstance(value, list):
|
|
96
162
|
recipients_str = str([constantize_name(r) if (CONSTANTIZE_NAMES and r) else r for r in value])
|
|
97
163
|
add_prop(_field, recipients_str.replace("'", '') if CONSTANTIZE_NAMES else recipients_str)
|
|
164
|
+
elif _field.name == 'timestamp' and self.date is not None:
|
|
165
|
+
continue # Don't print both timestamp and date
|
|
98
166
|
elif isinstance(value, datetime):
|
|
99
167
|
value_str = re.sub(' 00:00:00', '', str(value))
|
|
100
168
|
add_prop(_field, f"parse('{value_str}')" if CONSTANTIZE_NAMES else f"'{value}'")
|
|
101
|
-
elif _field.name == 'description':
|
|
102
|
-
add_prop(_field, value.strip())
|
|
103
169
|
elif isinstance(value, str):
|
|
104
170
|
if "'" in value:
|
|
105
171
|
value = '"' + value.replace('"', r'\"') + '"'
|
|
@@ -112,22 +178,12 @@ class FileCfg:
|
|
|
112
178
|
|
|
113
179
|
return props
|
|
114
180
|
|
|
115
|
-
def __eq__(self, other: 'FileCfg') -> bool:
|
|
116
|
-
"""Return True if everything matches other than the two 'dupe_' fields ('duplicate_ids' is compared)."""
|
|
117
|
-
for _field in self.sorted_fields():
|
|
118
|
-
if _field.name == 'id' or _field.name.startswith('dupe'):
|
|
119
|
-
continue
|
|
120
|
-
elif getattr(self, _field.name) != getattr(other, _field.name):
|
|
121
|
-
return False
|
|
122
|
-
|
|
123
|
-
return True
|
|
124
|
-
|
|
125
181
|
def __repr__(self) -> str:
|
|
126
182
|
props = self._props_strs()
|
|
127
183
|
type_str = f"{type(self).__name__}("
|
|
128
184
|
single_line_repr = type_str + ', '.join(props) + f')'
|
|
129
185
|
|
|
130
|
-
if
|
|
186
|
+
if len(single_line_repr) < MAX_LINE_LENGTH:
|
|
131
187
|
repr_str = single_line_repr
|
|
132
188
|
else:
|
|
133
189
|
repr_str = f"{type_str}{INDENT_NEWLINE}" + INDENTED_JOIN.join(props)
|
|
@@ -142,31 +198,53 @@ class FileCfg:
|
|
|
142
198
|
|
|
143
199
|
|
|
144
200
|
@dataclass(kw_only=True)
|
|
145
|
-
class
|
|
201
|
+
class CommunicationCfg(DocCfg):
|
|
146
202
|
"""
|
|
147
|
-
Convenience class to unite various configured properties for a given Communication file.
|
|
148
203
|
Manual config is always required for MessengerLog author attribution. It's also often needed for Email
|
|
149
204
|
files to handle the terrible OCR text that Congress provided which messes up a lot of the email headers.
|
|
150
205
|
|
|
151
206
|
Attributes:
|
|
152
|
-
actual_text (str | None): In dire cases of broken OCR we just configure the body of the email as a string.
|
|
153
207
|
attribution_reason (str | None): Optional explanation of why this email was attributed to this author.
|
|
154
208
|
is_attribution_uncertain (bool): True if we have a good idea of who the author is but are not 100% certain
|
|
209
|
+
"""
|
|
210
|
+
attribution_reason: str | None = None
|
|
211
|
+
is_attribution_uncertain: bool = False
|
|
212
|
+
|
|
213
|
+
def __repr__(self) -> str:
|
|
214
|
+
return super().__repr__()
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
@dataclass(kw_only=True)
|
|
218
|
+
class EmailCfg(CommunicationCfg):
|
|
219
|
+
"""
|
|
220
|
+
Attributes:
|
|
221
|
+
actual_text (str | None): In dire cases of broken OCR we just configure the body of the email as a string.
|
|
155
222
|
is_fwded_article (bool): True if this is a newspaper article someone fwded. Used to exclude articles from word counting.
|
|
156
223
|
recipients (list[str | None]): Who received the email
|
|
157
224
|
"""
|
|
158
225
|
actual_text: str | None = None # Override for the Email._actual_text() method for particularly broken emails
|
|
159
|
-
attribution_reason: str | None = None
|
|
160
|
-
is_attribution_uncertain: bool = False
|
|
161
226
|
is_fwded_article: bool = False
|
|
162
227
|
recipients: list[str | None] = field(default_factory=list)
|
|
163
228
|
|
|
164
|
-
def
|
|
165
|
-
|
|
229
|
+
def __post_init__(self):
|
|
230
|
+
super().__post_init__()
|
|
231
|
+
self.category = EMAIL
|
|
232
|
+
|
|
233
|
+
@classmethod
|
|
234
|
+
def from_doc_cfg(cls, cfg: DocCfg) -> 'EmailCfg':
|
|
235
|
+
return cls(**asdict(cfg))
|
|
166
236
|
|
|
237
|
+
# This is necessary for some dumb reason. @dataclass(repr=False) doesn't cut it
|
|
167
238
|
def __repr__(self) -> str:
|
|
168
239
|
return super().__repr__()
|
|
169
240
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
241
|
+
|
|
242
|
+
@dataclass(kw_only=True)
|
|
243
|
+
class TextCfg(CommunicationCfg):
|
|
244
|
+
def __post_init__(self):
|
|
245
|
+
super().__post_init__()
|
|
246
|
+
self.category = TEXT_MESSAGE
|
|
247
|
+
|
|
248
|
+
# This is necessary for some dumb reason. @dataclass(repr=False) doesn't cut it
|
|
249
|
+
def __repr__(self) -> str:
|
|
250
|
+
return super().__repr__()
|
epstein_files/util/env.py
CHANGED
|
@@ -4,42 +4,43 @@ from os import environ
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from sys import argv
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from epstein_files.util.logging import datefinder_logger, env_log_level, logger
|
|
8
8
|
|
|
9
9
|
DEFAULT_WIDTH = 154
|
|
10
|
-
HTML_SCRIPTS = ['generate_html.py', 'count_words.py']
|
|
10
|
+
HTML_SCRIPTS = ['epstein_generate', 'generate_html.py', 'count_words.py']
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
parser = ArgumentParser(description="Parse epstein OCR docs and generate HTML page.")
|
|
14
|
-
parser.add_argument('--build', '-b', action='store_true', help='write
|
|
15
|
-
parser.add_argument('--all-emails', '-ae', action='store_true', help='all the emails')
|
|
16
|
-
parser.add_argument('--all-
|
|
17
|
-
parser.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of a limited selection')
|
|
14
|
+
parser.add_argument('--build', '-b', action='store_true', help='write output to file')
|
|
15
|
+
parser.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
|
|
16
|
+
parser.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just interesting ones')
|
|
18
17
|
parser.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
|
|
19
18
|
parser.add_argument('--name', '-n', action='append', dest='names', help='specify the name(s) whose communications should be output')
|
|
19
|
+
parser.add_argument('--output-file', '-out', metavar='FILE', default='index.html', help='write output to FILE in docs/ (default=index.html)')
|
|
20
20
|
parser.add_argument('--output-emails', '-oe', action='store_true', help='generate other files section')
|
|
21
21
|
parser.add_argument('--output-other-files', '-oo', action='store_true', help='generate other files section')
|
|
22
22
|
parser.add_argument('--output-texts', '-ot', action='store_true', help='generate other files section')
|
|
23
23
|
parser.add_argument('--pickled', '-p', action='store_true', help='use pickled EpsteinFiles object')
|
|
24
24
|
parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='generate new pickled EpsteinFiles object')
|
|
25
|
+
parser.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (only used by scripts)')
|
|
25
26
|
parser.add_argument('--sort-alphabetical', '-alpha', action='store_true', help='sort emailers alphabetically in counts table')
|
|
26
27
|
parser.add_argument('--suppress-output', '-s', action='store_true', help='no output to terminal (use with --build)')
|
|
27
|
-
parser.add_argument('--use-epstein-web-links', '-use', action='store_true', help='use epsteinweb.org links instead of
|
|
28
|
-
parser.add_argument('--search-other', '-so', action='store_true', help='search for string in non email/text files (only used by search script)')
|
|
28
|
+
parser.add_argument('--use-epstein-web-links', '-use', action='store_true', help='use epsteinweb.org links instead of epstein.media')
|
|
29
29
|
parser.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use')
|
|
30
30
|
parser.add_argument('--whole-file', '-wf', action='store_true', help='print whole file (only used by search script)')
|
|
31
31
|
parser.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
|
|
32
32
|
parser.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
|
|
33
|
+
parser.add_argument('--make-clean', '-mc', action='store_true', help='delete all build artifact HTML and JSON files')
|
|
33
34
|
parser.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
|
|
35
|
+
parser.add_argument('--json-metadata', '-jm', action='store_true', help='dump JSON metadata for all files')
|
|
34
36
|
parser.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats at the end')
|
|
35
37
|
parser.add_argument('positional_args', nargs='*', help='Optional args (only used by helper scripts)')
|
|
36
38
|
args = parser.parse_args()
|
|
37
39
|
|
|
38
|
-
is_env_var_set = lambda s: len(environ.get(s) or '') > 0
|
|
39
40
|
current_script = Path(argv[0]).name
|
|
41
|
+
is_env_var_set = lambda s: len(environ.get(s) or '') > 0
|
|
40
42
|
is_html_script = current_script in HTML_SCRIPTS
|
|
41
43
|
|
|
42
|
-
args.deep_debug = args.deep_debug or is_env_var_set('DEEP_DEBUG')
|
|
43
44
|
args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
|
|
44
45
|
args.output_emails = args.output_emails or args.all_emails
|
|
45
46
|
args.output_other_files = args.output_other_files or args.all_other_files
|
|
@@ -48,27 +49,25 @@ args.width = args.width if is_html_script else None
|
|
|
48
49
|
specified_names: list[str | None] = [None if n == 'None' else n for n in (args.names or [])]
|
|
49
50
|
|
|
50
51
|
|
|
51
|
-
#
|
|
52
|
-
logging.basicConfig(level="NOTSET", format="%(message)s", datefmt="[%X]", handlers=[RichHandler()])
|
|
53
|
-
# logging.basicConfig(level="DEBUG", handlers=[RichHandler()])
|
|
54
|
-
logger = logging.getLogger("rich")
|
|
55
|
-
|
|
52
|
+
# Log level args
|
|
56
53
|
if args.deep_debug:
|
|
57
54
|
logger.setLevel(logging.DEBUG)
|
|
58
55
|
elif args.debug:
|
|
59
56
|
logger.setLevel(logging.INFO)
|
|
60
57
|
elif args.suppress_logs:
|
|
61
58
|
logger.setLevel(logging.FATAL)
|
|
62
|
-
|
|
59
|
+
elif not env_log_level:
|
|
63
60
|
logger.setLevel(logging.WARNING)
|
|
64
61
|
|
|
65
|
-
|
|
62
|
+
logger.info(f'Log level set to {logger.level}...')
|
|
66
63
|
datefinder_logger.setLevel(logger.level)
|
|
67
64
|
|
|
68
65
|
|
|
69
66
|
# Massage args that depend on other args to the appropriate state
|
|
70
|
-
if not (args.output_texts or args.output_emails or args.output_other_files):
|
|
71
|
-
|
|
67
|
+
if not (args.json_metadata or args.output_texts or args.output_emails or args.output_other_files):
|
|
68
|
+
if is_html_script:
|
|
69
|
+
logger.warning(f"No output section chosen; outputting default of texts, selected emails, and other files...")
|
|
70
|
+
|
|
72
71
|
args.output_texts = True
|
|
73
72
|
args.output_emails = True
|
|
74
73
|
args.output_other_files = True
|
|
@@ -77,4 +76,4 @@ if args.use_epstein_web_links:
|
|
|
77
76
|
logger.warning(f"Using links to epsteinweb.org links instead of epsteinify.com...")
|
|
78
77
|
|
|
79
78
|
if args.debug:
|
|
80
|
-
logger.warning(f"
|
|
79
|
+
logger.warning(f"Invocation args:\nis_html_script={is_html_script},\nspecified_names={specified_names},\nargs={args}")
|
|
@@ -3,7 +3,7 @@ from os import environ
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from sys import exit
|
|
5
5
|
|
|
6
|
-
from epstein_files.util.constant.strings import HOUSE_OVERSIGHT_PREFIX
|
|
6
|
+
from epstein_files.util.constant.strings import FILE_NAME_REGEX, FILE_STEM_REGEX, HOUSE_OVERSIGHT_PREFIX
|
|
7
7
|
|
|
8
8
|
EPSTEIN_DOCS_DIR_ENV_VAR_NAME = 'EPSTEIN_DOCS_DIR'
|
|
9
9
|
DOCS_DIR_ENV = environ[EPSTEIN_DOCS_DIR_ENV_VAR_NAME]
|
|
@@ -16,30 +16,23 @@ elif not DOCS_DIR.exists():
|
|
|
16
16
|
print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!")
|
|
17
17
|
exit(1)
|
|
18
18
|
|
|
19
|
-
JSON_DIR = DOCS_DIR.joinpath('json_files')
|
|
20
|
-
HTML_DIR = Path('docs')
|
|
21
19
|
EXTRACTED_EMAILS_DIR = Path('emails_extracted_from_legal_filings')
|
|
22
|
-
|
|
23
|
-
WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_emails_word_count.html')
|
|
24
|
-
EPSTEIN_WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_texts_and_emails_word_count.html')
|
|
25
|
-
PICKLED_PATH = Path("the_epstein_files.pkl.gz")
|
|
26
|
-
|
|
27
|
-
FILE_STEM_REGEX = re.compile(fr"{HOUSE_OVERSIGHT_PREFIX}(\d{{6}})")
|
|
28
|
-
FILE_ID_REGEX = re.compile(fr".*{FILE_STEM_REGEX.pattern}(_\d{{1,2}})?(\.txt(\.json)?)?")
|
|
20
|
+
FILE_ID_REGEX = re.compile(fr".*{FILE_NAME_REGEX.pattern}")
|
|
29
21
|
FILENAME_LENGTH = len(HOUSE_OVERSIGHT_PREFIX) + 6
|
|
30
22
|
KB = 1024
|
|
31
23
|
MB = KB * KB
|
|
32
24
|
|
|
33
25
|
|
|
34
26
|
# Handles both string and int 'id' args.
|
|
35
|
-
|
|
27
|
+
id_str = lambda id: f"{int(id):06d}"
|
|
36
28
|
filename_for_id = lambda id: file_stem_for_id(id) + '.txt'
|
|
37
29
|
|
|
38
30
|
|
|
39
31
|
def coerce_file_stem(filename_or_id: int | str) -> str:
|
|
40
32
|
"""Generate a valid file_stem no matter what form the argument comes in."""
|
|
41
33
|
if isinstance(filename_or_id, str) and filename_or_id.startswith(HOUSE_OVERSIGHT_PREFIX):
|
|
42
|
-
|
|
34
|
+
file_id = extract_file_id(filename_or_id)
|
|
35
|
+
file_stem = file_stem_for_id(file_id)
|
|
43
36
|
else:
|
|
44
37
|
file_stem = file_stem_for_id(filename_or_id)
|
|
45
38
|
|
|
@@ -49,32 +42,56 @@ def coerce_file_stem(filename_or_id: int | str) -> str:
|
|
|
49
42
|
return file_stem
|
|
50
43
|
|
|
51
44
|
|
|
52
|
-
def
|
|
53
|
-
|
|
45
|
+
def coerce_file_name(filename_or_id: int | str) -> str:
|
|
46
|
+
return coerce_file_stem(filename_or_id) + '.txt'
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def coerce_file_path(filename_or_id: int | str) -> Path:
|
|
50
|
+
return DOCS_DIR.joinpath(coerce_file_name(filename_or_id))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def extract_file_id(filename_or_id: int | str | Path) -> str:
|
|
54
|
+
if isinstance(filename_or_id, int) or (isinstance(filename_or_id, str) and len(filename_or_id) <= 6):
|
|
55
|
+
return id_str(filename_or_id)
|
|
56
|
+
|
|
57
|
+
file_match = FILE_ID_REGEX.match(str(filename_or_id))
|
|
54
58
|
|
|
55
59
|
if not file_match:
|
|
56
|
-
raise RuntimeError(f"Failed to extract file ID from {
|
|
60
|
+
raise RuntimeError(f"Failed to extract file ID from {filename_or_id}")
|
|
57
61
|
|
|
58
62
|
return file_match.group(1)
|
|
59
63
|
|
|
60
64
|
|
|
65
|
+
def file_size(file_path: str | Path) -> int:
|
|
66
|
+
return Path(file_path).stat().st_size
|
|
67
|
+
|
|
68
|
+
|
|
61
69
|
def file_size_str(file_path: str | Path) -> str:
|
|
62
|
-
|
|
70
|
+
size = file_size(file_path)
|
|
63
71
|
digits = 2
|
|
64
72
|
|
|
65
|
-
if
|
|
66
|
-
size_num =
|
|
73
|
+
if size > MB:
|
|
74
|
+
size_num = float(size) / MB
|
|
67
75
|
size_str = 'MB'
|
|
68
|
-
elif
|
|
69
|
-
size_num =
|
|
76
|
+
elif size > KB:
|
|
77
|
+
size_num = float(size) / KB
|
|
70
78
|
size_str = 'kb'
|
|
71
79
|
digits = 1
|
|
72
80
|
else:
|
|
73
|
-
return f"{
|
|
81
|
+
return f"{size} b"
|
|
74
82
|
|
|
75
83
|
return f"{size_num:,.{digits}f} {size_str}"
|
|
76
84
|
|
|
77
85
|
|
|
86
|
+
def file_stem_for_id(id: int | str) -> str:
|
|
87
|
+
if isinstance(id, int) or (isinstance(id, str) and len(id) <= 6):
|
|
88
|
+
return f"{HOUSE_OVERSIGHT_PREFIX}{id_str(id)}"
|
|
89
|
+
elif len(id) == 8:
|
|
90
|
+
return f"{HOUSE_OVERSIGHT_PREFIX}{id}"
|
|
91
|
+
else:
|
|
92
|
+
raise RuntimeError(f"Unknown kind of file id {id}")
|
|
93
|
+
|
|
94
|
+
|
|
78
95
|
def is_local_extract_file(filename) -> bool:
|
|
79
96
|
"""Return true if filename is of form 'HOUSE_OVERSIGHT_029835_1.txt'."""
|
|
80
97
|
file_match = FILE_ID_REGEX.match(str(filename))
|