epstein-files 1.2.1__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +55 -11
- epstein_files/documents/document.py +13 -2
- epstein_files/documents/email.py +329 -258
- epstein_files/documents/emails/email_header.py +17 -8
- epstein_files/documents/other_file.py +8 -6
- epstein_files/epstein_files.py +18 -4
- epstein_files/person.py +65 -20
- epstein_files/util/constant/names.py +18 -12
- epstein_files/util/constant/output_files.py +8 -5
- epstein_files/util/constant/strings.py +4 -2
- epstein_files/util/constant/urls.py +13 -2
- epstein_files/util/constants.py +486 -224
- epstein_files/util/data.py +1 -0
- epstein_files/util/doc_cfg.py +33 -27
- epstein_files/util/env.py +18 -8
- epstein_files/util/file_helper.py +2 -0
- epstein_files/util/highlighted_group.py +321 -132
- epstein_files/util/output.py +19 -24
- epstein_files/util/rich.py +9 -3
- epstein_files/util/word_count.py +2 -2
- {epstein_files-1.2.1.dist-info → epstein_files-1.4.1.dist-info}/METADATA +3 -3
- epstein_files-1.4.1.dist-info/RECORD +34 -0
- {epstein_files-1.2.1.dist-info → epstein_files-1.4.1.dist-info}/entry_points.txt +1 -1
- epstein_files-1.2.1.dist-info/RECORD +0 -34
- {epstein_files-1.2.1.dist-info → epstein_files-1.4.1.dist-info}/LICENSE +0 -0
- {epstein_files-1.2.1.dist-info → epstein_files-1.4.1.dist-info}/WHEEL +0 -0
epstein_files/util/data.py
CHANGED
|
@@ -22,6 +22,7 @@ ALL_NAMES = [v for k, v in vars(names).items() if isinstance(v, str) and CONSTAN
|
|
|
22
22
|
PACIFIC_TZ = tz.gettz("America/Los_Angeles")
|
|
23
23
|
TIMEZONE_INFO = {"PDT": PACIFIC_TZ, "PST": PACIFIC_TZ} # Suppresses annoying warnings from parse() calls
|
|
24
24
|
|
|
25
|
+
all_elements_same = lambda _list: len(_list) == 0 or all(x == _list[0] for x in _list)
|
|
25
26
|
collapse_newlines = lambda text: MULTINEWLINE_REGEX.sub('\n\n', text)
|
|
26
27
|
date_str = lambda dt: dt.isoformat()[0:10] if dt else None
|
|
27
28
|
escape_double_quotes = lambda text: text.replace('"', r'\"')
|
epstein_files/util/doc_cfg.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import re
|
|
2
3
|
from copy import deepcopy
|
|
3
4
|
from dataclasses import Field, asdict, dataclass, field, fields
|
|
@@ -9,20 +10,21 @@ from dateutil.parser import parse
|
|
|
9
10
|
from epstein_files.util.constant.names import *
|
|
10
11
|
from epstein_files.util.constant.strings import *
|
|
11
12
|
from epstein_files.util.data import remove_zero_time, without_falsey
|
|
13
|
+
from epstein_files.util.env import args
|
|
12
14
|
|
|
13
|
-
DuplicateType = Literal['earlier', 'quoted', 'redacted', 'same']
|
|
15
|
+
DuplicateType = Literal['bounced', 'earlier', 'quoted', 'redacted', 'same']
|
|
14
16
|
Metadata = dict[str, bool | datetime | int | str | list[str | None] |dict[str, bool | str]]
|
|
15
17
|
|
|
16
18
|
# Misc
|
|
17
|
-
CONSTANTIZE_NAMES = False # A flag set to True that causes repr() of these classes to return strings of usable code
|
|
18
19
|
INDENT = ' '
|
|
19
20
|
INDENT_NEWLINE = f'\n{INDENT}'
|
|
20
21
|
INDENTED_JOIN = f',{INDENT_NEWLINE}'
|
|
21
|
-
MAX_LINE_LENGTH =
|
|
22
|
+
MAX_LINE_LENGTH = 135
|
|
22
23
|
REPUTATION_MGMT = f'{REPUTATION} management'
|
|
23
24
|
SAME = 'same'
|
|
24
25
|
|
|
25
26
|
DUPE_TYPE_STRS: dict[DuplicateType, str] = {
|
|
27
|
+
'bounced': 'a bounced copy of',
|
|
26
28
|
'earlier': 'an earlier draft of',
|
|
27
29
|
'quoted': 'quoted in full in',
|
|
28
30
|
'redacted': 'a redacted version of',
|
|
@@ -32,7 +34,10 @@ DUPE_TYPE_STRS: dict[DuplicateType, str] = {
|
|
|
32
34
|
FIELD_SORT_KEY = {
|
|
33
35
|
'id': 'a',
|
|
34
36
|
'author': 'aa',
|
|
35
|
-
'
|
|
37
|
+
'comment': 'zz',
|
|
38
|
+
'duplicate_ids': 'dup',
|
|
39
|
+
'duplicate_of_id': 'dupe',
|
|
40
|
+
'recipients': 'aaa',
|
|
36
41
|
}
|
|
37
42
|
|
|
38
43
|
FINANCIAL_REPORTS_AUTHORS = [
|
|
@@ -49,7 +54,6 @@ FINANCIAL_REPORTS_AUTHORS = [
|
|
|
49
54
|
# Fields like timestamp and author are better added from the Document object
|
|
50
55
|
NON_METADATA_FIELDS = [
|
|
51
56
|
'actual_text',
|
|
52
|
-
'date',
|
|
53
57
|
'id',
|
|
54
58
|
'is_synthetic',
|
|
55
59
|
]
|
|
@@ -64,18 +68,18 @@ class DocCfg:
|
|
|
64
68
|
id (str): ID of file
|
|
65
69
|
author (Name): Author of the document (if any)
|
|
66
70
|
category (str | None): Type of file
|
|
67
|
-
date (str | None):
|
|
71
|
+
date (str | None): Parsed to a datetime by timestamp() if it exists
|
|
68
72
|
dupe_type (DuplicateType | None): The type of duplicate this file is or its 'duplicate_ids' are
|
|
69
73
|
duplicate_ids (list[str]): IDs of *other* documents that are dupes of this document
|
|
70
74
|
duplicate_of_id (str | None): If this is a dupe the ID of the duplicated file. This file will be suppressed
|
|
71
75
|
is_interesting (bool | None): Override other considerations and always consider this file interesting (or not)
|
|
72
|
-
timestamp (datetime | None): Time this email was sent, file was created, article published, etc.
|
|
73
76
|
is_synthetic (bool): True if this config was generated by the duplicate_cfgs() method
|
|
74
77
|
"""
|
|
75
78
|
id: str
|
|
76
79
|
attached_to_email_id: str | None = None
|
|
77
80
|
author: Name = None
|
|
78
81
|
category: str | None = None
|
|
82
|
+
comment: str = ''
|
|
79
83
|
date: str | None = None
|
|
80
84
|
description: str | None = None
|
|
81
85
|
dupe_type: DuplicateType | None = None
|
|
@@ -84,12 +88,8 @@ class DocCfg:
|
|
|
84
88
|
is_attribution_uncertain: bool = False
|
|
85
89
|
is_interesting: bool | None = None
|
|
86
90
|
is_synthetic: bool = False
|
|
87
|
-
timestamp: datetime | None = None
|
|
88
91
|
|
|
89
92
|
def __post_init__(self):
|
|
90
|
-
if self.date:
|
|
91
|
-
self.timestamp = parse(self.date)
|
|
92
|
-
|
|
93
93
|
if self.duplicate_of_id or self.duplicate_ids:
|
|
94
94
|
self.dupe_type = self.dupe_type or SAME
|
|
95
95
|
|
|
@@ -142,7 +142,16 @@ class DocCfg:
|
|
|
142
142
|
yield dupe_cfg
|
|
143
143
|
|
|
144
144
|
def metadata(self) -> Metadata:
|
|
145
|
-
|
|
145
|
+
metadata = {k: v for k, v in asdict(self).items() if k not in NON_METADATA_FIELDS and v}
|
|
146
|
+
|
|
147
|
+
if self.is_interesting is False:
|
|
148
|
+
metadata['is_interesting'] = False
|
|
149
|
+
|
|
150
|
+
return metadata
|
|
151
|
+
|
|
152
|
+
def timestamp(self) -> datetime | None:
|
|
153
|
+
if self.date:
|
|
154
|
+
return parse(self.date)
|
|
146
155
|
|
|
147
156
|
def _props_strs(self) -> list[str]:
|
|
148
157
|
props = []
|
|
@@ -151,20 +160,16 @@ class DocCfg:
|
|
|
151
160
|
for _field in sorted(fields(self), key=lambda f: FIELD_SORT_KEY.get(f.name, f.name)):
|
|
152
161
|
value = getattr(self, _field.name)
|
|
153
162
|
|
|
154
|
-
if
|
|
163
|
+
if _field.name in ['actual_text', 'is_fwded_article', 'is_interesting']: # fields can be False or None or ''
|
|
164
|
+
if value is not None:
|
|
165
|
+
add_prop(_field, str(value))
|
|
166
|
+
elif not value or _field.name == 'dupe_type' and value == 'same':
|
|
155
167
|
continue
|
|
156
168
|
elif _field.name == AUTHOR:
|
|
157
|
-
add_prop(_field, constantize_name(str(value)) if
|
|
158
|
-
elif _field.name == '
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
recipients_str = str([constantize_name(r) if (CONSTANTIZE_NAMES and r) else r for r in value])
|
|
162
|
-
add_prop(_field, recipients_str.replace("'", '') if CONSTANTIZE_NAMES else recipients_str)
|
|
163
|
-
elif _field.name == 'timestamp' and self.date is not None:
|
|
164
|
-
continue # Don't print both timestamp and date
|
|
165
|
-
elif isinstance(value, datetime):
|
|
166
|
-
value_str = remove_zero_time(value)
|
|
167
|
-
add_prop(_field, f"parse('{value_str}')" if CONSTANTIZE_NAMES else f"'{value}'")
|
|
169
|
+
add_prop(_field, constantize_name(str(value)) if args.constantize else f"'{value}'")
|
|
170
|
+
elif _field.name == 'recipients':
|
|
171
|
+
recipients_str = str([constantize_name(r) if (args.constantize and r) else r for r in value])
|
|
172
|
+
add_prop(_field, recipients_str.replace("'", '') if args.constantize else recipients_str)
|
|
168
173
|
elif isinstance(value, str):
|
|
169
174
|
if "'" in value:
|
|
170
175
|
value = '"' + value.replace('"', r'\"') + '"'
|
|
@@ -182,14 +187,14 @@ class DocCfg:
|
|
|
182
187
|
type_str = f"{type(self).__name__}("
|
|
183
188
|
single_line_repr = type_str + ', '.join(props) + f')'
|
|
184
189
|
|
|
185
|
-
if len(single_line_repr) < MAX_LINE_LENGTH:
|
|
190
|
+
if len(single_line_repr) < MAX_LINE_LENGTH or (self.comment and getattr(self, 'is_fwded_article')):
|
|
186
191
|
repr_str = single_line_repr
|
|
187
192
|
else:
|
|
188
193
|
repr_str = f"{type_str}{INDENT_NEWLINE}" + INDENTED_JOIN.join(props)
|
|
189
194
|
repr_str += ',' if props else ''
|
|
190
195
|
repr_str += '\n)'
|
|
191
196
|
|
|
192
|
-
if
|
|
197
|
+
if args.constantize:
|
|
193
198
|
repr_str = INDENT + INDENT_NEWLINE.join(repr_str.split('\n'))
|
|
194
199
|
return repr_str.replace(',,', ',').replace(',),', '),').replace(',),', '),')
|
|
195
200
|
else:
|
|
@@ -224,9 +229,10 @@ class EmailCfg(CommunicationCfg):
|
|
|
224
229
|
"""
|
|
225
230
|
actual_text: str | None = None
|
|
226
231
|
fwded_text_after: str | None = None
|
|
227
|
-
is_fwded_article: bool =
|
|
232
|
+
is_fwded_article: bool | None = None
|
|
228
233
|
recipients: list[Name] = field(default_factory=list)
|
|
229
234
|
subject: str | None = None
|
|
235
|
+
truncate_to: int | None = None
|
|
230
236
|
|
|
231
237
|
# This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
|
|
232
238
|
def __repr__(self) -> str:
|
epstein_files/util/env.py
CHANGED
|
@@ -38,7 +38,7 @@ output.add_argument('--all-emails', '-ae', action='store_true', help='all the em
|
|
|
38
38
|
output.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just the interesting ones')
|
|
39
39
|
parser.add_argument('--build', '-b', nargs="?", default=None, const=DEFAULT_FILE, help='write output to HTML file')
|
|
40
40
|
output.add_argument('--email-timeline', action='store_true', help='print a table of all emails in chronological order')
|
|
41
|
-
output.add_argument('--emailers-info', action='store_true', help='write a .png of the eeailers info table')
|
|
41
|
+
output.add_argument('--emailers-info', '-ei', action='store_true', help='write a .png of the eeailers info table')
|
|
42
42
|
output.add_argument('--json-files', action='store_true', help='pretty print all the raw JSON data files in the collection and exit')
|
|
43
43
|
output.add_argument('--json-metadata', action='store_true', help='dump JSON metadata for all files and exit')
|
|
44
44
|
output.add_argument('--output-emails', '-oe', action='store_true', help='generate emails section')
|
|
@@ -49,43 +49,51 @@ output.add_argument('--suppress-output', action='store_true', help='no output to
|
|
|
49
49
|
output.add_argument('--uninteresting', action='store_true', help='only output uninteresting other files')
|
|
50
50
|
output.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use (in characters)')
|
|
51
51
|
|
|
52
|
-
scripts = parser.add_argument_group('SCRIPTS', 'Options used by
|
|
52
|
+
scripts = parser.add_argument_group('SCRIPTS', 'Options used by epstein_grep, epstein_show, and epstein_diff.')
|
|
53
53
|
scripts.add_argument('positional_args', nargs='*', help='strings to searchs for, file IDs to show or diff, etc.')
|
|
54
|
+
scripts.add_argument('--email-body', action='store_true', help='epstein_grep but only for the body of the email')
|
|
55
|
+
scripts.add_argument('--min-line-length', type=int, help='epstein_grep minimum length of a matched line')
|
|
54
56
|
scripts.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (used by epstein_show)')
|
|
55
57
|
scripts.add_argument('--whole-file', '-wf', action='store_true', help='print whole files')
|
|
56
58
|
|
|
57
59
|
debug = parser.add_argument_group('DEBUG')
|
|
58
60
|
debug.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
|
|
61
|
+
debug.add_argument('--constantize', action='store_true', help='constantize names when printing repr() of objects')
|
|
59
62
|
debug.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
|
|
60
63
|
debug.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
|
|
61
64
|
debug.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats about the files')
|
|
62
65
|
debug.add_argument('--skip-other-files', '-sof', action='store_true', help='skip parsing non email/text files')
|
|
63
66
|
debug.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
|
|
67
|
+
debug.add_argument('--truncate', '-t', type=int, help='truncate emails to this many characters')
|
|
68
|
+
debug.add_argument('--write-txt', '-wt', action='store_true', help='write a plain text version of output')
|
|
64
69
|
|
|
65
70
|
|
|
66
71
|
# Parse args
|
|
67
72
|
args = parser.parse_args()
|
|
68
73
|
is_html_script = parser.prog in HTML_SCRIPTS
|
|
69
74
|
|
|
70
|
-
args.build = args.build
|
|
71
75
|
args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
|
|
72
76
|
args.names = [None if n == 'None' else n.strip() for n in (args.names or [])]
|
|
73
77
|
args.output_emails = args.output_emails or args.all_emails
|
|
74
78
|
args.output_other = args.output_other or args.all_other_files or args.uninteresting
|
|
75
79
|
args.overwrite_pickle = args.overwrite_pickle or (is_env_var_set('OVERWRITE_PICKLE') and not is_env_var_set('PICKLED'))
|
|
76
80
|
args.width = args.width if is_html_script else None
|
|
81
|
+
args.any_output_selected = any([is_output_arg(arg) and val for arg, val in vars(args).items()])
|
|
82
|
+
|
|
83
|
+
if not (args.any_output_selected or args.email_timeline or args.emailers_info):
|
|
84
|
+
if is_html_script:
|
|
85
|
+
logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
|
|
86
|
+
|
|
87
|
+
args.output_emails = args.output_other = args.output_texts = True
|
|
77
88
|
|
|
78
89
|
if is_html_script:
|
|
79
90
|
if args.positional_args:
|
|
80
91
|
exit_with_error(f"{parser.prog} does not accept positional arguments (receeived {args.positional_args})")
|
|
81
92
|
|
|
82
93
|
if parser.prog == EPSTEIN_GENERATE:
|
|
83
|
-
if
|
|
94
|
+
if args.any_output_selected:
|
|
84
95
|
if args.email_timeline:
|
|
85
96
|
exit_with_error(f"--email-timeline option is mutually exlusive with other output options")
|
|
86
|
-
elif not args.email_timeline and not args.emailers_info:
|
|
87
|
-
logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
|
|
88
|
-
args.output_texts = args.output_emails = args.output_other = True
|
|
89
97
|
|
|
90
98
|
if args.build == DEFAULT_FILE:
|
|
91
99
|
if args.all_emails:
|
|
@@ -94,13 +102,15 @@ if is_html_script:
|
|
|
94
102
|
args.build = CHRONOLOGICAL_EMAILS_PATH
|
|
95
103
|
else:
|
|
96
104
|
args.build = TEXT_MSGS_HTML_PATH
|
|
97
|
-
elif parser.prog.startswith('epstein_') and not args.positional_args:
|
|
105
|
+
elif parser.prog.startswith('epstein_') and not args.positional_args and not args.names:
|
|
98
106
|
exit_with_error(f"{parser.prog} requires positional arguments but got none!")
|
|
99
107
|
|
|
100
108
|
if args.names:
|
|
101
109
|
logger.warning(f"Output restricted to {args.names}")
|
|
102
110
|
args.output_other = False
|
|
103
111
|
|
|
112
|
+
if args.truncate and args.whole_file:
|
|
113
|
+
exit_with_error(f"--whole-file and --truncate are incompatible")
|
|
104
114
|
|
|
105
115
|
# Log level args
|
|
106
116
|
if args.deep_debug:
|
|
@@ -38,6 +38,8 @@ def extract_file_id(filename_or_id: int | str | Path) -> str:
|
|
|
38
38
|
|
|
39
39
|
if isinstance(filename_or_id, int) or (isinstance(filename_or_id, str) and len(filename_or_id) <= 6):
|
|
40
40
|
return id_str(filename_or_id)
|
|
41
|
+
elif isinstance(filename_or_id, str) and len(filename_or_id) == 8:
|
|
42
|
+
return f"{HOUSE_OVERSIGHT_PREFIX}{filename_or_id}"
|
|
41
43
|
|
|
42
44
|
file_match = FILE_ID_REGEX.match(str(filename_or_id).upper())
|
|
43
45
|
|