epstein-files 1.2.5__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +32 -13
- epstein_files/documents/document.py +8 -1
- epstein_files/documents/email.py +179 -97
- epstein_files/documents/emails/email_header.py +17 -8
- epstein_files/documents/other_file.py +8 -6
- epstein_files/epstein_files.py +16 -1
- epstein_files/person.py +40 -15
- epstein_files/util/constant/names.py +10 -6
- epstein_files/util/constant/strings.py +2 -1
- epstein_files/util/constants.py +463 -225
- epstein_files/util/doc_cfg.py +33 -27
- epstein_files/util/env.py +10 -3
- epstein_files/util/file_helper.py +2 -0
- epstein_files/util/highlighted_group.py +66 -23
- epstein_files/util/output.py +17 -31
- epstein_files/util/rich.py +2 -1
- epstein_files/util/word_count.py +1 -1
- {epstein_files-1.2.5.dist-info → epstein_files-1.4.1.dist-info}/METADATA +3 -3
- epstein_files-1.4.1.dist-info/RECORD +34 -0
- {epstein_files-1.2.5.dist-info → epstein_files-1.4.1.dist-info}/entry_points.txt +1 -1
- epstein_files-1.2.5.dist-info/RECORD +0 -34
- {epstein_files-1.2.5.dist-info → epstein_files-1.4.1.dist-info}/LICENSE +0 -0
- {epstein_files-1.2.5.dist-info → epstein_files-1.4.1.dist-info}/WHEEL +0 -0
epstein_files/util/doc_cfg.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import re
|
|
2
3
|
from copy import deepcopy
|
|
3
4
|
from dataclasses import Field, asdict, dataclass, field, fields
|
|
@@ -9,20 +10,21 @@ from dateutil.parser import parse
|
|
|
9
10
|
from epstein_files.util.constant.names import *
|
|
10
11
|
from epstein_files.util.constant.strings import *
|
|
11
12
|
from epstein_files.util.data import remove_zero_time, without_falsey
|
|
13
|
+
from epstein_files.util.env import args
|
|
12
14
|
|
|
13
|
-
DuplicateType = Literal['earlier', 'quoted', 'redacted', 'same']
|
|
15
|
+
DuplicateType = Literal['bounced', 'earlier', 'quoted', 'redacted', 'same']
|
|
14
16
|
Metadata = dict[str, bool | datetime | int | str | list[str | None] |dict[str, bool | str]]
|
|
15
17
|
|
|
16
18
|
# Misc
|
|
17
|
-
CONSTANTIZE_NAMES = False # A flag set to True that causes repr() of these classes to return strings of usable code
|
|
18
19
|
INDENT = ' '
|
|
19
20
|
INDENT_NEWLINE = f'\n{INDENT}'
|
|
20
21
|
INDENTED_JOIN = f',{INDENT_NEWLINE}'
|
|
21
|
-
MAX_LINE_LENGTH =
|
|
22
|
+
MAX_LINE_LENGTH = 135
|
|
22
23
|
REPUTATION_MGMT = f'{REPUTATION} management'
|
|
23
24
|
SAME = 'same'
|
|
24
25
|
|
|
25
26
|
DUPE_TYPE_STRS: dict[DuplicateType, str] = {
|
|
27
|
+
'bounced': 'a bounced copy of',
|
|
26
28
|
'earlier': 'an earlier draft of',
|
|
27
29
|
'quoted': 'quoted in full in',
|
|
28
30
|
'redacted': 'a redacted version of',
|
|
@@ -32,7 +34,10 @@ DUPE_TYPE_STRS: dict[DuplicateType, str] = {
|
|
|
32
34
|
FIELD_SORT_KEY = {
|
|
33
35
|
'id': 'a',
|
|
34
36
|
'author': 'aa',
|
|
35
|
-
'
|
|
37
|
+
'comment': 'zz',
|
|
38
|
+
'duplicate_ids': 'dup',
|
|
39
|
+
'duplicate_of_id': 'dupe',
|
|
40
|
+
'recipients': 'aaa',
|
|
36
41
|
}
|
|
37
42
|
|
|
38
43
|
FINANCIAL_REPORTS_AUTHORS = [
|
|
@@ -49,7 +54,6 @@ FINANCIAL_REPORTS_AUTHORS = [
|
|
|
49
54
|
# Fields like timestamp and author are better added from the Document object
|
|
50
55
|
NON_METADATA_FIELDS = [
|
|
51
56
|
'actual_text',
|
|
52
|
-
'date',
|
|
53
57
|
'id',
|
|
54
58
|
'is_synthetic',
|
|
55
59
|
]
|
|
@@ -64,18 +68,18 @@ class DocCfg:
|
|
|
64
68
|
id (str): ID of file
|
|
65
69
|
author (Name): Author of the document (if any)
|
|
66
70
|
category (str | None): Type of file
|
|
67
|
-
date (str | None):
|
|
71
|
+
date (str | None): Parsed to a datetime by timestamp() if it exists
|
|
68
72
|
dupe_type (DuplicateType | None): The type of duplicate this file is or its 'duplicate_ids' are
|
|
69
73
|
duplicate_ids (list[str]): IDs of *other* documents that are dupes of this document
|
|
70
74
|
duplicate_of_id (str | None): If this is a dupe the ID of the duplicated file. This file will be suppressed
|
|
71
75
|
is_interesting (bool | None): Override other considerations and always consider this file interesting (or not)
|
|
72
|
-
timestamp (datetime | None): Time this email was sent, file was created, article published, etc.
|
|
73
76
|
is_synthetic (bool): True if this config was generated by the duplicate_cfgs() method
|
|
74
77
|
"""
|
|
75
78
|
id: str
|
|
76
79
|
attached_to_email_id: str | None = None
|
|
77
80
|
author: Name = None
|
|
78
81
|
category: str | None = None
|
|
82
|
+
comment: str = ''
|
|
79
83
|
date: str | None = None
|
|
80
84
|
description: str | None = None
|
|
81
85
|
dupe_type: DuplicateType | None = None
|
|
@@ -84,12 +88,8 @@ class DocCfg:
|
|
|
84
88
|
is_attribution_uncertain: bool = False
|
|
85
89
|
is_interesting: bool | None = None
|
|
86
90
|
is_synthetic: bool = False
|
|
87
|
-
timestamp: datetime | None = None
|
|
88
91
|
|
|
89
92
|
def __post_init__(self):
|
|
90
|
-
if self.date:
|
|
91
|
-
self.timestamp = parse(self.date)
|
|
92
|
-
|
|
93
93
|
if self.duplicate_of_id or self.duplicate_ids:
|
|
94
94
|
self.dupe_type = self.dupe_type or SAME
|
|
95
95
|
|
|
@@ -142,7 +142,16 @@ class DocCfg:
|
|
|
142
142
|
yield dupe_cfg
|
|
143
143
|
|
|
144
144
|
def metadata(self) -> Metadata:
|
|
145
|
-
|
|
145
|
+
metadata = {k: v for k, v in asdict(self).items() if k not in NON_METADATA_FIELDS and v}
|
|
146
|
+
|
|
147
|
+
if self.is_interesting is False:
|
|
148
|
+
metadata['is_interesting'] = False
|
|
149
|
+
|
|
150
|
+
return metadata
|
|
151
|
+
|
|
152
|
+
def timestamp(self) -> datetime | None:
|
|
153
|
+
if self.date:
|
|
154
|
+
return parse(self.date)
|
|
146
155
|
|
|
147
156
|
def _props_strs(self) -> list[str]:
|
|
148
157
|
props = []
|
|
@@ -151,20 +160,16 @@ class DocCfg:
|
|
|
151
160
|
for _field in sorted(fields(self), key=lambda f: FIELD_SORT_KEY.get(f.name, f.name)):
|
|
152
161
|
value = getattr(self, _field.name)
|
|
153
162
|
|
|
154
|
-
if
|
|
163
|
+
if _field.name in ['actual_text', 'is_fwded_article', 'is_interesting']: # fields can be False or None or ''
|
|
164
|
+
if value is not None:
|
|
165
|
+
add_prop(_field, str(value))
|
|
166
|
+
elif not value or _field.name == 'dupe_type' and value == 'same':
|
|
155
167
|
continue
|
|
156
168
|
elif _field.name == AUTHOR:
|
|
157
|
-
add_prop(_field, constantize_name(str(value)) if
|
|
158
|
-
elif _field.name == '
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
recipients_str = str([constantize_name(r) if (CONSTANTIZE_NAMES and r) else r for r in value])
|
|
162
|
-
add_prop(_field, recipients_str.replace("'", '') if CONSTANTIZE_NAMES else recipients_str)
|
|
163
|
-
elif _field.name == 'timestamp' and self.date is not None:
|
|
164
|
-
continue # Don't print both timestamp and date
|
|
165
|
-
elif isinstance(value, datetime):
|
|
166
|
-
value_str = remove_zero_time(value)
|
|
167
|
-
add_prop(_field, f"parse('{value_str}')" if CONSTANTIZE_NAMES else f"'{value}'")
|
|
169
|
+
add_prop(_field, constantize_name(str(value)) if args.constantize else f"'{value}'")
|
|
170
|
+
elif _field.name == 'recipients':
|
|
171
|
+
recipients_str = str([constantize_name(r) if (args.constantize and r) else r for r in value])
|
|
172
|
+
add_prop(_field, recipients_str.replace("'", '') if args.constantize else recipients_str)
|
|
168
173
|
elif isinstance(value, str):
|
|
169
174
|
if "'" in value:
|
|
170
175
|
value = '"' + value.replace('"', r'\"') + '"'
|
|
@@ -182,14 +187,14 @@ class DocCfg:
|
|
|
182
187
|
type_str = f"{type(self).__name__}("
|
|
183
188
|
single_line_repr = type_str + ', '.join(props) + f')'
|
|
184
189
|
|
|
185
|
-
if len(single_line_repr) < MAX_LINE_LENGTH:
|
|
190
|
+
if len(single_line_repr) < MAX_LINE_LENGTH or (self.comment and getattr(self, 'is_fwded_article')):
|
|
186
191
|
repr_str = single_line_repr
|
|
187
192
|
else:
|
|
188
193
|
repr_str = f"{type_str}{INDENT_NEWLINE}" + INDENTED_JOIN.join(props)
|
|
189
194
|
repr_str += ',' if props else ''
|
|
190
195
|
repr_str += '\n)'
|
|
191
196
|
|
|
192
|
-
if
|
|
197
|
+
if args.constantize:
|
|
193
198
|
repr_str = INDENT + INDENT_NEWLINE.join(repr_str.split('\n'))
|
|
194
199
|
return repr_str.replace(',,', ',').replace(',),', '),').replace(',),', '),')
|
|
195
200
|
else:
|
|
@@ -224,9 +229,10 @@ class EmailCfg(CommunicationCfg):
|
|
|
224
229
|
"""
|
|
225
230
|
actual_text: str | None = None
|
|
226
231
|
fwded_text_after: str | None = None
|
|
227
|
-
is_fwded_article: bool =
|
|
232
|
+
is_fwded_article: bool | None = None
|
|
228
233
|
recipients: list[Name] = field(default_factory=list)
|
|
229
234
|
subject: str | None = None
|
|
235
|
+
truncate_to: int | None = None
|
|
230
236
|
|
|
231
237
|
# This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
|
|
232
238
|
def __repr__(self) -> str:
|
epstein_files/util/env.py
CHANGED
|
@@ -49,13 +49,16 @@ output.add_argument('--suppress-output', action='store_true', help='no output to
|
|
|
49
49
|
output.add_argument('--uninteresting', action='store_true', help='only output uninteresting other files')
|
|
50
50
|
output.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use (in characters)')
|
|
51
51
|
|
|
52
|
-
scripts = parser.add_argument_group('SCRIPTS', 'Options used by
|
|
52
|
+
scripts = parser.add_argument_group('SCRIPTS', 'Options used by epstein_grep, epstein_show, and epstein_diff.')
|
|
53
53
|
scripts.add_argument('positional_args', nargs='*', help='strings to searchs for, file IDs to show or diff, etc.')
|
|
54
|
+
scripts.add_argument('--email-body', action='store_true', help='epstein_grep but only for the body of the email')
|
|
55
|
+
scripts.add_argument('--min-line-length', type=int, help='epstein_grep minimum length of a matched line')
|
|
54
56
|
scripts.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (used by epstein_show)')
|
|
55
57
|
scripts.add_argument('--whole-file', '-wf', action='store_true', help='print whole files')
|
|
56
58
|
|
|
57
59
|
debug = parser.add_argument_group('DEBUG')
|
|
58
60
|
debug.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
|
|
61
|
+
debug.add_argument('--constantize', action='store_true', help='constantize names when printing repr() of objects')
|
|
59
62
|
debug.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
|
|
60
63
|
debug.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
|
|
61
64
|
debug.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats about the files')
|
|
@@ -78,7 +81,9 @@ args.width = args.width if is_html_script else None
|
|
|
78
81
|
args.any_output_selected = any([is_output_arg(arg) and val for arg, val in vars(args).items()])
|
|
79
82
|
|
|
80
83
|
if not (args.any_output_selected or args.email_timeline or args.emailers_info):
|
|
81
|
-
|
|
84
|
+
if is_html_script:
|
|
85
|
+
logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
|
|
86
|
+
|
|
82
87
|
args.output_emails = args.output_other = args.output_texts = True
|
|
83
88
|
|
|
84
89
|
if is_html_script:
|
|
@@ -97,13 +102,15 @@ if is_html_script:
|
|
|
97
102
|
args.build = CHRONOLOGICAL_EMAILS_PATH
|
|
98
103
|
else:
|
|
99
104
|
args.build = TEXT_MSGS_HTML_PATH
|
|
100
|
-
elif parser.prog.startswith('epstein_') and not args.positional_args:
|
|
105
|
+
elif parser.prog.startswith('epstein_') and not args.positional_args and not args.names:
|
|
101
106
|
exit_with_error(f"{parser.prog} requires positional arguments but got none!")
|
|
102
107
|
|
|
103
108
|
if args.names:
|
|
104
109
|
logger.warning(f"Output restricted to {args.names}")
|
|
105
110
|
args.output_other = False
|
|
106
111
|
|
|
112
|
+
if args.truncate and args.whole_file:
|
|
113
|
+
exit_with_error(f"--whole-file and --truncate are incompatible")
|
|
107
114
|
|
|
108
115
|
# Log level args
|
|
109
116
|
if args.deep_debug:
|
|
@@ -38,6 +38,8 @@ def extract_file_id(filename_or_id: int | str | Path) -> str:
|
|
|
38
38
|
|
|
39
39
|
if isinstance(filename_or_id, int) or (isinstance(filename_or_id, str) and len(filename_or_id) <= 6):
|
|
40
40
|
return id_str(filename_or_id)
|
|
41
|
+
elif isinstance(filename_or_id, str) and len(filename_or_id) == 8:
|
|
42
|
+
return f"{HOUSE_OVERSIGHT_PREFIX}{filename_or_id}"
|
|
41
43
|
|
|
42
44
|
file_match = FILE_ID_REGEX.match(str(filename_or_id).upper())
|
|
43
45
|
|
|
@@ -31,6 +31,8 @@ REGEX_STYLE_PREFIX = 'regex'
|
|
|
31
31
|
SIMPLE_NAME_REGEX = re.compile(r"^[-\w, ]+$", re.IGNORECASE)
|
|
32
32
|
TECH_BRO = 'tech bro'
|
|
33
33
|
|
|
34
|
+
VICTIM_COLOR = 'orchid1'
|
|
35
|
+
|
|
34
36
|
CATEGORY_STYLE_MAPPING = {
|
|
35
37
|
ARTICLE: JOURNALIST,
|
|
36
38
|
BOOK: JOURNALIST,
|
|
@@ -160,7 +162,7 @@ class HighlightedNames(HighlightedText):
|
|
|
160
162
|
|
|
161
163
|
pattern = '|'.join(name_patterns)
|
|
162
164
|
|
|
163
|
-
if args.deep_debug:
|
|
165
|
+
if args.deep_debug and args.colors_only:
|
|
164
166
|
debug_console.print(Text('').append(f"{name:25s}", style=self.style).append(f" '{pattern}'", style='dim'))
|
|
165
167
|
|
|
166
168
|
return pattern
|
|
@@ -215,7 +217,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
215
217
|
ManualHighlight(
|
|
216
218
|
label='email_subject',
|
|
217
219
|
style='light_yellow3',
|
|
218
|
-
pattern=r"^(> )?(Classification|Flag|Subject): (?P<email_subject>.*)",
|
|
220
|
+
pattern=r"^(> )?(Classification|Flag|Subject|Sujet ?): (?P<email_subject>.*)",
|
|
219
221
|
),
|
|
220
222
|
HighlightedNames(
|
|
221
223
|
label=ACADEMIA,
|
|
@@ -245,12 +247,13 @@ HIGHLIGHTED_NAMES = [
|
|
|
245
247
|
r"Bard\s+((Early )?College|High School|Schools)",
|
|
246
248
|
r"Brotherton",
|
|
247
249
|
r"Carl\s*Sagan",
|
|
248
|
-
r"Columbia",
|
|
250
|
+
r"Columbia(\s*(Business\s*School|University))?",
|
|
249
251
|
r"Dan(iel|ny) Kahneman",
|
|
250
252
|
r"(Francis\s*)?Crick",
|
|
251
253
|
r"J(ames|im)\s*Watson",
|
|
252
254
|
r"(Lord\s*)?Martin\s*Rees",
|
|
253
255
|
r"Massachusetts\s*Institute\s*of\s*Technology",
|
|
256
|
+
r"Mayo\s*Clinic",
|
|
254
257
|
r"Media\s*Lab",
|
|
255
258
|
r"(Marvin\s*)?Minsky",
|
|
256
259
|
r"MIT(\s*Media\s*Lab)?",
|
|
@@ -260,10 +263,11 @@ HIGHLIGHTED_NAMES = [
|
|
|
260
263
|
r"Princeton(\s*University)?",
|
|
261
264
|
r"Regeneron",
|
|
262
265
|
r"(Richard\s*)?Dawkins",
|
|
266
|
+
r"Rockefeller\s*University",
|
|
263
267
|
r"(Sandy\s*)?Pentland", # Media Lab
|
|
264
268
|
r"Sanofi",
|
|
265
269
|
r"Stanford(\s*University)?(\s*Hospital)?",
|
|
266
|
-
r"(
|
|
270
|
+
r"(Ste(ph|v)en\s*)?Hawking",
|
|
267
271
|
r"(Steven?\s*)?Pinker",
|
|
268
272
|
r"Texas\s*A&M",
|
|
269
273
|
r"Tulane",
|
|
@@ -319,6 +323,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
319
323
|
patterns=[
|
|
320
324
|
r"(Art )?Spiegelman",
|
|
321
325
|
r"Artspace",
|
|
326
|
+
r"Ayn\s*Rand",
|
|
322
327
|
r"Bobby slayton",
|
|
323
328
|
r"bono\s*mick",
|
|
324
329
|
r"Errol(\s*Morris)?",
|
|
@@ -413,9 +418,10 @@ HIGHLIGHTED_NAMES = [
|
|
|
413
418
|
'Philip Kafka': 'president of Prince Concepts (and son of Terry Kafka?)',
|
|
414
419
|
ROBERT_LAWRENCE_KUHN: 'investment banker, China expert',
|
|
415
420
|
TERRY_KAFKA: 'CEO of Impact Outdoor (highway billboards)',
|
|
416
|
-
TOM_PRITZKER: '
|
|
421
|
+
TOM_PRITZKER: 'chairman of The Pritzker Organization and Hyatt Hotels',
|
|
417
422
|
},
|
|
418
423
|
patterns=[
|
|
424
|
+
r"Arthur Klein",
|
|
419
425
|
r"((Bill|David)\s*)?Koch(\s*(Bro(s|thers)|Industries))?",
|
|
420
426
|
r"Gruterite",
|
|
421
427
|
r"((John|Patricia)\s*)?Kluge",
|
|
@@ -423,6 +429,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
423
429
|
r"(Mi(chael|ke)\s*)?Ovitz",
|
|
424
430
|
r"(Steve\s+)?Wynn",
|
|
425
431
|
r"(Les(lie)?\s+)?Wexner",
|
|
432
|
+
r"Michael\s*Klein",
|
|
426
433
|
r"New Leaf Ventures",
|
|
427
434
|
r"Park Partners",
|
|
428
435
|
r"SALSS",
|
|
@@ -500,6 +507,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
500
507
|
r"Dem(ocrat(ic)?)?",
|
|
501
508
|
r"(Diana\s*)?DeGette",
|
|
502
509
|
r"DNC",
|
|
510
|
+
r"(Ed(ward)?\s*)?Mezvinsky",
|
|
503
511
|
r"Elena\s*Kagan",
|
|
504
512
|
r"(Eliott?\s*)?Spitzer(, Eliot)?",
|
|
505
513
|
r"Eric Holder",
|
|
@@ -550,7 +558,6 @@ HIGHLIGHTED_NAMES = [
|
|
|
550
558
|
MERWIN_DELA_CRUZ: None, # HOUSE_OVERSIGHT_032652 Groff says "Jojo and Merwin both requested off Nov. 25 and 26"
|
|
551
559
|
NADIA_MARCINKO: "Epstein's pilot",
|
|
552
560
|
'Sean J. Lancaster': 'airplane reseller',
|
|
553
|
-
ZUBAIR_KHAN: 'Tranchulas cybersecurity, InsightsPod founder, Islamabad / Dubai',
|
|
554
561
|
},
|
|
555
562
|
patterns=[
|
|
556
563
|
r"Adriana\s*Ross",
|
|
@@ -566,7 +573,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
566
573
|
MARK_EPSTEIN: 'brother of Jeffrey',
|
|
567
574
|
},
|
|
568
575
|
patterns=[
|
|
569
|
-
r"JEGE(\s*Inc)",
|
|
576
|
+
r"JEGE(\s*Inc)?",
|
|
570
577
|
r"LSJ",
|
|
571
578
|
],
|
|
572
579
|
),
|
|
@@ -640,6 +647,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
640
647
|
r"AfD",
|
|
641
648
|
r"(Angela )?Merk(el|le)",
|
|
642
649
|
r"Austria",
|
|
650
|
+
r"Belgi(an|um)",
|
|
643
651
|
r"(Benjamin\s*)?Harnwell",
|
|
644
652
|
r"Berlin",
|
|
645
653
|
r"Borge",
|
|
@@ -649,6 +657,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
649
657
|
r"Brussels",
|
|
650
658
|
r"Cannes",
|
|
651
659
|
r"Cypr(iot|us)",
|
|
660
|
+
r"David\s*Cameron",
|
|
652
661
|
r"Davos",
|
|
653
662
|
r"ECB",
|
|
654
663
|
r"England",
|
|
@@ -663,6 +672,8 @@ HIGHLIGHTED_NAMES = [
|
|
|
663
672
|
r"Ital(ian|y)",
|
|
664
673
|
r"Jacques",
|
|
665
674
|
r"Kiev",
|
|
675
|
+
r"Latvian?",
|
|
676
|
+
r"Lithuanian?",
|
|
666
677
|
r"Le\s*Pen",
|
|
667
678
|
r"London",
|
|
668
679
|
r"Macron",
|
|
@@ -672,11 +683,13 @@ HIGHLIGHTED_NAMES = [
|
|
|
672
683
|
r"NATO",
|
|
673
684
|
r"(Nicholas\s*)?Sarkozy",
|
|
674
685
|
r"Nigel(\s*Farage)?",
|
|
686
|
+
r"(Northern\s*)?Ireland",
|
|
675
687
|
r"Norw(ay|egian)",
|
|
676
688
|
r"Oslo",
|
|
677
689
|
r"Paris",
|
|
678
690
|
r"Polish",
|
|
679
691
|
r"pope",
|
|
692
|
+
r"Portugal",
|
|
680
693
|
r"Scotland",
|
|
681
694
|
r"(Sebastian )?Kurz",
|
|
682
695
|
r"Stockholm",
|
|
@@ -685,6 +698,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
685
698
|
r"Swed(en|ish)(?![-\s]+American Life Scienc)",
|
|
686
699
|
r"Swi(ss|tzerland)",
|
|
687
700
|
r"(Tony\s)?Blair",
|
|
701
|
+
r"United\s*Kingdom",
|
|
688
702
|
r"U\.K\.",
|
|
689
703
|
r"Ukrain(e|ian)",
|
|
690
704
|
r"Venice",
|
|
@@ -748,6 +762,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
748
762
|
r"(Janet\s*)?Yellen",
|
|
749
763
|
r"(Jerome\s*)?Powell(?! M\. Cabot)",
|
|
750
764
|
r"(Jimmy\s*)?Cayne",
|
|
765
|
+
r"Joon\s*Yun",
|
|
751
766
|
r"JPMC?",
|
|
752
767
|
r"j\.?p\.?\s*morgan(\.?com|\s*Chase)?",
|
|
753
768
|
r"Madoff",
|
|
@@ -760,6 +775,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
760
775
|
r"(Peter L. )?Scher",
|
|
761
776
|
r"(Ray\s*)?Dalio",
|
|
762
777
|
r"(Richard\s*)?LeFrak",
|
|
778
|
+
r"Rockefeller(?! University)(\s*Foundation)?",
|
|
763
779
|
r"(Ste(phen|ve)\s*)?Schwart?z?man",
|
|
764
780
|
r"Serageldin",
|
|
765
781
|
r"UBS",
|
|
@@ -823,6 +839,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
823
839
|
r"FTC",
|
|
824
840
|
r"(General\s*)?P(a|e)traeus",
|
|
825
841
|
r"Geoff\s*Ling",
|
|
842
|
+
r"Homeland\s*Security",
|
|
826
843
|
r"IRS",
|
|
827
844
|
r"(James\s*)?Comey",
|
|
828
845
|
r"(Jennifer\s*Shasky\s*)?Calvery",
|
|
@@ -952,7 +969,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
952
969
|
'Alain Forget': 'author of "How To Get Out Of This World ALIVE"',
|
|
953
970
|
'Alex Yablon': 'New York Magazine fact checker (?)',
|
|
954
971
|
EDWARD_JAY_EPSTEIN: 'no relation, wrote books about spies',
|
|
955
|
-
HENRY_HOLT: f"{MICHAEL_WOLFF}'s book publisher",
|
|
972
|
+
HENRY_HOLT: f"{MICHAEL_WOLFF}'s book publisher (company not a person)",
|
|
956
973
|
JAMES_HILL: 'ABC News',
|
|
957
974
|
JENNIFER_JACQUET: 'Future Science magazine',
|
|
958
975
|
JOHN_BROCKMAN: 'literary agent and author specializing in scientific literature',
|
|
@@ -972,14 +989,14 @@ HIGHLIGHTED_NAMES = [
|
|
|
972
989
|
r'Associated\s*Press',
|
|
973
990
|
r"Axios",
|
|
974
991
|
r"BBC",
|
|
975
|
-
r"(Bob|Robert)\s*Costa",
|
|
992
|
+
r"(Bob|Robert)\s*(Costa|Woodward)",
|
|
976
993
|
r"Breitbart",
|
|
977
994
|
r"BuzzFeed(\s*News)?",
|
|
978
995
|
r"C-?Span",
|
|
979
996
|
r"CBS(\s*(4|Corp|News))?",
|
|
980
997
|
r"Charlie\s*Rose",
|
|
981
998
|
r"China\s*Daily",
|
|
982
|
-
r"
|
|
999
|
+
r"(C|MS)?NBC(\s*News)?",
|
|
983
1000
|
r"CNN(politics?)?",
|
|
984
1001
|
r"Con[cs]hita", r"Sarnoff",
|
|
985
1002
|
r"Daily Business Review",
|
|
@@ -1000,6 +1017,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
1000
1017
|
r"Globe\s*and\s*Mail",
|
|
1001
1018
|
r"Good\s*Morning\s*America",
|
|
1002
1019
|
r"Graydon(\s*Carter)?",
|
|
1020
|
+
r"Hollywood\s*Reporter",
|
|
1003
1021
|
r"Huff(ington)?(\s*Po(st)?)?",
|
|
1004
1022
|
r"Ingram, David",
|
|
1005
1023
|
r"James\s*Hill",
|
|
@@ -1007,6 +1025,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
1007
1025
|
r"Jesse Kornbluth",
|
|
1008
1026
|
r"John\s*Connolly",
|
|
1009
1027
|
r"Jonathan\s*Karl",
|
|
1028
|
+
r"Journal of Criminal Law and Criminology",
|
|
1010
1029
|
r"Julie\s*(K.?\s*)?Brown", r'jbrown@miamiherald.com',
|
|
1011
1030
|
r"(Katie\s*)?Couric",
|
|
1012
1031
|
r"Keith\s*Larsen",
|
|
@@ -1025,7 +1044,6 @@ HIGHLIGHTED_NAMES = [
|
|
|
1025
1044
|
r"PERVERSION\s*OF\s*JUSTICE",
|
|
1026
1045
|
r"Politico",
|
|
1027
1046
|
r"Pro\s*Publica",
|
|
1028
|
-
r"Reuters",
|
|
1029
1047
|
r"(Sean\s*)?Hannity",
|
|
1030
1048
|
r"Sharon Churcher", # Daily Mail
|
|
1031
1049
|
r"Sulzberger",
|
|
@@ -1038,7 +1056,9 @@ HIGHLIGHTED_NAMES = [
|
|
|
1038
1056
|
r"(The\s*)?New\s*Yorker",
|
|
1039
1057
|
r"(The\s*)?Wall\s*Street\s*Journal",
|
|
1040
1058
|
r"(The\s*)?Wa(shington\s*)?Po(st)?",
|
|
1059
|
+
r"(Thomson\s*)?Reuters",
|
|
1041
1060
|
r"(Uma\s*)?Sanghvi",
|
|
1061
|
+
r"USA\s*Today",
|
|
1042
1062
|
r"Vanity\s*Fair",
|
|
1043
1063
|
r"Viceland",
|
|
1044
1064
|
r"Vick[iy]\s*Ward",
|
|
@@ -1072,6 +1092,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
1072
1092
|
r"Chile",
|
|
1073
1093
|
r"Colombian?",
|
|
1074
1094
|
r"Cuban?",
|
|
1095
|
+
r"el chapo",
|
|
1075
1096
|
r"El\s*Salvador",
|
|
1076
1097
|
r"((Enrique )?Pena )?Nieto",
|
|
1077
1098
|
r"Lat(in)?\s*Am(erican?)?",
|
|
@@ -1113,12 +1134,16 @@ HIGHLIGHTED_NAMES = [
|
|
|
1113
1134
|
r"Arizona(?! State University)",
|
|
1114
1135
|
r"Aspen",
|
|
1115
1136
|
r"Berkeley",
|
|
1137
|
+
r"Boston",
|
|
1116
1138
|
r"Brooklyn",
|
|
1117
1139
|
r"California",
|
|
1118
1140
|
r"Canada",
|
|
1119
1141
|
r"Cape Cod",
|
|
1142
|
+
r"Charlottesville",
|
|
1143
|
+
r"Colorado",
|
|
1120
1144
|
r"Connecticut",
|
|
1121
1145
|
r"Florida",
|
|
1146
|
+
r"Los Angeles",
|
|
1122
1147
|
r"Loudoun\s*County?",
|
|
1123
1148
|
r"Martha's\s*Vineyard",
|
|
1124
1149
|
r"Miami(?!\s?Herald)",
|
|
@@ -1128,9 +1153,12 @@ HIGHLIGHTED_NAMES = [
|
|
|
1128
1153
|
r"NY(C|\s*State)",
|
|
1129
1154
|
r"Orange\s*County",
|
|
1130
1155
|
r"Oregon",
|
|
1156
|
+
r"Palo Alto",
|
|
1157
|
+
r"Pennsylvania",
|
|
1131
1158
|
r"Phoenix",
|
|
1132
1159
|
r"Portland",
|
|
1133
|
-
r"
|
|
1160
|
+
r"San Francisco",
|
|
1161
|
+
r"Sant[ae]\s*Fe",
|
|
1134
1162
|
r"Telluride",
|
|
1135
1163
|
r"Teterboro",
|
|
1136
1164
|
r"Texas(?! A&M)",
|
|
@@ -1157,6 +1185,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
1157
1185
|
r"Afghanistan",
|
|
1158
1186
|
r"Al[-\s]?Qa[ei]da",
|
|
1159
1187
|
r"Ahmadinejad",
|
|
1188
|
+
r"(Rakhat )?Aliyev",
|
|
1160
1189
|
r"Arab",
|
|
1161
1190
|
r"Aramco",
|
|
1162
1191
|
r"Armenia",
|
|
@@ -1185,6 +1214,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
1185
1214
|
r"Hamas",
|
|
1186
1215
|
r"Hezbollah",
|
|
1187
1216
|
r"HBJ",
|
|
1217
|
+
r"Hourani",
|
|
1188
1218
|
r"Houthi",
|
|
1189
1219
|
r"Imran\s+Khan",
|
|
1190
1220
|
r"Iran(ian)?([-\s]Contra)?",
|
|
@@ -1207,10 +1237,11 @@ HIGHLIGHTED_NAMES = [
|
|
|
1207
1237
|
r"MB(N|S|Z)",
|
|
1208
1238
|
r"Mid(dle)?\s*East(ern)?",
|
|
1209
1239
|
r"Mohammed\s+bin\s+Salman",
|
|
1210
|
-
r"
|
|
1240
|
+
r"Morocc(an|o)",
|
|
1211
1241
|
r"Mubarak",
|
|
1212
1242
|
r"Muslim(\s*Brotherhood)?",
|
|
1213
1243
|
r"Nayaf",
|
|
1244
|
+
r"Nazarbayev",
|
|
1214
1245
|
r"Pakistani?",
|
|
1215
1246
|
r"Omar",
|
|
1216
1247
|
r"(Osama\s*)?Bin\s*Laden",
|
|
@@ -1230,10 +1261,10 @@ HIGHLIGHTED_NAMES = [
|
|
|
1230
1261
|
r"Syrian?",
|
|
1231
1262
|
r"(Tarek\s*)?El\s*Sayed",
|
|
1232
1263
|
r"Tehran",
|
|
1264
|
+
r"Timur\s*Kulibayev",
|
|
1233
1265
|
r"Tripoli",
|
|
1234
1266
|
r"Tunisian?",
|
|
1235
|
-
r"Turk(ey|ish)",
|
|
1236
|
-
r"Turks(?! & Caicos)",
|
|
1267
|
+
r"Turk(ey|ish)?(?!s & Caicos)",
|
|
1237
1268
|
r"UAE",
|
|
1238
1269
|
r"((Iraq|Iran|Kuwait|Qatar|Yemen)i?)",
|
|
1239
1270
|
],
|
|
@@ -1281,6 +1312,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
1281
1312
|
},
|
|
1282
1313
|
patterns=[
|
|
1283
1314
|
r"(Matt(hew)? )?Hiltzi[gk]",
|
|
1315
|
+
r"Philip\s*Barden",
|
|
1284
1316
|
r"PR\s*Newswire",
|
|
1285
1317
|
REPUTATION_MGMT,
|
|
1286
1318
|
r"Reputation.com",
|
|
@@ -1308,6 +1340,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
1308
1340
|
r"Broidy",
|
|
1309
1341
|
r"(Chris\s)?Christie",
|
|
1310
1342
|
r"(?<!Merwin Dela )Cruz",
|
|
1343
|
+
r"Darrell\s*Issa",
|
|
1311
1344
|
r"Devin\s*Nunes",
|
|
1312
1345
|
r"(Don\s*)?McGa[hn]n",
|
|
1313
1346
|
r"Erik Prince",
|
|
@@ -1333,7 +1366,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
1333
1366
|
r"(Michael\s)?Hayden",
|
|
1334
1367
|
r"((General|Mike)\s*)?(Flynn|Pence)",
|
|
1335
1368
|
r"(Mitt\s*)?Romney",
|
|
1336
|
-
r"Mnuchin",
|
|
1369
|
+
r"(Steven?\s*)?Mnuchin",
|
|
1337
1370
|
r"(Newt\s*)Gingrich",
|
|
1338
1371
|
r"Nikki",
|
|
1339
1372
|
r"Haley",
|
|
@@ -1346,7 +1379,9 @@ HIGHLIGHTED_NAMES = [
|
|
|
1346
1379
|
r"(Rex\s*)?Till?erson",
|
|
1347
1380
|
r"(?<!Cynthia )(Richard\s*)?Nixon",
|
|
1348
1381
|
r"RNC",
|
|
1382
|
+
r"(Roy|Stephen)\s*Moore",
|
|
1349
1383
|
r"Tea\s*Party",
|
|
1384
|
+
r"Wilbur\s*Ross",
|
|
1350
1385
|
],
|
|
1351
1386
|
),
|
|
1352
1387
|
HighlightedNames(
|
|
@@ -1396,7 +1431,6 @@ HIGHLIGHTED_NAMES = [
|
|
|
1396
1431
|
r"Russian?",
|
|
1397
1432
|
r"Sberbank",
|
|
1398
1433
|
r"Soviet(\s*Union)?",
|
|
1399
|
-
r"Timur\s*Kulibayev",
|
|
1400
1434
|
r"USSR",
|
|
1401
1435
|
r"Vlad(imir)?(?! Yudash)",
|
|
1402
1436
|
r"(Vladimir\s*)?Putin",
|
|
@@ -1435,6 +1469,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
1435
1469
|
REID_HOFFMAN: 'PayPal mafia member, founder of LinkedIn',
|
|
1436
1470
|
STEVEN_SINOFSKY: 'ex-Microsoft, loves bitcoin',
|
|
1437
1471
|
VINCENZO_IOZZO: 'CEO of the identity-security company SlashID',
|
|
1472
|
+
ZUBAIR_KHAN: 'Tranchulas cybersecurity, InsightsPod founder, Islamabad / Dubai',
|
|
1438
1473
|
},
|
|
1439
1474
|
patterns=[
|
|
1440
1475
|
r"AG?I",
|
|
@@ -1443,6 +1478,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
1443
1478
|
r"Danny\s*Hillis",
|
|
1444
1479
|
r"deep learning",
|
|
1445
1480
|
r"Drew\s*Houston",
|
|
1481
|
+
r"Eliezer\s*Yudkowsky",
|
|
1446
1482
|
r"Eric\s*Schmidt",
|
|
1447
1483
|
r"Greylock(\s*Partners)?",
|
|
1448
1484
|
r"(?<!(ustin|Moshe)\s)Hoffmand?",
|
|
@@ -1462,6 +1498,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
1462
1498
|
r"Softbank",
|
|
1463
1499
|
r"SpaceX",
|
|
1464
1500
|
r"Tim\s*Ferriss?",
|
|
1501
|
+
r"Vision\s*Fund",
|
|
1465
1502
|
r"WikiLeak(ed|s)",
|
|
1466
1503
|
],
|
|
1467
1504
|
),
|
|
@@ -1518,6 +1555,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
1518
1555
|
r"(Kenneth E\. )?Mapp",
|
|
1519
1556
|
r"PBI",
|
|
1520
1557
|
r"Puerto\s*Ric(an|o)",
|
|
1558
|
+
r"San\s*Juan",
|
|
1521
1559
|
r"S(ain)?t.?\s*Thomas",
|
|
1522
1560
|
r"USVI",
|
|
1523
1561
|
r"(?<!stein |vis-a-)VI(?!s-a-)",
|
|
@@ -1527,9 +1565,9 @@ HIGHLIGHTED_NAMES = [
|
|
|
1527
1565
|
),
|
|
1528
1566
|
HighlightedNames(
|
|
1529
1567
|
label='victim',
|
|
1530
|
-
style=
|
|
1568
|
+
style=VICTIM_COLOR,
|
|
1531
1569
|
patterns=[
|
|
1532
|
-
r"
|
|
1570
|
+
r"child\s*pornography",
|
|
1533
1571
|
r"(David\s*)?Bo[il]es(,?\s*Schiller( & Flexner)?)?",
|
|
1534
1572
|
r"(Gloria\s*)?Allred",
|
|
1535
1573
|
r"(Jane|Tiffany)\s*Doe",
|
|
@@ -1595,6 +1633,11 @@ HIGHLIGHTED_NAMES = [
|
|
|
1595
1633
|
HighlightedNames(emailers={SULTAN_BIN_SULAYEM: 'chairman of ports in Dubai, CEO of DP World'}, style='green1', category=MIDEAST),
|
|
1596
1634
|
|
|
1597
1635
|
# HighlightedText not HighlightedNames bc of word boundary issue
|
|
1636
|
+
HighlightedText(
|
|
1637
|
+
label='metoo',
|
|
1638
|
+
style=VICTIM_COLOR,
|
|
1639
|
+
patterns=[r"#metoo"]
|
|
1640
|
+
),
|
|
1598
1641
|
HighlightedText(
|
|
1599
1642
|
label='phone_number',
|
|
1600
1643
|
style='bright_green',
|
|
@@ -1615,7 +1658,7 @@ HIGHLIGHTED_TEXTS = [
|
|
|
1615
1658
|
HighlightedText(
|
|
1616
1659
|
label='header_field',
|
|
1617
1660
|
style='plum4',
|
|
1618
|
-
patterns=[r'
|
|
1661
|
+
patterns=[r'^[>• ]{,4}(Date ?|From|Sent|To|C[cC]|Importance|Reply[- ]?To|Subject|Bee|B[cC]{2}|Attachments|Flag|Classification|((A|Debut du message transfer[&e]|De(stinataire)?|Envoye|Expe(cl|d)iteur|Objet|Q|Sujet) ?)):|^on behalf of'],
|
|
1619
1662
|
),
|
|
1620
1663
|
HighlightedText(
|
|
1621
1664
|
label='http_links',
|
|
@@ -1625,16 +1668,16 @@ HIGHLIGHTED_TEXTS = [
|
|
|
1625
1668
|
HighlightedText(
|
|
1626
1669
|
label='quoted_reply_line',
|
|
1627
1670
|
style='dim',
|
|
1628
|
-
patterns=[REPLY_REGEX.pattern],
|
|
1671
|
+
patterns=[REPLY_REGEX.pattern, r"^(> )?wrote:$"],
|
|
1629
1672
|
),
|
|
1630
1673
|
HighlightedText(
|
|
1631
1674
|
label='redacted',
|
|
1632
1675
|
style='grey58',
|
|
1633
|
-
patterns=[fr"{REDACTED}
|
|
1676
|
+
patterns=[fr"{REDACTED}|<?Privileged - Redacted>?"],
|
|
1634
1677
|
),
|
|
1635
1678
|
HighlightedText(
|
|
1636
1679
|
label='sent_from',
|
|
1637
|
-
style='
|
|
1680
|
+
style='light_cyan3 italic dim',
|
|
1638
1681
|
patterns=[SENT_FROM_REGEX.pattern],
|
|
1639
1682
|
),
|
|
1640
1683
|
HighlightedText(
|