epstein-files 1.0.10__py3-none-any.whl → 1.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@ from dateutil.parser import parse
8
8
 
9
9
  from epstein_files.util.constant.names import *
10
10
  from epstein_files.util.constant.strings import *
11
- from epstein_files.util.data import without_falsey
11
+ from epstein_files.util.data import remove_time_from_timestamp_str, without_falsey
12
12
 
13
13
  DuplicateType = Literal['earlier', 'quoted', 'redacted', 'same']
14
14
  Metadata = dict[str, bool | datetime | int | str | list[str | None] |dict[str, bool | str]]
@@ -47,12 +47,11 @@ FINANCIAL_REPORTS_AUTHORS = [
47
47
  ]
48
48
 
49
49
  # Fields like timestamp and author are better added from the Document object
50
- INVALID_FOR_METADATA = [
50
+ NON_METADATA_FIELDS = [
51
51
  'actual_text',
52
52
  'date',
53
53
  'id',
54
- 'timestamp',
55
- 'was_generated',
54
+ 'is_synthetic',
56
55
  ]
57
56
 
58
57
 
@@ -68,10 +67,10 @@ class DocCfg:
68
67
  date (str | None): If passed will be immediated parsed into the 'timestamp' field
69
68
  dupe_of_id (str | None): If this is a dupe the ID of the duplicated file. This file will be suppressed
70
69
  dupe_type (DuplicateType | None): The type of duplicate this file is or its 'duplicate_ids' are
71
- duplicate_ids (list[str]): Inverse of 'dupe_of_id' - this file will NOT be suppressed but 'duplicate_ids' will be
70
+ duplicate_ids (list[str]): IDs of *other* documents that are dupes of this document
72
71
  is_interesting (bool): Override other considerations and always consider this file interesting
73
72
  timestamp (datetime | None): Time this email was sent, file was created, article published, etc.
74
- was_generated (bool): True if this object was generated by the duplicate_cfgs() method
73
+ is_synthetic (bool): True if this config was generated by the duplicate_cfgs() method
75
74
  """
76
75
  id: str
77
76
  author: str | None = None
@@ -82,8 +81,8 @@ class DocCfg:
82
81
  dupe_type: DuplicateType | None = None
83
82
  duplicate_ids: list[str] = field(default_factory=list)
84
83
  is_interesting: bool = False
84
+ is_synthetic: bool = False
85
85
  timestamp: datetime | None = None
86
- was_generated: bool = False
87
86
 
88
87
  def __post_init__(self):
89
88
  if self.date:
@@ -92,66 +91,48 @@ class DocCfg:
92
91
  if self.dupe_of_id or self.duplicate_ids:
93
92
  self.dupe_type = self.dupe_type or SAME
94
93
 
95
- def duplicate_reason(self) -> str | None:
96
- if self.dupe_type is not None:
97
- return DUPE_TYPE_STRS[self.dupe_type]
98
-
99
- def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
100
- """Create synthetic DocCfg objects that set the 'dupe_of_id' field to point back to this object."""
101
- for id in self.duplicate_ids:
102
- dupe_cfg = deepcopy(self)
103
- dupe_cfg.id = id
104
- dupe_cfg.dupe_of_id = self.id
105
- dupe_cfg.duplicate_ids = []
106
- dupe_cfg.dupe_type = self.dupe_type
107
- dupe_cfg.was_generated = True
108
- yield dupe_cfg
109
-
110
- def info_str(self) -> str | None:
94
+ def complete_description(self) -> str | None:
111
95
  """String that summarizes what is known about this document."""
112
- if self.category and not self.description:
96
+ if self.category and not self.description and not self.author:
113
97
  return self.category
114
98
  elif self.category == REPUTATION:
115
99
  return f"{REPUTATION_MGMT}: {self.description}"
100
+ elif self.category == SKYPE_LOG:
101
+ msg = f"{self.category} of conversation with {self.author}" if self.author else self.category
102
+ return f"{msg} {self.description}" if self.description else msg
116
103
  elif self.author and self.description:
117
104
  if self.category in [ACADEMIA, BOOK]:
118
- return self.title_by_author()
105
+ title = self.description if '"' in self.description else f"'{self.description}'"
106
+ return f"{title} by {self.author}"
119
107
  elif self.category == FINANCE and self.author in FINANCIAL_REPORTS_AUTHORS:
120
108
  return f"{self.author} report: '{self.description}'"
121
109
  elif self.category == LEGAL and 'v.' in self.author:
122
- return f"{self.author}: '{self.description}'"
110
+ return f"{self.author}: {self.description}"
123
111
  elif self.category and self.author is None and self.description is None:
124
112
  return self.category
125
113
 
126
114
  pieces = without_falsey([self.author, self.description])
127
115
  return ' '.join(pieces) if pieces else None
128
116
 
129
- def metadata(self) -> Metadata:
130
- non_null_fields = {k: v for k, v in asdict(self).items() if v and k not in INVALID_FOR_METADATA}
131
-
132
- if self.category in [EMAIL, TEXT_MESSAGE]:
133
- del non_null_fields['category']
134
-
135
- return non_null_fields
136
-
137
- def non_null_field_names(self) -> list[str]:
138
- return [f.name for f in self.sorted_fields() if getattr(self, f.name)]
139
-
140
- def sorted_fields(self) -> list[Field]:
141
- return sorted(fields(self), key=lambda f: FIELD_SORT_KEY.get(f.name, f.name))
142
-
143
- def title_by_author(self) -> str:
144
- if not (self.author and self.description):
145
- raise RuntimeError(f"Can't call title_by_author() without author and description!")
117
+ def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
118
+ """Create synthetic DocCfg objects that set the 'dupe_of_id' field to point back to this object."""
119
+ for id in self.duplicate_ids:
120
+ dupe_cfg = deepcopy(self)
121
+ dupe_cfg.id = id
122
+ dupe_cfg.dupe_of_id = self.id
123
+ dupe_cfg.duplicate_ids = []
124
+ dupe_cfg.dupe_type = self.dupe_type
125
+ dupe_cfg.is_synthetic = True
126
+ yield dupe_cfg
146
127
 
147
- title = self.description if '"' in self.description else f"'{self.description}'"
148
- return f"{title} by {self.author}"
128
+ def metadata(self) -> Metadata:
129
+ return {k: v for k, v in asdict(self).items() if k not in NON_METADATA_FIELDS and v}
149
130
 
150
131
  def _props_strs(self) -> list[str]:
151
132
  props = []
152
133
  add_prop = lambda f, value: props.append(f"{f.name}={value}")
153
134
 
154
- for _field in self.sorted_fields():
135
+ for _field in sorted(fields(self), key=lambda f: FIELD_SORT_KEY.get(f.name, f.name)):
155
136
  value = getattr(self, _field.name)
156
137
 
157
138
  if value is None or value is False or (isinstance(value, list) and len(value) == 0):
@@ -160,13 +141,13 @@ class DocCfg:
160
141
  add_prop(_field, constantize_name(str(value)) if CONSTANTIZE_NAMES else f"'{value}'")
161
142
  elif _field.name == 'category' and value in [EMAIL, TEXT_MESSAGE]:
162
143
  continue
163
- elif _field.name == 'recipients' and isinstance(value, list):
144
+ elif _field.name == 'recipients' and value:
164
145
  recipients_str = str([constantize_name(r) if (CONSTANTIZE_NAMES and r) else r for r in value])
165
146
  add_prop(_field, recipients_str.replace("'", '') if CONSTANTIZE_NAMES else recipients_str)
166
147
  elif _field.name == 'timestamp' and self.date is not None:
167
148
  continue # Don't print both timestamp and date
168
149
  elif isinstance(value, datetime):
169
- value_str = re.sub(' 00:00:00', '', str(value))
150
+ value_str = remove_time_from_timestamp_str(value)
170
151
  add_prop(_field, f"parse('{value_str}')" if CONSTANTIZE_NAMES else f"'{value}'")
171
152
  elif isinstance(value, str):
172
153
  if "'" in value:
@@ -221,22 +202,15 @@ class EmailCfg(CommunicationCfg):
221
202
  """
222
203
  Attributes:
223
204
  actual_text (str | None): In dire cases of broken OCR we just configure the body of the email as a string.
205
+ fwded_text_after (str | None): If set, any text after this is a fwd of an article or similar
224
206
  is_fwded_article (bool): True if this is a newspaper article someone fwded. Used to exclude articles from word counting.
225
207
  recipients (list[str | None]): Who received the email
226
208
  """
227
- actual_text: str | None = None # Override for the Email._actual_text() method for particularly broken emails
228
- fwded_text_after: str | None = None # If set, any text after this is a fwd of an article or similar
209
+ actual_text: str | None = None
210
+ fwded_text_after: str | None = None
229
211
  is_fwded_article: bool = False
230
212
  recipients: list[str | None] = field(default_factory=list)
231
213
 
232
- def __post_init__(self):
233
- super().__post_init__()
234
- self.category = EMAIL
235
-
236
- @classmethod
237
- def from_doc_cfg(cls, cfg: DocCfg) -> 'EmailCfg':
238
- return cls(**asdict(cfg))
239
-
240
214
  # This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
241
215
  def __repr__(self) -> str:
242
216
  return super().__repr__()
@@ -244,10 +218,6 @@ class EmailCfg(CommunicationCfg):
244
218
 
245
219
  @dataclass(kw_only=True)
246
220
  class TextCfg(CommunicationCfg):
247
- def __post_init__(self):
248
- super().__post_init__()
249
- self.category = TEXT_MESSAGE
250
-
251
221
  # This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
252
222
  def __repr__(self) -> str:
253
223
  return super().__repr__()
epstein_files/util/env.py CHANGED
@@ -2,7 +2,7 @@ import logging
2
2
  from argparse import ArgumentParser
3
3
  from os import environ
4
4
  from pathlib import Path
5
- from sys import argv
5
+ from sys import argv, exit
6
6
 
7
7
  from rich_argparse_plus import RichHelpFormatterPlus
8
8
 
@@ -11,28 +11,30 @@ from epstein_files.util.logging import env_log_level, logger
11
11
  COUNT_WORDS_SCRIPT = 'epstein_word_count'
12
12
  DEFAULT_WIDTH = 145
13
13
  HTML_SCRIPTS = ['epstein_generate', COUNT_WORDS_SCRIPT]
14
+ EPSTEIN_DOCS_DIR_ENV_VAR_NAME = 'EPSTEIN_DOCS_DIR'
15
+
14
16
 
15
17
  RichHelpFormatterPlus.choose_theme('morning_glory')
16
18
  parser = ArgumentParser(description="Parse epstein OCR docs and generate HTML pages.", formatter_class=RichHelpFormatterPlus)
19
+ parser.add_argument('--make-clean', action='store_true', help='delete all HTML build artifact and write latest URLs to .urls.env')
17
20
  parser.add_argument('--name', '-n', action='append', dest='names', help='specify the name(s) whose communications should be output')
18
- parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='ovewrite cached EpsteinFiles')
21
+ parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='re-parse the files and ovewrite cached data')
19
22
 
20
- output = parser.add_argument_group('OUTPUT')
23
+ output = parser.add_argument_group('OUTPUT', 'Options used by epstein_generate.')
21
24
  output.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
22
25
  output.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just the interesting ones')
23
- output.add_argument('--build', '-b', action='store_true', help='write output to an HTML file')
24
- output.add_argument('--json-metadata', '-jm', action='store_true', help='dump JSON metadata for all files and exit')
25
- output.add_argument('--make-clean', action='store_true', help='delete all HTML build artifact and write latest URLs to .urls.env')
26
- output.add_argument('--output-emails', '-oe', action='store_true', help='generate other files section')
27
- output.add_argument('--output-json-files', action='store_true', help='pretty print all the raw JSON data files in the collection and exit')
28
- output.add_argument('--output-other-files', '-oo', action='store_true', help='generate other files section')
26
+ output.add_argument('--build', '-b', action='store_true', help='write HTML output to a file')
27
+ output.add_argument('--json-files', action='store_true', help='pretty print all the raw JSON data files in the collection and exit')
28
+ output.add_argument('--json-metadata', action='store_true', help='dump JSON metadata for all files and exit')
29
+ output.add_argument('--output-emails', '-oe', action='store_true', help='generate emails section')
30
+ output.add_argument('--output-other', '-oo', action='store_true', help='generate other files section')
29
31
  output.add_argument('--output-texts', '-ot', action='store_true', help='generate text messages section')
30
32
  output.add_argument('--sort-alphabetical', action='store_true', help='sort emailers alphabetically intead of by email count')
31
33
  output.add_argument('--suppress-output', action='store_true', help='no output to terminal (use with --build)')
32
34
  output.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use (in characters)')
33
- output.add_argument('--use-epstein-web-links', action='store_true', help='use epsteinweb.org links instead of epstein.media')
35
+ output.add_argument('--use-epstein-web', action='store_true', help='use epsteinweb.org links instead of epstein.media')
34
36
 
35
- scripts = parser.add_argument_group('SCRIPTS', 'Arguments used only by epstein_search, epstein_show, epstein_diff')
37
+ scripts = parser.add_argument_group('SCRIPTS', 'Options used by epstein_search, epstein_show, and epstein_diff.')
36
38
  scripts.add_argument('positional_args', nargs='*', help='strings to searchs for, file IDs to show or diff, etc.')
37
39
  scripts.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (used by epstein_show)')
38
40
  scripts.add_argument('--whole-file', '-wf', action='store_true', help='print whole file (used by epstein_search)')
@@ -42,23 +44,35 @@ debug.add_argument('--colors-only', '-c', action='store_true', help='print heade
42
44
  debug.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
43
45
  debug.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
44
46
  debug.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats about the files')
47
+ debug.add_argument('--skip-other-files', '-sof', action='store_true', help='skip parsing non email/text files')
45
48
  debug.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
46
49
  args = parser.parse_args()
47
50
 
51
+
52
+ # Verify Epstein docs can be found
53
+ DOCS_DIR_ENV = environ.get(EPSTEIN_DOCS_DIR_ENV_VAR_NAME)
54
+ DOCS_DIR = Path(DOCS_DIR_ENV or '').resolve()
55
+
56
+ if not DOCS_DIR_ENV:
57
+ print(f"\n ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME} env var not set!\n")
58
+ exit(1)
59
+ elif not DOCS_DIR.exists():
60
+ print(f"\n ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!\n")
61
+ exit(1)
62
+
48
63
  current_script = Path(argv[0]).name
49
64
  is_env_var_set = lambda s: len(environ.get(s) or '') > 0
50
65
  is_html_script = current_script in HTML_SCRIPTS
51
66
 
52
67
  args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
53
68
  args.output_emails = args.output_emails or args.all_emails
54
- args.output_other_files = args.output_other_files or args.all_other_files
69
+ args.output_other = args.output_other or args.all_other_files
55
70
  args.overwrite_pickle = args.overwrite_pickle or (is_env_var_set('OVERWRITE_PICKLE') and not is_env_var_set('PICKLED'))
56
71
  args.width = args.width if is_html_script else None
57
72
  is_output_selected = any([arg.startswith('output_') and value for arg, value in vars(args).items()])
58
73
  is_output_selected = is_output_selected or args.json_metadata or args.colors_only
59
74
  specified_names: list[str | None] = [None if n == 'None' else n for n in (args.names or [])]
60
75
 
61
-
62
76
  # Log level args
63
77
  if args.deep_debug:
64
78
  logger.setLevel(logging.DEBUG)
@@ -74,11 +88,9 @@ logger.info(f'Log level set to {logger.level}...')
74
88
  # Massage args that depend on other args to the appropriate state
75
89
  if current_script == 'epstein_generate' and not (is_output_selected or args.make_clean):
76
90
  logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
77
- args.output_texts = True
78
- args.output_emails = True
79
- args.output_other_files = True
91
+ args.output_texts = args.output_emails = args.output_other = True
80
92
 
81
- if args.use_epstein_web_links:
93
+ if args.use_epstein_web:
82
94
  logger.warning(f"Using links to epsteinweb.org links instead of epsteinify.com...")
83
95
 
84
96
  if args.debug:
@@ -1,20 +1,9 @@
1
1
  import re
2
- from os import environ
3
2
  from pathlib import Path
4
- from sys import exit
5
3
 
6
4
  from epstein_files.util.constant.strings import FILE_NAME_REGEX, FILE_STEM_REGEX, HOUSE_OVERSIGHT_PREFIX
7
-
8
- EPSTEIN_DOCS_DIR_ENV_VAR_NAME = 'EPSTEIN_DOCS_DIR'
9
- DOCS_DIR_ENV = environ[EPSTEIN_DOCS_DIR_ENV_VAR_NAME]
10
- DOCS_DIR = Path(DOCS_DIR_ENV or '').resolve()
11
-
12
- if not DOCS_DIR_ENV:
13
- print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME} env var not set!")
14
- exit(1)
15
- elif not DOCS_DIR.exists():
16
- print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!")
17
- exit(1)
5
+ from epstein_files.util.env import DOCS_DIR
6
+ from epstein_files.util.logging import logger
18
7
 
19
8
  EXTRACTED_EMAILS_DIR = Path('emails_extracted_from_legal_filings')
20
9
  FILE_ID_REGEX = re.compile(fr".*{FILE_NAME_REGEX.pattern}")
@@ -22,10 +11,13 @@ FILENAME_LENGTH = len(HOUSE_OVERSIGHT_PREFIX) + 6
22
11
  KB = 1024
23
12
  MB = KB * KB
24
13
 
14
+ file_size = lambda file_path: Path(file_path).stat().st_size
15
+ file_size_str = lambda file_path: file_size_to_str(file_size(file_path))
25
16
 
26
- # Handles both string and int 'id' args.
17
+ # Coerce methods handle both string and int arguments.
18
+ coerce_file_name = lambda filename_or_id: coerce_file_stem(filename_or_id) + '.txt'
19
+ coerce_file_path = lambda filename_or_id: DOCS_DIR.joinpath(coerce_file_name(filename_or_id))
27
20
  id_str = lambda id: f"{int(id):06d}"
28
- filename_for_id = lambda id: file_stem_for_id(id) + '.txt'
29
21
 
30
22
 
31
23
  def coerce_file_stem(filename_or_id: int | str) -> str:
@@ -42,14 +34,6 @@ def coerce_file_stem(filename_or_id: int | str) -> str:
42
34
  return file_stem
43
35
 
44
36
 
45
- def coerce_file_name(filename_or_id: int | str) -> str:
46
- return coerce_file_stem(filename_or_id) + '.txt'
47
-
48
-
49
- def coerce_file_path(filename_or_id: int | str) -> Path:
50
- return DOCS_DIR.joinpath(coerce_file_name(filename_or_id))
51
-
52
-
53
37
  def extract_file_id(filename_or_id: int | str | Path) -> str:
54
38
  if isinstance(filename_or_id, int) or (isinstance(filename_or_id, str) and len(filename_or_id) <= 6):
55
39
  return id_str(filename_or_id)
@@ -62,12 +46,7 @@ def extract_file_id(filename_or_id: int | str | Path) -> str:
62
46
  return file_match.group(1)
63
47
 
64
48
 
65
- def file_size(file_path: str | Path) -> int:
66
- return Path(file_path).stat().st_size
67
-
68
-
69
- def file_size_str(file_path: str | Path) -> str:
70
- size = file_size(file_path)
49
+ def file_size_to_str(size: int) -> str:
71
50
  digits = 2
72
51
 
73
52
  if size > MB:
@@ -96,3 +75,7 @@ def is_local_extract_file(filename) -> bool:
96
75
  """Return true if filename is of form 'HOUSE_OVERSIGHT_029835_1.txt'."""
97
76
  file_match = FILE_ID_REGEX.match(str(filename))
98
77
  return True if file_match and file_match.group(2) else False
78
+
79
+
80
+ def log_file_write(file_path: str | Path) -> None:
81
+ logger.warning(f"Wrote {file_size_str(file_path)} to '{file_path}'")
@@ -2,6 +2,7 @@ import re
2
2
  from dataclasses import dataclass, field
3
3
 
4
4
  from rich.highlighter import RegexHighlighter
5
+ from rich.text import Text
5
6
 
6
7
  from epstein_files.util.constant.names import *
7
8
  from epstein_files.util.constant.strings import *
@@ -21,7 +22,7 @@ EPSTEIN_ESTATE_EXECUTOR = f"Epstein {ESTATE_EXECUTOR}"
21
22
  REGEX_STYLE_PREFIX = 'regex'
22
23
  SIMPLE_NAME_REGEX = re.compile(r"^[-\w ]+$", re.IGNORECASE)
23
24
 
24
- CATEGORY_LABEL_MAPPING = {
25
+ CATEGORY_STYLE_MAPPING = {
25
26
  ARTICLE: JOURNALIST,
26
27
  ARTS: ENTERTAINER,
27
28
  BOOK: JOURNALIST,
@@ -31,6 +32,12 @@ CATEGORY_LABEL_MAPPING = {
31
32
  REPUTATION: PUBLICIST,
32
33
  }
33
34
 
35
+ CATEGORY_STYLES = {
36
+ JSON: 'dark_red',
37
+ JUNK: 'grey19',
38
+ 'letter': 'medium_orchid1'
39
+ }
40
+
34
41
 
35
42
  @dataclass(kw_only=True)
36
43
  class HighlightedText:
@@ -156,7 +163,7 @@ HIGHLIGHTED_NAMES = [
156
163
  HighlightedNames(
157
164
  label=BUSINESS,
158
165
  style='spring_green4',
159
- pattern=r'Gruterite|(John\s*)?Kluge|Marc Rich|(Mi(chael|ke)\s*)?Ovitz|(Steve\s+)?Wynn|(Les(lie)?\s+)?Wexner|SALSS|Swedish[-\s]*American\s*Life\s*Science\s*Summit|Valhi|(Yves\s*)?Bouvier',
166
+ pattern=r'Gruterite|(John\s*)?Kluge|Marc Rich|(Mi(chael|ke)\s*)?Ovitz|(Steve\s+)?Wynn|(Les(lie)?\s+)?Wexner|New Leaf Ventures|Park Partners|SALSS|Swedish[-\s]*American\s*Life\s*Science\s*Summit|Valhi|(Yves\s*)?Bouvier',
160
167
  emailers = {
161
168
  ALIREZA_ITTIHADIEH: 'CEO Freestream Aircraft Limited',
162
169
  BARBRO_C_EHNBOM: 'Swedish pharmaceuticals, SALSS',
@@ -216,6 +223,7 @@ HIGHLIGHTED_NAMES = [
216
223
  'Linda Pinto': 'interior design at Alberto Pinto Cabinet',
217
224
  MERWIN_DELA_CRUZ: None, # HOUSE_OVERSIGHT_032652 Groff says "Jojo and Merwin both requested off Nov. 25 and 26"
218
225
  NADIA_MARCINKO: 'pilot',
226
+ 'Sean J. Lancaster': 'airplane reseller',
219
227
  }
220
228
  ),
221
229
  HighlightedNames(
@@ -253,6 +261,8 @@ HIGHLIGHTED_NAMES = [
253
261
  MARTIN_WEINBERG: CRIMINAL_DEFENSE_ATTORNEY,
254
262
  MICHAEL_MILLER: 'Steptoe LLP partner',
255
263
  REID_WEINGARTEN: 'Steptoe LLP partner',
264
+ ROBERT_D_CRITTON_JR: 'criminal defense attorney',
265
+ 'Robert Gold': None,
256
266
  'Roy Black': CRIMINAL_DEFENSE_2008,
257
267
  SCOTT_J_LINK: None,
258
268
  TONJA_HADDAD_COLEMAN: f'{EPSTEIN_V_ROTHSTEIN_EDWARDS_ATTORNEY}, maybe daughter of Fred Haddad?',
@@ -303,15 +313,17 @@ HIGHLIGHTED_NAMES = [
303
313
  }
304
314
  ),
305
315
  HighlightedNames(
306
- label='finance',
316
+ label=FINANCE,
307
317
  style='green',
308
- pattern=r'Apollo|Ari\s*Glass|Bank|(Bernie\s*)?Madoff|Black(rock|stone)|B\s*of\s*A|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|(money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
318
+ pattern=r'Apollo|Ari\s*Glass|Bank|(Bernie\s*)?Madoff|Black(rock|stone)|B\s*of\s*A|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|((anti.?)?money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
309
319
  emailers={
310
320
  AMANDA_ENS: 'Citigroup',
321
+ BRAD_WECHSLER: f"head of {LEON_BLACK}'s personal investment vehicle according to FT",
311
322
  DANIEL_SABBA: 'UBS Investment Bank',
312
323
  DAVID_FISZEL: 'CIO Honeycomb Asset Management',
313
324
  JES_STALEY: 'former CEO of Barclays',
314
325
  JIDE_ZEITLIN: 'former partner at Goldman Sachs, allegations of sexual misconduct',
326
+ 'Laurie Cameron': 'currency trading',
315
327
  LEON_BLACK: 'Apollo CEO',
316
328
  MARC_LEON: 'Luxury Properties Sari Morrocco',
317
329
  MELANIE_SPINELLA: f'representative of {LEON_BLACK}',
@@ -325,6 +337,7 @@ HIGHLIGHTED_NAMES = [
325
337
  style='deep_pink2',
326
338
  pattern=r'Cambridge|(Derek\s*)?Bok|Elisa(\s*New)?|Harvard(\s*(Business|Law|University)(\s*School)?)?|(Jonathan\s*)?Zittrain|(Stephen\s*)?Kosslyn',
327
339
  emailers = {
340
+ "Donald Rubin": f"Professor of Statistics",
328
341
  "Kelly Friendly": f"longtime aide and spokesperson of {LARRY_SUMMERS}",
329
342
  LARRY_SUMMERS: 'board of Digital Currency Group (DCG), Harvard president, Obama economic advisor',
330
343
  'Leah Reis-Dennis': 'producer for Lisa New\'s Poetry in America',
@@ -370,7 +383,7 @@ HIGHLIGHTED_NAMES = [
370
383
  HighlightedNames(
371
384
  label=JOURNALIST,
372
385
  style='bright_yellow',
373
- pattern=r'Palm\s*Beach\s*(Daily\s*News|Post)|ABC(\s*News)?|Alex\s*Yablon|(Andrew\s*)?Marra|Arianna(\s*Huffington)?|(Arthur\s*)?Kretchmer|BBC|Bloomberg|Breitbart|Charlie\s*Rose|China\s*Daily|CNBC|CNN(politics?)?|Con[cs]hita|Sarnoff|(?<!Virgin[-\s]Islands[-\s])Daily\s*(Beast|Mail|News|Telegraph)|(David\s*)?Pecker|David\s*Brooks|Ed\s*Krassenstein|(Emily\s*)?Michot|Ezra\s*Klein|(George\s*)?Stephanopoulus|Globe\s*and\s*Mail|Good\s*Morning\s*America|Graydon(\s*Carter)?|Huffington(\s*Post)?|Ingram, David|(James\s*)?Patterson|Jonathan\s*Karl|Julie\s*(K.?\s*)?Brown|(Katie\s*)?Couric|Keith\s*Larsen|L\.?A\.?\s*Times|Miami\s*Herald|(Michele\s*)?Dargan|(National\s*)?Enquirer|(The\s*)?N(ew\s*)?Y(ork\s*)?(P(ost)?|T(imes)?)|(The\s*)?New\s*Yorker|NYer|PERVERSION\s*OF\s*JUSTICE|Politico|Pro\s*Publica|(Sean\s*)?Hannity|Sulzberger|SunSentinel|Susan Edelman|(Uma\s*)?Sanghvi|(The\s*)?Wa(shington\s*)?Po(st)?|Viceland|Vick[iy]\s*Ward|Vox|WGBH|(The\s*)?Wall\s*Street\s*Journal|WSJ|[-\w.]+@(bbc|independent|mailonline|mirror|thetimes)\.co\.uk',
386
+ pattern=r'Palm\s*Beach\s*(Daily\s*News|Post)|ABC(\s*News)?|Alex\s*Yablon|(Andrew\s*)?Marra|Arianna(\s*Huffington)?|(Arthur\s*)?Kretchmer|BBC|Bloomberg|Breitbart|Charlie\s*Rose|China\s*Daily|CNBC|CNN(politics?)?|Con[cs]hita|Sarnoff|(?<!Virgin[-\s]Islands[-\s])Daily\s*(Beast|Mail|News|Telegraph)|(David\s*)?Pecker|David\s*Brooks|Ed\s*Krassenstein|(Emily\s*)?Michot|Ezra\s*Klein|(George\s*)?Stephanopoulus|Globe\s*and\s*Mail|Good\s*Morning\s*America|Graydon(\s*Carter)?|Huffington(\s*Post)?|Ingram, David|(James\s*)?(Hill|Patterson)|Jonathan\s*Karl|Julie\s*(K.?\s*)?Brown|(Katie\s*)?Couric|Keith\s*Larsen|L\.?A\.?\s*Times|Miami\s*Herald|(Michele\s*)?Dargan|(National\s*)?Enquirer|(The\s*)?N(ew\s*)?Y(ork\s*)?(P(ost)?|T(imes)?)|(The\s*)?New\s*Yorker|NYer|PERVERSION\s*OF\s*JUSTICE|Politico|Pro\s*Publica|(Sean\s*)?Hannity|Sulzberger|SunSentinel|Susan Edelman|(Uma\s*)?Sanghvi|(The\s*)?Wa(shington\s*)?Po(st)?|Viceland|Vick[iy]\s*Ward|Vox|WGBH|(The\s*)?Wall\s*Street\s*Journal|WSJ|[-\w.]+@(bbc|independent|mailonline|mirror|thetimes)\.co\.uk',
374
387
  emailers = {
375
388
  EDWARD_JAY_EPSTEIN: 'reporter who wrote about the kinds of crimes Epstein was involved in, no relation to Jeffrey',
376
389
  'James Hill': 'ABC News',
@@ -390,7 +403,7 @@ HIGHLIGHTED_NAMES = [
390
403
  HighlightedNames(
391
404
  label='law enforcement',
392
405
  style='color(24) bold',
393
- pattern=r'ag|(Alicia\s*)?Valle|attorney|((Bob|Robert)\s*)?Mueller|(Byung\s)?Pak|CFTC|CIA|CIS|CVRA|Dep(artmen)?t\.?\s*of\s*(the\s*)?(Justice|Treasury)|DHS|DOJ|FBI|FCPA|FDIC|Federal\s*Bureau\s*of\s*Investigation|FinCEN|FINRA|FOIA|FTC|IRS|(James\s*)?Comey|(Jennifer\s*Shasky\s*)?Calvery|((Judge|Mark)\s*)?(Carney|Filip)|(Kirk )?Blouin|KYC|NIH|NS(A|C)|OCC|OFAC|(Lann?a\s*)?Belohlavek|lawyer|(Michael\s*)?Reiter|OGE|Office\s*of\s*Government\s*Ethics|Police Code Enforcement|(Preet\s*)?Bharara|SCOTUS|SD(FL|NY)|Southern\s*District\s*of\s*(Florida|New\s*York)|SEC|Secret\s*Service|Securities\s*and\s*Exchange\s*Commission|State\s*Dep(artmen)?t|Strzok|Supreme\s*Court|Treasury\s*(Dep(artmen)?t|Secretary)|TSA|USAID|(William\s*J\.?\s*)?Zloch',
406
+ pattern=r'ag|(Alicia\s*)?Valle|AML|attorney|((Bob|Robert)\s*)?Mueller|(Byung\s)?Pak|CFTC?|CIA|CIS|CVRA|Dep(artmen)?t\.?\s*of\s*(the\s*)?(Justice|Treasury)|DHS|DOJ|FBI|FCPA|FDIC|Federal\s*Bureau\s*of\s*Investigation|FinCEN|FINRA|FOIA|FTC|IRS|(James\s*)?Comey|(Jennifer\s*Shasky\s*)?Calvery|((Judge|Mark)\s*)?(Carney|Filip)|(Kirk )?Blouin|KYC|NIH|NS(A|C)|OCC|OFAC|(Lann?a\s*)?Belohlavek|lawyer|(Michael\s*)?Reiter|OGE|Office\s*of\s*Government\s*Ethics|Police Code Enforcement|(Preet\s*)?Bharara|SCOTUS|SD(FL|NY)|Southern\s*District\s*of\s*(Florida|New\s*York)|SEC|Secret\s*Service|Securities\s*and\s*Exchange\s*Commission|State\s*Dep(artmen)?t|Strzok|Supreme\s*Court|Treasury\s*(Dep(artmen)?t|Secretary)|TSA|USAID|(William\s*J\.?\s*)?Zloch',
394
407
  emailers = {
395
408
  ANN_MARIE_VILLAFANA: 'southern district of Florida U.S. Attorney',
396
409
  DANNY_FROST: 'Director of Communications at Manhattan DA',
@@ -450,6 +463,7 @@ HIGHLIGHTED_NAMES = [
450
463
  IAN_OSBORNE: f"{OSBORNE_LLP} reputation repairer possibly hired by Epstein ca. 2011-06",
451
464
  MICHAEL_SITRICK: 'crisis PR',
452
465
  PEGGY_SIEGAL: 'socialite',
466
+ 'R. Couri Hay': None,
453
467
  ROSS_GOW: 'Acuity Reputation Management',
454
468
  TYLER_SHEARS: f"{REPUTATION_MGMT}, worked on Epstein's Google search results with {CHRISTINA_GALBRAITH}",
455
469
  }
@@ -477,6 +491,7 @@ HIGHLIGHTED_NAMES = [
477
491
  style='red bold',
478
492
  pattern=r'Alfa\s*Bank|Anya\s*Rasulova|Chernobyl|Day\s+One\s+Ventures|(Dmitry\s)?(Kiselyov|(Lana\s*)?Pozhidaeva|Medvedev|Rybolo(o?l?ev|vlev))|Dmitry|FSB|GRU|KGB|Kislyak|Kremlin|Kuznetsova|Lavrov|Lukoil|Moscow|(Oleg\s*)?Deripaska|Oleksandr Vilkul|Rosneft|RT|St.?\s*?Petersburg|Russian?|Sberbank|Soviet(\s*Union)?|USSR|Vladimir|(Vladimir\s*)?(Putin|Yudashkin)|Women\s*Empowerment|Xitrans',
479
493
  emailers = {
494
+ 'Dasha Zhukova': 'art collector, daughter of Alexander Zhukov',
480
495
  MASHA_DROKOVA: 'silicon valley VC, former Putin Youth',
481
496
  RENATA_BOLOTOVA: 'former aspiring model, now fund manager at New York State Insurance Fund',
482
497
  SVETLANA_POZHIDAEVA: f'Epstein\'s Russian assistant who was recommended for a visa by Sergei Belyakov (FSB) and {DAVID_BLAINE}',
@@ -485,14 +500,16 @@ HIGHLIGHTED_NAMES = [
485
500
  HighlightedNames(
486
501
  label=ACADEMIA,
487
502
  style='light_goldenrod2',
488
- pattern=r'Alain Forget|Brotherton|Carl\s*Sagan|Columbia|David Grosof|J(ames|im)\s*Watson|(Lord\s*)?Martin\s*Rees|Massachusetts\s*Institute\s*of\s*Technology|MIT(\s*Media\s*Lab)?|Media\s*Lab|Minsky|((Noam|Valeria)\s*)?Chomsky|Praluent|Regeneron|(Richard\s*)?Dawkins|Sanofi|Stanford|(Stephen\s*)?Hawking|(Steven?\s*)?Pinker|UCLA',
503
+ pattern=r'Alain Forget|Brotherton|Carl\s*Sagan|Columbia|David Grosof|J(ames|im)\s*Watson|(Lord\s*)?Martin\s*Rees|Massachusetts\s*Institute\s*of\s*Technology|MIT(\s*Media\s*Lab)?|Media\s*Lab|Minsky|((Noam|Valeria)\s*)?Chomsky|Norman\s*Finkelstein|Praluent|Regeneron|(Richard\s*)?Dawkins|Sanofi|Stanford|(Stephen\s*)?Hawking|(Steven?\s*)?Pinker|UCLA',
489
504
  emailers = {
490
505
  DAVID_HAIG: None,
491
506
  JOSCHA_BACH: 'cognitive science / AI research',
492
507
  'Daniel Kahneman': 'Nobel economic sciences laureate and cognitivie psychologist (?)',
508
+ 'Ed Boyden': 'Associate Professor, MIT Media Lab neurobiology',
493
509
  LAWRENCE_KRAUSS: 'theoretical physicist',
494
510
  LINDA_STONE: 'ex-Microsoft, MIT Media Lab',
495
511
  MARK_TRAMO: 'professor of neurology at UCLA',
512
+ 'Nancy Dahl': f'wife of {LAWRENCE_KRAUSS}',
496
513
  NEAL_KASSELL: 'professor of neurosurgery at University of Virginia',
497
514
  PETER_ATTIA: 'longevity medicine',
498
515
  ROBERT_TRIVERS: 'evolutionary biology',
@@ -588,7 +605,7 @@ HIGHLIGHTED_NAMES = [
588
605
  HighlightedText(
589
606
  label='phone_number',
590
607
  style='bright_green',
591
- pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|\b[\d+]{10,12}\b",
608
+ pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|(\b|\+)[\d+]{10,12}\b",
592
609
  ),
593
610
  ]
594
611
 
@@ -648,18 +665,14 @@ def get_info_for_name(name: str) -> str | None:
648
665
 
649
666
 
650
667
  def get_style_for_category(category: str) -> str | None:
651
- if category in [CONFERENCE, SPEECH]:
668
+ if category in CATEGORY_STYLES:
669
+ return CATEGORY_STYLES[category]
670
+ elif category in [CONFERENCE, SPEECH]:
652
671
  return f"{get_style_for_category(ACADEMIA)} dim"
653
- elif category == JSON:
654
- return 'dark_red'
655
- elif category == JUNK:
656
- return 'grey19'
657
- elif category == 'letter':
658
- return 'medium_orchid1'
659
672
  elif category == SOCIAL:
660
- return f"{get_style_for_category(PUBLICIST)} dim"
673
+ return get_style_for_category(PUBLICIST)
661
674
 
662
- category = CATEGORY_LABEL_MAPPING.get(category, category)
675
+ category = CATEGORY_STYLE_MAPPING.get(category, category)
663
676
 
664
677
  for highlight_group in HIGHLIGHTED_NAMES:
665
678
  if highlight_group.label == category:
@@ -672,6 +685,10 @@ def get_style_for_name(name: str | None, default_style: str = DEFAULT, allow_bol
672
685
  return style if allow_bold else style.replace('bold', '').strip()
673
686
 
674
687
 
688
+ def styled_category(category: str) -> Text:
689
+ return Text(category, get_style_for_category(category) or 'wheat4')
690
+
691
+
675
692
  def _get_highlight_group_for_name(name: str) -> HighlightedNames | None:
676
693
  for highlight_group in HIGHLIGHTED_NAMES:
677
694
  if highlight_group.regex.search(name):
@@ -1,6 +1,5 @@
1
1
  import logging
2
2
  from os import environ
3
- from pathlib import Path
4
3
 
5
4
  import datefinder
6
5
  import rich_argparse_plus
@@ -10,7 +9,6 @@ from rich.logging import RichHandler
10
9
  from rich.theme import Theme
11
10
 
12
11
  from epstein_files.util.constant.strings import *
13
- from epstein_files.util.file_helper import file_size_str
14
12
 
15
13
  FILENAME_STYLE = 'gray27'
16
14
 
@@ -34,7 +32,7 @@ LOG_LEVEL_ENV_VAR = 'LOG_LEVEL'
34
32
  # Augment the standard log highlighter with 'epstein_filename' matcher
35
33
  class LogHighlighter(ReprHighlighter):
36
34
  highlights = ReprHighlighter.highlights + [
37
- *[fr"(?P<{doc_type}>{doc_type})" for doc_type in DOC_TYPE_STYLES.keys()],
35
+ *[fr"(?P<{doc_type}>{doc_type}(Cfg)?)" for doc_type in DOC_TYPE_STYLES.keys()],
38
36
  "(?P<epstein_filename>" + FILE_NAME_REGEX.pattern + ')',
39
37
  ]
40
38
 
@@ -60,7 +58,3 @@ if env_log_level_str:
60
58
 
61
59
  logger.warning(f"Setting log level to {env_log_level} based on {LOG_LEVEL_ENV_VAR} env var...")
62
60
  logger.setLevel(env_log_level)
63
-
64
-
65
- def log_file_write(file_path: str | Path) -> None:
66
- logger.warning(f"Wrote {file_size_str(file_path)} to '{file_path}'")
@@ -11,7 +11,8 @@ from epstein_files.util.constant.names import *
11
11
  from epstein_files.util.constant.output_files import JSON_FILES_JSON_PATH, JSON_METADATA_PATH
12
12
  from epstein_files.util.data import dict_sets_to_lists
13
13
  from epstein_files.util.env import args, specified_names
14
- from epstein_files.util.logging import log_file_write, logger
14
+ from epstein_files.util.file_helper import log_file_write
15
+ from epstein_files.util.logging import logger
15
16
  from epstein_files.util.rich import *
16
17
 
17
18
  PRINT_COLOR_KEY_EVERY_N_EMAILS = 150
@@ -60,7 +61,6 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
60
61
  """Returns number of emails printed."""
61
62
  print_section_header(('Selections from ' if not args.all_emails else '') + 'His Emails')
62
63
  print_other_site_link(is_header=False)
63
-
64
64
  emailers_to_print: list[str | None]
65
65
  emailer_tables: list[str | None] = []
66
66
  already_printed_emails: list[Email] = []
@@ -106,8 +106,8 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
106
106
  _verify_all_emails_were_printed(epstein_files, already_printed_emails)
107
107
 
108
108
  fwded_articles = [e for e in already_printed_emails if e.config and e.config.is_fwded_article]
109
- logger.warning(f"{len(fwded_articles)} of {len(already_printed_emails)} emails were forwarded articles.")
110
- logger.warning(f"Rewrote {len(Email.rewritten_header_ids)} headers of {len(epstein_files.emails)} emails.")
109
+ log_msg = f"Rewrote {len(Email.rewritten_header_ids)} email headers (out of {len(already_printed_emails)})"
110
+ logger.warning(f"{log_msg}, {len(fwded_articles)} of the emails were forwarded articles.")
111
111
  return len(already_printed_emails)
112
112
 
113
113
 
@@ -121,11 +121,11 @@ def print_json_files(epstein_files: EpsteinFiles):
121
121
  else:
122
122
  for json_file in epstein_files.json_files:
123
123
  console.line(2)
124
- console.print(json_file.description_panel())
124
+ console.print(json_file.summary_panel())
125
125
  console.print_json(json_file.json_str(), indent=4, sort_keys=False)
126
126
 
127
127
 
128
- def print_json_metadata(epstein_files: EpsteinFiles) -> None:
128
+ def write_json_metadata(epstein_files: EpsteinFiles) -> None:
129
129
  json_str = epstein_files.json_metadata()
130
130
 
131
131
  if args.build:
@@ -187,8 +187,13 @@ def write_urls() -> None:
187
187
  def _verify_all_emails_were_printed(epstein_files: EpsteinFiles, already_printed_emails: list[Email]) -> None:
188
188
  """Log warnings if some emails were never printed."""
189
189
  email_ids_that_were_printed = set([email.file_id for email in already_printed_emails])
190
- logger.warning(f"Printed {len(already_printed_emails)} emails of {len(email_ids_that_were_printed)} unique file IDs.")
190
+ logger.warning(f"Printed {len(already_printed_emails):,} emails of {len(email_ids_that_were_printed):,} unique file IDs.")
191
+ missed_an_email = False
191
192
 
192
193
  for email in epstein_files.emails:
193
- if email.file_id not in email_ids_that_were_printed and not email.is_duplicate:
194
+ if email.file_id not in email_ids_that_were_printed and not email.is_duplicate():
194
195
  logger.warning(f"Failed to print {email.summary()}")
196
+ missed_an_email = True
197
+
198
+ if not missed_an_email:
199
+ logger.warning(f"All {len(epstein_files.emails):,} emails printed at least once.")