epstein-files 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,15 +3,10 @@ Helpers for dealing with various kinds of data.
3
3
  """
4
4
  import itertools
5
5
  import re
6
- import time
7
- from dataclasses import dataclass, field
8
6
  from datetime import datetime, timezone
9
7
  from dateutil import tz
10
8
  from typing import TypeVar
11
9
 
12
- from dateutil.parser import parse
13
- from rich.text import Text
14
-
15
10
  from epstein_files.util.constant import names
16
11
  from epstein_files.util.env import args
17
12
  from epstein_files.util.logging import logger
@@ -24,27 +19,20 @@ CONSTANT_VAR_REGEX = re.compile(r"^[A-Z_]+$")
24
19
  ALL_NAMES = [v for k, v in vars(names).items() if isinstance(v, str) and CONSTANT_VAR_REGEX.match(k)]
25
20
 
26
21
  PACIFIC_TZ = tz.gettz("America/Los_Angeles")
27
- TIMEZONE_INFO = {"PST": PACIFIC_TZ, "PDT": PACIFIC_TZ} # Suppresses annoying warnings from parse() calls
28
-
29
-
30
- def dict_sets_to_lists(d: dict[str, set]) -> dict[str, list]:
31
- return {k: sorted(list(v)) for k, v in d.items()}
32
-
22
+ TIMEZONE_INFO = {"PDT": PACIFIC_TZ, "PST": PACIFIC_TZ} # Suppresses annoying warnings from parse() calls
33
23
 
34
- def extract_datetime(s: str) -> datetime | None:
35
- match = ISO_DATE_REGEX.search(s)
36
24
 
37
- if not match:
38
- return None
39
-
40
- date_str = match.group(0)
25
+ collapse_newlines = lambda text: MULTINEWLINE_REGEX.sub('\n\n', text)
26
+ date_str = lambda dt: dt.isoformat()[0:10] if dt else None
27
+ escape_double_quotes = lambda text: text.replace('"', r'\"')
28
+ escape_single_quotes = lambda text: text.replace("'", r"\'")
29
+ iso_timestamp = lambda dt: dt.isoformat().replace('T', ' ')
30
+ uniquify = lambda _list: list(set(_list))
31
+ without_falsey = lambda _list: [e for e in _list if e]
41
32
 
42
- if len(date_str) == 4:
43
- date_str += '-01-01'
44
- elif len(date_str) == 7:
45
- date_str += '-01'
46
33
 
47
- return parse(date_str, tzinfos=TIMEZONE_INFO)
34
+ def dict_sets_to_lists(d: dict[str, set]) -> dict[str, list]:
35
+ return {k: sorted(list(v)) for k, v in d.items()}
48
36
 
49
37
 
50
38
  def extract_last_name(name: str) -> str:
@@ -91,8 +79,8 @@ def ordinal_str(n: int) -> str:
91
79
  return str(n) + suffix
92
80
 
93
81
 
94
- def patternize(_pattern: str | re.Pattern):
95
- return _pattern if isinstance(_pattern, re.Pattern) else re.compile(rf"({_pattern})", re.IGNORECASE)
82
+ def patternize(_pattern: str | re.Pattern) -> re.Pattern:
83
+ return _pattern if isinstance(_pattern, re.Pattern) else re.compile(fr"({_pattern})", re.IGNORECASE)
96
84
 
97
85
 
98
86
  def remove_timezone(timestamp: datetime) -> datetime:
@@ -106,12 +94,3 @@ def remove_timezone(timestamp: datetime) -> datetime:
106
94
  def sort_dict(d: dict[str | None, int] | dict[str, int]) -> list[tuple[str | None, int]]:
107
95
  sort_key = lambda e: (e[0] or '').lower() if args.sort_alphabetical else [-e[1], (e[0] or '').lower()]
108
96
  return sorted(d.items(), key=sort_key)
109
-
110
-
111
- collapse_newlines = lambda text: MULTINEWLINE_REGEX.sub('\n\n', text)
112
- date_str = lambda dt: dt.isoformat()[0:10] if dt else None
113
- escape_double_quotes = lambda text: text.replace('"', r'\"')
114
- escape_single_quotes = lambda text: text.replace("'", r"\'")
115
- iso_timestamp = lambda dt: dt.isoformat().replace('T', ' ')
116
- uniquify = lambda _list: list(set(_list))
117
- without_nones = lambda _list: [e for e in _list if e]
@@ -8,7 +8,7 @@ from dateutil.parser import parse
8
8
 
9
9
  from epstein_files.util.constant.names import *
10
10
  from epstein_files.util.constant.strings import *
11
- from epstein_files.util.data import without_nones
11
+ from epstein_files.util.data import without_falsey
12
12
 
13
13
  DuplicateType = Literal['earlier', 'quoted', 'redacted', 'same']
14
14
  Metadata = dict[str, bool | datetime | int | str | list[str | None] |dict[str, bool | str]]
@@ -116,10 +116,12 @@ class DocCfg:
116
116
  return self.title_by_author()
117
117
  elif self.category == FINANCE and self.author in FINANCIAL_REPORTS_AUTHORS:
118
118
  return f"{self.author} report: '{self.description}'"
119
+ elif self.category == LEGAL and 'v.' in self.author:
120
+ return f"{self.author}: '{self.description}'"
119
121
  elif self.category and self.author is None and self.description is None:
120
122
  return self.category
121
123
 
122
- pieces = without_nones([self.author, self.description])
124
+ pieces = without_falsey([self.author, self.description])
123
125
  return ' '.join(pieces) if pieces else None
124
126
 
125
127
  def metadata(self) -> Metadata:
@@ -176,16 +178,6 @@ class DocCfg:
176
178
 
177
179
  return props
178
180
 
179
- def __eq__(self, other: 'DocCfg') -> bool:
180
- """Return True if everything matches other than the two 'dupe_' fields ('duplicate_ids' is compared)."""
181
- for _field in self.sorted_fields():
182
- if _field.name == 'id' or _field.name.startswith('dupe'):
183
- continue
184
- elif getattr(self, _field.name) != getattr(other, _field.name):
185
- return False
186
-
187
- return True
188
-
189
181
  def __repr__(self) -> str:
190
182
  props = self._props_strs()
191
183
  type_str = f"{type(self).__name__}("
@@ -231,6 +223,7 @@ class EmailCfg(CommunicationCfg):
231
223
  recipients (list[str | None]): Who received the email
232
224
  """
233
225
  actual_text: str | None = None # Override for the Email._actual_text() method for particularly broken emails
226
+ fwded_text_after: str | None = None # If set, any text after this is a fwd of an article or similar
234
227
  is_fwded_article: bool = False
235
228
  recipients: list[str | None] = field(default_factory=list)
236
229
 
@@ -242,7 +235,7 @@ class EmailCfg(CommunicationCfg):
242
235
  def from_doc_cfg(cls, cfg: DocCfg) -> 'EmailCfg':
243
236
  return cls(**asdict(cfg))
244
237
 
245
- # This is necessary for some dumb reason. @dataclass(repr=False) doesn't cut it
238
+ # This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
246
239
  def __repr__(self) -> str:
247
240
  return super().__repr__()
248
241
 
@@ -253,6 +246,6 @@ class TextCfg(CommunicationCfg):
253
246
  super().__post_init__()
254
247
  self.category = TEXT_MESSAGE
255
248
 
256
- # This is necessary for some dumb reason. @dataclass(repr=False) doesn't cut it
249
+ # This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
257
250
  def __repr__(self) -> str:
258
251
  return super().__repr__()
epstein_files/util/env.py CHANGED
@@ -6,16 +6,18 @@ from sys import argv
6
6
 
7
7
  from epstein_files.util.logging import datefinder_logger, env_log_level, logger
8
8
 
9
+ COUNT_WORDS_SCRIPT = 'count_words.py'
9
10
  DEFAULT_WIDTH = 154
10
- HTML_SCRIPTS = ['generate_html.py', 'count_words.py']
11
+ HTML_SCRIPTS = ['epstein_generate', 'generate_html.py', COUNT_WORDS_SCRIPT]
11
12
 
12
13
 
13
14
  parser = ArgumentParser(description="Parse epstein OCR docs and generate HTML page.")
14
- parser.add_argument('--build', '-b', action='store_true', help='write HTML to docs/index.html')
15
+ parser.add_argument('--build', '-b', action='store_true', help='write output to file')
15
16
  parser.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
16
17
  parser.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just interesting ones')
17
18
  parser.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
18
19
  parser.add_argument('--name', '-n', action='append', dest='names', help='specify the name(s) whose communications should be output')
20
+ parser.add_argument('--output-file', '-out', metavar='FILE', default='index.html', help='write output to FILE in docs/ (default=index.html)')
19
21
  parser.add_argument('--output-emails', '-oe', action='store_true', help='generate other files section')
20
22
  parser.add_argument('--output-other-files', '-oo', action='store_true', help='generate other files section')
21
23
  parser.add_argument('--output-texts', '-ot', action='store_true', help='generate other files section')
@@ -64,7 +66,7 @@ datefinder_logger.setLevel(logger.level)
64
66
 
65
67
  # Massage args that depend on other args to the appropriate state
66
68
  if not (args.json_metadata or args.output_texts or args.output_emails or args.output_other_files):
67
- if is_html_script:
69
+ if is_html_script and current_script != COUNT_WORDS_SCRIPT and not args.make_clean:
68
70
  logger.warning(f"No output section chosen; outputting default of texts, selected emails, and other files...")
69
71
 
70
72
  args.output_texts = True
@@ -8,7 +8,6 @@ from epstein_files.util.constant.strings import FILE_NAME_REGEX, FILE_STEM_REGEX
8
8
  EPSTEIN_DOCS_DIR_ENV_VAR_NAME = 'EPSTEIN_DOCS_DIR'
9
9
  DOCS_DIR_ENV = environ[EPSTEIN_DOCS_DIR_ENV_VAR_NAME]
10
10
  DOCS_DIR = Path(DOCS_DIR_ENV or '').resolve()
11
- PICKLED_PATH = Path("the_epstein_files.pkl.gz")
12
11
 
13
12
  if not DOCS_DIR_ENV:
14
13
  print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME} env var not set!")
@@ -17,20 +16,7 @@ elif not DOCS_DIR.exists():
17
16
  print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!")
18
17
  exit(1)
19
18
 
20
- HTML_DIR = Path('docs')
21
19
  EXTRACTED_EMAILS_DIR = Path('emails_extracted_from_legal_filings')
22
- EPSTEIN_WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_texts_and_emails_word_count.html')
23
- GH_PAGES_HTML_PATH = HTML_DIR.joinpath('index.html')
24
- JSON_METADATA_PATH = HTML_DIR.joinpath('epstein_files_nov_2025_cryptadamus_metadata.json')
25
- WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_emails_word_count.html')
26
-
27
- BUILD_ARTIFACTS = [
28
- EPSTEIN_WORD_COUNT_HTML_PATH,
29
- GH_PAGES_HTML_PATH,
30
- JSON_METADATA_PATH,
31
- WORD_COUNT_HTML_PATH,
32
- ]
33
-
34
20
  FILE_ID_REGEX = re.compile(fr".*{FILE_NAME_REGEX.pattern}")
35
21
  FILENAME_LENGTH = len(HOUSE_OVERSIGHT_PREFIX) + 6
36
22
  KB = 1024
@@ -110,11 +96,3 @@ def is_local_extract_file(filename) -> bool:
110
96
  """Return true if filename is of form 'HOUSE_OVERSIGHT_029835_1.txt'."""
111
97
  file_match = FILE_ID_REGEX.match(str(filename))
112
98
  return True if file_match and file_match.group(2) else False
113
-
114
-
115
- def make_clean() -> None:
116
- """Delete all build artifacts."""
117
- for build_file in BUILD_ARTIFACTS:
118
- if build_file.exists():
119
- print(f"Removing build file '{build_file}'...")
120
- build_file.unlink()
@@ -2,7 +2,6 @@ import re
2
2
  from dataclasses import dataclass, field
3
3
 
4
4
  from rich.highlighter import RegexHighlighter
5
- from rich.text import Text
6
5
 
7
6
  from epstein_files.util.constant.names import *
8
7
  from epstein_files.util.constant.strings import *
@@ -10,7 +9,7 @@ from epstein_files.util.constant.urls import ARCHIVE_LINK_COLOR
10
9
  from epstein_files.util.constants import (EMAILER_ID_REGEXES, EPSTEIN_V_ROTHSTEIN_EDWARDS, HEADER_ABBREVIATIONS,
11
10
  OSBORNE_LLP, REPLY_REGEX, SENT_FROM_REGEX, VIRGIN_ISLANDS)
12
11
  from epstein_files.util.doc_cfg import *
13
- from epstein_files.util.data import extract_last_name, listify
12
+ from epstein_files.util.data import extract_last_name, listify, without_falsey
14
13
 
15
14
  CIVIL_ATTORNEY = 'civil attorney'
16
15
  CRIMINAL_DEFENSE_ATTORNEY = 'criminal defense attorney'
@@ -48,7 +47,6 @@ class HighlightedText:
48
47
  label: str = ''
49
48
  pattern: str = ''
50
49
  style: str
51
- # Computed fields
52
50
  regex: re.Pattern = field(init=False)
53
51
  theme_style_name: str = field(init=False)
54
52
  _capture_group_label: str = field(init=False)
@@ -76,7 +74,7 @@ class HighlightedNames(HighlightedText):
76
74
  Attributes:
77
75
  category (str): optional string to use as an override for self.label in some contexts
78
76
  emailers (dict[str, str | None]): optional names to construct regexes for (values are descriptions)
79
- _pattern (str): complete regex pattern that combines 'pattern' with 'emailers'
77
+ _pattern (str): regex pattern combining 'pattern' with first & last names of all 'emailers'
80
78
  """
81
79
  category: str = ''
82
80
  emailers: dict[str, str | None] = field(default_factory=dict)
@@ -102,7 +100,7 @@ class HighlightedNames(HighlightedText):
102
100
  self.emailers.get(name),
103
101
  ]
104
102
 
105
- info_pieces = [p for p in info_pieces if p is not None]
103
+ info_pieces = without_falsey(info_pieces)
106
104
  return ', '.join(info_pieces) if info_pieces else None
107
105
 
108
106
  def _emailer_pattern(self, name: str) -> str:
@@ -114,10 +112,10 @@ class HighlightedNames(HighlightedText):
114
112
  if name in EMAILER_ID_REGEXES:
115
113
  pattern = EMAILER_ID_REGEXES[name].pattern
116
114
 
117
- # Include regex for last name
118
- # TODO: handle word boundary issue for names that end in symbols
119
- if SIMPLE_NAME_REGEX.match(last_name) and last_name.lower() not in NAMES_TO_NOT_HIGHLIGHT:
120
- pattern += fr"|{last_name}"
115
+ # Include regex for first and last names
116
+ for partial_name in [first_name, last_name]:
117
+ if SIMPLE_NAME_REGEX.match(partial_name) and partial_name.lower() not in NAMES_TO_NOT_HIGHLIGHT:
118
+ pattern += fr"|{partial_name}"
121
119
 
122
120
  return pattern
123
121
  elif ' ' not in name:
@@ -163,7 +161,7 @@ HIGHLIGHTED_NAMES = [
163
161
  ALIREZA_ITTIHADIEH: 'CEO Freestream Aircraft Limited',
164
162
  BARBRO_C_EHNBOM: 'Swedish pharmaceuticals',
165
163
  FRED_HADDAD: "co-founder of Heck's in West Virginia",
166
- GERALD_BARTON: "Maryland property developer, fan of Trump's Irish golf course",
164
+ GERALD_BARTON: "Maryland property developer Landmark Land Company, fan of Trump's Irish golf course",
167
165
  GORDON_GETTY: 'heir of oil tycoon J. Paul Getty',
168
166
  NICHOLAS_RIBIS: 'Hilton CEO, former president of Trump Organization',
169
167
  'Philip Kafka': 'president of Prince Concepts (and son of Terry Kafka?)',
@@ -272,7 +270,7 @@ HIGHLIGHTED_NAMES = [
272
270
  HighlightedNames(
273
271
  label='europe',
274
272
  style='light_sky_blue3',
275
- pattern=r'(Angela )?Merk(el|le)|Austria|(Benjamin\s*)?Harnwell|Berlin|Brexit(eers?)?|Brit(ain|ish)|Brussels|Cannes|(Caroline|Jack)?\s*Lang(, Caroline)?|Cypr(iot|us)|Davos|ECB|EU|Europe(an)?(\s*Union)?|France|Geneva|Germany?|Gillard|Gree(ce|k)|Ital(ian|y)|Jacques|(Kevin\s*)?Rudd|Le\s*Pen|London|Macron|Melusine|Munich|(Natalia\s*)?Veselnitskaya|(Nicholas\s*)?Sarkozy|Nigel(\s*Farage)?|Oslo|Paris|Polish|(Sebastian )?Kurz|(Vi(c|k)tor\s+)?Orbah?n|Edward Rod Larsen|Strasbourg|Strauss[- ]?Kahn|Swed(en|ish)(?![-\s]+America)|Switzerland|(Tony\s)?Blair|Ukrain(e|ian)|Vienna|(Vitaly\s*)?Churkin|Zug',
273
+ pattern=r'(Angela )?Merk(el|le)|Austria|(Benjamin\s*)?Harnwell|Berlin|Borge|Boris\s*Johnson|Brexit(eers?)?|Brit(ain|ish)|Brussels|Cannes|(Caroline|Jack)?\s*Lang(, Caroline)?|Cypr(iot|us)|Davos|ECB|England|EU|Europe(an)?(\s*Union)?|Fr(ance|ench)|Geneva|Germany?|Gillard|Gree(ce|k)|Ital(ian|y)|Jacques|(Kevin\s*)?Rudd|Le\s*Pen|London|Macron|Melusine|Munich|(Natalia\s*)?Veselnitskaya|(Nicholas\s*)?Sarkozy|Nigel(\s*Farage)?|Norw(ay|egian)|Oslo|Paris|Polish|(Sebastian )?Kurz|(Vi(c|k)tor\s+)?Orbah?n|Edward Rod Larsen|Strasbourg|Strauss[- ]?Kahn|Swed(en|ish)(?![-\s]+America)|Switzerland|(Tony\s)?Blair|Ukrain(e|ian)|Vienna|(Vitaly\s*)?Churkin|Zug',
276
274
  emailers = {
277
275
  ANDRZEJ_DUDA: 'former president of Poland',
278
276
  MIROSLAV_LAJCAK: 'Russia-friendly Slovakian politician, friend of Steve Bannon',
@@ -306,7 +304,7 @@ HIGHLIGHTED_NAMES = [
306
304
  HighlightedNames(
307
305
  label='finance',
308
306
  style='green',
309
- pattern=r'Apollo|Ari\s*Glass|(Bernie\s*)?Madoff|Black(rock|stone)|BofA|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|MLPF&S|(money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
307
+ pattern=r'Apollo|Ari\s*Glass|(Bernie\s*)?Madoff|Black(rock|stone)|BofA|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|(money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
310
308
  emailers={
311
309
  AMANDA_ENS: 'Citigroup',
312
310
  DANIEL_SABBA: 'UBS Investment Bank',
@@ -342,7 +340,7 @@ HIGHLIGHTED_NAMES = [
342
340
  emailers = {
343
341
  ANIL_AMBANI: 'chairman of Reliance Group',
344
342
  VINIT_SAHNI: None,
345
- ZUBAIR_KHAN: 'Tranchulas CEO, InsightsPod founder',
343
+ ZUBAIR_KHAN: 'cybersecurity firm Tranchulas CEO, InsightsPod founder, based in Islamabad and Dubai',
346
344
  }
347
345
  ),
348
346
  HighlightedNames(
@@ -391,7 +389,7 @@ HIGHLIGHTED_NAMES = [
391
389
  HighlightedNames(
392
390
  label='law enforcement',
393
391
  style='color(24) bold',
394
- pattern=r'ag|(Alicia\s*)?Valle|((Bob|Robert)\s*)?Mueller|(Byung\s)?Pak|CFTC|CIA|CIS|CVRA|Dep(artmen)?t\.?\s*of\s*(the\s*)?(Justice|Treasury)|DHS|DOJ|FBI|FCPA|FDIC|Federal\s*Bureau\s*of\s*Investigation|FinCEN|FINRA|FOIA|FTC|IRS|(James\s*)?Comey|(Jennifer\s*Shasky\s*)?Calvery|((Judge|Mark)\s*)?(Carney|Filip)|(Kirk )?Blouin|KYC|NIH|NS(A|C)|OCC|OFAC|(Lann?a\s*)?Belohlavek|(Michael\s*)?Reiter|OGE|Office\s*of\s*Government\s*Ethics|Police Code Enforcement|(Preet\s*)?Bharara|SCOTUS|SD(FL|NY)|Southern\s*District\s*of\s*(Florida|New\s*York)|SEC|Securities\s*and\s*Exchange\s*Commission|State\s*Dep(artmen)?t|Strzok|Supreme\s*Court|Treasury\s*(Dep(artmen)?t|Secretary)|TSA|USAID|(William\s*J\.?\s*)?Zloch',
392
+ pattern=r'ag|(Alicia\s*)?Valle|attorney|((Bob|Robert)\s*)?Mueller|(Byung\s)?Pak|CFTC|CIA|CIS|CVRA|Dep(artmen)?t\.?\s*of\s*(the\s*)?(Justice|Treasury)|DHS|DOJ|FBI|FCPA|FDIC|Federal\s*Bureau\s*of\s*Investigation|FinCEN|FINRA|FOIA|FTC|IRS|(James\s*)?Comey|(Jennifer\s*Shasky\s*)?Calvery|((Judge|Mark)\s*)?(Carney|Filip)|(Kirk )?Blouin|KYC|NIH|NS(A|C)|OCC|OFAC|(Lann?a\s*)?Belohlavek|lawyer|(Michael\s*)?Reiter|OGE|Office\s*of\s*Government\s*Ethics|Police Code Enforcement|(Preet\s*)?Bharara|SCOTUS|SD(FL|NY)|Southern\s*District\s*of\s*(Florida|New\s*York)|SEC|Securities\s*and\s*Exchange\s*Commission|State\s*Dep(artmen)?t|Strzok|Supreme\s*Court|Treasury\s*(Dep(artmen)?t|Secretary)|TSA|USAID|(William\s*J\.?\s*)?Zloch',
395
393
  emailers = {
396
394
  ANN_MARIE_VILLAFANA: 'southern district of Florida U.S. Attorney',
397
395
  DANNY_FROST: 'Director of Communications at Manhattan DA',
@@ -426,7 +424,7 @@ HIGHLIGHTED_NAMES = [
426
424
  HighlightedNames(
427
425
  label='modeling',
428
426
  style='pale_violet_red1',
429
- pattern=r'\w+@mc2mm.com|(Nicole\s*)?Junkerman',
427
+ pattern=r'\w+@mc2mm.com|model(ed|ing)|(Nicole\s*)?Junkerman',
430
428
  emailers = {
431
429
  'Abi Schwinck': 'MC2 Model Management (?)',
432
430
  DANIEL_SIAD: None,
@@ -458,7 +456,8 @@ HIGHLIGHTED_NAMES = [
458
456
  HighlightedNames(
459
457
  label='republicans',
460
458
  style='bold dark_red',
461
- pattern=r'Alberto\sGonzale[sz]|(Alex\s*)?Acosta|(Bill\s*)?Barr|Bill\s*Shine|(Bob\s*)?Corker|(John\s*(R.?\s*)?)Bolton|Broidy|(Chris\s)?Christie|Devin\s*Nunes|(Don\s*)?McGa[hn]n|McMaster|(George\s*)?Nader|GOP|(Brett\s*)?Kavanaugh|Kissinger|Kobach|Koch\s*Brothers|Kolfage|Kudlow|Lewandowski|(Marco\s)?Rubio|(Mark\s*)Meadows|Mattis|(?<!Merwin Dela )Cruz|(Michael\s)?Hayden|((General|Mike)\s*)?(Flynn|Pence)|(Mitt\s*)?Romney|Mnuchin|Nikki|Haley|(Paul\s+)?Manafort|(Peter\s)?Navarro|Pompeo|Reagan|Republican|(?<!Cynthia )(Richard\s*)?Nixon|Sasse|(Rex\s*)?Tillerson',
459
+ pattern=r'Alberto\sGonzale[sz]|(Alex\s*)?Acosta|(Bill\s*)?Barr|Bill\s*Shine|(Bob\s*)?Corker|(John\s*(R.?\s*)?)Bolton|Broidy|(Chris\s)?Christie|Devin\s*Nunes|(Don\s*)?McGa[hn]n|McMaster|(George\s*)?Nader|GOP|(Brett\s*)?Kavanaugh|Kissinger|Kobach|Koch\s*Brothers|Kolfage|Kudlow|Lewandowski|(Marco\s)?Rubio|(Mark\s*)Meadows|Mattis|(?<!Merwin Dela )Cruz|(Michael\s)?Hayden|((General|Mike)\s*)?(Flynn|Pence)|(Mitt\s*)?Romney|Mnuchin|Nikki|Haley|(Paul\s+)?Manafort|(Peter\s)?Navarro|Pompeo|Reagan|Reince|Priebus|Republican|(?<!Cynthia )(Richard\s*)?Nixon|Sasse|(Rex\s*)?Tillerson',
460
+ # There's no emails from these people, they're just here to automate the regex creation for both first + last names
462
461
  emailers = {
463
462
  RUDY_GIULIANI: 'disbarred formed mayor of New York City',
464
463
  TULSI_GABBARD: None,
@@ -475,7 +474,7 @@ HIGHLIGHTED_NAMES = [
475
474
  HighlightedNames(
476
475
  label='russia',
477
476
  style='red bold',
478
- pattern=r'Alfa\s*Bank|Anya\s*Rasulova|Chernobyl|Day\s+One\s+Ventures|(Dmitry\s)?(Kiselyov|(Lana\s*)?Pozhidaeva|Medvedev|Rybolo(o?l?ev|vlev))|Dmitry|FSB|GRU|KGB|Kislyak|Kremlin|Kuznetsova|Lavrov|Lukoil|Moscow|(Oleg\s*)?Deripaska|Oleksandr Vilkul|Rosneft|RT|St.?\s*?Petersburg|Russian?|Sberbank|Soviet(\s*Union)?|USSR|(Vladimir\s*)?(Putin|Yudashkin)|Women\s*Empowerment|Xitrans',
477
+ pattern=r'Alfa\s*Bank|Anya\s*Rasulova|Chernobyl|Day\s+One\s+Ventures|(Dmitry\s)?(Kiselyov|(Lana\s*)?Pozhidaeva|Medvedev|Rybolo(o?l?ev|vlev))|Dmitry|FSB|GRU|KGB|Kislyak|Kremlin|Kuznetsova|Lavrov|Lukoil|Moscow|(Oleg\s*)?Deripaska|Oleksandr Vilkul|Rosneft|RT|St.?\s*?Petersburg|Russian?|Sberbank|Soviet(\s*Union)?|USSR|Vladimir|(Vladimir\s*)?(Putin|Yudashkin)|Women\s*Empowerment|Xitrans',
479
478
  emailers = {
480
479
  MASHA_DROKOVA: 'silicon valley VC, former Putin Youth',
481
480
  RENATA_BOLOTOVA: 'former aspiring model, now fund manager at New York State Insurance Fund',
@@ -519,7 +518,7 @@ HIGHLIGHTED_NAMES = [
519
518
  HighlightedNames(
520
519
  label='trump',
521
520
  style='red3 bold',
522
- pattern=r"@?realDonaldTrump|(Alan\s*)?Weiss?elberg|\bDJ?T\b|Donald J. Tramp|(Donald\s+(J\.\s+)?)?Trump(ism|\s*Properties)?|Don(ald| *Jr)(?! Rubin)|Ivana|(Madeleine\s*)?Westerhout|Mar[-\s]*a[-\s]*Lago|(Marla\s*)?Maples|(Matt(hew)? )?Calamari|\bMatt C\b|Melania|(Michael (J.? )?)?Boccio|Roger\s+Stone|rona|(The\s*)?Art\s*of\s*the\s*Deal",
521
+ pattern=r"@?realDonaldTrump|(Alan\s*)?Weiss?elberg|\bDJ?T\b|Donald J. Tramp|(Donald\s+(J\.\s+)?)?Trump(ism|\s*Properties)?|Don(ald| *Jr)(?! Rubin)|Ivana|(Madeleine\s*)?Westerhout|Mar[-\s]*a[-\s]*Lago|(Marla\s*)?Maples|(Matt(hew)? )?Calamari|\bMatt C\b|Melania|(Michael (J.? )?)?Boccio|Rebekah\s*Mercer|Roger\s+Stone|rona|(The\s*)?Art\s*of\s*the\s*Deal",
523
522
  emailers = {
524
523
  'Bruce Moskowitz': "'Trump's health guy' according to Epstein",
525
524
  },
@@ -541,7 +540,7 @@ HIGHLIGHTED_NAMES = [
541
540
  HighlightedNames(
542
541
  label=VIRGIN_ISLANDS,
543
542
  style='sea_green1',
544
- pattern=r'Bahamas|Caribb?ean|Dominican\s*Republic|(Great|Little)\s*St.?\s*James|Haiti(an)?|(John\s*)deJongh(\s*Jr\.?)|(Kenneth E\. )?Mapp|Palm\s*Beach(?!\s*Post)|PBI|S(ain)?t.?\s*Thomas|USVI|VI|(The\s*)?Virgin\s*Islands(\s*Daily\s*News)?', # TODO: VI Daily News should be yellow but it's hard bc Daily News xists
543
+ pattern=r'Antigua|Bahamas|Caribb?ean|Dominican\s*Republic|(Great|Little)\s*St.?\s*James|Haiti(an)?|(John\s*)deJongh(\s*Jr\.?)|(Kenneth E\. )?Mapp|Palm\s*Beach(?!\s*Post)|PBI|S(ain)?t.?\s*Thomas|USVI|VI|(The\s*)?Virgin\s*Islands(\s*Daily\s*News)?', # TODO: VI Daily News should be yellow but it's hard bc Daily News xists
545
544
  emailers = {
546
545
  CECILE_DE_JONGH: f'First lady 2007-2015',
547
546
  STACEY_PLASKETT: 'non-voting member of Congress',
@@ -561,7 +560,7 @@ HIGHLIGHTED_NAMES = [
561
560
  HighlightedNames(
562
561
  label=STEVE_BANNON,
563
562
  style='color(58)',
564
- pattern=r'((Steve|Sean)\s*)?Bannon?',
563
+ pattern=r'((Steve|Sean)\s*)?Bannon?|(American\s*)?Dharma',
565
564
  ),
566
565
  HighlightedNames(
567
566
  emailers={STEVEN_HOFFENBERG: HEADER_ABBREVIATIONS['Hoffenberg']},
@@ -578,7 +577,18 @@ HIGHLIGHTED_NAMES = [
578
577
  HighlightedNames(emailers={PRINCE_ANDREW: 'British royal family'}, style='dodger_blue1'),
579
578
  HighlightedNames(emailers={SOON_YI_PREVIN: "wife of Woody Allen"}, style='hot_pink'),
580
579
  HighlightedNames(emailers={SULTAN_BIN_SULAYEM: 'CEO of DP World, chairman of ports in Dubai'}, style='green1'),
581
- HighlightedText(label='unknown', style='cyan', pattern=r'\(unknown\)'), # HighlightedText bc of word boundary issue
580
+
581
+ # HighlightedText not HighlightedNames bc of word boundary issue
582
+ HighlightedText(
583
+ label='unknown',
584
+ style='cyan',
585
+ pattern=r'\(unknown\)'
586
+ ),
587
+ HighlightedText(
588
+ label='phone_number',
589
+ style='bright_green',
590
+ pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|[\d+]{10,12}",
591
+ ),
582
592
  ]
583
593
 
584
594
  # Highlight regexes for things other than names, only used by RegexHighlighter pattern matching
@@ -593,11 +603,6 @@ HIGHLIGHTED_TEXTS = [
593
603
  style=f'{ARCHIVE_LINK_COLOR} underline',
594
604
  pattern=r"https?:[^\s]+",
595
605
  ),
596
- HighlightedText(
597
- label='phone_number',
598
- style='bright_green',
599
- pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|[\d+]{10,12}",
600
- ),
601
606
  HighlightedText(
602
607
  label='quoted_reply_line',
603
608
  style='dim',
@@ -1,5 +1,6 @@
1
1
  import logging
2
2
  from os import environ
3
+ from pathlib import Path
3
4
 
4
5
  from rich.console import Console
5
6
  from rich.highlighter import ReprHighlighter
@@ -7,6 +8,7 @@ from rich.logging import RichHandler
7
8
  from rich.theme import Theme
8
9
 
9
10
  from epstein_files.util.constant.strings import *
11
+ from epstein_files.util.file_helper import file_size_str
10
12
 
11
13
  FILENAME_STYLE = 'gray27'
12
14
 
@@ -27,6 +29,7 @@ LOG_THEME[f"{ReprHighlighter.base_style}epstein_filename"] = FILENAME_STYLE
27
29
  LOG_LEVEL_ENV_VAR = 'LOG_LEVEL'
28
30
 
29
31
 
32
+ # Augment the standard log highlighter with 'epstein_filename' matcher
30
33
  class LogHighlighter(ReprHighlighter):
31
34
  highlights = ReprHighlighter.highlights + [
32
35
  *[fr"(?P<{doc_type}>{doc_type})" for doc_type in DOC_TYPE_STYLES.keys()],
@@ -55,3 +58,7 @@ if env_log_level_str:
55
58
  logger.warning(f"Setting log level to {env_log_level} based on {LOG_LEVEL_ENV_VAR} env var...")
56
59
  logger.setLevel(env_log_level)
57
60
  datefinder_logger.setLevel(env_log_level)
61
+
62
+
63
+ def log_file_write(file_path: str | Path) -> None:
64
+ logger.warning(f"Wrote {file_size_str(file_path)} to '{file_path}'")
@@ -0,0 +1,179 @@
1
+ from rich.padding import Padding
2
+
3
+ from epstein_files.documents.email import Email
4
+ from epstein_files.documents.messenger_log import MessengerLog
5
+ from epstein_files.epstein_files import EpsteinFiles, count_by_month
6
+ from epstein_files.util.constant.output_files import JSON_METADATA_PATH
7
+ from epstein_files.util.constant import urls
8
+ from epstein_files.util.constant.html import *
9
+ from epstein_files.util.constant.names import *
10
+ from epstein_files.util.constant.strings import EMAIL_CLASS, MESSENGER_LOG_CLASS
11
+ from epstein_files.util.data import dict_sets_to_lists
12
+ from epstein_files.util.env import args, specified_names
13
+ from epstein_files.util.logging import log_file_write, logger
14
+ from epstein_files.util.rich import *
15
+
16
+ PRINT_COLOR_KEY_EVERY_N_EMAILS = 150
17
+
18
+ # Order matters. Default names to print emails for.
19
+ DEFAULT_EMAILERS = [
20
+ JEREMY_RUBIN,
21
+ AL_SECKEL,
22
+ JOI_ITO,
23
+ JABOR_Y,
24
+ STEVEN_SINOFSKY,
25
+ DANIEL_SIAD,
26
+ JEAN_LUC_BRUNEL,
27
+ STEVEN_HOFFENBERG,
28
+ EHUD_BARAK,
29
+ MARTIN_NOWAK,
30
+ MASHA_DROKOVA,
31
+ RENATA_BOLOTOVA,
32
+ STEVE_BANNON,
33
+ OLIVIER_COLOM,
34
+ BORIS_NIKOLIC,
35
+ PRINCE_ANDREW,
36
+ JIDE_ZEITLIN,
37
+ DAVID_STERN,
38
+ MOHAMED_WAHEED_HASSAN,
39
+ JENNIFER_JACQUET,
40
+ TYLER_SHEARS,
41
+ CHRISTINA_GALBRAITH,
42
+ None,
43
+ ]
44
+
45
+ # Order matters. Default names to print tables w/email subject, timestamp, etc for. # TODO: get rid of this ?
46
+ DEFAULT_EMAILER_TABLES: list[str | None] = [
47
+ GHISLAINE_MAXWELL,
48
+ LEON_BLACK,
49
+ SULTAN_BIN_SULAYEM,
50
+ DEEPAK_CHOPRA,
51
+ ARIANE_DE_ROTHSCHILD,
52
+ ]
53
+
54
+ if len(set(DEFAULT_EMAILERS).intersection(set(DEFAULT_EMAILER_TABLES))) > 0:
55
+ raise RuntimeError(f"Some names appear in both DEFAULT_EMAILERS and DEFAULT_EMAILER_TABLES")
56
+
57
+
58
+ def print_emails(epstein_files: EpsteinFiles) -> int:
59
+ """Returns number of emails printed."""
60
+ print_section_header(('Selections from ' if not args.all_emails else '') + 'His Emails')
61
+ print_other_site_link(is_header=False)
62
+
63
+ emailers_to_print: list[str | None]
64
+ emailer_tables: list[str | None] = []
65
+ already_printed_emails: list[Email] = []
66
+ num_emails_printed_since_last_color_key = 0
67
+
68
+ if specified_names:
69
+ emailers_to_print = specified_names
70
+ else:
71
+ epstein_files.print_emailer_counts_table()
72
+
73
+ if args.all_emails:
74
+ emailers_to_print = sorted(epstein_files.all_emailers(), key=lambda e: epstein_files.earliest_email_at(e))
75
+ console.print('Email conversations are sorted chronologically based on time of the first email.')
76
+ print_numbered_list_of_emailers(emailers_to_print, epstein_files)
77
+ else:
78
+ emailers_to_print = DEFAULT_EMAILERS
79
+ emailer_tables = DEFAULT_EMAILER_TABLES
80
+ console.print('Email conversations grouped by counterparty can be found in the order listed below.')
81
+ print_numbered_list_of_emailers(emailers_to_print)
82
+ console.print("\nAfter that there's tables linking to (but not displaying) all known emails for each of these people:")
83
+ print_numbered_list_of_emailers(emailer_tables)
84
+
85
+ for author in emailers_to_print:
86
+ author_emails = epstein_files.print_emails_for(author)
87
+ already_printed_emails.extend(author_emails)
88
+ num_emails_printed_since_last_color_key += len(author_emails)
89
+
90
+ # Print color key every once in a while
91
+ if num_emails_printed_since_last_color_key > PRINT_COLOR_KEY_EVERY_N_EMAILS:
92
+ print_color_key()
93
+ num_emails_printed_since_last_color_key = 0
94
+
95
+ if emailer_tables:
96
+ print_author_header(f"Email Tables for {len(emailer_tables)} Other People", 'white')
97
+
98
+ for name in DEFAULT_EMAILER_TABLES:
99
+ epstein_files.print_emails_table_for(name)
100
+
101
+ if not specified_names:
102
+ epstein_files.print_email_device_info()
103
+
104
+ if args.all_emails:
105
+ _verify_all_emails_were_printed(epstein_files, already_printed_emails)
106
+
107
+ logger.warning(f"Rewrote {len(Email.rewritten_header_ids)} headers of {len(epstein_files.emails)} emails")
108
+ return len(already_printed_emails)
109
+
110
+
111
+ def print_json_metadata(epstein_files: EpsteinFiles) -> None:
112
+ json_str = epstein_files.json_metadata()
113
+
114
+ if args.build:
115
+ with open(JSON_METADATA_PATH, 'w') as f:
116
+ f.write(json_str)
117
+ log_file_write(JSON_METADATA_PATH)
118
+ else:
119
+ console.print_json(json_str, indent=4, sort_keys=True)
120
+
121
+
122
+ def print_json_stats(epstein_files: EpsteinFiles) -> None:
123
+ console.line(5)
124
+ console.print(Panel('JSON Stats Dump', expand=True, style='reverse bold'), '\n')
125
+ print_json(f"{MESSENGER_LOG_CLASS} Sender Counts", MessengerLog.count_authors(epstein_files.imessage_logs), skip_falsey=True)
126
+ print_json(f"{EMAIL_CLASS} Author Counts", epstein_files.email_author_counts, skip_falsey=True)
127
+ print_json(f"{EMAIL_CLASS} Recipient Counts", epstein_files.email_recipient_counts, skip_falsey=True)
128
+ print_json("Email signature_substitution_countss", epstein_files.email_signature_substitution_counts(), skip_falsey=True)
129
+ print_json("email_author_device_signatures", dict_sets_to_lists(epstein_files.email_authors_to_device_signatures))
130
+ print_json("email_sent_from_devices", dict_sets_to_lists(epstein_files.email_device_signatures_to_authors))
131
+ print_json("email_unknown_recipient_file_ids", epstein_files.email_unknown_recipient_file_ids())
132
+ print_json("count_by_month", count_by_month(epstein_files.all_documents()))
133
+
134
+
135
+ def print_text_messages(epstein_files: EpsteinFiles) -> None:
136
+ print_section_header('Text Messages')
137
+ print_centered("(conversations are sorted chronologically based on timestamp of first message)\n", style='gray30')
138
+ authors: list[str | None] = specified_names if specified_names else [JEFFREY_EPSTEIN]
139
+ log_files = epstein_files.imessage_logs_for(authors)
140
+
141
+ for log_file in log_files:
142
+ console.print(Padding(log_file))
143
+ console.line(2)
144
+
145
+ epstein_files.print_imessage_summary()
146
+
147
+
148
+ def write_urls() -> None:
149
+ """Write _URL style constant variables to a file bash scripts can load as env vars."""
150
+ if args.output_file == 'index.html':
151
+ logger.warning(f"Can't write env vars to '{args.output_file}', writing to '{URLS_ENV}' instead.\n")
152
+ args.output_file = URLS_ENV
153
+
154
+ url_vars = {
155
+ k: v for k, v in vars(urls).items()
156
+ if isinstance(v, str) and k.split('_')[-1] in ['URL'] and 'github.io' in v and 'BASE' not in k
157
+ }
158
+
159
+ with open(args.output_file, 'w') as f:
160
+ for var_name, url in url_vars.items():
161
+ key_value = f"{var_name}='{url}'"
162
+
163
+ if not args.suppress_output:
164
+ console.print(key_value, style='dim')
165
+
166
+ f.write(f"{key_value}\n")
167
+
168
+ console.line()
169
+ logger.warning(f"Wrote {len(url_vars)} URL variables to '{args.output_file}'\n")
170
+
171
+
172
+ def _verify_all_emails_were_printed(epstein_files: EpsteinFiles, already_printed_emails: list[Email]) -> None:
173
+ """Log warnings if some emails were never printed."""
174
+ email_ids_that_were_printed = set([email.file_id for email in already_printed_emails])
175
+ logger.warning(f"Printed {len(already_printed_emails)} emails of {len(email_ids_that_were_printed)} unique file IDs.")
176
+
177
+ for email in epstein_files.emails:
178
+ if email.file_id not in email_ids_that_were_printed and not email.is_duplicate:
179
+ logger.warning(f"Failed to print {email.summary()}")