epstein-files 1.0.9__py3-none-any.whl → 1.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,20 +1,9 @@
1
1
  import re
2
- from os import environ
3
2
  from pathlib import Path
4
- from sys import exit
5
3
 
6
4
  from epstein_files.util.constant.strings import FILE_NAME_REGEX, FILE_STEM_REGEX, HOUSE_OVERSIGHT_PREFIX
7
-
8
- EPSTEIN_DOCS_DIR_ENV_VAR_NAME = 'EPSTEIN_DOCS_DIR'
9
- DOCS_DIR_ENV = environ[EPSTEIN_DOCS_DIR_ENV_VAR_NAME]
10
- DOCS_DIR = Path(DOCS_DIR_ENV or '').resolve()
11
-
12
- if not DOCS_DIR_ENV:
13
- print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME} env var not set!")
14
- exit(1)
15
- elif not DOCS_DIR.exists():
16
- print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!")
17
- exit(1)
5
+ from epstein_files.util.env import DOCS_DIR
6
+ from epstein_files.util.logging import logger
18
7
 
19
8
  EXTRACTED_EMAILS_DIR = Path('emails_extracted_from_legal_filings')
20
9
  FILE_ID_REGEX = re.compile(fr".*{FILE_NAME_REGEX.pattern}")
@@ -23,9 +12,10 @@ KB = 1024
23
12
  MB = KB * KB
24
13
 
25
14
 
26
- # Handles both string and int 'id' args.
15
+ # Coerce methods hands both string and int arguments.
16
+ coerce_file_name = lambda filename_or_id: coerce_file_stem(filename_or_id) + '.txt'
17
+ coerce_file_path = lambda filename_or_id: DOCS_DIR.joinpath(coerce_file_name(filename_or_id))
27
18
  id_str = lambda id: f"{int(id):06d}"
28
- filename_for_id = lambda id: file_stem_for_id(id) + '.txt'
29
19
 
30
20
 
31
21
  def coerce_file_stem(filename_or_id: int | str) -> str:
@@ -42,14 +32,6 @@ def coerce_file_stem(filename_or_id: int | str) -> str:
42
32
  return file_stem
43
33
 
44
34
 
45
- def coerce_file_name(filename_or_id: int | str) -> str:
46
- return coerce_file_stem(filename_or_id) + '.txt'
47
-
48
-
49
- def coerce_file_path(filename_or_id: int | str) -> Path:
50
- return DOCS_DIR.joinpath(coerce_file_name(filename_or_id))
51
-
52
-
53
35
  def extract_file_id(filename_or_id: int | str | Path) -> str:
54
36
  if isinstance(filename_or_id, int) or (isinstance(filename_or_id, str) and len(filename_or_id) <= 6):
55
37
  return id_str(filename_or_id)
@@ -67,7 +49,10 @@ def file_size(file_path: str | Path) -> int:
67
49
 
68
50
 
69
51
  def file_size_str(file_path: str | Path) -> str:
70
- size = file_size(file_path)
52
+ return file_size_to_str(file_size(file_path))
53
+
54
+
55
+ def file_size_to_str(size: int) -> str:
71
56
  digits = 2
72
57
 
73
58
  if size > MB:
@@ -96,3 +81,7 @@ def is_local_extract_file(filename) -> bool:
96
81
  """Return true if filename is of form 'HOUSE_OVERSIGHT_029835_1.txt'."""
97
82
  file_match = FILE_ID_REGEX.match(str(filename))
98
83
  return True if file_match and file_match.group(2) else False
84
+
85
+
86
+ def log_file_write(file_path: str | Path) -> None:
87
+ logger.warning(f"Wrote {file_size_str(file_path)} to '{file_path}'")
@@ -2,6 +2,7 @@ import re
2
2
  from dataclasses import dataclass, field
3
3
 
4
4
  from rich.highlighter import RegexHighlighter
5
+ from rich.text import Text
5
6
 
6
7
  from epstein_files.util.constant.names import *
7
8
  from epstein_files.util.constant.strings import *
@@ -21,7 +22,7 @@ EPSTEIN_ESTATE_EXECUTOR = f"Epstein {ESTATE_EXECUTOR}"
21
22
  REGEX_STYLE_PREFIX = 'regex'
22
23
  SIMPLE_NAME_REGEX = re.compile(r"^[-\w ]+$", re.IGNORECASE)
23
24
 
24
- CATEGORY_LABEL_MAPPING = {
25
+ CATEGORY_STYLE_MAPPING = {
25
26
  ARTICLE: JOURNALIST,
26
27
  ARTS: ENTERTAINER,
27
28
  BOOK: JOURNALIST,
@@ -31,6 +32,12 @@ CATEGORY_LABEL_MAPPING = {
31
32
  REPUTATION: PUBLICIST,
32
33
  }
33
34
 
35
+ CATEGORY_STYLES = {
36
+ JSON: 'dark_red',
37
+ JUNK: 'grey19',
38
+ 'letter': 'medium_orchid1'
39
+ }
40
+
34
41
 
35
42
  @dataclass(kw_only=True)
36
43
  class HighlightedText:
@@ -156,7 +163,7 @@ HIGHLIGHTED_NAMES = [
156
163
  HighlightedNames(
157
164
  label=BUSINESS,
158
165
  style='spring_green4',
159
- pattern=r'Gruterite|(John\s*)?Kluge|Marc Rich|(Mi(chael|ke)\s*)?Ovitz|(Steve\s+)?Wynn|(Les(lie)?\s+)?Wexner|SALSS|Swedish[-\s]*American\s*Life\s*Science\s*Summit|Valhi|(Yves\s*)?Bouvier',
166
+ pattern=r'Gruterite|(John\s*)?Kluge|Marc Rich|(Mi(chael|ke)\s*)?Ovitz|(Steve\s+)?Wynn|(Les(lie)?\s+)?Wexner|New Leaf Ventures|Park Partners|SALSS|Swedish[-\s]*American\s*Life\s*Science\s*Summit|Valhi|(Yves\s*)?Bouvier',
160
167
  emailers = {
161
168
  ALIREZA_ITTIHADIEH: 'CEO Freestream Aircraft Limited',
162
169
  BARBRO_C_EHNBOM: 'Swedish pharmaceuticals, SALSS',
@@ -270,7 +277,7 @@ HIGHLIGHTED_NAMES = [
270
277
  HighlightedNames(
271
278
  label='europe',
272
279
  style='light_sky_blue3',
273
- pattern=r'(Angela )?Merk(el|le)|Austria|(Benjamin\s*)?Harnwell|Berlin|Borge|Boris\s*Johnson|Brexit(eers?)?|Brit(ain|ish)|Brussels|Cannes|(Caroline|Jack)?\s*Lang(, Caroline)?|Cypr(iot|us)|Davos|ECB|England|EU|Europe(an)?(\s*Union)?|Fr(ance|ench)|Geneva|Germany?|Gillard|Gree(ce|k)|Ital(ian|y)|Jacques|(Kevin\s*)?Rudd|Le\s*Pen|London|Macron|Melusine|Munich|(Natalia\s*)?Veselnitskaya|(Nicholas\s*)?Sarkozy|Nigel(\s*Farage)?|Norw(ay|egian)|Oslo|Paris|Polish|(Sebastian )?Kurz|(Vi(c|k)tor\s+)?Orbah?n|Edward Rod Larsen|Strasbourg|Strauss[- ]?Kahn|Swed(en|ish)(?![-\s]+America)|Switzerland|(Tony\s)?Blair|Ukrain(e|ian)|Vienna|(Vitaly\s*)?Churkin|Zug',
280
+ pattern=r'(Angela )?Merk(el|le)|Austria|(Benjamin\s*)?Harnwell|Berlin|Borge|Boris\s*Johnson|Brexit(eers?)?|Brit(ain|ish)|Brussels|Cannes|(Caroline|Jack)?\s*Lang(, Caroline)?|Cypr(iot|us)|Davos|ECB|England|EU|Europe(an)?(\s*Union)?|Fr(ance|ench)|Geneva|Germany?|Gillard|Gree(ce|k)|Ital(ian|y)|Jacques|(Kevin\s*)?Rudd|Le\s*Pen|London|Macron|Melusine|Munich|(Natalia\s*)?Veselnitskaya|(Nicholas\s*)?Sarkozy|Nigel(\s*Farage)?|Norw(ay|egian)|Oslo|Paris|Polish|(Sebastian )?Kurz|(Vi(c|k)tor\s+)?Orbah?n|Edward Rod Larsen|Strasbourg|Strauss[- ]?Kahn|Swed(en|ish)(?![-\s]+America)|Switzerland|(Tony\s)?Blair|U\.?K\.?|Ukrain(e|ian)|Vienna|(Vitaly\s*)?Churkin|Zug',
274
281
  emailers = {
275
282
  ANDRZEJ_DUDA: 'former president of Poland',
276
283
  MIROSLAV_LAJCAK: 'Russia-friendly Slovakian politician, friend of Steve Bannon',
@@ -305,7 +312,7 @@ HIGHLIGHTED_NAMES = [
305
312
  HighlightedNames(
306
313
  label='finance',
307
314
  style='green',
308
- pattern=r'Apollo|Ari\s*Glass|Bank|(Bernie\s*)?Madoff|Black(rock|stone)|B\s*of\s*A|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|(money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
315
+ pattern=r'Apollo|Ari\s*Glass|Bank|(Bernie\s*)?Madoff|Black(rock|stone)|B\s*of\s*A|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|((anti.?)?money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
309
316
  emailers={
310
317
  AMANDA_ENS: 'Citigroup',
311
318
  DANIEL_SABBA: 'UBS Investment Bank',
@@ -325,6 +332,7 @@ HIGHLIGHTED_NAMES = [
325
332
  style='deep_pink2',
326
333
  pattern=r'Cambridge|(Derek\s*)?Bok|Elisa(\s*New)?|Harvard(\s*(Business|Law|University)(\s*School)?)?|(Jonathan\s*)?Zittrain|(Stephen\s*)?Kosslyn',
327
334
  emailers = {
335
+ "Donald Rubin": f"Professor of Statistics",
328
336
  "Kelly Friendly": f"longtime aide and spokesperson of {LARRY_SUMMERS}",
329
337
  LARRY_SUMMERS: 'board of Digital Currency Group (DCG), Harvard president, Obama economic advisor',
330
338
  'Leah Reis-Dennis': 'producer for Lisa New\'s Poetry in America',
@@ -390,7 +398,7 @@ HIGHLIGHTED_NAMES = [
390
398
  HighlightedNames(
391
399
  label='law enforcement',
392
400
  style='color(24) bold',
393
- pattern=r'ag|(Alicia\s*)?Valle|attorney|((Bob|Robert)\s*)?Mueller|(Byung\s)?Pak|CFTC|CIA|CIS|CVRA|Dep(artmen)?t\.?\s*of\s*(the\s*)?(Justice|Treasury)|DHS|DOJ|FBI|FCPA|FDIC|Federal\s*Bureau\s*of\s*Investigation|FinCEN|FINRA|FOIA|FTC|IRS|(James\s*)?Comey|(Jennifer\s*Shasky\s*)?Calvery|((Judge|Mark)\s*)?(Carney|Filip)|(Kirk )?Blouin|KYC|NIH|NS(A|C)|OCC|OFAC|(Lann?a\s*)?Belohlavek|lawyer|(Michael\s*)?Reiter|OGE|Office\s*of\s*Government\s*Ethics|Police Code Enforcement|(Preet\s*)?Bharara|SCOTUS|SD(FL|NY)|Southern\s*District\s*of\s*(Florida|New\s*York)|SEC|Securities\s*and\s*Exchange\s*Commission|State\s*Dep(artmen)?t|Strzok|Supreme\s*Court|Treasury\s*(Dep(artmen)?t|Secretary)|TSA|USAID|(William\s*J\.?\s*)?Zloch',
401
+ pattern=r'ag|(Alicia\s*)?Valle|AML|attorney|((Bob|Robert)\s*)?Mueller|(Byung\s)?Pak|CFTC?|CIA|CIS|CVRA|Dep(artmen)?t\.?\s*of\s*(the\s*)?(Justice|Treasury)|DHS|DOJ|FBI|FCPA|FDIC|Federal\s*Bureau\s*of\s*Investigation|FinCEN|FINRA|FOIA|FTC|IRS|(James\s*)?Comey|(Jennifer\s*Shasky\s*)?Calvery|((Judge|Mark)\s*)?(Carney|Filip)|(Kirk )?Blouin|KYC|NIH|NS(A|C)|OCC|OFAC|(Lann?a\s*)?Belohlavek|lawyer|(Michael\s*)?Reiter|OGE|Office\s*of\s*Government\s*Ethics|Police Code Enforcement|(Preet\s*)?Bharara|SCOTUS|SD(FL|NY)|Southern\s*District\s*of\s*(Florida|New\s*York)|SEC|Secret\s*Service|Securities\s*and\s*Exchange\s*Commission|State\s*Dep(artmen)?t|Strzok|Supreme\s*Court|Treasury\s*(Dep(artmen)?t|Secretary)|TSA|USAID|(William\s*J\.?\s*)?Zloch',
394
402
  emailers = {
395
403
  ANN_MARIE_VILLAFANA: 'southern district of Florida U.S. Attorney',
396
404
  DANNY_FROST: 'Director of Communications at Manhattan DA',
@@ -399,7 +407,7 @@ HIGHLIGHTED_NAMES = [
399
407
  HighlightedNames(
400
408
  label=LOBBYIST,
401
409
  style='light_coral',
402
- pattern=r'[BR]ob Crowe|Stanley Rosenberg',
410
+ pattern=r'[BR]ob Crowe|CSIS|Stanley Rosenberg',
403
411
  emailers = {
404
412
  'Joshua Cooper Ramo': 'co-CEO of Henry Kissinger Associates',
405
413
  KATHERINE_KEATING: 'Daughter of former Australian PM',
@@ -457,7 +465,7 @@ HIGHLIGHTED_NAMES = [
457
465
  HighlightedNames(
458
466
  label='republicans',
459
467
  style='bold dark_red',
460
- pattern=r'Alberto\sGonzale[sz]|(Alex\s*)?Acosta|(Bill\s*)?Barr|Bill\s*Shine|(Bob\s*)?Corker|(John\s*(R.?\s*)?)Bolton|Broidy|(Chris\s)?Christie|Devin\s*Nunes|(Don\s*)?McGa[hn]n|McMaster|(George\s*)?Nader|GOP|(Brett\s*)?Kavanaugh|Kissinger|Kobach|Koch\s*Brothers|Kolfage|Kudlow|Lewandowski|(Marco\s)?Rubio|(Mark\s*)Meadows|Mattis|(?<!Merwin Dela )Cruz|(Michael\s)?Hayden|((General|Mike)\s*)?(Flynn|Pence)|(Mitt\s*)?Romney|Mnuchin|Nikki|Haley|(Paul\s+)?Manafort|(Peter\s)?Navarro|Pompeo|Reagan|Reince|Priebus|Republican|(?<!Cynthia )(Richard\s*)?Nixon|Sasse|(Rex\s*)?Tillerson',
468
+ pattern=r'Alberto\sGonzale[sz]|(Alex\s*)?Acosta|(Bill\s*)?Barr|Bill\s*Shine|(Bob\s*)?Corker|(John\s*(R.?\s*)?)Bolton|Broidy|(Chris\s)?Christie|Devin\s*Nunes|(Don\s*)?McGa[hn]n|McMaster|(George\s*)?Nader|GOP|(Brett\s*)?Kavanaugh|Kissinger|Kobach|Koch\s*Brothers|Kolfage|Kudlow|Lewandowski|(Marco\s)?Rubio|(Mark\s*)Meadows|Mattis|McCain|(?<!Merwin Dela )Cruz|(Michael\s)?Hayden|((General|Mike)\s*)?(Flynn|Pence)|(Mitt\s*)?Romney|Mnuchin|Nikki|Haley|(Paul\s+)?(Manafort|Volcker)|(Peter\s)?Navarro|Pompeo|Reagan|Reince|Priebus|Republican|(Rex\s*)?Tillerson|(?<!Cynthia )(Richard\s*)?Nixon|Sasse',
461
469
  # There's no emails from these people, they're just here to automate the regex creation for both first + last names
462
470
  emailers = {
463
471
  RUDY_GIULIANI: 'disbarred formed mayor of New York City',
@@ -588,7 +596,7 @@ HIGHLIGHTED_NAMES = [
588
596
  HighlightedText(
589
597
  label='phone_number',
590
598
  style='bright_green',
591
- pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|\b[\d+]{10,12}\b",
599
+ pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|(\b|\+)[\d+]{10,12}\b",
592
600
  ),
593
601
  ]
594
602
 
@@ -648,18 +656,14 @@ def get_info_for_name(name: str) -> str | None:
648
656
 
649
657
 
650
658
  def get_style_for_category(category: str) -> str | None:
651
- if category in [CONFERENCE, SPEECH]:
659
+ if category in CATEGORY_STYLES:
660
+ return CATEGORY_STYLES[category]
661
+ elif category in [CONFERENCE, SPEECH]:
652
662
  return f"{get_style_for_category(ACADEMIA)} dim"
653
- elif category == JSON:
654
- return 'dark_red'
655
- elif category == JUNK:
656
- return 'grey19'
657
- elif category == 'letter':
658
- return 'medium_orchid1'
659
663
  elif category == SOCIAL:
660
- return f"{get_style_for_category(PUBLICIST)} dim"
664
+ return f"{get_style_for_category(PUBLICIST)}"
661
665
 
662
- category = CATEGORY_LABEL_MAPPING.get(category, category)
666
+ category = CATEGORY_STYLE_MAPPING.get(category, category)
663
667
 
664
668
  for highlight_group in HIGHLIGHTED_NAMES:
665
669
  if highlight_group.label == category:
@@ -672,6 +676,10 @@ def get_style_for_name(name: str | None, default_style: str = DEFAULT, allow_bol
672
676
  return style if allow_bold else style.replace('bold', '').strip()
673
677
 
674
678
 
679
+ def styled_category(category: str) -> Text:
680
+ return Text(category, get_style_for_category(category) or 'wheat4')
681
+
682
+
675
683
  def _get_highlight_group_for_name(name: str) -> HighlightedNames | None:
676
684
  for highlight_group in HIGHLIGHTED_NAMES:
677
685
  if highlight_group.regex.search(name):
@@ -1,6 +1,5 @@
1
1
  import logging
2
2
  from os import environ
3
- from pathlib import Path
4
3
 
5
4
  import datefinder
6
5
  import rich_argparse_plus
@@ -10,7 +9,6 @@ from rich.logging import RichHandler
10
9
  from rich.theme import Theme
11
10
 
12
11
  from epstein_files.util.constant.strings import *
13
- from epstein_files.util.file_helper import file_size_str
14
12
 
15
13
  FILENAME_STYLE = 'gray27'
16
14
 
@@ -60,7 +58,3 @@ if env_log_level_str:
60
58
 
61
59
  logger.warning(f"Setting log level to {env_log_level} based on {LOG_LEVEL_ENV_VAR} env var...")
62
60
  logger.setLevel(env_log_level)
63
-
64
-
65
- def log_file_write(file_path: str | Path) -> None:
66
- logger.warning(f"Wrote {file_size_str(file_path)} to '{file_path}'")
@@ -11,7 +11,8 @@ from epstein_files.util.constant.names import *
11
11
  from epstein_files.util.constant.output_files import JSON_FILES_JSON_PATH, JSON_METADATA_PATH
12
12
  from epstein_files.util.data import dict_sets_to_lists
13
13
  from epstein_files.util.env import args, specified_names
14
- from epstein_files.util.logging import log_file_write, logger
14
+ from epstein_files.util.file_helper import log_file_write
15
+ from epstein_files.util.logging import logger
15
16
  from epstein_files.util.rich import *
16
17
 
17
18
  PRINT_COLOR_KEY_EVERY_N_EMAILS = 150
@@ -60,7 +61,6 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
60
61
  """Returns number of emails printed."""
61
62
  print_section_header(('Selections from ' if not args.all_emails else '') + 'His Emails')
62
63
  print_other_site_link(is_header=False)
63
-
64
64
  emailers_to_print: list[str | None]
65
65
  emailer_tables: list[str | None] = []
66
66
  already_printed_emails: list[Email] = []
@@ -106,8 +106,8 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
106
106
  _verify_all_emails_were_printed(epstein_files, already_printed_emails)
107
107
 
108
108
  fwded_articles = [e for e in already_printed_emails if e.config and e.config.is_fwded_article]
109
- logger.warning(f"{len(fwded_articles)} of {len(already_printed_emails)} emails were forwarded articles.")
110
- logger.warning(f"Rewrote {len(Email.rewritten_header_ids)} headers of {len(epstein_files.emails)} emails.")
109
+ log_msg = f"Rewrote {len(Email.rewritten_header_ids)} email headers (out of {len(already_printed_emails)})"
110
+ logger.warning(f"{log_msg}, {len(fwded_articles)} of the emails were forwarded articles.")
111
111
  return len(already_printed_emails)
112
112
 
113
113
 
@@ -121,7 +121,7 @@ def print_json_files(epstein_files: EpsteinFiles):
121
121
  else:
122
122
  for json_file in epstein_files.json_files:
123
123
  console.line(2)
124
- console.print(json_file.description_panel())
124
+ console.print(json_file.summary_panel())
125
125
  console.print_json(json_file.json_str(), indent=4, sort_keys=False)
126
126
 
127
127
 
@@ -187,8 +187,13 @@ def write_urls() -> None:
187
187
  def _verify_all_emails_were_printed(epstein_files: EpsteinFiles, already_printed_emails: list[Email]) -> None:
188
188
  """Log warnings if some emails were never printed."""
189
189
  email_ids_that_were_printed = set([email.file_id for email in already_printed_emails])
190
- logger.warning(f"Printed {len(already_printed_emails)} emails of {len(email_ids_that_were_printed)} unique file IDs.")
190
+ logger.warning(f"Printed {len(already_printed_emails):,} emails of {len(email_ids_that_were_printed):,} unique file IDs.")
191
+ missed_an_email = False
191
192
 
192
193
  for email in epstein_files.emails:
193
- if email.file_id not in email_ids_that_were_printed and not email.is_duplicate:
194
+ if email.file_id not in email_ids_that_were_printed and not email.is_duplicate():
194
195
  logger.warning(f"Failed to print {email.summary()}")
196
+ missed_an_email = True
197
+
198
+ if not missed_an_email:
199
+ logger.warning(f"All {len(epstein_files.emails):,} emails printed at least once.")
@@ -20,8 +20,9 @@ from epstein_files.util.constant.urls import *
20
20
  from epstein_files.util.constants import FALLBACK_TIMESTAMP, HEADER_ABBREVIATIONS
21
21
  from epstein_files.util.data import json_safe
22
22
  from epstein_files.util.env import args
23
+ from epstein_files.util.file_helper import log_file_write
23
24
  from epstein_files.util.highlighted_group import ALL_HIGHLIGHTS, HIGHLIGHTED_NAMES, EpsteinHighlighter
24
- from epstein_files.util.logging import log_file_write, logger
25
+ from epstein_files.util.logging import logger
25
26
 
26
27
  TITLE_WIDTH = 50
27
28
  NUM_COLOR_KEY_COLS = 4
@@ -30,6 +31,7 @@ QUESTION_MARK_TXT = Text(QUESTION_MARKS, style='dim')
30
31
  GREY_NUMBERS = [58, 39, 39, 35, 30, 27, 23, 23, 19, 19, 15, 15, 15]
31
32
 
32
33
  DEFAULT_NAME_STYLE = 'gray46'
34
+ INFO_STYLE = 'white dim italic'
33
35
  KEY_STYLE='honeydew2 bold'
34
36
  SECTION_HEADER_STYLE = 'bold white on blue3'
35
37
  SOCIAL_MEDIA_LINK_STYLE = 'pale_turquoise4'
@@ -239,23 +241,26 @@ def print_numbered_list_of_emailers(_list: list[str | None], epstein_files = Non
239
241
  def print_other_site_link(is_header: bool = True) -> None:
240
242
  """Print a link to the emails site if we're building text messages site and vice versa."""
241
243
  site_type: SiteType = EMAIL if args.all_emails else TEXT_MESSAGE
244
+ link_style = OTHER_SITE_LINK_STYLE if is_header else 'light_slate_grey bold'
242
245
 
243
246
  if is_header:
244
247
  print_starred_header(f"This is the Epstein {site_type.title()}s site", num_spaces=4, num_stars=14)
245
248
 
246
249
  other_site_type: SiteType = TEXT_MESSAGE if site_type == EMAIL else EMAIL
247
- other_site_msg = "another site for" + (' all of' if other_site_type == EMAIL else '')
250
+ other_site_msg = "another site with" + (' all of' if other_site_type == EMAIL else '')
248
251
  other_site_msg += f" Epstein's {other_site_type}s also generated by this code"
249
- markup_msg = link_markup(SITE_URLS[other_site_type], other_site_msg, OTHER_SITE_LINK_STYLE)
252
+ markup_msg = link_markup(SITE_URLS[other_site_type], other_site_msg, link_style)
250
253
  print_centered(parenthesize(Text.from_markup(markup_msg)), style='bold')
251
254
 
252
- if is_header:
253
- word_count_link = link_text_obj(WORD_COUNT_URL, 'most frequently used words in the emails and texts', AUX_SITE_LINK_STYLE)
254
- print_centered(parenthesize(word_count_link))
255
- metadata_link = link_text_obj(JSON_METADATA_URL, 'author attribution explanations', AUX_SITE_LINK_STYLE)
256
- print_centered(parenthesize(metadata_link))
257
- json_link = link_text_obj(WORD_COUNT_URL, "epstein's json files", AUX_SITE_LINK_STYLE)
258
- print_centered(parenthesize(json_link))
255
+ if not is_header:
256
+ return
257
+
258
+ word_count_link = link_text_obj(WORD_COUNT_URL, 'most frequently used words in the emails and texts', AUX_SITE_LINK_STYLE)
259
+ print_centered(parenthesize(word_count_link))
260
+ metadata_link = link_text_obj(JSON_METADATA_URL, 'author attribution explanations', AUX_SITE_LINK_STYLE)
261
+ print_centered(parenthesize(metadata_link))
262
+ json_link = link_text_obj(WORD_COUNT_URL, "epstein's json files", AUX_SITE_LINK_STYLE)
263
+ print_centered(parenthesize(json_link))
259
264
 
260
265
 
261
266
  def print_page_title(expand: bool = True, width: int | None = None) -> None:
@@ -9,18 +9,22 @@ from rich.padding import Padding
9
9
  from rich.text import Text
10
10
 
11
11
  from epstein_files.documents.emails.email_header import EmailHeader
12
- from epstein_files.util.constant.common_words import COMMON_WORDS, UNSINGULARIZABLE_WORDS
12
+ from epstein_files.epstein_files import EpsteinFiles
13
+ from epstein_files.util.constant.common_words import COMMON_WORDS_LIST, COMMON_WORDS, UNSINGULARIZABLE_WORDS
13
14
  from epstein_files.util.constant.names import OTHER_NAMES
15
+ from epstein_files.util.constant.output_files import WORD_COUNT_HTML_PATH
14
16
  from epstein_files.util.data import ALL_NAMES, flatten, sort_dict
15
- from epstein_files.util.env import args
17
+ from epstein_files.util.env import args, specified_names
16
18
  from epstein_files.util.logging import logger
17
- from epstein_files.util.rich import highlighter
18
- from epstein_files.util.search_result import SearchResult
19
+ from epstein_files.util.rich import (console, highlighter, print_centered, print_color_key, print_page_title,
20
+ print_panel, print_starred_header, write_html)
21
+ from epstein_files.util.search_result import MatchedLine, SearchResult
22
+ from epstein_files.util.timer import Timer
19
23
 
20
24
  FIRST_AND_LAST_NAMES = flatten([n.split() for n in ALL_NAMES])
21
25
  FIRST_AND_LAST_NAMES = [n.lower() for n in FIRST_AND_LAST_NAMES] + OTHER_NAMES
22
26
 
23
- HTML_REGEX = re.compile(r"com/|cae-v2w=|content-(transfe|type)|font(/|-(family|size))|http|\.html?\??|margin-bottom|padding-left|quoted-printable|region=|text-decoration|ttps|www|\.(gif|jpe?g|png);?$")
27
+ HTML_REGEX = re.compile(r"^http|#yiv|com/|cae-v2w=|content-(transfe|type)|font(/|-(family|size))|http|\.html?\??|margin-bottom|padding-left|quoted-printable|region=|text-decoration|ttps|www|\.(gif|jpe?g|png);?$")
24
28
  HYPHENATED_WORD_REGEX = re.compile(r"[a-z]+-[a-z]+", re.IGNORECASE)
25
29
  OK_SYMBOL_WORDS = ['mar-a-lago', 'p/e', 's&p', ':)', ':).', ';)', ':-)', ';-)']
26
30
  ONLY_SYMBOLS_REGEX = re.compile(r"^[^a-zA-Z0-9]+$")
@@ -187,6 +191,62 @@ class WordCount:
187
191
  yield f"Showing {len(word_txts):,} words appearing at least {MIN_COUNT_CUTOFF} times (out of {len(self.count):,} words)."
188
192
 
189
193
 
194
+ def write_word_counts_html() -> None:
195
+ timer = Timer()
196
+ epstein_files = EpsteinFiles.get_files(timer)
197
+ email_subjects: set[str] = set()
198
+ word_count = WordCount()
199
+
200
+ # Remove dupes, junk mail, and fwded articles from emails
201
+ emails = [e for e in epstein_files.emails if not (e.is_duplicate() or e.is_junk_mail() or e.is_fwded_article())]
202
+
203
+ for email in emails:
204
+ if specified_names and email.author not in specified_names:
205
+ continue
206
+
207
+ logger.info(f"Counting words in {email}\n [SUBJECT] {email.subject()}")
208
+ lines = email.actual_text.split('\n')
209
+
210
+ if email.subject() not in email_subjects and f'Re: {email.subject()}' not in email_subjects:
211
+ email_subjects.add(email.subject())
212
+ lines.append(email.subject())
213
+
214
+ for i, line in enumerate(lines):
215
+ if HTML_REGEX.search(line):
216
+ continue
217
+
218
+ for word in line.split():
219
+ word_count.tally_word(word, SearchResult(email, [MatchedLine(line, i)]))
220
+
221
+ # Add in iMessage conversation words
222
+ imessage_logs = epstein_files.imessage_logs_for(specified_names) if specified_names else epstein_files.imessage_logs
223
+
224
+ for imessage_log in imessage_logs:
225
+ logger.info(f"Counting words in {imessage_log}")
226
+
227
+ for i, msg in enumerate(imessage_log.messages):
228
+ if specified_names and msg.author not in specified_names:
229
+ continue
230
+ elif HTML_REGEX.search(line):
231
+ continue
232
+
233
+ for word in msg.text.split():
234
+ word_count.tally_word(word, SearchResult(imessage_log, [MatchedLine(msg.text, i)]))
235
+
236
+ print_page_title(expand=False)
237
+ print_starred_header(f"Most Common Words in {len(emails):,} Emails and {len(imessage_logs)} iMessage Logs")
238
+ print_centered(f"(excluding {len(COMMON_WORDS_LIST)} particularly common words at bottom)", style='dim')
239
+ console.line()
240
+ print_color_key()
241
+ console.line()
242
+ console.print(word_count)
243
+ console.line(2)
244
+ print_panel(f"{len(COMMON_WORDS_LIST):,} Excluded Words", centered=True)
245
+ console.print(', '.join(COMMON_WORDS_LIST), highlight=False)
246
+ write_html(WORD_COUNT_HTML_PATH)
247
+ timer.print_at_checkpoint(f"Finished counting words")
248
+
249
+
190
250
  def _word_style(word: str | None) -> str:
191
251
  word = word or ''
192
252
  return 'bright_white' if word in FIRST_AND_LAST_NAMES else 'grey53'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: epstein-files
3
- Version: 1.0.9
3
+ Version: 1.0.11
4
4
  Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
5
5
  Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
6
6
  License: GPL-3.0-or-later
@@ -0,0 +1,33 @@
1
+ epstein_files/__init__.py,sha256=4zxX1tw-0xMwpM-Sbq7PezV0YNS9zN-P6gc9BQ1BqKU,4710
2
+ epstein_files/documents/communication.py,sha256=SunZdjMhR9v6y8LlQ6jhIu8vYjSndaBK0Su1mKnhfj0,2060
3
+ epstein_files/documents/document.py,sha256=dECV0bSnOJzPfOIHyHeG5rNxKd6uwuiso35-sQZg9No,18353
4
+ epstein_files/documents/email.py,sha256=yXiW7mB4myU8G9DY7PnnqazaCqeAR3dHr35NfBplfRU,38519
5
+ epstein_files/documents/emails/email_header.py,sha256=wkPfSLbmzkAeQwvhf0bAeFDLPbQT-EeG0v8vNNLYktM,7502
6
+ epstein_files/documents/imessage/text_message.py,sha256=3HlNp75JIoMlWj7PaUWIFry3qlGEmpGu5OmdmsBYS34,2807
7
+ epstein_files/documents/json_file.py,sha256=HsnVWPZXVxTF_DadL2YtJtsiXKXOd18PUs05O33tjNc,1317
8
+ epstein_files/documents/messenger_log.py,sha256=uSPlg85jGTwod1cV9f7MtxSNqmMZ61JBFzoiRNqg52M,6263
9
+ epstein_files/documents/other_file.py,sha256=S_Y-SxYYYXtx42JHmhFWl5BbTduNI7cwQjeYHBJA7sc,9950
10
+ epstein_files/epstein_files.py,sha256=SaD4DJJ5tRxY97Ei4BdOgLzHQ9wrBVGrP64CSqdmk-w,18691
11
+ epstein_files/util/constant/common_words.py,sha256=aR0UjoWmxyR49XS-DtHECQ1CiA_bK8hNP6CQ1TS9yZA,3696
12
+ epstein_files/util/constant/html.py,sha256=9U098TGzlghGg4WfxLYHyub5JGR17Dv7VP5i2MSu8Kk,1415
13
+ epstein_files/util/constant/names.py,sha256=KKJEYFpdOp4xDwXe5dhrqYgF12oJODvVSFpAB28Q76A,10153
14
+ epstein_files/util/constant/output_files.py,sha256=BkV4_gmdj46RfGy5SFYp6dgTty3FtlBth5YGmaGutls,1700
15
+ epstein_files/util/constant/strings.py,sha256=FDtksfH50PSxtSBw9XhmqxtrgRgGxdIvGiAR2bbPpu4,1899
16
+ epstein_files/util/constant/urls.py,sha256=0IdCVVvXib0i-4TZFkVHoS4zCbjOBZWcr6NkGxsmQWM,4981
17
+ epstein_files/util/constants.py,sha256=LPSI6Z0n3ChFDnMGYVO80cGuSKZf0OoyUzLih_jlRKI,111434
18
+ epstein_files/util/data.py,sha256=xwTqrbAi7ZDJM0iyFVOevnokP_oIQ2npkRjHzF1KGGY,2908
19
+ epstein_files/util/doc_cfg.py,sha256=OZlocAWldfR8Nomiad4FxQeyhNMbd0PQ-rumKn2nWBg,9641
20
+ epstein_files/util/env.py,sha256=HnYcfHSNkwVJ_T75Woy43_OpDyxD0KHPj3GxcVx86N4,5751
21
+ epstein_files/util/file_helper.py,sha256=-higKqc9J5IfNpzMzg-9j1ps3beV4N2cw8kdAxfm7NA,2835
22
+ epstein_files/util/highlighted_group.py,sha256=fU-8ns50uUolzPEAxadF5AnPLjn383KpEeyRXfFbv_U,35971
23
+ epstein_files/util/logging.py,sha256=8e22WaBfDAKEmkcr3Gb4TdqtFSkU4FQDpk3Z6hfSzbw,1977
24
+ epstein_files/util/output.py,sha256=UzTU0mNHEmeJr3w2TXAp19X497GB6_-HyW0mfztI1jk,8120
25
+ epstein_files/util/rich.py,sha256=8-4IA5bwPBdDPqkPdymq3zVKB9hfy3nrT7fUrN_XevY,14744
26
+ epstein_files/util/search_result.py,sha256=1fxe0KPBQXBk4dLfu6m0QXIzYfZCzvaSkWqvghJGzxY,567
27
+ epstein_files/util/timer.py,sha256=8hxW4Y1JcTUfnBrHh7sL2pM9xu1sL4HFQM4CmmzTarU,837
28
+ epstein_files/util/word_count.py,sha256=8qBTuq3d0Q-3fwiuECKWi2RfL-KUiZD8TciwvfL0D_o,9353
29
+ epstein_files-1.0.11.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
30
+ epstein_files-1.0.11.dist-info/METADATA,sha256=HBW3t1F9lkoN6GIR7ySV2kBYnJhNEF9otDZWnf03jUo,5480
31
+ epstein_files-1.0.11.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
32
+ epstein_files-1.0.11.dist-info/entry_points.txt,sha256=5qYgwAXpxegeAicD_rzda_trDRnUC51F5UVDpcZ7j6Q,240
33
+ epstein_files-1.0.11.dist-info/RECORD,,
@@ -1,72 +0,0 @@
1
- # Count word usage in emails and texts
2
- import re
3
-
4
- from epstein_files.epstein_files import EpsteinFiles
5
- from epstein_files.util.constant.common_words import COMMON_WORDS_LIST
6
- from epstein_files.util.constant.output_files import WORD_COUNT_HTML_PATH
7
- from epstein_files.util.env import args, specified_names
8
- from epstein_files.util.logging import logger
9
- from epstein_files.util.rich import (console, print_centered, print_color_key, print_page_title, print_panel,
10
- print_starred_header, write_html)
11
- from epstein_files.util.search_result import MatchedLine, SearchResult
12
- from epstein_files.util.timer import Timer
13
- from epstein_files.util.word_count import WordCount
14
-
15
- HTML_REGEX = re.compile(r"^http|#yiv")
16
-
17
-
18
- def write_word_counts_html() -> None:
19
- timer = Timer()
20
- epstein_files = EpsteinFiles.get_files(timer)
21
- email_subjects: set[str] = set()
22
- word_count = WordCount()
23
-
24
- # Remove dupes, junk mail, and fwded articles from emails
25
- emails = [
26
- e for e in epstein_files.emails
27
- if not (e.is_duplicate or e.is_junk_mail() or (e.config and e.config.is_fwded_article)) \
28
- and (len(specified_names) == 0 or e.author in specified_names)
29
- ]
30
-
31
- for email in emails:
32
- logger.info(f"Counting words in {email}\n [SUBJECT] {email.subject()}")
33
- lines = email.actual_text.split('\n')
34
-
35
- if email.subject() not in email_subjects and f'Re: {email.subject()}' not in email_subjects:
36
- email_subjects.add(email.subject())
37
- lines.append(email.subject())
38
-
39
- for i, line in enumerate(lines):
40
- if HTML_REGEX.search(line):
41
- continue
42
-
43
- for word in line.split():
44
- word_count.tally_word(word, SearchResult(email, [MatchedLine(line, i)]))
45
-
46
- # Add in iMessage conversation words
47
- imessage_logs = epstein_files.imessage_logs_for(specified_names) if specified_names else epstein_files.imessage_logs
48
-
49
- for imessage_log in imessage_logs:
50
- logger.info(f"Counting words in {imessage_log}")
51
-
52
- for msg in imessage_log.messages():
53
- if len(specified_names) > 0 and msg.author not in specified_names:
54
- continue
55
- elif HTML_REGEX.search(line):
56
- continue
57
-
58
- for word in msg.text.split():
59
- word_count.tally_word(word, SearchResult(imessage_log, [msg.text]))
60
-
61
- print_page_title(expand=False)
62
- print_starred_header(f"Most Common Words in {len(emails):,} Emails and {len(imessage_logs)} iMessage Logs")
63
- print_centered(f"(excluding {len(COMMON_WORDS_LIST)} particularly common words at bottom)", style='dim')
64
- console.line()
65
- print_color_key()
66
- console.line()
67
- console.print(word_count)
68
- console.line(2)
69
- print_panel(f"{len(COMMON_WORDS_LIST):,} Excluded Words", centered=True)
70
- console.print(', '.join(COMMON_WORDS_LIST), highlight=False)
71
- write_html(WORD_COUNT_HTML_PATH)
72
- timer.print_at_checkpoint(f"Finished counting words")
@@ -1,34 +0,0 @@
1
- epstein_files/__init__.py,sha256=w68EUhAzri8a_c9HggKMfoHTvPbVAO-u3NuHVizDc7U,4940
2
- epstein_files/count_words.py,sha256=i1pYaQzX7b9S3pyV3RM_8asbQJ1PEk8wJgLOG6Mf0D8,2966
3
- epstein_files/documents/communication.py,sha256=SunZdjMhR9v6y8LlQ6jhIu8vYjSndaBK0Su1mKnhfj0,2060
4
- epstein_files/documents/document.py,sha256=BUaioSvOmfsR-ULa6hJy3WYg-hBDC-kqafUheMJ-jFY,16665
5
- epstein_files/documents/email.py,sha256=H34b2zt_TrPUgXHwZXybjmLE9-QNAtezs9NVSCPOSGM,38462
6
- epstein_files/documents/emails/email_header.py,sha256=wkPfSLbmzkAeQwvhf0bAeFDLPbQT-EeG0v8vNNLYktM,7502
7
- epstein_files/documents/imessage/text_message.py,sha256=wfWPQhwGG5Yzyhbr1NAQAY0bzRjjqVZmh8SPl48XmAM,3025
8
- epstein_files/documents/json_file.py,sha256=1Cx_3uM38Dwgrbs8fU55TUZKSrCsmd4QpHKWtfWdudw,1089
9
- epstein_files/documents/messenger_log.py,sha256=DHlQpbLbMITMpMtCYk2vcRc7-CTvYvOXql-9nDUc3tQ,5887
10
- epstein_files/documents/other_file.py,sha256=NdVlCYcyzHvOInReqF-zvHJI1hwtzMWW9ekDojHIb4U,9091
11
- epstein_files/epstein_files.py,sha256=EEx8Auwv8z0FkRrCi7wE8iuuRQd6K1rQDMc2vdbrsh4,18298
12
- epstein_files/util/constant/common_words.py,sha256=aR0UjoWmxyR49XS-DtHECQ1CiA_bK8hNP6CQ1TS9yZA,3696
13
- epstein_files/util/constant/html.py,sha256=9U098TGzlghGg4WfxLYHyub5JGR17Dv7VP5i2MSu8Kk,1415
14
- epstein_files/util/constant/names.py,sha256=flIZCafFXHiaSy-G2QhYH0hNfkeJBH6Gz7p9AdvYgC0,10125
15
- epstein_files/util/constant/output_files.py,sha256=BkV4_gmdj46RfGy5SFYp6dgTty3FtlBth5YGmaGutls,1700
16
- epstein_files/util/constant/strings.py,sha256=FDtksfH50PSxtSBw9XhmqxtrgRgGxdIvGiAR2bbPpu4,1899
17
- epstein_files/util/constant/urls.py,sha256=0IdCVVvXib0i-4TZFkVHoS4zCbjOBZWcr6NkGxsmQWM,4981
18
- epstein_files/util/constants.py,sha256=MsWZQs3qd9N1HlgC7MoSKRF6ssbmWlUXX49REsp3qQs,110867
19
- epstein_files/util/data.py,sha256=xwTqrbAi7ZDJM0iyFVOevnokP_oIQ2npkRjHzF1KGGY,2908
20
- epstein_files/util/doc_cfg.py,sha256=5Pb__bP00mKi9ACv33omZQA-TBzumc7D2Td_Mk4M5DY,9822
21
- epstein_files/util/env.py,sha256=PaPBi27-npU9egt9LHxr5qR65B2DPHwt7Xc9sx5VN-M,5225
22
- epstein_files/util/file_helper.py,sha256=v_bE10MHEcXti9DVJo4WqyOsG83Xrv05S3Vc70cYJkk,3082
23
- epstein_files/util/highlighted_group.py,sha256=L7R63oyDG_lQ9Vv5gB_rRwJgHS2sdMfXHs9xcuDzqdc,35667
24
- epstein_files/util/logging.py,sha256=4hVl1Qw1qRMSVEYKXZxrvdQuSIMBgTPskzvNMNu8268,2185
25
- epstein_files/util/output.py,sha256=wLjFBGR5ffn4cLep12G3OmUR0H3WtEMXeVMOXtd-6ig,7909
26
- epstein_files/util/rich.py,sha256=rdHzn4XRB2erQSf2yYyPakRmd9ixqBUdS8-BVOUAXnE,14603
27
- epstein_files/util/search_result.py,sha256=1fxe0KPBQXBk4dLfu6m0QXIzYfZCzvaSkWqvghJGzxY,567
28
- epstein_files/util/timer.py,sha256=8hxW4Y1JcTUfnBrHh7sL2pM9xu1sL4HFQM4CmmzTarU,837
29
- epstein_files/util/word_count.py,sha256=eGzcsoAvMcutRUFOJnVuEp9_28H74to7T9jTdGUZnuI,6757
30
- epstein_files-1.0.9.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
31
- epstein_files-1.0.9.dist-info/METADATA,sha256=QTK8iM7ZkD2742Gk9c8yPyG5LV1QLNOrjKguJALSX1c,5479
32
- epstein_files-1.0.9.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
33
- epstein_files-1.0.9.dist-info/entry_points.txt,sha256=5qYgwAXpxegeAicD_rzda_trDRnUC51F5UVDpcZ7j6Q,240
34
- epstein_files-1.0.9.dist-info/RECORD,,