epstein-files 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  import re
2
2
  from copy import deepcopy
3
+ from typing import cast
3
4
 
4
5
  from dateutil.parser import parse
5
6
 
@@ -84,7 +85,7 @@ EMAILER_ID_REGEXES: dict[str, re.Pattern] = {
84
85
  JAMES_HILL: re.compile(r"hill, james e.|james.e.hill@abc.com", re.IGNORECASE),
85
86
  JEAN_LUC_BRUNEL: re.compile(r'Jean[- ]Luc Brunel?', re.IGNORECASE),
86
87
  JEFF_FULLER: re.compile(r"jeff@mc2mm.com|Jeff Fuller", re.IGNORECASE),
87
- JEFFREY_EPSTEIN: re.compile(r'[djl]ee[vy]acation[©@]?g?(mail.com)?|Epstine|\bJEE?\b|Jeffrey E((sp|ps)tein?)?|jeeproject@yahoo.com|J Jep|Jeffery Edwards|(?<!Mark L. )Epstein', re.IGNORECASE),
88
+ JEFFREY_EPSTEIN: re.compile(r'[djl]\s?ee[vy]acation[©@]?g?(mail.com)?|Epstine|\bJEE?\b|Jeffrey E((sp|ps)tein?)?|jeeproject@yahoo.com|J Jep|Jeffery Edwards|(?<!Mark L. )Epstein', re.IGNORECASE),
88
89
  JESSICA_CADWELL: re.compile(r'Jessica Cadwell?', re.IGNORECASE),
89
90
  JOHNNY_EL_HACHEM: re.compile(r'el hachem johnny|johnny el hachem', re.IGNORECASE),
90
91
  JOI_ITO: re.compile(r'ji@media.mit.?edu|(joichi|joi)( Ito)?', re.IGNORECASE),
@@ -94,7 +95,7 @@ EMAILER_ID_REGEXES: dict[str, re.Pattern] = {
94
95
  LANDON_THOMAS: re.compile(r'lando[nr] thomas( jr)?|thomas jr.?, lando[nr]', re.IGNORECASE),
95
96
  LARRY_SUMMERS: re.compile(r'(La(wrence|rry).{1,5})?Summers?|^LH$|LHS|Ihsofficel', re.IGNORECASE),
96
97
  LAWRANCE_VISOSKI: re.compile(r'La(rry|wrance) Visoski?|Lvjet', re.IGNORECASE),
97
- LAWRENCE_KRAUSS: re.compile(r'Lawrence Kraus|lawkrauss', re.IGNORECASE),
98
+ LAWRENCE_KRAUSS: re.compile(r'Lawrence Kraus|[jl]awkrauss', re.IGNORECASE),
98
99
  LEON_BLACK: re.compile(r'Leon Black?', re.IGNORECASE),
99
100
  MANUELA_MARTINEZ: re.compile(fr'Manuela (- Mega Partners|Martinez)', re.IGNORECASE),
100
101
  MARIANA_IDZKOWSKA: re.compile(r'Mariana [Il]d[źi]kowska?', re.IGNORECASE),
@@ -268,7 +269,7 @@ SHIMON_POST = 'The Shimon Post'
268
269
  SHIMON_POST_ARTICLE = f'selection of articles about the mideast'
269
270
  SINGLE_PAGE = 'single page of'
270
271
  STRANGE_BEDFELLOWS = "'Strange Bedfellows' list of invitees f. Johnny Depp, Woody Allen, Obama, and more"
271
- SWEDISH_LIFE_SCIENCES_SUMMIT = f"{BARBRO_C_EHNBOM}'s Swedish American Life Science Summit"
272
+ SWEDISH_LIFE_SCIENCES_SUMMIT = f"{BARBRO_C_EHNBOM}'s Swedish American Life Science Summit (SALSS)"
272
273
  THE_REAL_DEAL_ARTICLE = 'article by Keith Larsen'
273
274
  TRUMP_DISCLOSURES = f"Donald Trump financial disclosures from U.S. Office of Government Ethics"
274
275
  UBS_CIO_REPORT = 'CIO Monthly Extended report'
@@ -371,8 +372,8 @@ TEXTS_CONFIG = CONFIRMED_TEXTS_CONFIG + UNCONFIRMED_TEXTS_CONFIG
371
372
  ########################################################################################################
372
373
 
373
374
  # Some emails have a lot of uninteresting CCs
374
- IRAN_NUCLEAR_DEAL_SPAM_EMAIL_RECIPIENTS: list[str | None] = ['Allen West', 'Rafael Bardaji', 'Philip Kafka', 'Herb Goodman', 'Grant Seeger', 'Lisa Albert', 'Janet Kafka', 'James Ramsey', 'ACT for America', 'John Zouzelka', 'Joel Dunn', 'Nate McClain', 'Bennet Greenwald', 'Taal Safdie', 'Uri Fouzailov', 'Neil Anderson', 'Nate White', 'Rita Hortenstine', 'Henry Hortenstine', 'Gary Gross', 'Forrest Miller', 'Bennett Schmidt', 'Val Sherman', 'Marcie Brown', 'Michael Horowitz', 'Marshall Funk']
375
- FLIGHT_IN_2012_PEOPLE: list[str | None] = ['Francis Derby', 'Januiz Banasiak', 'Louella Rabuyo', 'Richard Barnnet']
375
+ IRAN_DEAL_RECIPIENTS = ['Allen West', 'Rafael Bardaji', 'Philip Kafka', 'Herb Goodman', 'Grant Seeger', 'Lisa Albert', 'Janet Kafka', 'James Ramsey', 'ACT for America', 'John Zouzelka', 'Joel Dunn', 'Nate McClain', 'Bennet Greenwald', 'Taal Safdie', 'Uri Fouzailov', 'Neil Anderson', 'Nate White', 'Rita Hortenstine', 'Henry Hortenstine', 'Gary Gross', 'Forrest Miller', 'Bennett Schmidt', 'Val Sherman', 'Marcie Brown', 'Michael Horowitz', 'Marshall Funk']
376
+ FLIGHT_IN_2012_PEOPLE = ['Francis Derby', 'Januiz Banasiak', 'Louella Rabuyo', 'Richard Barnnet']
376
377
 
377
378
  EMAILS_CONFIG = [
378
379
  EmailCfg(id='032436', author=ALIREZA_ITTIHADIEH, attribution_reason='Signature'),
@@ -491,9 +492,6 @@ EMAILS_CONFIG = [
491
492
  EmailCfg(id='032727', author=KATHRYN_RUEMMLER, attribution_reason=KATHY_REASON, is_attribution_uncertain=True),
492
493
  EmailCfg(id='030478', author=LANDON_THOMAS),
493
494
  EmailCfg(id='029013', author=LARRY_SUMMERS, recipients=[JEFFREY_EPSTEIN]), # Bad OCR (nofix)
494
- EmailCfg(id='032206', author=LAWRENCE_KRAUSS), # More of a text convo?
495
- EmailCfg(id='032208', author=LAWRENCE_KRAUSS, recipients=[JEFFREY_EPSTEIN]), # More of a text convo?
496
- EmailCfg(id='032209', author=LAWRENCE_KRAUSS, recipients=[JEFFREY_EPSTEIN]), # More of a text convo?
497
495
  EmailCfg(id='029196', author=LAWRENCE_KRAUSS, recipients=[JEFFREY_EPSTEIN], actual_text='Talk in 40?'),
498
496
  EmailCfg(id='033593', author=LAWRANCE_VISOSKI, attribution_reason='Signature'),
499
497
  EmailCfg(id='033370', author=LAWRANCE_VISOSKI, attribution_reason=LARRY_REASON),
@@ -575,7 +573,7 @@ EMAILS_CONFIG = [
575
573
  attribution_reason='ends with "Respectfully, terry"',
576
574
  author=TERRY_KAFKA,
577
575
  fwded_text_after='From: Mike Cohen',
578
- recipients=[JEFFREY_EPSTEIN, MARK_EPSTEIN, MICHAEL_BUCHHOLTZ] + IRAN_NUCLEAR_DEAL_SPAM_EMAIL_RECIPIENTS,
576
+ recipients=[JEFFREY_EPSTEIN, MARK_EPSTEIN, MICHAEL_BUCHHOLTZ] + IRAN_DEAL_RECIPIENTS,
579
577
  duplicate_ids=['028482'],
580
578
  ),
581
579
  EmailCfg(id='029992', author=TERRY_KAFKA, attribution_reason='Quoted reply'),
@@ -600,7 +598,6 @@ EMAILS_CONFIG = [
600
598
  EmailCfg(id='022202', recipients=[JEAN_LUC_BRUNEL], attribution_reason='Follow up / reply', duplicate_ids=['029975']),
601
599
  EmailCfg(id='022187', recipients=[JEFFREY_EPSTEIN]), # Bad OCR (nofix)
602
600
  EmailCfg(id='031489', recipients=[JEFFREY_EPSTEIN]), # Bad OCR (unfixable)
603
- EmailCfg(id='032210', recipients=[JEFFREY_EPSTEIN]), # More of a text convo?
604
601
  EmailCfg(id='030347', recipients=[JEFFREY_EPSTEIN]), # Bad OCR (nofix)
605
602
  EmailCfg(id='030367', recipients=[JEFFREY_EPSTEIN]), # Bad OCR (nofix)
606
603
  EmailCfg(id='033274', recipients=[JEFFREY_EPSTEIN]), # this is a note sent to self
@@ -751,7 +748,7 @@ EMAILS_CONFIG = [
751
748
  EmailCfg(id='031118', duplicate_ids=['019465']),
752
749
  EmailCfg(id='031912', duplicate_ids=['032158']),
753
750
  EmailCfg(id='030587', duplicate_ids=['030514']),
754
- EmailCfg(id='029773', duplicate_ids=['012685']),
751
+ EmailCfg(id='029773', duplicate_ids=['012685'], fwded_text_after='Omar Quadhafi'),
755
752
  EmailCfg(id='033297', duplicate_ids=['033586']),
756
753
  EmailCfg(id='031089', duplicate_ids=['018084']),
757
754
  EmailCfg(id='031088', duplicate_ids=['030885']),
@@ -1195,7 +1192,7 @@ OTHER_FILES_CONFERENCES = [
1195
1192
  DocCfg(id='019300', author=SVETLANA_POZHIDAEVA, description=f'{WOMEN_EMPOWERMENT} f. {KATHRYN_RUEMMLER}', date='2019-04-05'),
1196
1193
  DocCfg(id='022267', author=SVETLANA_POZHIDAEVA, description=f'{WOMEN_EMPOWERMENT} founder essay about growing the seminar business'),
1197
1194
  DocCfg(id='022407', author=SVETLANA_POZHIDAEVA, description=f'{WOMEN_EMPOWERMENT} seminar pitch deck'),
1198
- DocCfg(id='017524', author=SWEDISH_LIFE_SCIENCES_SUMMIT, description=f"2012 program"),
1195
+ DocCfg(id='017524', author=SWEDISH_LIFE_SCIENCES_SUMMIT, description=f"2012 program emailed to epstein BY {BARBRO_C_EHNBOM} in 031226", date='2012-08-18'),
1199
1196
  DocCfg(id='026747', author=SWEDISH_LIFE_SCIENCES_SUMMIT, description=f"2017 program", date='2017-08-23'),
1200
1197
  DocCfg(id='014951', author='TED Talks', description=f"2017 program", date='2017-04-20'),
1201
1198
  DocCfg(id='024179', author=UN_GENERAL_ASSEMBLY, description=f'president and first lady schedule', date='2012-09-21'),
@@ -1326,7 +1323,7 @@ OTHER_FILES_LETTERS = [
1326
1323
  ]
1327
1324
 
1328
1325
  OTHER_FILES_PROPERTY = [
1329
- DocCfg(id='026759', author='Great Bay Condominium Owners Association', description=f'{PRESS_RELEASE} by about Hurricane Irma damage', date='2017-09-13'),
1326
+ DocCfg(id='026759', author='Great Bay Condominium Owners Association', description=f'{PRESS_RELEASE} about Hurricane Irma damage', date='2017-09-13'),
1330
1327
  DocCfg(id='016602', author=PALM_BEACH_CODE_ENFORCEMENT, description='board minutes', date='2008-04-17'),
1331
1328
  DocCfg(id='016554', author=PALM_BEACH_CODE_ENFORCEMENT, description='board minutes', date='2008-07-17', duplicate_ids=['016616', '016574']),
1332
1329
  DocCfg(id='027068', author=THE_REAL_DEAL, description=f"{THE_REAL_DEAL_ARTICLE} Palm House Hotel Bankruptcy and EB-5 Visa Fraud Allegations"),
@@ -1379,8 +1376,8 @@ OTHER_FILES_SOCIAL = [
1379
1376
  ]
1380
1377
 
1381
1378
  OTHER_FILES_POLITICS = [
1382
- DocCfg(id='029918', author=DIANA_DEGETTE_CAMPAIGN, description=f"bio", date='2012-01-01'),
1383
- DocCfg(id='031184', author=DIANA_DEGETTE_CAMPAIGN, description=f"fundraiser invitation"),
1379
+ DocCfg(id='029918', author=DIANA_DEGETTE_CAMPAIGN, description=f"bio", date='2012-09-27'),
1380
+ DocCfg(id='031184', author=DIANA_DEGETTE_CAMPAIGN, description=f"invitation to fundraiser hosted by {BARBRO_C_EHNBOM}", date='2012-09-27'),
1384
1381
  DocCfg(id='026827', author='Scowcroft Group', description=f'report on ISIS', date='2015-11-14'),
1385
1382
  DocCfg(id='024294', author=STACEY_PLASKETT, description=f"campaign flier", date='2016-10-01'),
1386
1383
  DocCfg(
@@ -1482,6 +1479,11 @@ OTHER_FILES_ARTS = [
1482
1479
  OTHER_FILES_MISC = [
1483
1480
  DocCfg(id='022780', category=FLIGHT_LOGS),
1484
1481
  DocCfg(id='022816', category=FLIGHT_LOGS),
1482
+ DocCfg(id='032206', category=SKYPE_LOG, author=LAWRENCE_KRAUSS),
1483
+ DocCfg(id='032208', category=SKYPE_LOG, author=LAWRENCE_KRAUSS),
1484
+ DocCfg(id='032209', category=SKYPE_LOG, author=LAWRENCE_KRAUSS),
1485
+ DocCfg(id='018224', category=SKYPE_LOG, author=LAWRENCE_KRAUSS, description=f'conversations with linkspirit (French?) and {LAWRENCE_KRAUSS}'),
1486
+ DocCfg(id='032210', category=SKYPE_LOG, description=f'conversation with linkspirit'),
1485
1487
  DocCfg(
1486
1488
  id='025147',
1487
1489
  author=BROCKMAN_INC,
@@ -1496,7 +1498,6 @@ OTHER_FILES_MISC = [
1496
1498
  DocCfg(id='027074', author=FEMALE_HEALTH_COMPANY, description=f"pitch deck (USAID was a customer)"),
1497
1499
  DocCfg(id='032735', author=GORDON_GETTY, description=f"on Trump", date='2018-03-20'), # Dated based on concurrent emails from Getty
1498
1500
  DocCfg(id='025540', author=JEFFREY_EPSTEIN, description=f"rough draft of Epstein's side of the story?"),
1499
- DocCfg(id='018224', author=LAWRENCE_KRAUSS, description=f"Skype conversation log"),
1500
1501
  DocCfg(id='026634', author='Michael Carrier', description=f"comments about an Apollo linked hedge fund 'DE Fund VIII'"),
1501
1502
  DocCfg(id='031425', author=SCOTT_J_LINK, description=f'completely redacted email from'),
1502
1503
  DocCfg(id='020447', author='Working Group on Chinese Influence Activities in the U.S.', description=f'Promoting Constructive Vigilance'),
@@ -1589,8 +1590,8 @@ SENT_FROM_REGEX = re.compile(r'^(?:(Please forgive|Sorry for all the) typos.{1,4
1589
1590
 
1590
1591
 
1591
1592
  # Error checking.
1592
- if len(OTHER_FILES_CONFIG) != 438:
1593
- logger.warning(f"Only {len(OTHER_FILES_CONFIG)} configured other files!")
1593
+ if len(OTHER_FILES_CONFIG) != 442:
1594
+ logger.warning(f"Found {len(OTHER_FILES_CONFIG)} configured other files!")
1594
1595
 
1595
1596
  encountered_file_ids = set()
1596
1597
 
@@ -109,7 +109,9 @@ class DocCfg:
109
109
 
110
110
  def info_str(self) -> str | None:
111
111
  """String that summarizes what is known about this document."""
112
- if self.category == REPUTATION:
112
+ if self.category and not self.description:
113
+ return self.category
114
+ elif self.category == REPUTATION:
113
115
  return f"{REPUTATION_MGMT}: {self.description}"
114
116
  elif self.author and self.description:
115
117
  if self.category in [ACADEMIA, BOOK]:
epstein_files/util/env.py CHANGED
@@ -6,36 +6,41 @@ from sys import argv
6
6
 
7
7
  from epstein_files.util.logging import datefinder_logger, env_log_level, logger
8
8
 
9
- COUNT_WORDS_SCRIPT = 'count_words.py'
10
- DEFAULT_WIDTH = 154
11
- HTML_SCRIPTS = ['epstein_generate', 'generate_html.py', COUNT_WORDS_SCRIPT]
9
+ COUNT_WORDS_SCRIPT = 'epstein_word_count'
10
+ DEFAULT_WIDTH = 145
11
+ HTML_SCRIPTS = ['epstein_generate', COUNT_WORDS_SCRIPT]
12
12
 
13
13
 
14
14
  parser = ArgumentParser(description="Parse epstein OCR docs and generate HTML page.")
15
- parser.add_argument('--build', '-b', action='store_true', help='write output to file')
16
- parser.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
17
- parser.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just interesting ones')
18
- parser.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
19
15
  parser.add_argument('--name', '-n', action='append', dest='names', help='specify the name(s) whose communications should be output')
20
- parser.add_argument('--output-file', '-out', metavar='FILE', default='index.html', help='write output to FILE in docs/ (default=index.html)')
21
- parser.add_argument('--output-emails', '-oe', action='store_true', help='generate other files section')
22
- parser.add_argument('--output-other-files', '-oo', action='store_true', help='generate other files section')
23
- parser.add_argument('--output-texts', '-ot', action='store_true', help='generate other files section')
24
- parser.add_argument('--pickled', '-p', action='store_true', help='use pickled EpsteinFiles object')
25
- parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='generate new pickled EpsteinFiles object')
26
- parser.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (only used by scripts)')
27
- parser.add_argument('--sort-alphabetical', '-alpha', action='store_true', help='sort emailers alphabetically in counts table')
28
- parser.add_argument('--suppress-output', '-s', action='store_true', help='no output to terminal (use with --build)')
29
- parser.add_argument('--use-epstein-web-links', '-use', action='store_true', help='use epsteinweb.org links instead of epstein.media')
30
- parser.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use')
31
- parser.add_argument('--whole-file', '-wf', action='store_true', help='print whole file (only used by search script)')
32
- parser.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
33
- parser.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
34
- parser.add_argument('--make-clean', '-mc', action='store_true', help='delete all build artifact HTML and JSON files')
35
- parser.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
36
- parser.add_argument('--json-metadata', '-jm', action='store_true', help='dump JSON metadata for all files')
37
- parser.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats at the end')
38
- parser.add_argument('positional_args', nargs='*', help='Optional args (only used by helper scripts)')
16
+ parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='ovewrite cached EpsteinFiles')
17
+
18
+ output = parser.add_argument_group('OUTPUT')
19
+ output.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
20
+ output.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just the interesting ones')
21
+ output.add_argument('--build', '-b', action='store_true', help='write output to HTML file')
22
+ output.add_argument('--make-clean', action='store_true', help='delete all HTML build artifact and write latest URLs to .urls.env')
23
+ output.add_argument('--output-emails', '-oe', action='store_true', help='generate other files section')
24
+ output.add_argument('--output-json-files', action='store_true', help='pretty print all the raw JSON data files in the collection')
25
+ output.add_argument('--output-other-files', '-oo', action='store_true', help='generate other files section')
26
+ output.add_argument('--output-texts', '-ot', action='store_true', help='generate other files section')
27
+ output.add_argument('--suppress-output', action='store_true', help='no output to terminal (use with --build)')
28
+ output.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use (in characters)')
29
+ output.add_argument('--use-epstein-web-links', action='store_true', help='use epsteinweb.org links instead of epstein.media')
30
+
31
+ scripts = parser.add_argument_group('SCRIPTS', 'Arguments used only by epstein_search, epstein_show, epstein_diff')
32
+ scripts.add_argument('positional_args', nargs='*', help='strings to searchs for, file IDs to show or diff, etc.')
33
+ scripts.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (only used by scripts)')
34
+ scripts.add_argument('--whole-file', '-wf', action='store_true', help='print whole file (only used by epstein_search)')
35
+
36
+ debug = parser.add_argument_group('DEBUG')
37
+ debug.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
38
+ debug.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
39
+ debug.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
40
+ debug.add_argument('--json-metadata', '-jm', action='store_true', help='dump JSON metadata for all files')
41
+ debug.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats at the end')
42
+ debug.add_argument('--sort-alphabetical', action='store_true', help='sort emailers alphabetically in counts table')
43
+ debug.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
39
44
  args = parser.parse_args()
40
45
 
41
46
  current_script = Path(argv[0]).name
@@ -45,7 +50,7 @@ is_html_script = current_script in HTML_SCRIPTS
45
50
  args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
46
51
  args.output_emails = args.output_emails or args.all_emails
47
52
  args.output_other_files = args.output_other_files or args.all_other_files
48
- args.pickled = args.pickled or is_env_var_set('PICKLED') or args.colors_only or len(args.names or []) > 0
53
+ args.overwrite_pickle = args.overwrite_pickle or (is_env_var_set('OVERWRITE_PICKLE') and not is_env_var_set('PICKLED'))
49
54
  args.width = args.width if is_html_script else None
50
55
  specified_names: list[str | None] = [None if n == 'None' else n for n in (args.names or [])]
51
56
 
@@ -66,8 +71,8 @@ datefinder_logger.setLevel(logger.level)
66
71
 
67
72
  # Massage args that depend on other args to the appropriate state
68
73
  if not (args.json_metadata or args.output_texts or args.output_emails or args.output_other_files):
69
- if is_html_script and current_script != COUNT_WORDS_SCRIPT and not args.make_clean:
70
- logger.warning(f"No output section chosen; outputting default of texts, selected emails, and other files...")
74
+ if is_html_script and current_script != COUNT_WORDS_SCRIPT and not args.make_clean and not args.colors_only:
75
+ logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
71
76
 
72
77
  args.output_texts = True
73
78
  args.output_emails = True
@@ -77,4 +82,4 @@ if args.use_epstein_web_links:
77
82
  logger.warning(f"Using links to epsteinweb.org links instead of epsteinify.com...")
78
83
 
79
84
  if args.debug:
80
- logger.warning(f"Invocation args:\nis_html_script={is_html_script},\nspecified_names={specified_names},\nargs={args}")
85
+ logger.warning(f"Invocation args:\ncurrent_script={current_script}\nis_html_script={is_html_script},\nspecified_names={specified_names},\nargs={args}")
@@ -159,7 +159,7 @@ HIGHLIGHTED_NAMES = [
159
159
  pattern=r'Gruterite|(John\s*)?Kluge|Marc Rich|(Mi(chael|ke)\s*)?Ovitz|(Steve\s+)?Wynn|(Les(lie)?\s+)?Wexner|SALSS|Swedish[-\s]*American\s*Life\s*Science\s*Summit|Valhi|(Yves\s*)?Bouvier',
160
160
  emailers = {
161
161
  ALIREZA_ITTIHADIEH: 'CEO Freestream Aircraft Limited',
162
- BARBRO_C_EHNBOM: 'Swedish pharmaceuticals',
162
+ BARBRO_C_EHNBOM: 'Swedish pharmaceuticals, SALSS',
163
163
  FRED_HADDAD: "co-founder of Heck's in West Virginia",
164
164
  GERALD_BARTON: "Maryland property developer Landmark Land Company, fan of Trump's Irish golf course",
165
165
  GORDON_GETTY: 'heir of oil tycoon J. Paul Getty',
@@ -296,6 +296,7 @@ HIGHLIGHTED_NAMES = [
296
296
  emailers = {
297
297
  DAVID_STERN: f'emailed Epstein from Moscow, appears to know chairman of {DEUTSCHE_BANK}',
298
298
  JONATHAN_FARKAS: "heir to the Alexander's department store fortune",
299
+ 'linkspirit': "Skype username of someone Epstein communicated with",
299
300
  'Peter Thomas Roth': 'student of Epstein at Dalton, skincare company founder',
300
301
  STEPHEN_HANSON: None,
301
302
  TOM_BARRACK: 'long time friend of Trump',
@@ -304,7 +305,7 @@ HIGHLIGHTED_NAMES = [
304
305
  HighlightedNames(
305
306
  label='finance',
306
307
  style='green',
307
- pattern=r'Apollo|Ari\s*Glass|(Bernie\s*)?Madoff|Black(rock|stone)|BofA|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|(money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
308
+ pattern=r'Apollo|Ari\s*Glass|Bank|(Bernie\s*)?Madoff|Black(rock|stone)|B\s*of\s*A|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|(money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
308
309
  emailers={
309
310
  AMANDA_ENS: 'Citigroup',
310
311
  DANIEL_SABBA: 'UBS Investment Bank',
@@ -587,7 +588,7 @@ HIGHLIGHTED_NAMES = [
587
588
  HighlightedText(
588
589
  label='phone_number',
589
590
  style='bright_green',
590
- pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|[\d+]{10,12}",
591
+ pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|\b[\d+]{10,12}\b",
591
592
  ),
592
593
  ]
593
594
 
@@ -1,13 +1,14 @@
1
+ import json
2
+
1
3
  from rich.padding import Padding
2
4
 
3
5
  from epstein_files.documents.email import Email
4
6
  from epstein_files.documents.messenger_log import MessengerLog
5
7
  from epstein_files.epstein_files import EpsteinFiles, count_by_month
6
- from epstein_files.util.constant.output_files import JSON_METADATA_PATH
7
- from epstein_files.util.constant import urls
8
+ from epstein_files.util.constant import output_files
8
9
  from epstein_files.util.constant.html import *
9
10
  from epstein_files.util.constant.names import *
10
- from epstein_files.util.constant.strings import EMAIL_CLASS, MESSENGER_LOG_CLASS
11
+ from epstein_files.util.constant.output_files import JSON_FILES_JSON_PATH, JSON_METADATA_PATH
11
12
  from epstein_files.util.data import dict_sets_to_lists
12
13
  from epstein_files.util.env import args, specified_names
13
14
  from epstein_files.util.logging import log_file_write, logger
@@ -108,6 +109,20 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
108
109
  return len(already_printed_emails)
109
110
 
110
111
 
112
+ def print_json_files(epstein_files: EpsteinFiles):
113
+ if args.build:
114
+ json_data = {json_file.url_slug: json_file.json_data() for json_file in epstein_files.json_files}
115
+
116
+ with open(JSON_FILES_JSON_PATH, 'w') as f:
117
+ f.write(json.dumps(json_data, sort_keys=True))
118
+ log_file_write(JSON_FILES_JSON_PATH)
119
+ else:
120
+ for json_file in epstein_files.json_files:
121
+ console.line(2)
122
+ console.print(json_file.description_panel())
123
+ console.print_json(json_file.json_str(), indent=4, sort_keys=False)
124
+
125
+
111
126
  def print_json_metadata(epstein_files: EpsteinFiles) -> None:
112
127
  json_str = epstein_files.json_metadata()
113
128
 
@@ -122,9 +137,9 @@ def print_json_metadata(epstein_files: EpsteinFiles) -> None:
122
137
  def print_json_stats(epstein_files: EpsteinFiles) -> None:
123
138
  console.line(5)
124
139
  console.print(Panel('JSON Stats Dump', expand=True, style='reverse bold'), '\n')
125
- print_json(f"{MESSENGER_LOG_CLASS} Sender Counts", MessengerLog.count_authors(epstein_files.imessage_logs), skip_falsey=True)
126
- print_json(f"{EMAIL_CLASS} Author Counts", epstein_files.email_author_counts, skip_falsey=True)
127
- print_json(f"{EMAIL_CLASS} Recipient Counts", epstein_files.email_recipient_counts, skip_falsey=True)
140
+ print_json(f"MessengerLog Sender Counts", MessengerLog.count_authors(epstein_files.imessage_logs), skip_falsey=True)
141
+ print_json(f"Email Author Counts", epstein_files.email_author_counts, skip_falsey=True)
142
+ print_json(f"Email Recipient Counts", epstein_files.email_recipient_counts, skip_falsey=True)
128
143
  print_json("Email signature_substitution_countss", epstein_files.email_signature_substitution_counts(), skip_falsey=True)
129
144
  print_json("email_author_device_signatures", dict_sets_to_lists(epstein_files.email_authors_to_device_signatures))
130
145
  print_json("email_sent_from_devices", dict_sets_to_lists(epstein_files.email_device_signatures_to_authors))
@@ -147,16 +162,12 @@ def print_text_messages(epstein_files: EpsteinFiles) -> None:
147
162
 
148
163
  def write_urls() -> None:
149
164
  """Write _URL style constant variables to a file bash scripts can load as env vars."""
150
- if args.output_file == 'index.html':
151
- logger.warning(f"Can't write env vars to '{args.output_file}', writing to '{URLS_ENV}' instead.\n")
152
- args.output_file = URLS_ENV
165
+ url_vars = {k: v for k, v in vars(output_files).items() if k.endswith('URL') and not k.startswith('GH')}
153
166
 
154
- url_vars = {
155
- k: v for k, v in vars(urls).items()
156
- if isinstance(v, str) and k.split('_')[-1] in ['URL'] and 'github.io' in v and 'BASE' not in k
157
- }
167
+ if not args.suppress_output:
168
+ console.line()
158
169
 
159
- with open(args.output_file, 'w') as f:
170
+ with open(URLS_ENV, 'w') as f:
160
171
  for var_name, url in url_vars.items():
161
172
  key_value = f"{var_name}='{url}'"
162
173
 
@@ -165,8 +176,10 @@ def write_urls() -> None:
165
176
 
166
177
  f.write(f"{key_value}\n")
167
178
 
168
- console.line()
169
- logger.warning(f"Wrote {len(url_vars)} URL variables to '{args.output_file}'\n")
179
+ if not args.suppress_output:
180
+ console.line()
181
+
182
+ logger.warning(f"Wrote {len(url_vars)} URL variables to '{URLS_ENV}'\n")
170
183
 
171
184
 
172
185
  def _verify_all_emails_were_printed(epstein_files: EpsteinFiles, already_printed_emails: list[Email]) -> None:
@@ -14,7 +14,8 @@ from rich.theme import Theme
14
14
 
15
15
  from epstein_files.util.constant.html import CONSOLE_HTML_FORMAT, HTML_TERMINAL_THEME, PAGE_TITLE
16
16
  from epstein_files.util.constant.names import UNKNOWN
17
- from epstein_files.util.constant.strings import DEFAULT, EMAIL, NA, OTHER_SITE_LINK_STYLE, QUESTION_MARKS, SiteType
17
+ from epstein_files.util.constant.output_files import SITE_URLS
18
+ from epstein_files.util.constant.strings import DEFAULT, EMAIL, NA, QUESTION_MARKS, TEXT_MESSAGE, SiteType
18
19
  from epstein_files.util.constant.urls import *
19
20
  from epstein_files.util.constants import FALLBACK_TIMESTAMP, HEADER_ABBREVIATIONS
20
21
  from epstein_files.util.data import json_safe
@@ -31,11 +32,22 @@ GREY_NUMBERS = [58, 39, 39, 35, 30, 27, 23, 23, 19, 19, 15, 15, 15]
31
32
  DEFAULT_NAME_STYLE = 'gray46'
32
33
  KEY_STYLE='honeydew2 bold'
33
34
  SECTION_HEADER_STYLE = 'bold white on blue3'
34
- SOCIAL_MEDIA_LINK_STYLE = 'cyan3 bold'
35
+ SOCIAL_MEDIA_LINK_STYLE = 'pale_turquoise4'
35
36
  SUBSTACK_POST_LINK_STYLE = 'bright_cyan'
36
37
  SYMBOL_STYLE = 'grey70'
38
+ TABLE_BORDER_STYLE = 'grey46'
39
+ TABLE_TITLE_STYLE = f"gray85 italic"
37
40
  TITLE_STYLE = 'black on bright_white bold'
38
41
 
42
+ AUX_SITE_LINK_STYLE = 'dark_orange3'
43
+ OTHER_SITE_LINK_STYLE = 'dark_goldenrod'
44
+
45
+ DEFAULT_TABLE_KWARGS = {
46
+ 'border_style': TABLE_BORDER_STYLE,
47
+ 'header_style': "bold",
48
+ 'title_style': TABLE_TITLE_STYLE,
49
+ }
50
+
39
51
  HIGHLIGHTED_GROUP_COLOR_KEYS = [
40
52
  Text(highlight_group.label.replace('_', ' '), style=highlight_group.style)
41
53
  for highlight_group in sorted(HIGHLIGHTED_NAMES, key=lambda hg: hg.label)
@@ -79,7 +91,11 @@ def build_highlighter(pattern: str) -> EpsteinHighlighter:
79
91
  return TempHighlighter()
80
92
 
81
93
 
82
- def join_texts(txts: list[Text], join: str = ' ', encloser: str = '') -> Text:
94
+ def build_table(title: str | None, **kwargs) -> Table:
95
+ return Table(title=title, **{**DEFAULT_TABLE_KWARGS, **kwargs})
96
+
97
+
98
+ def join_texts(txts: list[Text], join: str = ' ', encloser: str = '', encloser_style: str = 'wheat4') -> Text:
83
99
  """Join rich.Text objs into one."""
84
100
  if encloser:
85
101
  if len(encloser) != 2:
@@ -91,8 +107,9 @@ def join_texts(txts: list[Text], join: str = ' ', encloser: str = '') -> Text:
91
107
 
92
108
  txt = Text('')
93
109
 
94
- for i, link in enumerate(txts):
95
- txt.append(join if i >= 1 else '').append(enclose_start).append(link).append(enclose_end)
110
+ for i, _txt in enumerate(txts):
111
+ txt.append(join if i >= 1 else '').append(enclose_start, style=encloser_style)
112
+ txt.append(_txt).append(enclose_end, style=encloser_style)
96
113
 
97
114
  return txt
98
115
 
@@ -132,7 +149,7 @@ def print_centered_link(url: str, link_text: str, style: str | None = None) -> N
132
149
 
133
150
 
134
151
  def print_color_key() -> None:
135
- color_table = Table(title=f'Rough Guide to Highlighted Colors', show_header=False)
152
+ color_table = build_table('Rough Guide to Highlighted Colors', show_header=False)
136
153
  num_colors = len(HIGHLIGHTED_GROUP_COLOR_KEYS)
137
154
  row_number = 0
138
155
 
@@ -164,7 +181,7 @@ def print_header(epstein_files: 'EpsteinFiles') -> None:
164
181
  print_centered(f"if you think there's an attribution error or can deanonymize an {UNKNOWN} contact {CRYPTADAMUS_TWITTER}", 'grey46')
165
182
  print_centered('note this site is based on the OCR text provided by Congress which is not always the greatest', 'grey23')
166
183
  print_centered(f"(thanks to {link_markup('https://x.com/ImDrinknWyn', '@ImDrinknWyn', 'dodger_blue3')} + others for help attributing redacted emails)")
167
- print_centered_link(ATTRIBUTIONS_URL, "(some explanations of author attributions)", style='magenta')
184
+ print_centered_link(JSON_METADATA_URL, "(explanations of author attributions)", style='magenta')
168
185
 
169
186
 
170
187
  def print_json(label: str, obj: object, skip_falsey: bool = False) -> None:
@@ -231,24 +248,28 @@ def print_other_site_link(is_header: bool = True) -> None:
231
248
  other_site_msg += f" Epstein's {other_site_type}s also generated by this code"
232
249
  markup_msg = link_markup(SITE_URLS[other_site_type], other_site_msg, OTHER_SITE_LINK_STYLE)
233
250
  print_centered(parenthesize(Text.from_markup(markup_msg)), style='bold')
234
- word_count_link = link_text_obj(WORD_COUNT_URL, 'site showing the most frequently used words in these communiques', OTHER_SITE_LINK_STYLE)
235
- print_centered(parenthesize(word_count_link))
236
- metadata_link = link_text_obj(JSON_METADATA_URL, 'metadata with author attribution explanations', OTHER_SITE_LINK_STYLE)
237
- print_centered(parenthesize(metadata_link))
251
+
252
+ if is_header:
253
+ word_count_link = link_text_obj(WORD_COUNT_URL, 'most frequently used words in the emails and texts', AUX_SITE_LINK_STYLE)
254
+ print_centered(parenthesize(word_count_link))
255
+ metadata_link = link_text_obj(JSON_METADATA_URL, 'author attribution explanations', AUX_SITE_LINK_STYLE)
256
+ print_centered(parenthesize(metadata_link))
257
+ json_link = link_text_obj(WORD_COUNT_URL, "epstein's json files", AUX_SITE_LINK_STYLE)
258
+ print_centered(parenthesize(json_link))
238
259
 
239
260
 
240
261
  def print_page_title(expand: bool = True, width: int | None = None) -> None:
241
262
  title_panel = Panel(Text(PAGE_TITLE, justify='center'), expand=expand, style=TITLE_STYLE, width=width)
242
263
  console.print(Align.center(vertically_pad(title_panel)))
243
- print_social_media_links()
264
+ _print_social_media_links()
244
265
  console.line(2)
245
266
 
246
267
 
247
268
  def print_panel(msg: str, style: str = 'black on white', padding: tuple | None = None, centered: bool = False) -> None:
248
269
  _padding: list[int] = list(padding or [0, 0, 0, 0])
249
270
  _padding[2] += 1 # Bottom pad
250
- panel = Panel(Text.from_markup(msg, justify='center'), width=70, style=style)
251
271
  actual_padding: tuple[int, int, int, int] = tuple(_padding)
272
+ panel = Panel(Text.from_markup(msg, justify='center'), width=70, style=style)
252
273
 
253
274
  if centered:
254
275
  console.print(Align.center(Padding(panel, actual_padding)))
@@ -262,19 +283,6 @@ def print_section_header(msg: str, style: str = SECTION_HEADER_STYLE, is_centere
262
283
  console.print(Padding(panel, (3, 0, 1, 0)))
263
284
 
264
285
 
265
- def print_social_media_links() -> None:
266
- print_centered_link(SUBSTACK_URL, "I Made Epstein's Text Messages Great Again (And You Should Read Them)", style=f'{SUBSTACK_POST_LINK_STYLE} bold')
267
- print_centered_link(SUBSTACK_URL, SUBSTACK_URL.removeprefix('https://'), style=f'{SUBSTACK_POST_LINK_STYLE} dim')
268
-
269
- social_links = [
270
- link_text_obj('https://x.com/Cryptadamist/status/1990866804630036988', '@cryptadamist', style=SOCIAL_MEDIA_LINK_STYLE),
271
- link_text_obj('https://cryptadamus.substack.com/', 'substack', style=SOCIAL_MEDIA_LINK_STYLE),
272
- link_text_obj('https://universeodon.com/@cryptadamist/115572634993386057', 'mastodon', style=SOCIAL_MEDIA_LINK_STYLE),
273
- ]
274
-
275
- print_centered(join_texts(social_links, join=' ', encloser='[]'))
276
-
277
-
278
286
  def print_starred_header(msg: str, num_stars: int = 7, num_spaces: int = 2, style: str = TITLE_STYLE) -> None:
279
287
  stars = '*' * num_stars
280
288
  spaces = ' ' * num_spaces
@@ -314,7 +322,7 @@ def write_html(output_path: Path) -> None:
314
322
 
315
323
 
316
324
  def _print_abbreviations_table() -> None:
317
- table = Table(title="Abbreviations Used Frequently In These Conversations", header_style="bold", show_header=False)
325
+ table = build_table(title="Abbreviations Used Frequently In These Conversations", show_header=False)
318
326
  table.add_column("Abbreviation", justify="center", style='bold')
319
327
  table.add_column("Translation", style="white", justify="center")
320
328
 
@@ -326,7 +334,7 @@ def _print_abbreviations_table() -> None:
326
334
 
327
335
  def _print_external_links() -> None:
328
336
  console.line()
329
- print_starred_header('External Links', num_stars=0, num_spaces=20, style=f"italic")
337
+ print_centered(Text('External Links', style=TABLE_TITLE_STYLE))
330
338
  presser_link = link_text_obj(OVERSIGHT_REPUBLICANS_PRESSER_URL, 'Official Oversight Committee Press Release')
331
339
  raw_docs_link = join_texts([link_text_obj(RAW_OVERSIGHT_DOCS_GOOGLE_DRIVE_URL, 'raw files', style=f"{ARCHIVE_LINK_COLOR} dim")], encloser='()')
332
340
  print_centered(join_texts([presser_link, raw_docs_link]))
@@ -335,6 +343,26 @@ def _print_external_links() -> None:
335
343
  print_centered(link_markup(COURIER_NEWSROOM_ARCHIVE_URL, 'Searchable Archive') + " (Courier Newsroom)")
336
344
  print_centered(link_markup(EPSTEINIFY_URL) + " (raw document images)")
337
345
  print_centered(link_markup(EPSTEIN_WEB_URL) + " (character summaries)")
346
+ print_centered(link_markup(EPSTEIN_MEDIA_URL) + " (raw document images)")
347
+
348
+
349
+ def _print_social_media_links() -> None:
350
+ print_centered_link(
351
+ SUBSTACK_URL,
352
+ "I Made Epstein's Text Messages Great Again (And You Should Read Them)",
353
+ style=f'{SUBSTACK_POST_LINK_STYLE} bold'
354
+ )
355
+
356
+ print_centered_link(SUBSTACK_URL, SUBSTACK_URL.removeprefix('https://'), style=f'{SUBSTACK_POST_LINK_STYLE} dim')
357
+
358
+ social_links = [
359
+ link_text_obj('https://universeodon.com/@cryptadamist/115572634993386057', '@mastodon', style=SOCIAL_MEDIA_LINK_STYLE),
360
+ link_text_obj(SUBSTACK_URL, '@substack', style=SOCIAL_MEDIA_LINK_STYLE),
361
+ link_text_obj('https://x.com/Cryptadamist/status/1990866804630036988', '@twitter', style=SOCIAL_MEDIA_LINK_STYLE),
362
+ link_text_obj('https://github.com/michelcrypt4d4mus/epstein_text_messages', '@github', style=SOCIAL_MEDIA_LINK_STYLE)
363
+ ]
364
+
365
+ print_centered(join_texts(social_links, join=' / '))#, encloser='()'))#, encloser='‹›'))
338
366
 
339
367
 
340
368
  # if args.deep_debug:
@@ -20,6 +20,14 @@ from epstein_files.util.search_result import SearchResult
20
20
  FIRST_AND_LAST_NAMES = flatten([n.split() for n in ALL_NAMES])
21
21
  FIRST_AND_LAST_NAMES = [n.lower() for n in FIRST_AND_LAST_NAMES] + OTHER_NAMES
22
22
 
23
+ HTML_REGEX = re.compile(r"com/|cae-v2w=|content-(transfe|type)|font(/|-(family|size))|http|\.html?\??|margin-bottom|padding-left|quoted-printable|region=|text-decoration|ttps|www|\.(gif|jpe?g|png);?$")
24
+ HYPHENATED_WORD_REGEX = re.compile(r"[a-z]+-[a-z]+", re.IGNORECASE)
25
+ OK_SYMBOL_WORDS = ['mar-a-lago', 'p/e', 's&p', ':)', ':).', ';)', ':-)', ';-)']
26
+ ONLY_SYMBOLS_REGEX = re.compile(r"^[^a-zA-Z0-9]+$")
27
+ SYMBOL_WORD_REGEX = re.compile(r"^[-—–@%/?.,&=]+$")
28
+ SPLIT_WORDS_BY = ['@', '/']
29
+ FLAGGED_WORDS = [] # For debugging, log extra info when one of these is encountered
30
+
23
31
  NON_SINGULARIZABLE = UNSINGULARIZABLE_WORDS + [n for n in FIRST_AND_LAST_NAMES if n.endswith('s')]
24
32
  SKIP_WORDS_REGEX = re.compile(r"^(asmallworld@|enwiki|http|imagepng|nymagcomnymetro|addresswww|mailto|www|/font|colordu|classdms|targetdblank|nymagcom|palmbeachdailynews)|jee[vy]acation|fontfamily|(gif|html?|jpe?g|utm)$")
25
33
  BAD_CHARS_REGEX = re.compile(r"[-–=+()$€£©°«—^&%!#_`,.;:'‘’\"„“”?\d\\]")
@@ -100,21 +108,13 @@ SINGULARIZATIONS = {
100
108
  'twittercom': 'twitter',
101
109
  }
102
110
 
103
- HTML_REGEX = re.compile(r"com/|cae-v2w=|content-(transfe|type)|font(/|-(family|size))|http|\.html?\??|margin-bottom|padding-left|quoted-printable|region=|text-decoration|ttps|www|\.(gif|jpe?g|png);?$")
104
- HYPHENATED_WORD_REGEX = re.compile(r"[a-z]+-[a-z]+", re.IGNORECASE)
105
- OK_SYMBOL_WORDS = ['mar-a-lago', 'p/e', 's&p', ':)', ':).', ';)', ':-)', ';-)']
106
- SYMBOL_WORD_REGEX = re.compile(r"^[-—–@%/?.,&=]+$")
107
- ONLY_SYMBOLS_REGEX = re.compile(r"^[^a-zA-Z0-9]+$")
108
- SPLIT_WORDS_BY = ['@', '/']
109
- FLAGGED_WORDS = [] # For debugging, log extra info when one of these is encountered
110
-
111
111
 
112
112
  @dataclass
113
113
  class WordCount:
114
114
  count: dict[str, int] = field(default_factory=lambda: defaultdict(int))
115
115
  singularized: dict[str, int] = field(default_factory=lambda: defaultdict(int))
116
116
 
117
- def count_word(self, word: str, document_line: SearchResult) -> None:
117
+ def tally_word(self, word: str, document_line: SearchResult) -> None:
118
118
  word = EmailHeader.cleanup_str(word).lower().strip()
119
119
  raw_word = word
120
120
 
@@ -148,7 +148,7 @@ class WordCount:
148
148
  continue
149
149
 
150
150
  for w in word.split(symbol):
151
- self.count_word(w, document_line)
151
+ self.tally_word(w, document_line)
152
152
 
153
153
  logger.info(f" Split word with '{symbol}' in it '{word}'...")
154
154
  return