epstein-files 1.1.2__py3-none-any.whl → 1.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,7 +24,7 @@ HEADER_ABBREVIATIONS = {
24
24
  'Jared': "Jared Kushner",
25
25
  'Jagland': 'Thorbjørn Jagland (former Norwegian prime minister)',
26
26
  'JEGE': "Epstein's airplane holding company",
27
- 'Jeffrey Wernick': 'right wing crypto bro, former COO of Parler',
27
+ JEFFREY_WERNICK: 'right wing crypto bro, former COO of Parler',
28
28
  'Joi': f"Joi Ito ({MIT_MEDIA_LAB}, MIT Digital Currency Initiative)",
29
29
  "Hoffenberg": f"{STEVEN_HOFFENBERG} (Epstein's ponzi scheme partner)",
30
30
  'KSA': "Kingdom of Saudi Arabia",
@@ -89,11 +89,11 @@ EMAILER_ID_REGEXES: dict[str, re.Pattern] = {
89
89
  JESSICA_CADWELL: re.compile(r'Jessica Cadwell?', re.IGNORECASE),
90
90
  JOHNNY_EL_HACHEM: re.compile(r'el hachem johnny|johnny el hachem', re.IGNORECASE),
91
91
  JOI_ITO: re.compile(r'ji@media.mit.?edu|(joichi|joi)( Ito)?', re.IGNORECASE),
92
- JONATHAN_FARKAS: re.compile(r'Jonathan Farka(s|il)', re.IGNORECASE),
92
+ JONATHAN_FARKAS: re.compile(r'Jonathan Fark(a|u)(s|il)', re.IGNORECASE),
93
93
  KATHRYN_RUEMMLER: re.compile(r'Kathr?yn? Ruemmler?', re.IGNORECASE),
94
94
  KEN_STARR: re.compile(r'starr, ken|Ken(neth\s*(W.\s*)?)?\s+starr?|starr', re.IGNORECASE),
95
95
  LANDON_THOMAS: re.compile(r'lando[nr] thomas( jr)?|thomas jr.?, lando[nr]', re.IGNORECASE),
96
- LARRY_SUMMERS: re.compile(r'(La(wrence|rry).{1,5})?Summers?|^LH$|LHS|Ihsofficel', re.IGNORECASE),
96
+ LARRY_SUMMERS: re.compile(r'(La(wrence|rry).{1,5})?Summers?|^LH$|LHS|[Il]hsofficel?', re.IGNORECASE),
97
97
  LAWRANCE_VISOSKI: re.compile(r'La(rry|wrance) Visoski?|Lvjet', re.IGNORECASE),
98
98
  LAWRENCE_KRAUSS: re.compile(r'Lawrence Kraus[es]?|[jl]awkrauss|kruase', re.IGNORECASE),
99
99
  LEON_BLACK: re.compile(r'Leon\s*Black?|(?<!Marc )Leon(?! (Botstein|Jaworski|Wieseltier))', re.IGNORECASE),
@@ -309,8 +309,47 @@ IRAN_DEAL_RECIPIENTS = ['Allen West', 'Rafael Bardaji', 'Philip Kafka', 'Herb Go
309
309
  FLIGHT_IN_2012_PEOPLE = ['Francis Derby', 'Januiz Banasiak', 'Louella Rabuyo', 'Richard Barnnet']
310
310
 
311
311
  EMAILS_CONFIG = [
312
+ # 026294 and 026296 might also be Ittihadieh based on timing
312
313
  EmailCfg(id='032436', author=ALIREZA_ITTIHADIEH, attribution_reason='Signature'),
314
+ # 032542 026078 026080 026083 026086 026090 might also be Anas based on discussion of Dubai and Kuwait
313
315
  EmailCfg(id='032543', author=ANAS_ALRASHEED, attribution_reason='Later reply 033000 has quote'),
316
+ EmailCfg(id='026167', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
317
+ EmailCfg(id='032571', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
318
+ EmailCfg(id='032573', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
319
+ EmailCfg(id='032575', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
320
+ EmailCfg(id='032577', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
321
+ EmailCfg(id='032579', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
322
+ EmailCfg(id='032582', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
323
+ EmailCfg(id='032585', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
324
+ EmailCfg(id='032588', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
325
+ EmailCfg(id='032591', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
326
+ EmailCfg(id='032595', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
327
+ EmailCfg(id='032599', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
328
+ EmailCfg(id='032611', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
329
+ EmailCfg(id='023661', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
330
+ EmailCfg(id='032616', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
331
+ EmailCfg(id='032622', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
332
+ EmailCfg(id='032628', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
333
+ EmailCfg(id='032629', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
334
+ EmailCfg(id='032631', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
335
+ EmailCfg(id='026168', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
336
+ EmailCfg(id='026170', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
337
+ EmailCfg(id='026173', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
338
+ EmailCfg(id='026176', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
339
+ EmailCfg(id='026180', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
340
+ EmailCfg(id='026184', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
341
+ EmailCfg(id='026188', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
342
+ EmailCfg(id='026193', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
343
+ EmailCfg(id='026198', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
344
+ EmailCfg(id='026210', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
345
+ EmailCfg(id='026204', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
346
+ EmailCfg(id='032660', author=ANAS_ALRASHEED, attribution_reason='name visible in 029113 reply'),
347
+ EmailCfg(id='032663', author=ANAS_ALRASHEED, attribution_reason='name visible in 029113 reply'),
348
+ EmailCfg(id='032667', author=ANAS_ALRASHEED, attribution_reason='name visible in 029113 reply'),
349
+ EmailCfg(id='032672', author=ANAS_ALRASHEED, attribution_reason='name visible in 029113 reply'),
350
+ EmailCfg(id='032676', author=ANAS_ALRASHEED, attribution_reason='name visible in 029113 reply'),
351
+ EmailCfg(id='026237', author=ANAS_ALRASHEED, attribution_reason='name visible in 029113 reply'),
352
+ EmailCfg(id='032682', author=ANAS_ALRASHEED, attribution_reason='name visible in 029113 reply'),
314
353
  EmailCfg(id='026064', author=ARIANE_DE_ROTHSCHILD, attribution_reason='signature'),
315
354
  EmailCfg(id='026069', author=ARIANE_DE_ROTHSCHILD, attribution_reason='signature'),
316
355
  EmailCfg(id='030741', author=ARIANE_DE_ROTHSCHILD, attribution_reason='signature'),
@@ -351,6 +390,7 @@ EMAILS_CONFIG = [
351
390
  actual_text='',
352
391
  author=DARREN_INDYKE,
353
392
  description=f"heavily redacted email, quoted replies are from {STEVEN_HOFFENBERG} about James Patterson's book",
393
+ recipients=['Charles Michael'],
354
394
  timestamp=parse('2016-08-17 11:26:00'),
355
395
  attribution_reason='Quoted replies are in 019109',
356
396
  ),
@@ -444,10 +484,20 @@ EMAILS_CONFIG = [
444
484
  EmailCfg(id='017581', author='Lisa Randall', attribution_reason='reply header'),
445
485
  EmailCfg(id='026609', author='Mark Green', attribution_reason='Actually a fwd, Mark Green is in signature'),
446
486
  EmailCfg(id='030472', author=MARTIN_WEINBERG, attribution_reason='Maybe. in reply', is_attribution_uncertain=True),
487
+ EmailCfg(id='032563', author=MASHA_DROKOVA, attribution_reason='replied to in 033014'),
488
+ EmailCfg(id='032564', author=MASHA_DROKOVA, attribution_reason='follow up to 032563 about huffpo article with link'),
489
+ EmailCfg(id='031544', author=MASHA_DROKOVA, attribution_reason='follow up to 032563 about huffpo article with link'),
490
+ EmailCfg(id='032605', author=MASHA_DROKOVA, attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
491
+ EmailCfg(id='032606', author=MASHA_DROKOVA, attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
492
+ EmailCfg(id='032607', author=MASHA_DROKOVA, attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
493
+ EmailCfg(id='032609', author=MASHA_DROKOVA, attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
494
+ # 032581, 032604, 033025 may also be Masha based on timing, subject (interviews/articles), and sequential ID
495
+ EmailCfg(id='032604', author=MASHA_DROKOVA, attribution_reason="timing, subject (interviews/articles), and sequential ID", is_attribution_uncertain=True),
496
+ EmailCfg(id='032581', author=MASHA_DROKOVA, attribution_reason="timing, subject (interviews/articles), and sequential ID", is_attribution_uncertain=True),
497
+ EmailCfg(id='033025', author=MASHA_DROKOVA, attribution_reason="timing, subject (interviews/articles), and sequential ID", is_attribution_uncertain=True),
447
498
  EmailCfg(id='030235', author=MELANIE_WALKER, attribution_reason='In fwd'),
448
499
  EmailCfg(id='032343', author=MELANIE_WALKER, attribution_reason='Name seen in later reply 032346'),
449
500
  EmailCfg(id='032212', author=MIROSLAV_LAJCAK, attribution_reason='signature'),
450
- EmailCfg(id='022193', author=NADIA_MARCINKO, attribution_reason='reply'),
451
501
  EmailCfg(id='021814', author=NADIA_MARCINKO, attribution_reason='reply'),
452
502
  EmailCfg(id='021808', author=NADIA_MARCINKO, attribution_reason='reply'),
453
503
  EmailCfg(id='022190', author=NADIA_MARCINKO, attribution_reason='reply'),
@@ -508,12 +558,14 @@ EMAILS_CONFIG = [
508
558
  author=TERRY_KAFKA,
509
559
  fwded_text_after='From: Mike Cohen',
510
560
  recipients=cast(list[str | None], [JEFFREY_EPSTEIN, MARK_EPSTEIN, MICHAEL_BUCHHOLTZ] + IRAN_DEAL_RECIPIENTS),
561
+ subject='Fw: The Iran Nuclear Deal',
511
562
  duplicate_ids=['028482'],
512
563
  ),
513
564
  EmailCfg(id='029992', author=TERRY_KAFKA, attribution_reason='Quoted reply'),
514
565
  EmailCfg(id='029985', author=TERRY_KAFKA, attribution_reason='Quoted reply in 029992'),
515
566
  EmailCfg(id='020666', author=TERRY_KAFKA, attribution_reason="Ends with 'Terry'"),
516
567
  EmailCfg(id='026014', author=ZUBAIR_KHAN, recipients=[JEFFREY_EPSTEIN], timestamp=parse('2016-11-04 17:46:00')),
568
+ EmailCfg(id='033021', recipients=[ANAS_ALRASHEED], attribution_reason='visible in 033022'),
517
569
  EmailCfg(id='027063', recipients=[ANTHONY_BARRETT]),
518
570
  EmailCfg(id='030764', recipients=[ARIANE_DE_ROTHSCHILD], attribution_reason='Reply'),
519
571
  EmailCfg(id='026431', recipients=[ARIANE_DE_ROTHSCHILD], attribution_reason='Reply'),
@@ -538,6 +590,7 @@ EMAILS_CONFIG = [
538
590
  EmailCfg(id='032780', recipients=[JEFFREY_EPSTEIN]), # Bad OCR (nofix)
539
591
  EmailCfg(id='029324', recipients=[JEFFREY_EPSTEIN, "Jojo Fontanilla", "Lyn Fontanilla"]), # Bad OCR (nofix)
540
592
  EmailCfg(id='013482', recipients=[JEFFREY_EPSTEIN], is_fwded_article=True), # other recipients redacted. "The view from the US: Stem cell therapy steps up a gear with firs"
593
+ EmailCfg(id='029558', recipients=[JEFFREY_EPSTEIN, KATHERINE_KEATING], attribution_reason='BCC', fwded_text_after='Creativity is central'),
541
594
  EmailCfg(id='033456', recipients=["Joel"], attribution_reason='Reply'),
542
595
  EmailCfg(id='033458', recipients=["Joel"], attribution_reason='Reply'),
543
596
  EmailCfg(id='033460', recipients=["Joel"], attribution_reason='Reply'),
@@ -566,10 +619,13 @@ EMAILS_CONFIG = [
566
619
  EmailCfg(id='033466', recipients=[LAWRANCE_VISOSKI], attribution_reason='Reply signature'),
567
620
  EmailCfg(id='022250', recipients=[LESLEY_GROFF], attribution_reason='Reply'),
568
621
  EmailCfg(id='030242', recipients=[MARIANA_IDZKOWSKA], duplicate_ids=['032048'], dupe_type='redacted'),
622
+ EmailCfg(id='033027', recipients=[MASHA_DROKOVA], attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
569
623
  EmailCfg(id='030368', recipients=[MELANIE_SPINELLA], attribution_reason='Actually a self fwd from jeffrey to jeffrey'),
570
624
  EmailCfg(id='030369', recipients=[MELANIE_SPINELLA], attribution_reason='Actually a self fwd from jeffrey to jeffrey'),
571
625
  EmailCfg(id='030371', recipients=[MELANIE_SPINELLA], attribution_reason='Actually a self fwd from jeffrey to jeffrey'),
572
626
  EmailCfg(id='022258', recipients=[NADIA_MARCINKO], attribution_reason='Reply header'),
627
+ EmailCfg(id='022193', recipients=[NADIA_MARCINKO], attribution_reason='reply'),
628
+ EmailCfg(id='030572', recipients=[PAULA], attribution_reason='quoted in 030482', is_attribution_uncertain=True),
573
629
  EmailCfg(id='030506', recipients=[PAULA], attribution_reason=PAULA_REASON, is_attribution_uncertain=True),
574
630
  EmailCfg(id='030507', recipients=[PAULA], attribution_reason=PAULA_REASON, is_attribution_uncertain=True),
575
631
  EmailCfg(id='030508', recipients=[PAULA], attribution_reason=PAULA_REASON, is_attribution_uncertain=True),
@@ -585,8 +641,9 @@ EMAILS_CONFIG = [
585
641
  EmailCfg(id='032358', actual_text=REDACTED), # Completely redacted
586
642
  EmailCfg(id='033050', actual_text='schwartman'),
587
643
  EmailCfg(id='022219', description="discussion of attempts to clean up Epstein's Google search results"),
588
- EmailCfg(id='031333', is_fwded_article=True, description='looks like a Russian disinfo article'), # Russia Says IMF Chief Jailed For Discovering All US Gold is Gone
589
- EmailCfg(id='031335', is_fwded_article=True, description='looks like a Russian disinfo article'), # DOMINQUE STRAUSS-KAHN ARRESTED, NOT BECAUSE HE RAPED A MAID, BUT BECAUSE HE HAD EVIDENCE US HAS NO GOLD IN FORT KNOX.
644
+ EmailCfg(id='028524', is_fwded_article=True, description='Zach Braff op-ed on Woody Allen in NYT'),
645
+ EmailCfg(id='031333', is_fwded_article=True, description='Fort Knox conspiracy theory, looks like a Russian disinfo article'), # Russia Says IMF Chief Jailed For Discovering All US Gold is Gone
646
+ EmailCfg(id='031335', is_fwded_article=True, description='Fort Knox conspiracy theory, looks like a Russian disinfo article'), # DOMINQUE STRAUSS-KAHN ARRESTED, NOT BECAUSE HE RAPED A MAID, BUT BECAUSE HE HAD EVIDENCE US HAS NO GOLD IN FORT KNOX.
590
647
  EmailCfg(id='023627', is_fwded_article=True, description=MICHAEL_WOLFF_EPSTEIN_ARTICLE_DRAFT),
591
648
  EmailCfg(id='026298', is_fwded_article=True, duplicate_ids=['026499']), # Written by someone else?
592
649
  EmailCfg(id='029692', is_fwded_article=True, duplicate_ids=['029779']), # WaPo article
@@ -640,7 +697,6 @@ EMAILS_CONFIG = [
640
697
  EmailCfg(id='031472', is_fwded_article=True), # WSJ: Lawyers for Imam Wanted by Turkish authorities Fear for Their Client's Life
641
698
  EmailCfg(id='012684', is_fwded_article=True), # Trump in talks to buy socialite Kluge's Charlottesville vineyard
642
699
  EmailCfg(id='028536', is_fwded_article=True), # Palm Beach Post FBI Epstein files say he gave info. Does it explain sweetheart deal?
643
- EmailCfg(id='028524', is_fwded_article=True), # Zach Braff article on Woody Allen in NYT
644
700
  EmailCfg(id='030326', is_fwded_article=True), # NYP Congressional candidate compares Melania Trump to prostitute
645
701
  EmailCfg(id='030519', is_fwded_article=True), # Daily Mail on Prince Andrew
646
702
  EmailCfg(id='030878', is_fwded_article=True), # Steve Bannon almost appeared in Michael Moore's 'Fahrenheit 11/9'
@@ -667,6 +723,7 @@ EMAILS_CONFIG = [
667
723
  EmailCfg(id='029841', duplicate_ids=['012711'], dupe_type='redacted'),
668
724
  EmailCfg(id='030414', duplicate_ids=['030578'], dupe_type='redacted'),
669
725
  EmailCfg(id='031135', duplicate_ids=['030634'], dupe_type='redacted'),
726
+ EmailCfg(id='030620', duplicate_ids=['023067']),
670
727
  EmailCfg(id='029835', duplicate_ids=['028968']),
671
728
  EmailCfg(id='033512', duplicate_ids=['033361']),
672
729
  EmailCfg(id='030299', duplicate_ids=['021794']),
@@ -768,7 +825,6 @@ EMAILS_CONFIG = [
768
825
  # Emails that need a little help determining how to separate the actual text from fwded text
769
826
  EmailCfg(id='013415', fwded_text_after='Darren K. Indyke'),
770
827
  EmailCfg(id='024624', fwded_text_after='On Tue, May 14'),
771
- EmailCfg(id='029558', fwded_text_after='Creativity is central'),
772
828
  EmailCfg(id='025888', fwded_text_after='Jul 24, 2015'),
773
829
  EmailCfg(id='016413', fwded_text_after='In a former warehouse'),
774
830
  EmailCfg(id='025548', fwded_text_after='Edward Jay Epstein'),
@@ -1622,4 +1678,4 @@ REPLY_LINE_ON_NUMERIC_DATE_PATTERN = fr"On \d+/\d+/\d+[, ].*{REPLY_LINE_ENDING_P
1622
1678
  REPLY_LINE_ON_DATE_PATTERN = fr"^On (\d+ )?((Mon|Tues?|Wed(nes)?|Thu(rs)?|Fri|Sat(ur)?|Sun)(day)?|(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*)[, ].*{REPLY_LINE_ENDING_PATTERN}"
1623
1679
  REPLY_LINE_PATTERN = rf"({REPLY_LINE_IN_A_MSG_PATTERN}|{REPLY_LINE_ON_NUMERIC_DATE_PATTERN}|{REPLY_LINE_ON_DATE_PATTERN}|{FORWARDED_LINE_PATTERN})"
1624
1680
  REPLY_REGEX = re.compile(REPLY_LINE_PATTERN, re.IGNORECASE | re.MULTILINE)
1625
- SENT_FROM_REGEX = re.compile(r'^(?:(Please forgive|Sorry for all the) typos.{1,4})?(Sent (from|via).*(and string|AT&T|Droid|iPad|Phone|Mail|BlackBerry(.*(smartphone|device|Handheld|AT&T|T- ?Mobile))?)\.?)', re.M | re.I)
1681
+ SENT_FROM_REGEX = re.compile(r'^(?:(Please forgive|Sorry for all the) typos.{1,4})?((Envoyé de mon|Sent (from|via)).*(and string|AT&T|Droid|iPad|Phone|Mail|BlackBerry(.*(smartphone|device|Handheld|AT&T|T- ?Mobile))?)\.?)', re.M | re.I)
@@ -8,6 +8,7 @@ from dateutil import tz
8
8
  from typing import TypeVar
9
9
 
10
10
  from epstein_files.util.constant import names
11
+ from epstein_files.util.constant.strings import QUESTION_MARKS
11
12
  from epstein_files.util.env import args
12
13
  from epstein_files.util.logging import logger
13
14
 
@@ -41,7 +42,7 @@ def extract_last_name(name: str) -> str:
41
42
  if ' ' not in name:
42
43
  return name
43
44
 
44
- names = name.split()
45
+ names = name.removesuffix(QUESTION_MARKS).strip().split()
45
46
 
46
47
  if names[-1].startswith('Jr') and len(names[-1]) <= 3:
47
48
  return ' '.join(names[-2:])
@@ -49,6 +50,13 @@ def extract_last_name(name: str) -> str:
49
50
  return names[-1]
50
51
 
51
52
 
53
+ def extract_first_name(name: str) -> str:
54
+ if ' ' not in name:
55
+ return name
56
+
57
+ return name.removesuffix(f" {extract_last_name(name)}")
58
+
59
+
52
60
  def flatten(_list: list[list[T]]) -> list[T]:
53
61
  return list(itertools.chain.from_iterable(_list))
54
62
 
@@ -95,9 +95,13 @@ class DocCfg:
95
95
  def complete_description(self) -> str | None:
96
96
  """String that summarizes what is known about this document."""
97
97
  if self.category and not self.description and not self.author:
98
- return self.category
98
+ if self.category == JUNK:
99
+ return None
100
+ else:
101
+ return self.category
99
102
  elif self.category == REPUTATION:
100
- return f"{REPUTATION_MGMT}: {self.description}"
103
+ author_str = f"{self.author} " if self.author else ''
104
+ return f"{REPUTATION_MGMT}: {author_str}{self.description}"
101
105
  elif self.category == SKYPE_LOG:
102
106
  msg = f"{self.category} of conversation with {self.author}" if self.author else self.category
103
107
  return f"{msg} {self.description}" if self.description else msg
@@ -206,11 +210,13 @@ class EmailCfg(CommunicationCfg):
206
210
  fwded_text_after (str | None): If set, any text after this is a fwd of an article or similar
207
211
  is_fwded_article (bool): True if this is a newspaper article someone fwded. Used to exclude articles from word counting.
208
212
  recipients (list[str | None]): Who received the email
213
+ subject (str): Subject line
209
214
  """
210
215
  actual_text: str | None = None
211
216
  fwded_text_after: str | None = None
212
217
  is_fwded_article: bool = False
213
218
  recipients: list[str | None] = field(default_factory=list)
219
+ subject: str | None = None
214
220
 
215
221
  # This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
216
222
  def __repr__(self) -> str:
epstein_files/util/env.py CHANGED
@@ -5,9 +5,11 @@ from pathlib import Path
5
5
 
6
6
  from rich_argparse_plus import RichHelpFormatterPlus
7
7
 
8
+ from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, CHRONOLOGICAL_EMAILS_PATH, TEXT_MSGS_HTML_PATH
8
9
  from epstein_files.util.logging import env_log_level, exit_with_error, logger
9
10
 
10
11
  DEFAULT_WIDTH = 145
12
+ DEFAULT_FILE = 'default_file'
11
13
  EPSTEIN_GENERATE = 'epstein_generate'
12
14
  HTML_SCRIPTS = [EPSTEIN_GENERATE, 'epstein_word_count']
13
15
 
@@ -34,7 +36,7 @@ parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='re-p
34
36
  output = parser.add_argument_group('OUTPUT', 'Options used by epstein_generate.')
35
37
  output.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
36
38
  output.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just the interesting ones')
37
- output.add_argument('--build', '-b', action='store_true', help='write output to an HTML file in docs/')
39
+ parser.add_argument('--build', '-b', nargs="?", default=None, const=DEFAULT_FILE, help='write output to HTML file')
38
40
  output.add_argument('--email-timeline', action='store_true', help='print a table of all emails in chronological order')
39
41
  output.add_argument('--json-files', action='store_true', help='pretty print all the raw JSON data files in the collection and exit')
40
42
  output.add_argument('--json-metadata', action='store_true', help='dump JSON metadata for all files and exit')
@@ -82,6 +84,14 @@ if is_html_script:
82
84
  elif not args.email_timeline:
83
85
  logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
84
86
  args.output_texts = args.output_emails = args.output_other = True
87
+
88
+ if args.build == DEFAULT_FILE:
89
+ if args.all_emails:
90
+ args.build = ALL_EMAILS_PATH
91
+ elif args.email_timeline:
92
+ args.build = CHRONOLOGICAL_EMAILS_PATH
93
+ else:
94
+ args.build = TEXT_MSGS_HTML_PATH
85
95
  elif parser.prog.startswith('epstein_') and not args.positional_args:
86
96
  exit_with_error(f"{parser.prog} requires positional arguments but got none!")
87
97
 
@@ -33,10 +33,13 @@ def coerce_file_stem(filename_or_id: int | str) -> str:
33
33
 
34
34
 
35
35
  def extract_file_id(filename_or_id: int | str | Path) -> str:
36
+ if isinstance(filename_or_id, str):
37
+ filename_or_id = filename_or_id.removesuffix(',')
38
+
36
39
  if isinstance(filename_or_id, int) or (isinstance(filename_or_id, str) and len(filename_or_id) <= 6):
37
40
  return id_str(filename_or_id)
38
41
 
39
- file_match = FILE_ID_REGEX.match(str(filename_or_id))
42
+ file_match = FILE_ID_REGEX.match(str(filename_or_id).upper())
40
43
 
41
44
  if not file_match:
42
45
  raise RuntimeError(f"Failed to extract file ID from {filename_or_id}")