epstein-files 1.1.2__py3-none-any.whl → 1.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +17 -20
- epstein_files/documents/communication.py +3 -3
- epstein_files/documents/document.py +3 -0
- epstein_files/documents/email.py +75 -64
- epstein_files/documents/imessage/text_message.py +5 -9
- epstein_files/documents/messenger_log.py +2 -2
- epstein_files/epstein_files.py +17 -15
- epstein_files/util/constant/names.py +39 -38
- epstein_files/util/constant/strings.py +1 -0
- epstein_files/util/constants.py +65 -9
- epstein_files/util/data.py +9 -1
- epstein_files/util/doc_cfg.py +8 -2
- epstein_files/util/env.py +11 -1
- epstein_files/util/file_helper.py +4 -1
- epstein_files/util/highlighted_group.py +99 -52
- epstein_files/util/output.py +112 -94
- epstein_files/util/rich.py +28 -35
- epstein_files/util/word_count.py +1 -2
- {epstein_files-1.1.2.dist-info → epstein_files-1.1.5.dist-info}/METADATA +4 -1
- epstein_files-1.1.5.dist-info/RECORD +33 -0
- epstein_files-1.1.2.dist-info/RECORD +0 -33
- {epstein_files-1.1.2.dist-info → epstein_files-1.1.5.dist-info}/LICENSE +0 -0
- {epstein_files-1.1.2.dist-info → epstein_files-1.1.5.dist-info}/WHEEL +0 -0
- {epstein_files-1.1.2.dist-info → epstein_files-1.1.5.dist-info}/entry_points.txt +0 -0
epstein_files/util/constants.py
CHANGED
|
@@ -24,7 +24,7 @@ HEADER_ABBREVIATIONS = {
|
|
|
24
24
|
'Jared': "Jared Kushner",
|
|
25
25
|
'Jagland': 'Thorbjørn Jagland (former Norwegian prime minister)',
|
|
26
26
|
'JEGE': "Epstein's airplane holding company",
|
|
27
|
-
|
|
27
|
+
JEFFREY_WERNICK: 'right wing crypto bro, former COO of Parler',
|
|
28
28
|
'Joi': f"Joi Ito ({MIT_MEDIA_LAB}, MIT Digital Currency Initiative)",
|
|
29
29
|
"Hoffenberg": f"{STEVEN_HOFFENBERG} (Epstein's ponzi scheme partner)",
|
|
30
30
|
'KSA': "Kingdom of Saudi Arabia",
|
|
@@ -89,11 +89,11 @@ EMAILER_ID_REGEXES: dict[str, re.Pattern] = {
|
|
|
89
89
|
JESSICA_CADWELL: re.compile(r'Jessica Cadwell?', re.IGNORECASE),
|
|
90
90
|
JOHNNY_EL_HACHEM: re.compile(r'el hachem johnny|johnny el hachem', re.IGNORECASE),
|
|
91
91
|
JOI_ITO: re.compile(r'ji@media.mit.?edu|(joichi|joi)( Ito)?', re.IGNORECASE),
|
|
92
|
-
JONATHAN_FARKAS: re.compile(r'Jonathan
|
|
92
|
+
JONATHAN_FARKAS: re.compile(r'Jonathan Fark(a|u)(s|il)', re.IGNORECASE),
|
|
93
93
|
KATHRYN_RUEMMLER: re.compile(r'Kathr?yn? Ruemmler?', re.IGNORECASE),
|
|
94
94
|
KEN_STARR: re.compile(r'starr, ken|Ken(neth\s*(W.\s*)?)?\s+starr?|starr', re.IGNORECASE),
|
|
95
95
|
LANDON_THOMAS: re.compile(r'lando[nr] thomas( jr)?|thomas jr.?, lando[nr]', re.IGNORECASE),
|
|
96
|
-
LARRY_SUMMERS: re.compile(r'(La(wrence|rry).{1,5})?Summers?|^LH$|LHS|
|
|
96
|
+
LARRY_SUMMERS: re.compile(r'(La(wrence|rry).{1,5})?Summers?|^LH$|LHS|[Il]hsofficel?', re.IGNORECASE),
|
|
97
97
|
LAWRANCE_VISOSKI: re.compile(r'La(rry|wrance) Visoski?|Lvjet', re.IGNORECASE),
|
|
98
98
|
LAWRENCE_KRAUSS: re.compile(r'Lawrence Kraus[es]?|[jl]awkrauss|kruase', re.IGNORECASE),
|
|
99
99
|
LEON_BLACK: re.compile(r'Leon\s*Black?|(?<!Marc )Leon(?! (Botstein|Jaworski|Wieseltier))', re.IGNORECASE),
|
|
@@ -309,8 +309,47 @@ IRAN_DEAL_RECIPIENTS = ['Allen West', 'Rafael Bardaji', 'Philip Kafka', 'Herb Go
|
|
|
309
309
|
FLIGHT_IN_2012_PEOPLE = ['Francis Derby', 'Januiz Banasiak', 'Louella Rabuyo', 'Richard Barnnet']
|
|
310
310
|
|
|
311
311
|
EMAILS_CONFIG = [
|
|
312
|
+
# 026294 and 026296 might also be Ittihadieh based on timing
|
|
312
313
|
EmailCfg(id='032436', author=ALIREZA_ITTIHADIEH, attribution_reason='Signature'),
|
|
314
|
+
# 032542 026078 026080 026083 026086 026090 might also be Anas based on discussion of Dubai and Kuwait
|
|
313
315
|
EmailCfg(id='032543', author=ANAS_ALRASHEED, attribution_reason='Later reply 033000 has quote'),
|
|
316
|
+
EmailCfg(id='026167', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
|
|
317
|
+
EmailCfg(id='032571', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
|
|
318
|
+
EmailCfg(id='032573', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
|
|
319
|
+
EmailCfg(id='032575', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
|
|
320
|
+
EmailCfg(id='032577', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
|
|
321
|
+
EmailCfg(id='032579', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
|
|
322
|
+
EmailCfg(id='032582', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
|
|
323
|
+
EmailCfg(id='032585', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
|
|
324
|
+
EmailCfg(id='032588', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
|
|
325
|
+
EmailCfg(id='032591', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
|
|
326
|
+
EmailCfg(id='032595', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
|
|
327
|
+
EmailCfg(id='032599', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
|
|
328
|
+
EmailCfg(id='032611', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
|
|
329
|
+
EmailCfg(id='023661', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
|
|
330
|
+
EmailCfg(id='032616', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
|
|
331
|
+
EmailCfg(id='032622', author=ANAS_ALRASHEED, attribution_reason='name visible in 033022 reply'),
|
|
332
|
+
EmailCfg(id='032628', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
|
|
333
|
+
EmailCfg(id='032629', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
|
|
334
|
+
EmailCfg(id='032631', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
|
|
335
|
+
EmailCfg(id='026168', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
|
|
336
|
+
EmailCfg(id='026170', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
|
|
337
|
+
EmailCfg(id='026173', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
|
|
338
|
+
EmailCfg(id='026176', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
|
|
339
|
+
EmailCfg(id='026180', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
|
|
340
|
+
EmailCfg(id='026184', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
|
|
341
|
+
EmailCfg(id='026188', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
|
|
342
|
+
EmailCfg(id='026193', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
|
|
343
|
+
EmailCfg(id='026198', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
|
|
344
|
+
EmailCfg(id='026210', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
|
|
345
|
+
EmailCfg(id='026204', author=ANAS_ALRASHEED, attribution_reason='name visible in 026180 reply'),
|
|
346
|
+
EmailCfg(id='032660', author=ANAS_ALRASHEED, attribution_reason='name visible in 029113 reply'),
|
|
347
|
+
EmailCfg(id='032663', author=ANAS_ALRASHEED, attribution_reason='name visible in 029113 reply'),
|
|
348
|
+
EmailCfg(id='032667', author=ANAS_ALRASHEED, attribution_reason='name visible in 029113 reply'),
|
|
349
|
+
EmailCfg(id='032672', author=ANAS_ALRASHEED, attribution_reason='name visible in 029113 reply'),
|
|
350
|
+
EmailCfg(id='032676', author=ANAS_ALRASHEED, attribution_reason='name visible in 029113 reply'),
|
|
351
|
+
EmailCfg(id='026237', author=ANAS_ALRASHEED, attribution_reason='name visible in 029113 reply'),
|
|
352
|
+
EmailCfg(id='032682', author=ANAS_ALRASHEED, attribution_reason='name visible in 029113 reply'),
|
|
314
353
|
EmailCfg(id='026064', author=ARIANE_DE_ROTHSCHILD, attribution_reason='signature'),
|
|
315
354
|
EmailCfg(id='026069', author=ARIANE_DE_ROTHSCHILD, attribution_reason='signature'),
|
|
316
355
|
EmailCfg(id='030741', author=ARIANE_DE_ROTHSCHILD, attribution_reason='signature'),
|
|
@@ -351,6 +390,7 @@ EMAILS_CONFIG = [
|
|
|
351
390
|
actual_text='',
|
|
352
391
|
author=DARREN_INDYKE,
|
|
353
392
|
description=f"heavily redacted email, quoted replies are from {STEVEN_HOFFENBERG} about James Patterson's book",
|
|
393
|
+
recipients=['Charles Michael'],
|
|
354
394
|
timestamp=parse('2016-08-17 11:26:00'),
|
|
355
395
|
attribution_reason='Quoted replies are in 019109',
|
|
356
396
|
),
|
|
@@ -444,10 +484,20 @@ EMAILS_CONFIG = [
|
|
|
444
484
|
EmailCfg(id='017581', author='Lisa Randall', attribution_reason='reply header'),
|
|
445
485
|
EmailCfg(id='026609', author='Mark Green', attribution_reason='Actually a fwd, Mark Green is in signature'),
|
|
446
486
|
EmailCfg(id='030472', author=MARTIN_WEINBERG, attribution_reason='Maybe. in reply', is_attribution_uncertain=True),
|
|
487
|
+
EmailCfg(id='032563', author=MASHA_DROKOVA, attribution_reason='replied to in 033014'),
|
|
488
|
+
EmailCfg(id='032564', author=MASHA_DROKOVA, attribution_reason='follow up to 032563 about huffpo article with link'),
|
|
489
|
+
EmailCfg(id='031544', author=MASHA_DROKOVA, attribution_reason='follow up to 032563 about huffpo article with link'),
|
|
490
|
+
EmailCfg(id='032605', author=MASHA_DROKOVA, attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
|
|
491
|
+
EmailCfg(id='032606', author=MASHA_DROKOVA, attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
|
|
492
|
+
EmailCfg(id='032607', author=MASHA_DROKOVA, attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
|
|
493
|
+
EmailCfg(id='032609', author=MASHA_DROKOVA, attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
|
|
494
|
+
# 032581, 032604, 033025 may also be Masha based on timing, subject (interviews/articles), and sequential ID
|
|
495
|
+
EmailCfg(id='032604', author=MASHA_DROKOVA, attribution_reason="timing, subject (interviews/articles), and sequential ID", is_attribution_uncertain=True),
|
|
496
|
+
EmailCfg(id='032581', author=MASHA_DROKOVA, attribution_reason="timing, subject (interviews/articles), and sequential ID", is_attribution_uncertain=True),
|
|
497
|
+
EmailCfg(id='033025', author=MASHA_DROKOVA, attribution_reason="timing, subject (interviews/articles), and sequential ID", is_attribution_uncertain=True),
|
|
447
498
|
EmailCfg(id='030235', author=MELANIE_WALKER, attribution_reason='In fwd'),
|
|
448
499
|
EmailCfg(id='032343', author=MELANIE_WALKER, attribution_reason='Name seen in later reply 032346'),
|
|
449
500
|
EmailCfg(id='032212', author=MIROSLAV_LAJCAK, attribution_reason='signature'),
|
|
450
|
-
EmailCfg(id='022193', author=NADIA_MARCINKO, attribution_reason='reply'),
|
|
451
501
|
EmailCfg(id='021814', author=NADIA_MARCINKO, attribution_reason='reply'),
|
|
452
502
|
EmailCfg(id='021808', author=NADIA_MARCINKO, attribution_reason='reply'),
|
|
453
503
|
EmailCfg(id='022190', author=NADIA_MARCINKO, attribution_reason='reply'),
|
|
@@ -508,12 +558,14 @@ EMAILS_CONFIG = [
|
|
|
508
558
|
author=TERRY_KAFKA,
|
|
509
559
|
fwded_text_after='From: Mike Cohen',
|
|
510
560
|
recipients=cast(list[str | None], [JEFFREY_EPSTEIN, MARK_EPSTEIN, MICHAEL_BUCHHOLTZ] + IRAN_DEAL_RECIPIENTS),
|
|
561
|
+
subject='Fw: The Iran Nuclear Deal',
|
|
511
562
|
duplicate_ids=['028482'],
|
|
512
563
|
),
|
|
513
564
|
EmailCfg(id='029992', author=TERRY_KAFKA, attribution_reason='Quoted reply'),
|
|
514
565
|
EmailCfg(id='029985', author=TERRY_KAFKA, attribution_reason='Quoted reply in 029992'),
|
|
515
566
|
EmailCfg(id='020666', author=TERRY_KAFKA, attribution_reason="Ends with 'Terry'"),
|
|
516
567
|
EmailCfg(id='026014', author=ZUBAIR_KHAN, recipients=[JEFFREY_EPSTEIN], timestamp=parse('2016-11-04 17:46:00')),
|
|
568
|
+
EmailCfg(id='033021', recipients=[ANAS_ALRASHEED], attribution_reason='visible in 033022'),
|
|
517
569
|
EmailCfg(id='027063', recipients=[ANTHONY_BARRETT]),
|
|
518
570
|
EmailCfg(id='030764', recipients=[ARIANE_DE_ROTHSCHILD], attribution_reason='Reply'),
|
|
519
571
|
EmailCfg(id='026431', recipients=[ARIANE_DE_ROTHSCHILD], attribution_reason='Reply'),
|
|
@@ -538,6 +590,7 @@ EMAILS_CONFIG = [
|
|
|
538
590
|
EmailCfg(id='032780', recipients=[JEFFREY_EPSTEIN]), # Bad OCR (nofix)
|
|
539
591
|
EmailCfg(id='029324', recipients=[JEFFREY_EPSTEIN, "Jojo Fontanilla", "Lyn Fontanilla"]), # Bad OCR (nofix)
|
|
540
592
|
EmailCfg(id='013482', recipients=[JEFFREY_EPSTEIN], is_fwded_article=True), # other recipients redacted. "The view from the US: Stem cell therapy steps up a gear with firs"
|
|
593
|
+
EmailCfg(id='029558', recipients=[JEFFREY_EPSTEIN, KATHERINE_KEATING], attribution_reason='BCC', fwded_text_after='Creativity is central'),
|
|
541
594
|
EmailCfg(id='033456', recipients=["Joel"], attribution_reason='Reply'),
|
|
542
595
|
EmailCfg(id='033458', recipients=["Joel"], attribution_reason='Reply'),
|
|
543
596
|
EmailCfg(id='033460', recipients=["Joel"], attribution_reason='Reply'),
|
|
@@ -566,10 +619,13 @@ EMAILS_CONFIG = [
|
|
|
566
619
|
EmailCfg(id='033466', recipients=[LAWRANCE_VISOSKI], attribution_reason='Reply signature'),
|
|
567
620
|
EmailCfg(id='022250', recipients=[LESLEY_GROFF], attribution_reason='Reply'),
|
|
568
621
|
EmailCfg(id='030242', recipients=[MARIANA_IDZKOWSKA], duplicate_ids=['032048'], dupe_type='redacted'),
|
|
622
|
+
EmailCfg(id='033027', recipients=[MASHA_DROKOVA], attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
|
|
569
623
|
EmailCfg(id='030368', recipients=[MELANIE_SPINELLA], attribution_reason='Actually a self fwd from jeffrey to jeffrey'),
|
|
570
624
|
EmailCfg(id='030369', recipients=[MELANIE_SPINELLA], attribution_reason='Actually a self fwd from jeffrey to jeffrey'),
|
|
571
625
|
EmailCfg(id='030371', recipients=[MELANIE_SPINELLA], attribution_reason='Actually a self fwd from jeffrey to jeffrey'),
|
|
572
626
|
EmailCfg(id='022258', recipients=[NADIA_MARCINKO], attribution_reason='Reply header'),
|
|
627
|
+
EmailCfg(id='022193', recipients=[NADIA_MARCINKO], attribution_reason='reply'),
|
|
628
|
+
EmailCfg(id='030572', recipients=[PAULA], attribution_reason='quoted in 030482', is_attribution_uncertain=True),
|
|
573
629
|
EmailCfg(id='030506', recipients=[PAULA], attribution_reason=PAULA_REASON, is_attribution_uncertain=True),
|
|
574
630
|
EmailCfg(id='030507', recipients=[PAULA], attribution_reason=PAULA_REASON, is_attribution_uncertain=True),
|
|
575
631
|
EmailCfg(id='030508', recipients=[PAULA], attribution_reason=PAULA_REASON, is_attribution_uncertain=True),
|
|
@@ -585,8 +641,9 @@ EMAILS_CONFIG = [
|
|
|
585
641
|
EmailCfg(id='032358', actual_text=REDACTED), # Completely redacted
|
|
586
642
|
EmailCfg(id='033050', actual_text='schwartman'),
|
|
587
643
|
EmailCfg(id='022219', description="discussion of attempts to clean up Epstein's Google search results"),
|
|
588
|
-
EmailCfg(id='
|
|
589
|
-
EmailCfg(id='
|
|
644
|
+
EmailCfg(id='028524', is_fwded_article=True, description='Zach Braff op-ed on Woody Allen in NYT'),
|
|
645
|
+
EmailCfg(id='031333', is_fwded_article=True, description='Fort Knox conspiracy theory, looks like a Russian disinfo article'), # Russia Says IMF Chief Jailed For Discovering All US Gold is Gone
|
|
646
|
+
EmailCfg(id='031335', is_fwded_article=True, description='Fort Knox conspiracy theory, looks like a Russian disinfo article'), # DOMINQUE STRAUSS-KAHN ARRESTED, NOT BECAUSE HE RAPED A MAID, BUT BECAUSE HE HAD EVIDENCE US HAS NO GOLD IN FORT KNOX.
|
|
590
647
|
EmailCfg(id='023627', is_fwded_article=True, description=MICHAEL_WOLFF_EPSTEIN_ARTICLE_DRAFT),
|
|
591
648
|
EmailCfg(id='026298', is_fwded_article=True, duplicate_ids=['026499']), # Written by someone else?
|
|
592
649
|
EmailCfg(id='029692', is_fwded_article=True, duplicate_ids=['029779']), # WaPo article
|
|
@@ -640,7 +697,6 @@ EMAILS_CONFIG = [
|
|
|
640
697
|
EmailCfg(id='031472', is_fwded_article=True), # WSJ: Lawyers for Imam Wanted by Turkish authorities Fear for Their Client's Life
|
|
641
698
|
EmailCfg(id='012684', is_fwded_article=True), # Trump in talks to buy socialite Kluge's Charlottesville vineyard
|
|
642
699
|
EmailCfg(id='028536', is_fwded_article=True), # Palm Beach Post FBI Epstein files say he gave info. Does it explain sweetheart deal?
|
|
643
|
-
EmailCfg(id='028524', is_fwded_article=True), # Zach Braff article on Woody Allen in NYT
|
|
644
700
|
EmailCfg(id='030326', is_fwded_article=True), # NYP Congressional candidate compares Melania Trump to prostitute
|
|
645
701
|
EmailCfg(id='030519', is_fwded_article=True), # Daily Mail on Prince Andrew
|
|
646
702
|
EmailCfg(id='030878', is_fwded_article=True), # Steve Bannon almost appeared in Michael Moore's 'Fahrenheit 11/9'
|
|
@@ -667,6 +723,7 @@ EMAILS_CONFIG = [
|
|
|
667
723
|
EmailCfg(id='029841', duplicate_ids=['012711'], dupe_type='redacted'),
|
|
668
724
|
EmailCfg(id='030414', duplicate_ids=['030578'], dupe_type='redacted'),
|
|
669
725
|
EmailCfg(id='031135', duplicate_ids=['030634'], dupe_type='redacted'),
|
|
726
|
+
EmailCfg(id='030620', duplicate_ids=['023067']),
|
|
670
727
|
EmailCfg(id='029835', duplicate_ids=['028968']),
|
|
671
728
|
EmailCfg(id='033512', duplicate_ids=['033361']),
|
|
672
729
|
EmailCfg(id='030299', duplicate_ids=['021794']),
|
|
@@ -768,7 +825,6 @@ EMAILS_CONFIG = [
|
|
|
768
825
|
# Emails that need a little help determining how to separate the actual text from fwded text
|
|
769
826
|
EmailCfg(id='013415', fwded_text_after='Darren K. Indyke'),
|
|
770
827
|
EmailCfg(id='024624', fwded_text_after='On Tue, May 14'),
|
|
771
|
-
EmailCfg(id='029558', fwded_text_after='Creativity is central'),
|
|
772
828
|
EmailCfg(id='025888', fwded_text_after='Jul 24, 2015'),
|
|
773
829
|
EmailCfg(id='016413', fwded_text_after='In a former warehouse'),
|
|
774
830
|
EmailCfg(id='025548', fwded_text_after='Edward Jay Epstein'),
|
|
@@ -1622,4 +1678,4 @@ REPLY_LINE_ON_NUMERIC_DATE_PATTERN = fr"On \d+/\d+/\d+[, ].*{REPLY_LINE_ENDING_P
|
|
|
1622
1678
|
REPLY_LINE_ON_DATE_PATTERN = fr"^On (\d+ )?((Mon|Tues?|Wed(nes)?|Thu(rs)?|Fri|Sat(ur)?|Sun)(day)?|(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*)[, ].*{REPLY_LINE_ENDING_PATTERN}"
|
|
1623
1679
|
REPLY_LINE_PATTERN = rf"({REPLY_LINE_IN_A_MSG_PATTERN}|{REPLY_LINE_ON_NUMERIC_DATE_PATTERN}|{REPLY_LINE_ON_DATE_PATTERN}|{FORWARDED_LINE_PATTERN})"
|
|
1624
1680
|
REPLY_REGEX = re.compile(REPLY_LINE_PATTERN, re.IGNORECASE | re.MULTILINE)
|
|
1625
|
-
SENT_FROM_REGEX = re.compile(r'^(?:(Please forgive|Sorry for all the) typos.{1,4})?(Sent (from|via).*(and string|AT&T|Droid|iPad|Phone|Mail|BlackBerry(.*(smartphone|device|Handheld|AT&T|T- ?Mobile))?)\.?)', re.M | re.I)
|
|
1681
|
+
SENT_FROM_REGEX = re.compile(r'^(?:(Please forgive|Sorry for all the) typos.{1,4})?((Envoyé de mon|Sent (from|via)).*(and string|AT&T|Droid|iPad|Phone|Mail|BlackBerry(.*(smartphone|device|Handheld|AT&T|T- ?Mobile))?)\.?)', re.M | re.I)
|
epstein_files/util/data.py
CHANGED
|
@@ -8,6 +8,7 @@ from dateutil import tz
|
|
|
8
8
|
from typing import TypeVar
|
|
9
9
|
|
|
10
10
|
from epstein_files.util.constant import names
|
|
11
|
+
from epstein_files.util.constant.strings import QUESTION_MARKS
|
|
11
12
|
from epstein_files.util.env import args
|
|
12
13
|
from epstein_files.util.logging import logger
|
|
13
14
|
|
|
@@ -41,7 +42,7 @@ def extract_last_name(name: str) -> str:
|
|
|
41
42
|
if ' ' not in name:
|
|
42
43
|
return name
|
|
43
44
|
|
|
44
|
-
names = name.split()
|
|
45
|
+
names = name.removesuffix(QUESTION_MARKS).strip().split()
|
|
45
46
|
|
|
46
47
|
if names[-1].startswith('Jr') and len(names[-1]) <= 3:
|
|
47
48
|
return ' '.join(names[-2:])
|
|
@@ -49,6 +50,13 @@ def extract_last_name(name: str) -> str:
|
|
|
49
50
|
return names[-1]
|
|
50
51
|
|
|
51
52
|
|
|
53
|
+
def extract_first_name(name: str) -> str:
|
|
54
|
+
if ' ' not in name:
|
|
55
|
+
return name
|
|
56
|
+
|
|
57
|
+
return name.removesuffix(f" {extract_last_name(name)}")
|
|
58
|
+
|
|
59
|
+
|
|
52
60
|
def flatten(_list: list[list[T]]) -> list[T]:
|
|
53
61
|
return list(itertools.chain.from_iterable(_list))
|
|
54
62
|
|
epstein_files/util/doc_cfg.py
CHANGED
|
@@ -95,9 +95,13 @@ class DocCfg:
|
|
|
95
95
|
def complete_description(self) -> str | None:
|
|
96
96
|
"""String that summarizes what is known about this document."""
|
|
97
97
|
if self.category and not self.description and not self.author:
|
|
98
|
-
|
|
98
|
+
if self.category == JUNK:
|
|
99
|
+
return None
|
|
100
|
+
else:
|
|
101
|
+
return self.category
|
|
99
102
|
elif self.category == REPUTATION:
|
|
100
|
-
|
|
103
|
+
author_str = f"{self.author} " if self.author else ''
|
|
104
|
+
return f"{REPUTATION_MGMT}: {author_str}{self.description}"
|
|
101
105
|
elif self.category == SKYPE_LOG:
|
|
102
106
|
msg = f"{self.category} of conversation with {self.author}" if self.author else self.category
|
|
103
107
|
return f"{msg} {self.description}" if self.description else msg
|
|
@@ -206,11 +210,13 @@ class EmailCfg(CommunicationCfg):
|
|
|
206
210
|
fwded_text_after (str | None): If set, any text after this is a fwd of an article or similar
|
|
207
211
|
is_fwded_article (bool): True if this is a newspaper article someone fwded. Used to exclude articles from word counting.
|
|
208
212
|
recipients (list[str | None]): Who received the email
|
|
213
|
+
subject (str): Subject line
|
|
209
214
|
"""
|
|
210
215
|
actual_text: str | None = None
|
|
211
216
|
fwded_text_after: str | None = None
|
|
212
217
|
is_fwded_article: bool = False
|
|
213
218
|
recipients: list[str | None] = field(default_factory=list)
|
|
219
|
+
subject: str | None = None
|
|
214
220
|
|
|
215
221
|
# This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
|
|
216
222
|
def __repr__(self) -> str:
|
epstein_files/util/env.py
CHANGED
|
@@ -5,9 +5,11 @@ from pathlib import Path
|
|
|
5
5
|
|
|
6
6
|
from rich_argparse_plus import RichHelpFormatterPlus
|
|
7
7
|
|
|
8
|
+
from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, CHRONOLOGICAL_EMAILS_PATH, TEXT_MSGS_HTML_PATH
|
|
8
9
|
from epstein_files.util.logging import env_log_level, exit_with_error, logger
|
|
9
10
|
|
|
10
11
|
DEFAULT_WIDTH = 145
|
|
12
|
+
DEFAULT_FILE = 'default_file'
|
|
11
13
|
EPSTEIN_GENERATE = 'epstein_generate'
|
|
12
14
|
HTML_SCRIPTS = [EPSTEIN_GENERATE, 'epstein_word_count']
|
|
13
15
|
|
|
@@ -34,7 +36,7 @@ parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='re-p
|
|
|
34
36
|
output = parser.add_argument_group('OUTPUT', 'Options used by epstein_generate.')
|
|
35
37
|
output.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
|
|
36
38
|
output.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just the interesting ones')
|
|
37
|
-
|
|
39
|
+
parser.add_argument('--build', '-b', nargs="?", default=None, const=DEFAULT_FILE, help='write output to HTML file')
|
|
38
40
|
output.add_argument('--email-timeline', action='store_true', help='print a table of all emails in chronological order')
|
|
39
41
|
output.add_argument('--json-files', action='store_true', help='pretty print all the raw JSON data files in the collection and exit')
|
|
40
42
|
output.add_argument('--json-metadata', action='store_true', help='dump JSON metadata for all files and exit')
|
|
@@ -82,6 +84,14 @@ if is_html_script:
|
|
|
82
84
|
elif not args.email_timeline:
|
|
83
85
|
logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
|
|
84
86
|
args.output_texts = args.output_emails = args.output_other = True
|
|
87
|
+
|
|
88
|
+
if args.build == DEFAULT_FILE:
|
|
89
|
+
if args.all_emails:
|
|
90
|
+
args.build = ALL_EMAILS_PATH
|
|
91
|
+
elif args.email_timeline:
|
|
92
|
+
args.build = CHRONOLOGICAL_EMAILS_PATH
|
|
93
|
+
else:
|
|
94
|
+
args.build = TEXT_MSGS_HTML_PATH
|
|
85
95
|
elif parser.prog.startswith('epstein_') and not args.positional_args:
|
|
86
96
|
exit_with_error(f"{parser.prog} requires positional arguments but got none!")
|
|
87
97
|
|
|
@@ -33,10 +33,13 @@ def coerce_file_stem(filename_or_id: int | str) -> str:
|
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
def extract_file_id(filename_or_id: int | str | Path) -> str:
|
|
36
|
+
if isinstance(filename_or_id, str):
|
|
37
|
+
filename_or_id = filename_or_id.removesuffix(',')
|
|
38
|
+
|
|
36
39
|
if isinstance(filename_or_id, int) or (isinstance(filename_or_id, str) and len(filename_or_id) <= 6):
|
|
37
40
|
return id_str(filename_or_id)
|
|
38
41
|
|
|
39
|
-
file_match = FILE_ID_REGEX.match(str(filename_or_id))
|
|
42
|
+
file_match = FILE_ID_REGEX.match(str(filename_or_id).upper())
|
|
40
43
|
|
|
41
44
|
if not file_match:
|
|
42
45
|
raise RuntimeError(f"Failed to extract file ID from {filename_or_id}")
|