PyPI - epstein-files - Versions diffs - 1.1.5__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

epstein-files 1.1.5py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

epstein_files/__init__.py +12 -21
epstein_files/documents/communication.py +0 -3
epstein_files/documents/document.py +68 -21
epstein_files/documents/email.py +54 -70
epstein_files/documents/emails/email_header.py +14 -4
epstein_files/documents/imessage/text_message.py +5 -4
epstein_files/documents/messenger_log.py +7 -7
epstein_files/documents/other_file.py +16 -34
epstein_files/epstein_files.py +133 -141
epstein_files/person.py +324 -0
epstein_files/util/constant/names.py +46 -15
epstein_files/util/constant/output_files.py +1 -0
epstein_files/util/constant/strings.py +3 -3
epstein_files/util/constant/urls.py +15 -2
epstein_files/util/constants.py +75 -21
epstein_files/util/data.py +1 -20
epstein_files/util/doc_cfg.py +27 -17
epstein_files/util/env.py +5 -3
epstein_files/util/highlighted_group.py +248 -203
epstein_files/util/logging.py +1 -1
epstein_files/util/output.py +113 -157
epstein_files/util/rich.py +20 -35
epstein_files/util/timer.py +14 -0
epstein_files/util/word_count.py +1 -1
{epstein_files-1.1.5.dist-info → epstein_files-1.2.1.dist-info}/METADATA +6 -2
epstein_files-1.2.1.dist-info/RECORD +34 -0
epstein_files-1.1.5.dist-info/RECORD +0 -33
{epstein_files-1.1.5.dist-info → epstein_files-1.2.1.dist-info}/LICENSE +0 -0
{epstein_files-1.1.5.dist-info → epstein_files-1.2.1.dist-info}/WHEEL +0 -0
{epstein_files-1.1.5.dist-info → epstein_files-1.2.1.dist-info}/entry_points.txt +0 -0

epstein_files/util/constants.py CHANGED Viewed

@@ -19,6 +19,7 @@ HEADER_ABBREVIATIONS = {
     'bgC3': 'Bill Gates Ventures (renamed in 2018)',
     "Brock": 'Brock Pierce (crypto bro with a very sordid past)',
     "DB": "Deutsche Bank (maybe??)",
+    "GRAT": "Grantor Retained Annuity Trust (tax shelter)",
     'HBJ': "Sheikh Hamad bin Jassim (former Qatari prime minister)",
     'Jabor': '"an influential man in Qatar"',
     'Jared': "Jared Kushner",
@@ -62,6 +63,7 @@ EMAILER_ID_REGEXES: dict[str, re.Pattern] = {
     BARBRO_C_EHNBOM: re.compile(r'behnbom@aol.com|(Barbro\s.*)?Ehnbom', re.IGNORECASE),
     BARRY_J_COHEN: re.compile(r'barry\s*((j.?|james)\s*)?cohen?', re.IGNORECASE),
     BENNET_MOSKOWITZ: re.compile(r'Moskowitz.*Bennet|Bennet.*Moskowitz', re.IGNORECASE),
+    BOB_CROWE: re.compile(r"[BR]ob Crowe", re.IGNORECASE),
     BORIS_NIKOLIC: re.compile(r'(boris )?nikolic?', re.IGNORECASE),
     BRAD_EDWARDS:  re.compile(r'Brad(ley)?(\s*J(.?|ames))?\s*Edwards', re.IGNORECASE),
     BRAD_KARP: re.compile(r'Brad (S.? )?Karp|Karp, Brad', re.IGNORECASE),
@@ -83,6 +85,7 @@ EMAILER_ID_REGEXES: dict[str, re.Pattern] = {
     JACKIE_PERCZEK:  re.compile(r'jackie percze[kl]?', re.IGNORECASE),
     JABOR_Y: re.compile(r'[ji]abor\s*y?', re.IGNORECASE),
     JAMES_HILL: re.compile(r"hill, james e.|james.e.hill@abc.com", re.IGNORECASE),
+    JANUSZ_BANASIAK: re.compile(r"Janu[is]z Banasiak", re.IGNORECASE),
     JEAN_LUC_BRUNEL: re.compile(r'Jean[- ]Luc Brunel?', re.IGNORECASE),
     JEFF_FULLER: re.compile(r"jeff@mc2mm.com|Jeff Fuller", re.IGNORECASE),
     JEFFREY_EPSTEIN: re.compile(r'[djl]\s?ee[vy]acation[©@]?g?(mail.com)?|Epstine|\bJEE?\b|Jeffrey E((sp|ps)tein?)?( VI Foundation)?|jeeproject@yahoo.com|J Jep|Jeffery Edwards|(?<!(Mark L.|ard Jay) )Epstein', re.IGNORECASE),
@@ -157,6 +160,7 @@ EMAILERS = [
     BILL_GATES,
     BILL_SIEGEL,
     BRAD_WECHSLER,
+    CHRISTINA_GALBRAITH,
     DANIEL_SABBA,
     'Danny Goldberg',
     DAVID_SCHOEN,
@@ -302,11 +306,10 @@ TEXTS_CONFIG = CONFIRMED_TEXTS_CONFIG + UNCONFIRMED_TEXTS_CONFIG
 ################################################ EMAILS ################################################
 ########################################################################################################
-MICHAEL_WOLFF_EPSTEIN_ARTICLE_DRAFT = f"draft of an unpublished article about Epstein by {MICHAEL_WOLFF} written ca. 2014/2015"
 # Some emails have a lot of uninteresting CCs
-IRAN_DEAL_RECIPIENTS = ['Allen West', 'Rafael Bardaji', 'Philip Kafka', 'Herb Goodman', 'Grant Seeger', 'Lisa Albert', 'Janet Kafka', 'James Ramsey', 'ACT for America', 'John Zouzelka', 'Joel Dunn', 'Nate McClain', 'Bennet Greenwald', 'Taal Safdie', 'Uri Fouzailov', 'Neil Anderson', 'Nate White', 'Rita Hortenstine', 'Henry Hortenstine', 'Gary Gross', 'Forrest Miller', 'Bennett Schmidt', 'Val Sherman', 'Marcie Brown', 'Michael Horowitz', 'Marshall Funk']
-FLIGHT_IN_2012_PEOPLE = ['Francis Derby', 'Januiz Banasiak', 'Louella Rabuyo', 'Richard Barnnet']
+FLIGHT_IN_2012_PEOPLE: list[Name] = ['Francis Derby', JANUSZ_BANASIAK, 'Louella Rabuyo', 'Richard Barnnet']
+IRAN_DEAL_RECIPIENTS: list[Name] = ['Allen West', 'Rafael Bardaji', 'Philip Kafka', 'Herb Goodman', 'Grant Seeger', 'Lisa Albert', 'Janet Kafka', 'James Ramsey', 'ACT for America', 'John Zouzelka', 'Joel Dunn', 'Nate McClain', 'Bennet Greenwald', 'Taal Safdie', 'Uri Fouzailov', 'Neil Anderson', 'Nate White', 'Rita Hortenstine', 'Henry Hortenstine', 'Gary Gross', 'Forrest Miller', 'Bennett Schmidt', 'Val Sherman', 'Marcie Brown', 'Michael Horowitz', 'Marshall Funk']
+MICHAEL_WOLFF_EPSTEIN_ARTICLE_DRAFT = f"draft of an unpublished article about Epstein by {MICHAEL_WOLFF} written ca. 2014/2015"
 EMAILS_CONFIG = [
     # 026294 and 026296 might also be Ittihadieh based on timing
@@ -409,11 +412,11 @@ EMAILS_CONFIG = [
         dupe_type='redacted'
     ),
     EmailCfg(id='026547', author=GERALD_BARTON, recipients=[JEFFREY_EPSTEIN]),  # Bad OCR # TODO: email header is really jacked up
-    EmailCfg(id='029969', author=GWENDOLYN_BECK, attribution_reason='Signature'),
-    EmailCfg(id='029968', author=GWENDOLYN_BECK, attribution_reason='Signature', duplicate_ids=['031120']),
+    EmailCfg(id='029969', author=GWENDOLYN_BECK, attribution_reason='signature "Longevity & Successful Aging"'),
+    EmailCfg(id='029968', author=GWENDOLYN_BECK, attribution_reason='signature "beckresearchlabs.com"', duplicate_ids=['031120']),
     EmailCfg(id='029970', author=GWENDOLYN_BECK, attribution_reason='signed "Longevity & Successful Agin"'),
-    EmailCfg(id='029960', author=GWENDOLYN_BECK, attribution_reason='Reply'),
-    EmailCfg(id='029959', author=GWENDOLYN_BECK, attribution_reason='"Longevity & Aging"'),
+    EmailCfg(id='029960', author=GWENDOLYN_BECK, attribution_reason='signature "Beck Center for Longevity & Aging"'),
+    EmailCfg(id='029959', author=GWENDOLYN_BECK, attribution_reason='signature "Beck Center for Longevity & Aging"'),
     EmailCfg(id='033360', author=HENRY_HOLT, attribution_reason='in signature'),  # Henry Holt is a company not a person
     EmailCfg(id='033384', author=JACK_GOLDBERGER, attribution_reason='Might be Paul Prosperi?', is_attribution_uncertain=True),
     EmailCfg(id='026024', author=JEAN_HUGUEN, attribution_reason='Signature'),
@@ -474,7 +477,7 @@ EMAILS_CONFIG = [
     EmailCfg(
         id='029977',
         author=LAWRANCE_VISOSKI,
-        recipients=cast(list[str | None], [JEFFREY_EPSTEIN, DARREN_INDYKE, LESLEY_GROFF, RICHARD_KAHN] + FLIGHT_IN_2012_PEOPLE),
+        recipients=[JEFFREY_EPSTEIN, DARREN_INDYKE, LESLEY_GROFF, RICHARD_KAHN] + FLIGHT_IN_2012_PEOPLE,
         attribution_reason=LARRY_REASON,
         duplicate_ids=['031129'],
     ),
@@ -491,14 +494,12 @@ EMAILS_CONFIG = [
     EmailCfg(id='032606', author=MASHA_DROKOVA, attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
     EmailCfg(id='032607', author=MASHA_DROKOVA, attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
     EmailCfg(id='032609', author=MASHA_DROKOVA, attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
-    # 032581, 032604, 033025 may also be Masha based on timing, subject (interviews/articles), and sequential ID
     EmailCfg(id='032604', author=MASHA_DROKOVA, attribution_reason="timing, subject (interviews/articles), and sequential ID", is_attribution_uncertain=True),
     EmailCfg(id='032581', author=MASHA_DROKOVA, attribution_reason="timing, subject (interviews/articles), and sequential ID", is_attribution_uncertain=True),
-    EmailCfg(id='033025', author=MASHA_DROKOVA, attribution_reason="timing, subject (interviews/articles), and sequential ID", is_attribution_uncertain=True),
     EmailCfg(id='030235', author=MELANIE_WALKER, attribution_reason='In fwd'),
     EmailCfg(id='032343', author=MELANIE_WALKER, attribution_reason='Name seen in later reply 032346'),
     EmailCfg(id='032212', author=MIROSLAV_LAJCAK, attribution_reason='signature'),
-    EmailCfg(id='021814', author=NADIA_MARCINKO, attribution_reason='reply'),
+    EmailCfg(id='021814', author=NADIA_MARCINKO, attribution_reason='reply'), #, actual_text="I'm a pilot...I prefer sex slave to copilot ;)"),
     EmailCfg(id='021808', author=NADIA_MARCINKO, attribution_reason='reply'),
     EmailCfg(id='022190', author=NADIA_MARCINKO, attribution_reason='reply'),
     EmailCfg(id='021818', author=NADIA_MARCINKO, attribution_reason='reply'),
@@ -540,11 +541,12 @@ EMAILS_CONFIG = [
         author=SEAN_BANNON,
         attribution_reason="From protonmail, Bannon wrote 'just sent from my protonmail' in 027067",
     ),
-    EmailCfg(id='029003', author=SOON_YI_PREVIN, attribution_reason="\"Sent from Soon-Yi's iPhone\""),
-    EmailCfg(id='029005', author=SOON_YI_PREVIN, attribution_reason="\"Sent from Soon-Yi's iPhone\""),
-    EmailCfg(id='029007', author=SOON_YI_PREVIN, attribution_reason="\"Sent from Soon-Yi's iPhone\""),
-    EmailCfg(id='029010', author=SOON_YI_PREVIN, attribution_reason="\"Sent from Soon-Yi's iPhone\""),
-    EmailCfg(id='032296', author=SOON_YI_PREVIN, attribution_reason="\"Sent from Soon-Yi's iPhone\""),
+    EmailCfg(id='029003', author=SOON_YI_PREVIN, attribution_reason='"Sent from Soon-Yi\'s iPhone"'),
+    EmailCfg(id='029005', author=SOON_YI_PREVIN, attribution_reason='"Sent from Soon-Yi\'s iPhone"'),
+    EmailCfg(id='029007', author=SOON_YI_PREVIN, attribution_reason='"Sent from Soon-Yi\'s iPhone"'),
+    EmailCfg(id='029010', author=SOON_YI_PREVIN, attribution_reason='"Sent from Soon-Yi\'s iPhone"'),
+    EmailCfg(id='032296', author=SOON_YI_PREVIN, attribution_reason='"Sent from Soon-Yi\'s iPhone"'),
+    EmailCfg(id='033292', author=SOON_YI_PREVIN, attribution_reason='mentions "Woody\'s movie"', is_attribution_uncertain=True),
     EmailCfg(
         id='019109',
         author=STEVEN_HOFFENBERG,
@@ -557,7 +559,7 @@ EMAILS_CONFIG = [
         attribution_reason='ends with "Respectfully, terry"',
         author=TERRY_KAFKA,
         fwded_text_after='From: Mike Cohen',
-        recipients=cast(list[str | None], [JEFFREY_EPSTEIN, MARK_EPSTEIN, MICHAEL_BUCHHOLTZ] + IRAN_DEAL_RECIPIENTS),
+        recipients=[JEFFREY_EPSTEIN, MARK_EPSTEIN, MICHAEL_BUCHHOLTZ] + IRAN_DEAL_RECIPIENTS,
         subject='Fw: The Iran Nuclear Deal',
         duplicate_ids=['028482'],
     ),
@@ -620,6 +622,7 @@ EMAILS_CONFIG = [
     EmailCfg(id='022250', recipients=[LESLEY_GROFF], attribution_reason='Reply'),
     EmailCfg(id='030242', recipients=[MARIANA_IDZKOWSKA], duplicate_ids=['032048'], dupe_type='redacted'),
     EmailCfg(id='033027', recipients=[MASHA_DROKOVA], attribution_reason="re: PR interview, 031544 says she'll be in NY at that time"),
+    EmailCfg(id='033025', recipients=[MASHA_DROKOVA], attribution_reason="timing, subject (interviews/articles), and sequential ID", is_attribution_uncertain=True),
     EmailCfg(id='030368', recipients=[MELANIE_SPINELLA], attribution_reason='Actually a self fwd from jeffrey to jeffrey'),
     EmailCfg(id='030369', recipients=[MELANIE_SPINELLA], attribution_reason='Actually a self fwd from jeffrey to jeffrey'),
     EmailCfg(id='030371', recipients=[MELANIE_SPINELLA], attribution_reason='Actually a self fwd from jeffrey to jeffrey'),
@@ -658,6 +661,7 @@ EMAILS_CONFIG = [
     EmailCfg(id='032023', is_fwded_article=True, duplicate_ids=['032012']),  # American-Israeli Cooperative Enterprise Newsletter
     EmailCfg(id='021758', is_fwded_article=True, duplicate_ids=['030616']),  # Radar Online article about Epstein's early prison release
     EmailCfg(id='033297', is_fwded_article=True, duplicate_ids=['033586']),  # Sultan Sulayem fwding article about Trump and Russia
+    EmailCfg(id='030983', is_fwded_article=True),  # Power Line blog Alex Acosta and Jeffrey Epstein Plea Deal Analysis
     EmailCfg(id='031774', is_fwded_article=True),  # Krassner fwd of Palmer Report article
     EmailCfg(id='033345', is_fwded_article=True),  # Krassner fwd of Palmer Report article
     EmailCfg(id='029903', is_fwded_article=True),  # Krassner fwd of Ann Coulter article about Epstein
@@ -711,6 +715,7 @@ EMAILS_CONFIG = [
     EmailCfg(id='030373', timestamp=parse('2018-10-03 01:49:27')),
     # Configure duplicates
+    EmailCfg(id='026631', duplicate_ids=['026632'], dupe_type='quoted'),
     EmailCfg(id='028768', duplicate_ids=['026563'], dupe_type='redacted'),
     EmailCfg(id='027056', duplicate_ids=['028762'], dupe_type='redacted'),
     EmailCfg(id='032248', duplicate_ids=['032246'], dupe_type='redacted'),
@@ -870,7 +875,7 @@ TWEET = 'tweet'
 # Legal cases
 BRUNEL_V_EPSTEIN = f"{JEAN_LUC_BRUNEL} v. {JEFFREY_EPSTEIN} and Tyler McDonald d/b/a YI.org"
 EDWARDS_V_DERSHOWITZ = f"{BRAD_EDWARDS} & {PAUL_G_CASSELL} v. {ALAN_DERSHOWITZ}"
-EPSTEIN_V_ROTHSTEIN_EDWARDS = f"Epstein v. Scott Rothstein, {BRAD_EDWARDS}, and L.M."
+EPSTEIN_V_ROTHSTEIN_EDWARDS = f"Epstein v. Scott Rothstein, {BRAD_EDWARDS}, & L.M."
 GIUFFRE_V_DERSHOWITZ = f"{VIRGINIA_GIUFFRE} v. {ALAN_DERSHOWITZ}"
 GIUFFRE_V_EPSTEIN = f"{VIRGINIA_GIUFFRE} v. {JEFFREY_EPSTEIN}"
 GIUFFRE_V_MAXWELL = f"{VIRGINIA_GIUFFRE} v. {GHISLAINE_MAXWELL}"
@@ -1379,8 +1384,6 @@ OTHER_FILES_LETTERS = [
         description=f"letter about algorithmic trading",
         date='2016-06-24',  # date is based on Brexit reference but he could be backtesting,
     ),
-    DocCfg(id='029304', author=DONALD_TRUMP, description=f"recommendation letter for recently departed {TRUMP_ORG} lawyer {MICHAEL_J_BOCCIO}"),
-    DocCfg(id='029301', author=MICHAEL_J_BOCCIO, description=f"letter from former lawyer at the {TRUMP_ORG}", date='2011-08-07'),
     DocCfg(id='026134', description=f'letter to someone named George about investment opportunities in the Ukraine banking sector'),
 ]
@@ -1531,13 +1534,27 @@ OTHER_FILES_ACADEMIA = [
 # resumes and application letters
 OTHER_FILES_RESUMES = [
+    DocCfg(
+        id='029304',
+        attached_to_email_id='029299',
+        author=DONALD_TRUMP,
+        description=f"recommendation letter for recently departed {TRUMP_ORG} lawyer {MICHAEL_J_BOCCIO}",
+    ),
     DocCfg(id='022367', author='Jack J Grynberg', description=RESUME_OF, date='2014-07-01'),
     DocCfg(
         id='029302',
+        attached_to_email_id='029299',
         author=MICHAEL_J_BOCCIO,
         description=f"{RESUME_OF} (former lawyer at the {TRUMP_ORG})",
         date='2011-08-07',
     ),
+    DocCfg(
+        id='029301',
+        attached_to_email_id='029299',
+        author=MICHAEL_J_BOCCIO,
+        description=f"letter from former lawyer at the {TRUMP_ORG}",
+        date='2011-08-07',
+    ),
     DocCfg(id='029102', author=NERIO_ALESSANDRI, description=HBS_APPLICATION),
     DocCfg(id='029104', author=NERIO_ALESSANDRI, description=HBS_APPLICATION),
     DocCfg(id='015671', author='Robin Solomon', description=RESUME_OF, date='2015-06-02'),  # She left Mount Sinai at some point in 2015,
@@ -1679,3 +1696,40 @@ REPLY_LINE_ON_DATE_PATTERN = fr"^On (\d+ )?((Mon|Tues?|Wed(nes)?|Thu(rs)?|Fri|Sa
 REPLY_LINE_PATTERN = rf"({REPLY_LINE_IN_A_MSG_PATTERN}|{REPLY_LINE_ON_NUMERIC_DATE_PATTERN}|{REPLY_LINE_ON_DATE_PATTERN}|{FORWARDED_LINE_PATTERN})"
 REPLY_REGEX = re.compile(REPLY_LINE_PATTERN, re.IGNORECASE | re.MULTILINE)
 SENT_FROM_REGEX = re.compile(r'^(?:(Please forgive|Sorry for all the) typos.{1,4})?((Envoyé de mon|Sent (from|via)).*(and string|AT&T|Droid|iPad|Phone|Mail|BlackBerry(.*(smartphone|device|Handheld|AT&T|T- ?Mobile))?)\.?)', re.M | re.I)
+# No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
+UNINTERESTING_EMAILERS = FLIGHT_IN_2012_PEOPLE + IRAN_DEAL_RECIPIENTS + [
+    'Alan Dlugash',                          # CCed with Richard Kahn
+    'Alan Rogers',                           # Random CC
+    'Andrew Friendly',                       # Presumably some relation of Kelly Friendly
+    'BS Stern',                              # A random fwd of email we have
+    'Cheryl Kleen',                          # Single email from Anne Boyles, displayed under Anne Boyles
+    'Connie Zaguirre',                       # Random CC
+    'Dan Fleuette',                          # CC from sean bannon
+    'Danny Goldberg',                        # Random Paul Krassner emails
+    GERALD_LEFCOURT,                         # Single CC
+    GORDON_GETTY,                            # Random CC
+    JEFF_FULLER,                             # Random Jean Luc Brunel CC
+    'Jojo Fontanilla',                       # Random CC
+    'Joseph Vinciguerra',                    # Random CC
+    'Larry Cohen',                           # Random Bill Gates CC
+    'Lyn Fontanilla',                        # Random CC
+    'Mark Albert',                           # Random CC
+    'Matthew Schafer',                       # Random CC
+    MICHAEL_BUCHHOLTZ,                       # Terry Kafka CC
+    'Nancy Dahl',                            # covered by Lawrence Krauss (her husband)
+    'Michael Simmons',                       # Random CC
+    'Nancy Portland',                        # Lawrence Krauss CC
+    'Oliver Goodenough',                     # Robert Trivers CC
+    'Peter Aldhous',                         # Lawrence Krauss CC
+    'Players2',                              # Hoffenberg CC
+    'Police Code Enforcement',               # Kirk Blouin / John Page CC
+    'Sam Harris',                            # Lawrence Krauss CC
+    SAMUEL_LEFF,                             # Random CC
+    'Sean T Lehane',                         # Random CC
+    'Stephen Rubin',                         # Random CC
+    'Tim Kane',                              # Random CC
+    'Travis Pangburn',                       # Random CC
+    'Vahe Stepanian',                        # Random CC
+]

epstein_files/util/data.py CHANGED Viewed

@@ -29,7 +29,7 @@ escape_single_quotes = lambda text: text.replace("'", r"\'")
 iso_timestamp = lambda dt: dt.isoformat().replace('T', ' ')
 days_between = lambda dt1, dt2: (dt2 - dt1).days + 1
 days_between_str = lambda dt1, dt2: f"{days_between(dt1, dt2)} day" + ('s' if days_between(dt1, dt2) > 1 else '')
-remove_zero_time_from_timestamp_str = lambda dt: dt.isoformat().removesuffix('T00:00:00')
+remove_zero_time = lambda dt: dt.isoformat().removesuffix('T00:00:00')
 uniquify = lambda _list: list(set(_list))
 without_falsey = lambda _list: [e for e in _list if e]
@@ -38,25 +38,6 @@ def dict_sets_to_lists(d: dict[str, set]) -> dict[str, list]:
     return {k: sorted(list(v)) for k, v in d.items()}
-def extract_last_name(name: str) -> str:
-    if ' ' not in name:
-        return name
-    names = name.removesuffix(QUESTION_MARKS).strip().split()
-    if names[-1].startswith('Jr') and len(names[-1]) <= 3:
-        return ' '.join(names[-2:])
-    else:
-        return names[-1]
-def extract_first_name(name: str) -> str:
-    if ' ' not in name:
-        return name
-    return name.removesuffix(f" {extract_last_name(name)}")
 def flatten(_list: list[list[T]]) -> list[T]:
     return list(itertools.chain.from_iterable(_list))

epstein_files/util/doc_cfg.py CHANGED Viewed

@@ -8,7 +8,7 @@ from dateutil.parser import parse
 from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import *
-from epstein_files.util.data import remove_zero_time_from_timestamp_str, without_falsey
+from epstein_files.util.data import remove_zero_time, without_falsey
 DuplicateType = Literal['earlier', 'quoted', 'redacted', 'same']
 Metadata = dict[str, bool | datetime | int | str | list[str | None] |dict[str, bool | str]]
@@ -62,7 +62,7 @@ class DocCfg:
     Attributes:
         id (str): ID of file
-        author (str | None): Author of the document (if any)
+        author (Name): Author of the document (if any)
         category (str | None): Type of file
         date (str | None): If passed will be immediated parsed into the 'timestamp' field
         dupe_type (DuplicateType | None): The type of duplicate this file is or its 'duplicate_ids' are
@@ -74,13 +74,14 @@ class DocCfg:
     """
     id: str
     attached_to_email_id: str | None = None
-    author: str | None = None
+    author: Name = None
     category: str | None = None
     date: str | None = None
     description: str | None = None
     dupe_type: DuplicateType | None = None
     duplicate_ids: list[str] = field(default_factory=list)
     duplicate_of_id: str | None = None
+    is_attribution_uncertain: bool = False
     is_interesting: bool | None = None
     is_synthetic: bool = False
     timestamp: datetime | None = None
@@ -94,30 +95,40 @@ class DocCfg:
     def complete_description(self) -> str | None:
         """String that summarizes what is known about this document."""
+        description = ''
         if self.category and not self.description and not self.author:
             if self.category == JUNK:
                 return None
             else:
-                return self.category
+                description = self.category
         elif self.category == REPUTATION:
             author_str = f"{self.author} " if self.author else ''
-            return f"{REPUTATION_MGMT}: {author_str}{self.description}"
+            description = f"{REPUTATION_MGMT}: {author_str}{self.description}"
         elif self.category == SKYPE_LOG:
             msg = f"{self.category} of conversation with {self.author}" if self.author else self.category
-            return f"{msg} {self.description}" if self.description else msg
+            description = f"{msg} {self.description}" if self.description else msg
         elif self.author and self.description:
             if self.category in [ACADEMIA, BOOK]:
                 title = self.description if '"' in self.description else f'"{self.description}"'
-                return f"{title} by {self.author}"
+                description = f"{title} by {self.author}"
             elif self.category == FINANCE and self.author in FINANCIAL_REPORTS_AUTHORS:
-                return f'{self.author} report: "{self.description}"'
+                description = f'{self.author} report: "{self.description}"'
             elif self.category == LEGAL and 'v.' in self.author:
-                return f"{self.author}: {self.description}"
-        elif self.category and self.author is None and self.description is None:
-            return self.category
+                description = f"{self.author}: {self.description}"
+        if not description:
+            pieces = without_falsey([self.author, self.description])
+            if pieces:
+                description = ' '.join(pieces)
+            else:
+                return None
-        pieces = without_falsey([self.author, self.description])
-        return ' '.join(pieces) if pieces else None
+        if self.attached_to_email_id:
+            description += f" attached to email {self.attached_to_email_id}"
+        return description
     def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
         """Create synthetic DocCfg objects that set the 'duplicate_of_id' field to point back to this object."""
@@ -152,7 +163,7 @@ class DocCfg:
             elif _field.name == 'timestamp' and self.date is not None:
                 continue  # Don't print both timestamp and date
             elif isinstance(value, datetime):
-                value_str = remove_zero_time_from_timestamp_str(value)
+                value_str = remove_zero_time(value)
                 add_prop(_field, f"parse('{value_str}')" if CONSTANTIZE_NAMES else f"'{value}'")
             elif isinstance(value, str):
                 if "'" in value:
@@ -196,7 +207,6 @@ class CommunicationCfg(DocCfg):
         is_attribution_uncertain (bool): True if we have a good idea of who the author is but are not 100% certain
     """
     attribution_reason: str | None = None
-    is_attribution_uncertain: bool = False
     def __repr__(self) -> str:
         return super().__repr__()
@@ -209,13 +219,13 @@ class EmailCfg(CommunicationCfg):
         actual_text (str | None): In dire cases of broken OCR we just configure the body of the email as a string.
         fwded_text_after (str | None): If set, any text after this is a fwd of an article or similar
         is_fwded_article (bool): True if this is a newspaper article someone fwded. Used to exclude articles from word counting.
-        recipients (list[str | None]): Who received the email
+        recipients (list[Name]): Who received the email
         subject (str): Subject line
     """
     actual_text: str | None = None
     fwded_text_after: str | None = None
     is_fwded_article: bool = False
-    recipients: list[str | None] = field(default_factory=list)
+    recipients: list[Name] = field(default_factory=list)
     subject: str | None = None
     # This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it

epstein_files/util/env.py CHANGED Viewed

@@ -8,7 +8,7 @@ from rich_argparse_plus import RichHelpFormatterPlus
 from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, CHRONOLOGICAL_EMAILS_PATH, TEXT_MSGS_HTML_PATH
 from epstein_files.util.logging import env_log_level, exit_with_error, logger
-DEFAULT_WIDTH = 145
+DEFAULT_WIDTH = 155
 DEFAULT_FILE = 'default_file'
 EPSTEIN_GENERATE = 'epstein_generate'
 HTML_SCRIPTS = [EPSTEIN_GENERATE, 'epstein_word_count']
@@ -38,6 +38,7 @@ output.add_argument('--all-emails', '-ae', action='store_true', help='all the em
 output.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just the interesting ones')
 parser.add_argument('--build', '-b', nargs="?", default=None, const=DEFAULT_FILE, help='write output to HTML file')
 output.add_argument('--email-timeline', action='store_true', help='print a table of all emails in chronological order')
+output.add_argument('--emailers-info', action='store_true', help='write a .png of the eeailers info table')
 output.add_argument('--json-files', action='store_true', help='pretty print all the raw JSON data files in the collection and exit')
 output.add_argument('--json-metadata', action='store_true', help='dump JSON metadata for all files and exit')
 output.add_argument('--output-emails', '-oe', action='store_true', help='generate emails section')
@@ -66,8 +67,9 @@ debug.add_argument('--suppress-logs', '-sl', action='store_true', help='set debu
 args = parser.parse_args()
 is_html_script = parser.prog in HTML_SCRIPTS
+args.build = args.build
 args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
-args.names = [None if n == 'None' else n for n in (args.names or [])]
+args.names = [None if n == 'None' else n.strip() for n in (args.names or [])]
 args.output_emails = args.output_emails or args.all_emails
 args.output_other = args.output_other or args.all_other_files or args.uninteresting
 args.overwrite_pickle = args.overwrite_pickle or (is_env_var_set('OVERWRITE_PICKLE') and not is_env_var_set('PICKLED'))
@@ -81,7 +83,7 @@ if is_html_script:
         if any([is_output_arg(arg) and val for arg, val in vars(args).items()]):
             if args.email_timeline:
                 exit_with_error(f"--email-timeline option is mutually exlusive with other output options")
-        elif not args.email_timeline:
+        elif not args.email_timeline and not args.emailers_info:
             logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
             args.output_texts = args.output_emails = args.output_other = True

epstein-files 1.1.5__py3-none-any.whl → 1.2.1__py3-none-any.whl

epstein-files 1.1.5py3-none-any.whl → 1.2.1py3-none-any.whl