PyPI - epstein-files - Versions diffs - 1.2.5__py3-none-any.whl → 1.4.1__py3-none-any.whl - Mend

epstein-files 1.2.5py3-none-any.whl → 1.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

epstein_files/__init__.py +32 -13
epstein_files/documents/document.py +8 -1
epstein_files/documents/email.py +179 -97
epstein_files/documents/emails/email_header.py +17 -8
epstein_files/documents/other_file.py +8 -6
epstein_files/epstein_files.py +16 -1
epstein_files/person.py +40 -15
epstein_files/util/constant/names.py +10 -6
epstein_files/util/constant/strings.py +2 -1
epstein_files/util/constants.py +463 -225
epstein_files/util/doc_cfg.py +33 -27
epstein_files/util/env.py +10 -3
epstein_files/util/file_helper.py +2 -0
epstein_files/util/highlighted_group.py +66 -23
epstein_files/util/output.py +17 -31
epstein_files/util/rich.py +2 -1
epstein_files/util/word_count.py +1 -1
{epstein_files-1.2.5.dist-info → epstein_files-1.4.1.dist-info}/METADATA +3 -3
epstein_files-1.4.1.dist-info/RECORD +34 -0
{epstein_files-1.2.5.dist-info → epstein_files-1.4.1.dist-info}/entry_points.txt +1 -1
epstein_files-1.2.5.dist-info/RECORD +0 -34
{epstein_files-1.2.5.dist-info → epstein_files-1.4.1.dist-info}/LICENSE +0 -0
{epstein_files-1.2.5.dist-info → epstein_files-1.4.1.dist-info}/WHEEL +0 -0

epstein_files/documents/email.py CHANGED Viewed

@@ -17,12 +17,12 @@ from rich.text import Text
 from epstein_files.documents.communication import Communication
 from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, INFO_INDENT
 from epstein_files.documents.emails.email_header import (BAD_EMAILER_REGEX, EMAIL_SIMPLE_HEADER_REGEX,
-     EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, TIME_REGEX, EmailHeader)
+     EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, FIELDS_COLON_PATTERN, TIME_REGEX, EmailHeader)
+from epstein_files.documents.other_file import OtherFile
 from epstein_files.util.constant.names import *
 from epstein_files.util.constant.strings import REDACTED
 from epstein_files.util.constants import *
-from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes,
-     flatten, listify, remove_timezone, uniquify)
+from epstein_files.util.data import TIMEZONE_INFO, collapse_newlines, escape_single_quotes, remove_timezone
 from epstein_files.util.doc_cfg import EmailCfg, Metadata
 from epstein_files.util.file_helper import extract_file_id, file_stem_for_id
 from epstein_files.util.highlighted_group import JUNK_EMAILERS, get_style_for_name
@@ -30,9 +30,11 @@ from epstein_files.util.logging import logger
 from epstein_files.util.rich import *
 BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
-BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
+BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|PAGE INTENTIONALLY LEFT BLANK|Classification: External Communication|Hide caption|Importance:?\s*High|[iI,•]|[1i] (_ )?[il]|, [-,]|L\._|_filtered|.*(yiv0232|font-family:|margin-bottom:).*)$')
+BAD_SUBJECT_CONTINUATIONS = ['orwarded', 'Hi ', 'Sent ', 'AmLaw', 'Original Message', 'Privileged', 'Sorry', '---']
 DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
-LINK_LINE_REGEX = re.compile(f"^>? ?htt")
+FIELDS_COLON_REGEX = re.compile(FIELDS_COLON_PATTERN)
+LINK_LINE_REGEX = re.compile(f"^[>• ]*htt")
 LINK_LINE2_REGEX = re.compile(r"^[-\w.%&=/]{5,}$")
 QUOTED_REPLY_LINE_REGEX = re.compile(r'(\nFrom:(.*)|wrote:)\n', re.IGNORECASE)
 REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
@@ -44,13 +46,12 @@ LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
 SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
 REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
-URL_SIGNIFIERS = ['amp?', 'cd=', 'click', 'ft=', 'gclid', 'htm', 'keywords=', 'module=', 'mpweb', 'nlid=', 'ref=', 'smid=', 'usg=', 'utm']
+URL_SIGNIFIERS = ['?amp', 'amp?', 'cd=', 'click', 'CMP=', 'contentId', 'ft=', 'gclid', 'htm', 'mp=', 'keywords=', 'Id=', 'module=', 'mpweb', 'nlid=', 'ref=', 'smid=', 'sp=', 'usg=', 'utm']
 APPEARS_IN = 'appears in'
 MAX_NUM_HEADER_LINES = 14
-MAX_QUOTED_REPLIES = 2
-MAX_CHARS_TO_PRINT = 4000
-TRUNCATED_CHARS = int(MAX_CHARS_TO_PRINT / 3)
+MAX_QUOTED_REPLIES = 1
+NUM_WORDS_IN_LAST_QUOTE = 6
 REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
     '********************************',
@@ -88,7 +89,13 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
     re.compile(r'^INW$', re.MULTILINE): REDACTED,
     # links
     'Imps ://': 'https://',
+    'on-accusers-rose-\nmcgowan/ ': 'on-accusers-rose-\nmcgowan/\n',
+    'the-truth-\nabout-the-bitcoin-foundation/ )': 'the-truth-about-the-bitcoin-foundation/ )\n',
+    'woody-allen-jeffrey-epsteins-\nsociety-friends-close-ranks/ ---': 'woody-allen-jeffrey-epsteins-society-friends-close_ranks/\n',
+    ' https://www.theguardian.com/world/2017/may/29/close-friend-trump-thomas-barrack-\nalleged-tax-evasion-italy-sardinia?CMP=share btn fb': '\nhttps://www.theguardian.com/world/2017/may/29/close-friend-trump-thomas-barrack-alleged-tax-evasion-italy-sardinia?CMP=share_btn_fb',
     re.compile(r'timestopics/people/t/landon jr thomas/inde\n?x\n?\.\n?h\n?tml'): 'timestopics/people/t/landon_jr_thomas/index.html',
+    re.compile(r" http ?://www. ?dailymail. ?co ?.uk/news/article-\d+/Troub ?led-woman-history-drug-\n?us ?e-\n?.*html"): '\nhttp://www.dailymail.co.uk/news/article-3914012/Troubled-woman-history-drug-use-claimed-assaulted-Donald-Trump-Jeffrey-Epstein-sex-party-age-13-FABRICATED-story.html',
+    re.compile(r"http.*steve-bannon-trump-tower-\n?interview-\n?trumps-\n?strategist-plots-\n?new-political-movement-948747"): "\nhttp://www.hollywoodreporter.com/news/steve-bannon-trump-tower-interview-trumps-strategist-plots-new-political-movement-948747",
     # Subject lines
     "Arrested in\nInauguration Day Riot": "Arrested in Inauguration Day Riot",
     "as Putin Mayhem Tests President's Grip\non GOP": "as Putin Mayhem Tests President's Grip on GOP",
@@ -99,6 +106,7 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
     "COVER UP SEX ABUSE CRIMES\nBY THE WHITE HOUSE": "COVER UP SEX ABUSE CRIMES BY THE WHITE HOUSE",
     'Priebus, used\nprivate email accounts for': 'Priebus, used private email accounts for',
     "War on the Investigations\nEncircling Him": "War on the Investigations Encircling Him",
+    "Subject; RE": "Subject: RE",
     re.compile(r"deadline re Mr Bradley Edwards vs Mr\s*Jeffrey Epstein", re.I): "deadline re Mr Bradley Edwards vs Mr Jeffrey Epstein",
     re.compile(r"Following Plea That Implicated Trump -\s*https://www.npr.org/676040070", re.I): "Following Plea That Implicated Trump - https://www.npr.org/676040070",
     re.compile(r"for Attorney General -\s+Wikisource, the"): r"for Attorney General - Wikisource, the",
@@ -109,6 +117,8 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
     re.compile(r"Subject:\s*Fwd: Trending Now: Friends for three decades"): "Subject: Fwd: Trending Now: Friends for three decades",
     # Misc
     'AVG°': 'AVGO',
+    'Saw Matt C with DTF at golf': 'Saw Matt C with DJT at golf',
+    re.compile(r"[i. ]*Privileged[- ]*Redacted[i. ]*"): '<PRIVILEGED - REDACTED>',
 }
 EMAIL_SIGNATURE_REGEXES = {
@@ -118,20 +128,28 @@ EMAIL_SIGNATURE_REGEXES = {
     DANIEL_SIAD: re.compile(r"Confidentiality Notice: The information contained in this electronic message is PRIVILEGED and confidential information intended only for the use of the individual entity or entities named as recipient or recipients. If the reader is not the intended recipient, be hereby notified that any dissemination, distribution or copy of this communication is strictly prohibited. If you have received this communication in error, please notify me immediately by electronic mail or by telephone and permanently delete this message from your computer system. Thank you.".replace(' ', r'\s*'), re.IGNORECASE),
     DANNY_FROST: re.compile(r"Danny Frost\nDirector.*\nManhattan District.*\n212.*", re.IGNORECASE),
     DARREN_INDYKE: re.compile(r"DARREN K. INDYKE.*?\**\nThe information contained in this communication.*?Darren K.[\n\s]+?[Il]ndyke(, PLLC)? — All rights reserved\.? ?\n\*{50,120}(\n\**)?", re.DOTALL),
+    DAVID_FISZEL: re.compile(r"This e-mail and any file.*\nmail and/or any file.*\nmail or any.*\nreceived.*\nmisdirected.*"),
     DAVID_INGRAM: re.compile(r"Thank you in advance.*\nDavid Ingram.*\nCorrespondent\nReuters.*\nThomson.*(\n(Office|Mobile|Reuters.com).*)*"),
     DEEPAK_CHOPRA: re.compile(fr"({DEEPAK_CHOPRA}( MD)?\n)?2013 Costa Del Mar Road\nCarlsbad, CA 92009(\n(Chopra Foundation|Super Genes: Unlock.*))?(\nJiyo)?(\nChopra Center for Wellbeing)?(\nHome: Where Everyone is Welcome)?"),
-    EDUARDO_ROBLES: re.compile(fr"(• )?email:.*\n(• )?email:\n(• )?website: www.creativekingdom.com\n(• )?address: 5th Floor Office No:504 Aspect Tower,\nBusiness Bay, Dubai United Arab Emirates."),
+    EDUARDO_ROBLES: re.compile(r"(• )?email:.*\n(• )?email:\n(• )?website: www.creativekingdom.com\n(• )?address: 5th Floor Office No:504 Aspect Tower,\nBusiness Bay, Dubai United Arab Emirates."),
+    ERIC_ROTH: re.compile(r"2221 Smithtown Avenue\nLong Island.*\nRonkonkoma.*\n(.1. )?Phone\nFax\nCell\ne-mail"),
+    GHISLAINE_MAXWELL: re.compile(r"FACEBOOK\nTWITTER\nG\+\nPINTEREST\nINSTAGRAM\nPLEDGE\nTHE DAILY CATCH"),
     JEFFREY_EPSTEIN: re.compile(r"((\*+|please note)\n+)?(> )?(• )?(» )?The information contained in this communication is\n(> )*(» )?confidential.*?all attachments.( copyright -all rights reserved?)?", re.DOTALL),
     JESSICA_CADWELL: re.compile(r"(f.*\n)?Certified Para.*\nFlorida.*\nBURMAN.*\n515.*\nSuite.*\nWest Palm.*(\nTel:.*)?(\nEmail:.*)?", re.IGNORECASE),
     KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
     LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
     LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
+    LEON_BLACK: re.compile(r"This email and any files transmitted with it are confidential and intended solely.*\n(they|whom).*\ndissemination.*\nother.*\nand delete.*"),
+    LISA_NEW: re.compile(r"Elisa New\nPowell M. Cabot.*\n(Director.*\n)?Harvard.*\n148.*\n([1I] )?12.*\nCambridge.*\n([1I] )?02138"),
     MARTIN_WEINBERG: re.compile(r"(Martin G. Weinberg, Esq.\n20 Park Plaza((, )|\n)Suite 1000\nBoston, MA 02116(\n61.*?)?(\n.*?([cC]ell|Office))*\n)?This Electronic Message contains.*?contents of this message is.*?prohibited.", re.DOTALL),
+    MICHAEL_MILLER: re.compile(r"Michael C. Miller\nPartner\nwww.steptoe.com/mmiller\nSteptoe\n(Privileged.*\n)?(\+1\s+)?direct.*\n(\+1\s+)?(\+1\s+)?fax.*\n(\+1.*)?cell.*\n(www.steptoe.com\n)?This message and any.*\nyou are not.*\nnotify the sender.*"),
     NICHOLAS_RIBIS: re.compile(r"60 Morris Turnpike 2FL\nSummit,? NJ.*\n0:\nF:\n\*{20,}\nCONFIDENTIALITY NOTICE.*\nattachments.*\ncopying.*\nIf you have.*\nthe copy.*\nThank.*\n\*{20,}"),
     PETER_MANDELSON: re.compile(r'Disclaimer This email and any attachments to it may be.*?with[ \n]+number(.*?EC4V[ \n]+6BJ)?', re.DOTALL | re.IGNORECASE),
     PAUL_BARRETT: re.compile(r"Paul Barrett[\n\s]+Alpha Group Capital LLC[\n\s]+(142 W 57th Street, 11th Floor, New York, NY 10019?[\n\s]+)?(al?[\n\s]*)?ALPHA GROUP[\n\s]+CAPITAL"),
+    PETER_ATTIA: re.compile(r"The information contained in this transmission may contain.*\n(laws|patient).*\n(distribution|named).*\n(distribution.*\nplease.*|copies.*)"),
     RICHARD_KAHN: re.compile(fr'Richard Kahn[\n\s]+HBRK Associates Inc.?[\n\s]+((301 East 66th Street, Suite 1OF|575 Lexington Avenue,? 4th Floor,?)[\n\s]+)?New York, (NY|New York) 100(22|65)(\s+(Tel?|Phone)( I|{REDACTED})?\s+Fa[x",]?(_|{REDACTED})*\s+[Ce]el?l?)?', re.IGNORECASE),
     ROSS_GOW: re.compile(r"Ross Gow\nManaging Partner\nACUITY Reputation Limited\n23 Berkeley Square\nLondon.*\nMobile.*\nTel"),
+    STEPHEN_HANSON: re.compile(r"(> )?Confidentiality Notice: This e-mail transmission.*\n(which it is addressed )?and may contain.*\n(applicable law. If you are not the intended )?recipient you are hereby.*\n(information contained in or attached to this transmission is )?STRICTLY PROHIBITED.*"),
     STEVEN_PFEIFFER: re.compile(r"Steven\nSteven .*\nAssociate.*\nIndependent Filmmaker Project\nMade in NY.*\n30 .*\nBrooklyn.*\n(p:.*\n)?www\.ifp.*", re.IGNORECASE),
     'Susan Edelman': re.compile(r'Susan Edel.*\nReporter\n1211.*\n917.*\nsedelman.*', re.IGNORECASE),
     TERRY_KAFKA: re.compile(r"((>|I) )?Terry B.? Kafka.*\n(> )?Impact Outdoor.*\n(> )?5454.*\n(> )?Dallas.*\n((> )?c?ell.*\n)?(> )?Impactoutdoor.*(\n(> )?cell.*)?", re.IGNORECASE),
@@ -152,13 +170,19 @@ BCC_LISTS = JUNK_EMAILERS + MAILING_LISTS
 TRUNCATE_EMAILS_FROM_OR_TO = [
     AMANDA_ENS,
     ANTHONY_BARRETT,
+    DANIEL_SABBA,
     DIANE_ZIMAN,
     JOSCHA_BACH,
     KATHERINE_KEATING,
+    LAWRANCE_VISOSKI,
     LAWRENCE_KRAUSS,
     LISA_NEW,
+    MOSHE_HOFFMAN,
     NILI_PRIELL_BARAK,
     PAUL_KRASSNER,
+    PAUL_PROSPERI,
+    'Susan Edelman',
+    TERRY_KAFKA,
 ]
 TRUNCATE_EMAILS_FROM = BCC_LISTS + TRUNCATE_EMAILS_FROM_OR_TO + [
@@ -170,6 +194,7 @@ TRUNCATE_EMAILS_FROM = BCC_LISTS + TRUNCATE_EMAILS_FROM_OR_TO + [
     DAVID_HAIG,
     EDWARD_ROD_LARSEN,
     JOHNNY_EL_HACHEM,
+    'Mark Green',
     MELANIE_WALKER,
     'Mitchell Bard',
     PEGGY_SIEGAL,
@@ -182,47 +207,12 @@ TRUNCATE_EMAILS_FROM = BCC_LISTS + TRUNCATE_EMAILS_FROM_OR_TO + [
     TERRY_KAFKA,
 ]
-# These IDs will be appended to INTERESTING_EMAIL_IDS
-INTERESTING_TRUNCATION_LENGTHS = {
-    '023627': 16_800,  # Micheal Wolff article with brock pierce
-    '030245': None,    # Epstein rationalizes his behavior in an open letter to the world
-    '030781': None,    # Bannon email about crypto coin issues
-    '032906': None,    # David Blaine email
-    '026036': 6000,    # Gino Yu blockchain mention
-    '029609': None,    # Joi Ito
-    '025233': None,    # Reputation.com discussion
-    '017827': None,    # Bannon / Peggy Siegal email about netflix doc on Epstein
-    '030222': None,    # Ross Gow / Ghislaine correspondence
-    '026028': None,    # Larry Summers / Karim Wade intro
-    '029545': None,    # Tyler Shears reputation
-    '025812': None,    # Tyler Shears reputation
-    '029914': 4500,    # Lord Mandelson russian investments
-    '033453': None,    # "Just heard you were telling people that you heard I asked Trump for a million dollars"
-    '031320': None,    # Epstein Gratitude foundation
-    '031036': None,    # Barbro Ehnbom talking about Swedish girl
-    '023454': 1878,    # Email invitation sent to tech CEOs + Epstein
-    '029342': 2000,    # Hakeem Jeffries
-}
-TRUNCATION_LENGTHS = {
-    **INTERESTING_TRUNCATION_LENGTHS,
-    '031791': None,    # First email in Jessica Cadwell chain about service of legal documents
-    '023208': None,    # Long discussion about leon black's finances
-    '028589': None,    # Long thread with Reid Weingarten
-    '029433': TRUNCATED_CHARS,  # Kahn taxes
-    '026778': TRUNCATED_CHARS,  # Kahn taxes
-    '033311': TRUNCATED_CHARS,  # Kahn taxes
-    '024251': TRUNCATED_CHARS,  # Kahn taxes
-    '026755': TRUNCATED_CHARS,  # Epstein self fwd
-}
 # These are long forwarded articles so we force a trim to 1,333 chars if these strings exist
 TRUNCATE_TERMS = [
     'The rebuilding of Indonesia',  # Vikcy ward article
-    'Dominique Strauss-Kahn',
-    'THOMAS L. FRIEDMAN',
     'a sleek, briskly paced film whose title suggests a heist movie',  # Inside Job
     'Calendar of Major Events, Openings, and Fundraisers',
+    'sent over from Marshall Heyman at the WSJ',
     "In recent months, China's BAT collapse",
     'President Obama introduces Jim Yong Kim as his nominee',
     'Trump appears with mobster-affiliated felon at New',
@@ -237,9 +227,11 @@ TRUNCATE_TERMS = [
     'co-inventor of the GTX Smart Shoe',
     'my latest Washington Post column',
     # Bannon
+    'As Steve Bannon continues his tour of Europe',
     "Bannon the European: He's opening the populist fort in Brussels",
     "Steve Bannon doesn't do subtle.",
     'The Department of Justice lost its latest battle with Congress',
+    'pedophile Jeffrey Epstein bought his way out',
     # lawyers
     'recuses itself from Jeffrey Epstein case',
     # Misc
@@ -265,11 +257,23 @@ LINE_REPAIR_MERGES = {
     '014397': [[4]] * 2,
     '014860': [[3], [4], [4]],
     '017523': [[4]],
+    '030367': [[1, 4], [2, 4]],
     '019105': [[5]] * 4,
     '019407': [[2, 4]],
+    '022187': [[1, 8], [2, 8], [3, 8], [4, 8]],
     '021729': [[2]],
+    '032896': [[2]],
+    '033050': [[0, 6], [1, 6], [2, 6], [3, 6], [4, 6]],
+    '022949': [[0, 4], [1, 4]],
+    '022197': [[0, 5], [1, 5], [3, 5]],
+    '021814': [[1, 6], [2, 6], [3, 6], [4, 6]],
+    '022190': [[1, 7], [0, 6], [3, 6], [4, 6]],
+    '029582': [[0, 5], [1, 5], [3, 5], [3, 5]],
     '022673': [[9]],
     '022684': [[9]],
+    '026625': [[0, 7], [1, 7], [2, 7], [3, 7], [4, 7], [5, 7]],
+    '026659': [[0, 5], [1, 5]],
+    '026764': [[0, 6], [1, 6]],
     '022695': [[4]],
     '022977': [[9]] * 10,
     '023001': [[5]] * 3,
@@ -278,11 +282,15 @@ LINE_REPAIR_MERGES = {
     '025329': [[2]] * 9,
     '025790': [[2]],
     '025812': [[3]] * 2,
+    '025589': [[3]] * 12,
     '026345': [[3]],
     '026609': [[4]],
+    '028921': [[5, 4], [4, 5]],
+    '026620': ([[20]] * 4) + [[3, 2]] + ([[2]] * 15) + [[2, 4]],
     '026829': [[3]],
     '026924': [[2, 4]],
     '028728': [[3]],
+    '026451': [[3, 5]] * 2,
     '028931': [[3, 6]],
     '029154': [[2, 5]],
     '029163': [[2, 5]],
@@ -302,18 +310,22 @@ LINE_REPAIR_MERGES = {
     '029977': ([[2]] * 4) + [[4], [2, 4]],
     '030299': [[7, 10]],
     '030315': [[3, 5]],
+    '030318': [[3, 5]],
     '030381': [[2, 4]],
     '030384': [[2, 4]],
     '030626': [[2], [4]],
+    '030861': [[3, 8]],
     '030999': [[2, 4]],
     '031384': [[2]],
     '031428': [[2], [2, 4]],
     '031442': [[0]],
+    '031489': [[2, 4], [3, 4], [3, 4], [10]],
+    '031619': [[7], [17], [17]],
     '031748': [[3]] * 2,
-    '031764': [[3]],
+    '031764': [[3], [8]],  # 8 is just for style fix internally, not header
     '031980': [[2, 4]],
     '032063': [[3, 5]],
-    '032272': [[3]],
+    '032272': [[2, 10], [3]],
     '032405': [[4]],
     '032637': [[9]] * 3,
     '033097': [[2]],
@@ -326,6 +338,8 @@ LINE_REPAIR_MERGES = {
     '033357': [[2, 4]],
     '033486': [[7, 9]],
     '033512': [[2]],
+    '026024': [[1, 3], [2, 3]],
+    '024923': [[0, 5], [2]],
     '033568': [[5]] * 5,
     '033575': [[2, 4]],
     '033576': [[3]],
@@ -344,12 +358,14 @@ class Email(Communication):
         sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
         signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
     """
+    attached_docs: list[OtherFile] = field(default_factory=list)
     actual_text: str = field(init=False)
     config: EmailCfg | None = None
     header: EmailHeader = field(init=False)
     recipients: list[Name] = field(default_factory=list)
     sent_from_device: str | None = None
     signature_substitution_counts: dict[str, int] = field(default_factory=dict)  # defaultdict breaks asdict :(
+    _is_first_for_user: bool = False  # Only set when printing
     _line_merge_arguments: list[tuple[int] | tuple[int, int]] = field(default_factory=list)
     # For logging how many headers we prettified while printing, kind of janky
@@ -389,6 +405,7 @@ class Email(Communication):
         self.sent_from_device = self._sent_from_device()
     def attachments(self) -> list[str]:
+        """Returns the string in the header."""
         return (self.header.attachments or '').split(';')
     def info_txt(self) -> Text:
@@ -402,7 +419,12 @@ class Email(Communication):
         return txt.append(highlighter(f" probably sent at {self.timestamp}"))
     def is_fwded_article(self) -> bool:
-        return bool(self.config and self.config.is_fwded_article)
+        if self.config is None:
+            return False
+        elif self.config.fwded_text_after:
+            return self.config.is_fwded_article is not False
+        else:
+            return bool(self.config.is_fwded_article)
     def is_junk_mail(self) -> bool:
         return self.author in JUNK_EMAILERS
@@ -413,9 +435,15 @@ class Email(Communication):
     def is_note_to_self(self) -> bool:
         return self.recipients == [self.author]
-    def is_with(self, name: str) -> bool:
+    def is_from_or_to(self, name: str) -> bool:
         return name in [self.author] + self.recipients
+    def is_word_count_worthy(self) -> bool:
+        if self.is_fwded_article():
+            return bool(self.config.fwded_text_after) or len(self.actual_text) < 150
+        else:
+            return not self.is_mailing_list()
     def metadata(self) -> Metadata:
         local_metadata = asdict(self)
         local_metadata['is_junk_mail'] = self.is_junk_mail()
@@ -462,8 +490,9 @@ class Email(Communication):
         elif self.header.num_header_rows == 0:
             return self.text
+        # import pdb;pdb.set_trace()
         self.log_top_lines(20, "Raw text:", logging.DEBUG)
-        self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
+        self.log(f"With {self.header.num_header_rows} header lines removed:\n{text[0:500]}\n\n", logging.DEBUG)
         reply_text_match = REPLY_TEXT_REGEX.search(text)
         if reply_text_match:
@@ -542,8 +571,8 @@ class Email(Communication):
         logger.debug(f"{self.file_id} extracted header\n\n{self.header}\n")
     def _extract_timestamp(self) -> datetime:
-        if self.config and self.config.timestamp:
-            return self.config.timestamp
+        if self.config and self.config.timestamp():
+            return self.config.timestamp()
         elif self.header.sent_at:
             timestamp = _parse_timestamp(self.header.sent_at)
@@ -572,36 +601,41 @@ class Email(Communication):
                 logger.debug(f"Fell back to timestamp {timestamp} in line '{line}'...")
                 return timestamp
-        raise RuntimeError(f"No timestamp found in '{self.file_path.name}' top lines:\n{searchable_text}")
+        no_timestamp_msg = f"No timestamp found in '{self.file_path.name}'"
-    def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
-        """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
-        if text is None:
-            header_offset = len(self.header.header_chars)
-            text = self.text[header_offset:]
+        if self.is_duplicate():
+            logger.warning(f"{no_timestamp_msg} but timestamp should be copied from {self.duplicate_of_id()}")
         else:
-            header_offset = 0
+            raise RuntimeError(f"{no_timestamp_msg}, top lines:\n{searchable_text}")
+    def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES) -> int | None:
+        """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
+        header_offset = len(self.header.header_chars)
+        text = self.text[header_offset:]
         for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text)):
             if i >= n:
                 return match.end() + header_offset - 1
-    def _merge_lines(self, idx: int, idx2: int | None = None) -> None:
+    def _merge_lines(self, idx1: int, idx2: int | None = None) -> None:
         """Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""
         if idx2 is None:
-            self._line_merge_arguments.append((idx,))
-            idx2 = idx + 1
+            self._line_merge_arguments.append((idx1,))
+            idx2 = idx1 + 1
         else:
-            self._line_merge_arguments.append((idx, idx2))
+            self._line_merge_arguments.append((idx1, idx2))
-        lines = self.lines[0:idx]
-        if idx2 <= idx:
-            raise RuntimeError(f"idx2 ({idx2}) must be greater than idx ({idx})")
-        elif idx2 == (idx + 1):
-            lines += [self.lines[idx] + ' ' + self.lines[idx + 1]] + self.lines[idx + 2:]
+        if idx2 < idx1:
+            lines = self.lines[0:idx2] + self.lines[idx2 + 1:idx1] + [self.lines[idx1] + ' ' + self.lines[idx2]] + self.lines[idx1 + 1:]
+        elif idx2 == idx1:
+            raise RuntimeError(f"idx2 ({idx2}) must be greater or less than idx ({idx1})")
         else:
-            lines += [self.lines[idx] + ' ' + self.lines[idx2]] + self.lines[idx + 1:idx2] + self.lines[idx2 + 1:]
+            lines = self.lines[0:idx1]
+            if idx2 == (idx1 + 1):
+                lines += [self.lines[idx1] + ' ' + self.lines[idx1 + 1]] + self.lines[idx1 + 2:]
+            else:
+                lines += [self.lines[idx1] + ' ' + self.lines[idx2]] + self.lines[idx1 + 1:idx2] + self.lines[idx2 + 1:]
         self._set_computed_fields(lines=lines)
@@ -617,6 +651,10 @@ class Email(Communication):
             self.signature_substitution_counts[name] = self.signature_substitution_counts.get(name, 0)
             self.signature_substitution_counts[name] += num_replaced
+        # Share / Tweet lines
+        if self.author == KATHRYN_RUEMMLER:
+            text = '\n'.join([l for l in text.split('\n') if l not in ['Share', 'Tweet', 'Bookmark it']])
         return collapse_newlines(text).strip()
     def _remove_line(self, idx: int) -> None:
@@ -657,17 +695,21 @@ class Email(Communication):
             self.log_top_lines(12, 'Result of modifications')
         lines = self.repair_ocr_text(OCR_REPAIRS, self.text).split('\n')
+        subject_line = next((line for line in lines if line.startswith('Subject:')), None) or ''
+        subject = subject_line.split(':')[1].strip() if subject_line else ''
         new_lines = []
         i = 0
-        # Fix links (remove spaces, merge multiline links to a single line)
+        # Fix links and quoted subjects (remove spaces, merge multiline links to a single line)
         while i < len(lines):
             line = lines[i]
             if LINK_LINE_REGEX.search(line):
                 while i < (len(lines) - 1) \
-                        and 'http' not in lines[i + 1] \
-                        and (lines[i + 1].endswith('/') or any(s in lines[i + 1] for s in URL_SIGNIFIERS) or LINK_LINE2_REGEX.match(lines[i + 1])):
+                        and not lines[i + 1].startswith('htt') \
+                        and (lines[i + 1].endswith('/') \
+                             or any(s in lines[i + 1] for s in URL_SIGNIFIERS) \
+                             or LINK_LINE2_REGEX.match(lines[i + 1])):
                     logger.debug(f"{self.filename}: Joining link lines\n   1. {line}\n   2. {lines[i + 1]}\n")
                     line += lines[i + 1]
                     i += 1
@@ -676,6 +718,17 @@ class Email(Communication):
             elif ' http' in line and line.endswith('html'):
                 pre_link, post_link = line.split(' http', 1)
                 line = f"{pre_link} http{post_link.replace(' ', '')}"
+            elif line.startswith('Subject:') and i < (len(lines) - 2) and len(line) >= 40:
+                next_line = lines[i + 1]
+                next_next = lines[i + 2]
+                if len(next_line) <= 1 or any([cont in next_line for cont in BAD_SUBJECT_CONTINUATIONS]):
+                    pass
+                elif (subject.endswith(next_line) and next_line != subject) \
+                        or (FIELDS_COLON_REGEX.search(next_next) and not FIELDS_COLON_REGEX.search(next_line)):
+                    self.warn(f"Fixing broken subject line\n  line: '{line}'\n    next: '{next_line}'\n    next: '{next_next}'\nsubject='{subject}'\n")
+                    line += f" {next_line}"
+                    i += 1
             new_lines.append(line)
@@ -699,7 +752,7 @@ class Email(Communication):
         """Copy info from original config for file this document was extracted from."""
         if self.file_id in ALL_FILE_CONFIGS:
             self.config = cast(EmailCfg, deepcopy(ALL_FILE_CONFIGS[self.file_id]))
-            self.warn(f"Merging existing cfg for '{self.file_id}' with cfg for extracted document...")
+            self.log(f"Merging existing cfg for '{self.file_id}' with cfg for extracted document...")
         else:
             self.config = EmailCfg(id=self.file_id)
@@ -721,34 +774,58 @@ class Email(Communication):
     def _truncate_to_length(self) -> int:
         """When printing truncate this email to this length."""
-        quote_cutoff = self._idx_of_nth_quoted_reply(text=self.text)  # Trim if there's many quoted replies
+        quote_cutoff = self._idx_of_nth_quoted_reply()  # Trim if there's many quoted replies
         includes_truncate_term = next((term for term in TRUNCATE_TERMS if term in self.text), None)
         if args.whole_file:
             num_chars = len(self.text)
         elif args.truncate:
             num_chars = args.truncate
-        elif self.file_id in TRUNCATION_LENGTHS:
-            num_chars = TRUNCATION_LENGTHS[self.file_id] or self.file_size()
-        elif self.author in TRUNCATE_EMAILS_FROM or any([self.is_with(n) for n in TRUNCATE_EMAILS_FROM_OR_TO]) or includes_truncate_term:
+        elif self.config and self.config.truncate_to is not None:
+            num_chars = len(self.text) if self.config.truncate_to == NO_TRUNCATE else self.config.truncate_to
+        elif self.is_interesting():
+            num_chars = len(self.text)
+        elif self.author in TRUNCATE_EMAILS_FROM \
+                or any([self.is_from_or_to(n) for n in TRUNCATE_EMAILS_FROM_OR_TO]) \
+                or self.is_fwded_article() \
+                or includes_truncate_term:
             num_chars = min(quote_cutoff or MAX_CHARS_TO_PRINT, TRUNCATED_CHARS)
-        elif quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
-            num_chars = quote_cutoff
         else:
-            num_chars = MAX_CHARS_TO_PRINT
-        if num_chars != MAX_CHARS_TO_PRINT and not self.is_duplicate():
-            log_args = {
-                'num_chars': num_chars,
-                'author_truncate': self.author in TRUNCATE_EMAILS_FROM,
-                'is_fwded_article': self.is_fwded_article(),
-                'is_quote_cutoff': quote_cutoff == num_chars,
-                'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
-                'quote_cutoff': quote_cutoff,
-            }
-            logger.debug(f'{self.summary()} truncating: ' + ', '.join([f"{k}={v}" for k, v in log_args.items() if v]) + '\n')
+            if quote_cutoff and quote_cutoff < MAX_CHARS_TO_PRINT:
+                trimmed_words = self.text[quote_cutoff:].split()
+                if '<...snipped' in trimmed_words[:NUM_WORDS_IN_LAST_QUOTE]:
+                    num_trailing_words = 0
+                elif trimmed_words and trimmed_words[0] in ['From:', 'Sent:']:
+                    num_trailing_words = NUM_WORDS_IN_LAST_QUOTE
+                else:
+                    num_trailing_words = NUM_WORDS_IN_LAST_QUOTE
+                if trimmed_words:
+                    last_quoted_text = ' '.join(trimmed_words[:num_trailing_words])
+                    num_chars = quote_cutoff + len(last_quoted_text) + 1 # Give a hint of the next line
+                else:
+                    num_chars = quote_cutoff
+            else:
+                num_chars = min(self.file_size(), MAX_CHARS_TO_PRINT)
+            # Always print whole email for 1st email for user
+            if self._is_first_for_user and num_chars < self.file_size() and not self.is_duplicate():
+                logger.info(f"{self} Overriding cutoff {num_chars} for first email")
+                num_chars = self.file_size()
+        log_args = {
+            'num_chars': num_chars,
+            '_is_first_for_user': self._is_first_for_user,
+            'author_truncate': self.author in TRUNCATE_EMAILS_FROM,
+            'is_fwded_article': self.is_fwded_article(),
+            'is_quote_cutoff': quote_cutoff == num_chars,
+            'includes_truncate_term': json.dumps(includes_truncate_term) if includes_truncate_term else None,
+            'quote_cutoff': quote_cutoff,
+        }
+        log_args_str = ', '.join([f"{k}={v}" for k, v in log_args.items() if v])
+        logger.debug(f"Truncate determination: {log_args_str}")
         return num_chars
     def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
@@ -762,7 +839,7 @@ class Email(Communication):
         if len(text) > num_chars:
             text = text[0:num_chars]
             doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style())
-            trim_note = f"<...trimmed to {num_chars} characters of {self.length()}, read the rest at {doc_link_markup}...>"
+            trim_note = f"<...trimmed to {num_chars:,} characters of {self.length():,}, read the rest at {doc_link_markup}...>"
             trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
         # Rewrite broken headers where the values are on separate lines from the field names
@@ -789,7 +866,7 @@ class Email(Communication):
         text = join_texts(lines, '\n')
         email_txt_panel = Panel(
-            highlighter(text).append('\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
+            highlighter(text).append('...\n\n').append(trim_footer_txt) if trim_footer_txt else highlighter(text),
             border_style=self._border_style(),
             expand=False,
             subtitle=REWRITTEN_HEADER_MSG if should_rewrite_header else None,
@@ -798,6 +875,11 @@ class Email(Communication):
         yield self.file_info_panel()
         yield Padding(email_txt_panel, (0, 0, 1, INFO_INDENT))
+        if self.attached_docs:
+            attachments_table_title = f" {self.url_slug} Email Attachments:"
+            attachments_table = OtherFile.files_preview_table(self.attached_docs, title=attachments_table_title)
+            yield Padding(attachments_table, (0, 0, 1, 12))
         if should_rewrite_header:
             self.log_top_lines(self.header.num_header_rows + 4, f'Original header:')

epstein_files/documents/emails/email_header.py CHANGED Viewed

@@ -2,7 +2,7 @@ import json
 import re
 from dataclasses import asdict, dataclass, field
-from epstein_files.util.constant.strings import AUTHOR, REDACTED
+from epstein_files.util.constant.strings import AUTHOR, REDACTED, indented
 from epstein_files.util.constants import ALL_CONFIGS
 from epstein_files.util.doc_cfg import EmailCfg
 from epstein_files.util.logging import logger
@@ -13,7 +13,10 @@ ON_BEHALF_OF = 'on behalf of'
 TO_FIELDS = ['bcc', 'cc', 'to']
 EMAILER_FIELDS = [AUTHOR] + TO_FIELDS
-HEADER_REGEX_STR = r'(((?:(?:Date|From|Sent|To|C[cC]|Importance|Subject|Bee|B[cC]{2}|Attachments|Classification|Flag):|on behalf of ?)(?! +(by |from my|via )).*\n){3,})'
+FIELD_PATTERNS = ['Date', 'From', 'Sent', 'To', r"C[cC]", r"B[cC][cC]", 'Importance', 'Subject', 'Attachments', 'Classification', 'Flag', 'Reply-To']
+FIELDS_PATTERN = '|'.join(FIELD_PATTERNS)
+FIELDS_COLON_PATTERN = fr"^({FIELDS_PATTERN}):"
+HEADER_REGEX_STR = fr"(((?:(?:{FIELDS_PATTERN}|Bee):|on behalf of ?)(?! +(by |from my|via )).*\n){{3,}})"
 EMAIL_SIMPLE_HEADER_REGEX = re.compile(rf'^{HEADER_REGEX_STR}')
 EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX = re.compile(HEADER_REGEX_STR)
 EMAIL_PRE_FORWARD_REGEX = re.compile(r"(.{3,2000}?)" + HEADER_REGEX_STR, re.DOTALL)  # Match up to the next email header section
@@ -53,6 +56,7 @@ class EmailHeader:
     importance: str | None = None
     attachments: str | None = None
     to: list[str] | None = None
+    reply_to: str | None = None
     def __post_init__(self):
         self.num_header_rows = len(self.field_names)
@@ -95,13 +99,10 @@ class EmailHeader:
                     logger.info(f"{log_prefix}, trying next line...")
                     num_headers += 1
                     value = email_lines[i + num_headers]
-                elif BAD_EMAILER_REGEX.match(value):
+                elif BAD_EMAILER_REGEX.match(value) or value.startswith('http'):
                     logger.info(f"{log_prefix}, decrementing num_headers and skipping...")
                     num_headers -= 1
                     continue
-                elif value.startswith('http'):
-                    logger.info(f"{log_prefix}, using empty string instead...")
-                    value = ''
                 value = [v.strip() for v in value.split(';') if len(v.strip()) > 0]
@@ -110,7 +111,12 @@ class EmailHeader:
         self.num_header_rows = len(self.field_names) + num_headers
         self.header_chars = '\n'.join(email_lines[0:self.num_header_rows])
         log_msg = f"Corrected empty header using {self.num_header_rows} lines to:\n"
-        logger.debug(f"{log_msg}{self}\n\nTop lines:\n\n%s", '\n'.join(email_lines[0:(num_headers + 1) * 2]))
+        logger.warning(
+            f"{log_msg}{self}\n\n[top lines]:\n\n%s\n\n[body_lines]:\n\n%s\n\n",
+            indented('\n'.join(email_lines[0:(num_headers + 1) * 2]), prefix='> '),
+            indented('\n'.join(email_lines[self.num_header_rows:self.num_header_rows + 5]), prefix='> '),
+        )
     def rewrite_header(self) -> str:
         header_fields = {}
@@ -151,7 +157,7 @@ class EmailHeader:
             #logger.debug(f"extracting header line: '{line}'")
             key, value = [element.strip() for element in line.split(':', 1)]
             value = value.rstrip('_')
-            key = AUTHOR if key == 'From' else ('sent_at' if key in ['Date', 'Sent'] else key.lower())
+            key = AUTHOR if key == 'From' else ('sent_at' if key in ['Date', 'Sent'] else key.lower().replace('-', '_'))
             key = 'bcc' if key == 'bee' else key
             if kw_args.get(key):
@@ -161,6 +167,9 @@ class EmailHeader:
             field_names.append(key)
+            if key == 'reply_to':
+                logger.warning(f"Found value for Reply-To field: '{value}'")
             if key in TO_FIELDS:
                 recipients = [element.strip() for element in value.split(';')]
                 recipients = [r for r in recipients if len(r) > 0]

epstein-files 1.2.5__py3-none-any.whl → 1.4.1__py3-none-any.whl

epstein-files 1.2.5py3-none-any.whl → 1.4.1py3-none-any.whl