epstein-files 1.2.5__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. epstein_files/__init__.py +55 -23
  2. epstein_files/documents/communication.py +9 -5
  3. epstein_files/documents/document.py +231 -135
  4. epstein_files/documents/doj_file.py +242 -0
  5. epstein_files/documents/doj_files/full_text.py +166 -0
  6. epstein_files/documents/email.py +289 -232
  7. epstein_files/documents/emails/email_header.py +35 -16
  8. epstein_files/documents/emails/emailers.py +223 -0
  9. epstein_files/documents/imessage/text_message.py +2 -3
  10. epstein_files/documents/json_file.py +18 -14
  11. epstein_files/documents/messenger_log.py +23 -39
  12. epstein_files/documents/other_file.py +54 -48
  13. epstein_files/epstein_files.py +65 -29
  14. epstein_files/person.py +151 -94
  15. epstein_files/util/constant/names.py +37 -10
  16. epstein_files/util/constant/output_files.py +2 -0
  17. epstein_files/util/constant/strings.py +14 -7
  18. epstein_files/util/constant/urls.py +17 -0
  19. epstein_files/util/constants.py +556 -391
  20. epstein_files/util/data.py +2 -0
  21. epstein_files/util/doc_cfg.py +44 -33
  22. epstein_files/util/env.py +34 -19
  23. epstein_files/util/file_helper.py +30 -6
  24. epstein_files/util/helpers/debugging_helper.py +13 -0
  25. epstein_files/util/helpers/env_helpers.py +21 -0
  26. epstein_files/util/highlighted_group.py +121 -37
  27. epstein_files/util/layout/left_bar_panel.py +26 -0
  28. epstein_files/util/logging.py +28 -13
  29. epstein_files/util/output.py +49 -40
  30. epstein_files/util/rich.py +30 -3
  31. epstein_files/util/word_count.py +7 -7
  32. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/METADATA +16 -3
  33. epstein_files-1.5.0.dist-info/RECORD +40 -0
  34. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/entry_points.txt +1 -1
  35. epstein_files-1.2.5.dist-info/RECORD +0 -34
  36. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/LICENSE +0 -0
  37. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/WHEEL +0 -0
@@ -2,7 +2,8 @@ import json
2
2
  import re
3
3
  from dataclasses import asdict, dataclass, field
4
4
 
5
- from epstein_files.util.constant.strings import AUTHOR, REDACTED
5
+ from epstein_files.documents.emails.emailers import BAD_EMAILER_REGEX, TIME_REGEX
6
+ from epstein_files.util.constant.strings import AUTHOR, indented
6
7
  from epstein_files.util.constants import ALL_CONFIGS
7
8
  from epstein_files.util.doc_cfg import EmailCfg
8
9
  from epstein_files.util.logging import logger
@@ -13,14 +14,29 @@ ON_BEHALF_OF = 'on behalf of'
13
14
  TO_FIELDS = ['bcc', 'cc', 'to']
14
15
  EMAILER_FIELDS = [AUTHOR] + TO_FIELDS
15
16
 
16
- HEADER_REGEX_STR = r'(((?:(?:Date|From|Sent|To|C[cC]|Importance|Subject|Bee|B[cC]{2}|Attachments|Classification|Flag):|on behalf of ?)(?! +(by |from my|via )).*\n){3,})'
17
+ FIELD_PATTERNS = [
18
+ 'Date',
19
+ 'From',
20
+ 'Sent',
21
+ 'To',
22
+ r"C[cC]",
23
+ r"B[cC][cC]",
24
+ 'Importance',
25
+ 'Subject',
26
+ 'Attachments',
27
+ 'Classification',
28
+ 'Flag',
29
+ 'Reply-To',
30
+ 'Inline-Images'
31
+ ]
32
+
33
+ DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}(From|Subject):') # IDed 140 emails out of 3777 DOJ files with just 'From:' match
34
+ FIELDS_PATTERN = '|'.join(FIELD_PATTERNS)
35
+ FIELDS_COLON_PATTERN = fr"^({FIELDS_PATTERN}):"
36
+ HEADER_REGEX_STR = fr"(((?:(?:{FIELDS_PATTERN}|Bee):|on behalf of ?)(?! +(by |from my|via )).*\n){{3,}})"
17
37
  EMAIL_SIMPLE_HEADER_REGEX = re.compile(rf'^{HEADER_REGEX_STR}')
18
38
  EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX = re.compile(HEADER_REGEX_STR)
19
39
  EMAIL_PRE_FORWARD_REGEX = re.compile(r"(.{3,2000}?)" + HEADER_REGEX_STR, re.DOTALL) # Match up to the next email header section
20
- TIME_REGEX = re.compile(r'^(\d{1,2}/\d{1,2}/\d{2,4}|Thursday|Monday|Tuesday|Wednesday|Friday|Saturday|Sunday).*')
21
-
22
- BAD_NAME_CHARS_REGEX = re.compile(r"[\"'\[\]*><•]")
23
- BAD_EMAILER_REGEX = re.compile(r'^(>|11111111)|agreed|ok|sexy|re:|fwd:|Multiple Senders|((sent|attachments|subject|importance).*|.*(january|201\d|hysterical|i have|image0|so that people|article 1.?|momminnemummin|These conspiracy theories|your state|undisclosed|www\.theguardian|talk in|it was a|what do|cc:|call (back|me)).*)$', re.IGNORECASE)
24
40
 
25
41
  CONFIGURED_ACTUAL_TEXTS = [
26
42
  cfg.actual_text for cfg in ALL_CONFIGS
@@ -51,8 +67,10 @@ class EmailHeader:
51
67
  classification: str | None = None
52
68
  flag: str | None = None
53
69
  importance: str | None = None
70
+ inline_images: str | None = None
54
71
  attachments: str | None = None
55
72
  to: list[str] | None = None
73
+ reply_to: str | None = None
56
74
 
57
75
  def __post_init__(self):
58
76
  self.num_header_rows = len(self.field_names)
@@ -95,13 +113,10 @@ class EmailHeader:
95
113
  logger.info(f"{log_prefix}, trying next line...")
96
114
  num_headers += 1
97
115
  value = email_lines[i + num_headers]
98
- elif BAD_EMAILER_REGEX.match(value):
116
+ elif BAD_EMAILER_REGEX.match(value) or value.startswith('http'):
99
117
  logger.info(f"{log_prefix}, decrementing num_headers and skipping...")
100
118
  num_headers -= 1
101
119
  continue
102
- elif value.startswith('http'):
103
- logger.info(f"{log_prefix}, using empty string instead...")
104
- value = ''
105
120
 
106
121
  value = [v.strip() for v in value.split(';') if len(v.strip()) > 0]
107
122
 
@@ -110,7 +125,12 @@ class EmailHeader:
110
125
  self.num_header_rows = len(self.field_names) + num_headers
111
126
  self.header_chars = '\n'.join(email_lines[0:self.num_header_rows])
112
127
  log_msg = f"Corrected empty header using {self.num_header_rows} lines to:\n"
113
- logger.debug(f"{log_msg}{self}\n\nTop lines:\n\n%s", '\n'.join(email_lines[0:(num_headers + 1) * 2]))
128
+
129
+ logger.info(
130
+ f"{log_msg}{self}\n\n[top lines]:\n\n%s\n\n[body_lines]:\n\n%s\n\n",
131
+ indented('\n'.join(email_lines[0:(num_headers + 1) * 2]), prefix='> '),
132
+ indented('\n'.join(email_lines[self.num_header_rows:self.num_header_rows + 5]), prefix='> '),
133
+ )
114
134
 
115
135
  def rewrite_header(self) -> str:
116
136
  header_fields = {}
@@ -151,7 +171,7 @@ class EmailHeader:
151
171
  #logger.debug(f"extracting header line: '{line}'")
152
172
  key, value = [element.strip() for element in line.split(':', 1)]
153
173
  value = value.rstrip('_')
154
- key = AUTHOR if key == 'From' else ('sent_at' if key in ['Date', 'Sent'] else key.lower())
174
+ key = AUTHOR if key == 'From' else ('sent_at' if key in ['Date', 'Sent'] else key.lower().replace('-', '_'))
155
175
  key = 'bcc' if key == 'bee' else key
156
176
 
157
177
  if kw_args.get(key):
@@ -161,6 +181,9 @@ class EmailHeader:
161
181
 
162
182
  field_names.append(key)
163
183
 
184
+ if key == 'reply_to':
185
+ logger.warning(f"Found value for Reply-To field: '{value}'")
186
+
164
187
  if key in TO_FIELDS:
165
188
  recipients = [element.strip() for element in value.split(';')]
166
189
  recipients = [r for r in recipients if len(r) > 0]
@@ -172,7 +195,3 @@ class EmailHeader:
172
195
  logger.debug(f"Header being parsed was this:\n\n{header}\n")
173
196
 
174
197
  return cls(field_names=field_names, header_chars=header, **kw_args)
175
-
176
- @staticmethod
177
- def cleanup_str(_str: str) -> str:
178
- return BAD_NAME_CHARS_REGEX.sub('', _str.replace(REDACTED, '')).strip().strip('_').strip()
@@ -0,0 +1,223 @@
1
+ """
2
+ Regexes and patterns for identifying people in email headers.
3
+ """
4
+ import re
5
+ from copy import deepcopy
6
+
7
+ from epstein_files.util.constant.names import *
8
+ from epstein_files.util.constant.strings import REDACTED
9
+ from epstein_files.util.data import escape_single_quotes
10
+ from epstein_files.util.logging import logger
11
+
12
+ BAD_EMAILER_REGEX = re.compile(r'^(>|11111111)|agreed|ok|sexy|re:|fwd:|Multiple Senders|((sent|attachments|subject|importance).*|.*(january|201\d|hysterical|i have|image0|so that people|article 1.?|momminnemummin|These conspiracy theories|your state|undisclosed|www\.theguardian|talk in|it was a|what do|cc:|call (back|me)|afiaata|[IM]{4,}).*)$', re.IGNORECASE)
13
+ BAD_NAME_CHARS_REGEX = re.compile(r"[\"'\[\]*><•=()]")
14
+ TIME_REGEX = re.compile(r'^((\d{1,2}/\d{1,2}/\d{2,4}|Thursday|Monday|Tuesday|Wednesday|Friday|Saturday|Sunday)|\d{4} ).*')
15
+
16
+ EMAILER_ID_PATTERNS: dict[str, str] = {
17
+ ALAN_DERSHOWITZ: r'(alan.{1,7})?dershowi(lz?|t?z)|AlanDersh',
18
+ ALIREZA_ITTIHADIEH: r'Alireza.[Il]ttihadieh',
19
+ ALISON_J_NATHAN: r"Alison(\s*J\.?)?\s*Nathan|Nathan NYSD Chambers?",
20
+ AMANDA_ENS: r'ens, amanda?|Amanda.Ens',
21
+ AMIR_TAAKI: r'Amir\s*Taaki|genjix',
22
+ ANAS_ALRASHEED: r'anas\s*al\s*rashee[cd]',
23
+ ANIL_AMBANI: r'Anil.Ambani',
24
+ ANN_MARIE_VILLAFANA: r'Villafana, Ann Marie|(A(\.|nn) Marie )?Villafa(c|n|ri)a',
25
+ ANTHONY_SCARAMUCCI: r"mooch|(Anthony ('The Mooch' )?)?Scaramucci",
26
+ ARIANE_DE_ROTHSCHILD: r'AdeR|((Ariane|Edmond) (de )?)?Rothsh?ch?ild|Ariane(?!\s+Dwyer)',
27
+ BARBRO_C_EHNBOM: r'behnbom@aol.com|(Barbro\s.*)?Ehnbom',
28
+ BARRY_J_COHEN: r'barry\s*((j.?|james)\s*)?cohen?',
29
+ BENNET_MOSKOWITZ: r'Moskowitz.*Bennet|Bennet.*Moskowitz',
30
+ BOB_CROWE: r"[BR]ob Crowe",
31
+ BORIS_NIKOLIC: r'(boris )?nikolic?',
32
+ BRAD_EDWARDS: r'Brad(ley)?(\s*J(.?|ames))?\s*Edwards',
33
+ BRAD_KARP: r'Brad (S.? )?Karp|Karp, Brad',
34
+ CHRISTIAN_EVERDELL: r"C(hristian\s*)?Everdell?",
35
+ CHRISTOPHER_DILORIO: r"Chris\s*Di[lI]o[nr](io)?",
36
+ DANGENE_AND_JENNIE_ENTERPRISE: r'Dangene and Jennie Enterprise?',
37
+ DANNY_FROST: r'Frost, Danny|frostd@dany.nyc.gov|Danny\s*Frost',
38
+ DARREN_INDYKE: r'darren$|Darren\s*(K\.?\s*)?[il]n[dq]_?yke?|dkiesq',
39
+ DAVID_FISZEL: r'David\s*Fis?zel',
40
+ DAVID_HAIG: fr'{DAVID_HAIG}|Haig, David',
41
+ DAVID_STERN: r'David Stern?',
42
+ DOUGLAS_WIGDOR: r'Doug(las)?\s*(H\.?)?\s*Wigdor',
43
+ EDUARDO_ROBLES: r'Ed(uardo)?\s*Robles',
44
+ EDWARD_JAY_EPSTEIN: r'(?<!Jeffrey )Edward (Jay )?Epstein',
45
+ EHUD_BARAK: r'(ehud|e?h)\s*barak|\behud',
46
+ FAITH_KATES: r'faith kates?',
47
+ GERALD_BARTON: r'Gerald.*Barton',
48
+ GERALD_LEFCOURT: r'Gerald\s*(B\.?\s*)?Lefcourt',
49
+ GHISLAINE_MAXWELL: r'g ?max(well)?|Ghislaine|Maxwell',
50
+ HEATHER_MANN: r'Heather Mann?',
51
+ INTELLIGENCE_SQUARED: r'intelligence\s*squared',
52
+ JACKIE_PERCZEK: r'jackie percze[kl]?',
53
+ JABOR_Y: r'[ji]abor\s*y?',
54
+ JAMES_HILL: r"hill, james e.|james.e.hill@abc.com",
55
+ JANUSZ_BANASIAK: r"Janu[is]z Banasiak",
56
+ JEAN_HUGUEN: r"Jean[\s.]Huguen",
57
+ JEAN_LUC_BRUNEL: r'Jean[- ]Luc Brunel?|JeanLuc',
58
+ JEFF_FULLER: r"jeff@mc2mm.com|Jeff Fuller",
59
+ JEFFREY_EPSTEIN: r'[djl]\s?ee[vy]acation[©@]?g?(mail.com)?|Epstine|\bJEE?\b|Jeff(rey)? (Edward )?E((sp|ps)tein?)?( VI Foundation)?|jeeproject@yahoo.com|J Jep|Jeffery Edwards|(?<!(Mark L.|ard Jay) )Epstein',
60
+ JESSICA_CADWELL: r'Jessica Cadwell?',
61
+ JOHNNY_EL_HACHEM: r'el hachem johnny|johnny el hachem',
62
+ JOI_ITO: r'ji@media.mit.?edu|(joichi|joi)( Ito)?',
63
+ JONATHAN_FARKAS: r'Jonathan Fark(a|u)(s|il)',
64
+ KARYNA_SHULIAK: r"Karyna\s*Shuliak?",
65
+ KATHRYN_RUEMMLER: r'Kathr?yn? Ruemmler?',
66
+ KEN_STARR: r'starr, ken|Ken(neth\s*(W.\s*)?)?\s+starr?|starr',
67
+ LANDON_THOMAS: r'lando[nr] thomas( jr)?|thomas jr.?, lando[nr]',
68
+ LARRY_SUMMERS: r'(La(wrence|rry).{1,5})?Summers?|^LH$|LHS|[Il]hsofficel?',
69
+ LAWRANCE_VISOSKI: r'La(rry|wrance) Visoski?|Lvjet',
70
+ LAWRENCE_KRAUSS: r'Lawrence Kraus[es]?|[jl]awkrauss|kruase',
71
+ LEON_BLACK: r'Leon\s*Black?|(?<!Marc )Leon(?! (Botstein|Jaworski|Wieseltier))',
72
+ LILLY_SANCHEZ: r'Lilly.*Sanchez',
73
+ LISA_NEW: r'E?Lisa New?\b',
74
+ MANUELA_MARTINEZ: fr'Manuela (- Mega Partners|Martinez)',
75
+ MARIANA_IDZKOWSKA: r'Mariana [Il]d[źi]kowska?',
76
+ MARK_EPSTEIN: r'Mark (L\. )?(Epstein|Lloyd)',
77
+ MARC_LEON: r'Marc[.\s]+(Kensington|Leon)|Kensington2',
78
+ MARTIN_NOWAK: r'(Martin.*?)?No[vw]ak|Nowak, Martin',
79
+ MARTIN_WEINBERG: r'martin.*?weinberg',
80
+ "Matthew Schafer": r"matthew\.?schafer?",
81
+ MELANIE_SPINELLA: r'M?elanie Spine[Il]{2}a',
82
+ MICHAEL_BUCHHOLTZ: r'Michael.*Buchholtz',
83
+ MICHAEL_MILLER: r'Micha(el)? Miller|Miller, Micha(el)?',
84
+ MICHAEL_SITRICK: r'(Mi(chael|ke).{0,5})?[CS]itrick',
85
+ MICHAEL_WOLFF: r'Michael\s*Wol(f[ef]e?|i)|Wolff',
86
+ MIROSLAV_LAJCAK: r"Miro(slav)?(\s+Laj[cč][aá]k)?",
87
+ MOHAMED_WAHEED_HASSAN: r'Mohamed Waheed(\s+Hassan)?',
88
+ NADIA_MARCINKO: r"Na[dď]i?a\s+Marcinko(v[aá])?",
89
+ NEAL_KASSELL: r'Neal\s*Kassell?',
90
+ NICHOLAS_RIBIS: r'Nic(holas|k)[\s._]Ribi?s?|Ribbis',
91
+ OLIVIER_COLOM: fr'Colom, Olivier|{OLIVIER_COLOM}',
92
+ PAUL_BARRETT: r'Paul Barre(d|tt)',
93
+ PAUL_KRASSNER: r'Pa\s?ul Krassner',
94
+ PAUL_MORRIS: r'morris, paul|Paul Morris',
95
+ PAULA: r'^Paula( Heil Fisher)?$',
96
+ PEGGY_SIEGAL: r'Peggy Siegal?',
97
+ PETER_ATTIA: r'Peter Attia?',
98
+ PETER_MANDELSON: r"((Lord|Peter) )?Mandelson",
99
+ 'pink@mc2mm.com': r"^Pink$|pink@mc2mm\.com",
100
+ PRINCE_ANDREW: r'Prince Andrew|The Duke',
101
+ REID_WEINGARTEN: r'Weingarten, Rei[cdi]|Rei[cdi] Weingarten',
102
+ RICHARD_KAHN: r'rich(ard)? kahn?',
103
+ ROBERT_D_CRITTON_JR: r'Robert D.? Critton,? Jr.?',
104
+ ROBERT_LAWRENCE_KUHN: r'Robert\s*(Lawrence)?\s*Kuhn',
105
+ ROBERT_TRIVERS: r'tri[vy]ersr@gmail|Robert\s*Trivers?',
106
+ ROSS_GOW: fr"Ross(acuity)? Gow|(ross@)?acuity\s*reputation(\.com)?",
107
+ SAMUEL_LEFF: r"Sam(uel)?(/Walli)? Leff",
108
+ SCOTT_J_LINK: r'scott j. link?',
109
+ SEAN_BANNON: r'sean bannon?',
110
+ SHAHER_ABDULHAK_BESHER: r'\bShaher( Abdulhak Besher)?\b',
111
+ SOON_YI_PREVIN: r'Soon[- ]Yi Previn?',
112
+ STACEY_RICHMAN: r"srichmanlaw|Stacey\s*Richman",
113
+ STEPHEN_HANSON: r'ste(phen|ve) hanson?|Shanson900',
114
+ STEVE_BANNON: r'steve banno[nr]?',
115
+ STEVEN_SINOFSKY: r'Steven Sinofsky?',
116
+ SULTAN_BIN_SULAYEM: r'Sultan (Ahmed )?bin Sulaye?m?',
117
+ TERJE_ROD_LARSEN: r"Terje(( (R[øo]e?d[- ])?)?Lars[eo]n)?",
118
+ TERRY_KAFKA: r'Terry Kafka?',
119
+ THANU_BOONYAWATANA: r"Thanu (BOONYAWATANA|Cnx)",
120
+ THORBJORN_JAGLAND: r'(Thor.{3,8})?Jag[il]and?',
121
+ TONJA_HADDAD_COLEMAN: r"To(nj|rl)a Haddad Coleman|haddadfm@aol.com",
122
+ VINCENZO_IOZZO: r"Vincenzo [IL]ozzo",
123
+ }
124
+
125
+ # If found as substring consider them the author
126
+ EMAILERS = [
127
+ 'Anne Boyles',
128
+ AL_SECKEL,
129
+ 'Ariane Dwyer',
130
+ AZIZA_ALAHMADI,
131
+ BILL_GATES,
132
+ BILL_SIEGEL,
133
+ 'Bobbi C Sternheim',
134
+ BRAD_WECHSLER,
135
+ BROCK_PIERCE,
136
+ CHRISTINA_GALBRAITH,
137
+ DANIEL_SABBA,
138
+ 'Danny Goldberg',
139
+ DAVID_SCHOEN,
140
+ DEBBIE_FEIN,
141
+ DEEPAK_CHOPRA,
142
+ GLENN_DUBIN,
143
+ GORDON_GETTY,
144
+ 'Jeff Pagliuca',
145
+ 'Kevin Bright',
146
+ 'Jack Lang',
147
+ JACK_SCAROLA,
148
+ JAY_LEFKOWITZ,
149
+ JES_STALEY,
150
+ JOHN_PAGE,
151
+ 'Jokeland',
152
+ JOSCHA_BACH,
153
+ 'Kathleen Ruderman',
154
+ KENNETH_E_MAPP,
155
+ 'Larry Cohen',
156
+ LESLEY_GROFF,
157
+ 'lorraine@mc2mm.com',
158
+ LINDA_STONE,
159
+ 'Lyn Fontanilla',
160
+ MARK_TRAMO,
161
+ MELANIE_WALKER,
162
+ MERWIN_DELA_CRUZ,
163
+ 'Michael Simmons', # Not the only "To:"
164
+ 'middle.east.update@hotmail.com',
165
+ 'Nancy Cain',
166
+ 'Nancy Dahl',
167
+ 'Nancy Portland',
168
+ 'Nathan NYSD Chambers',
169
+ 'Oliver Goodenough',
170
+ 'Paula Speer',
171
+ 'Peter Aldhous',
172
+ 'Peter Green',
173
+ ROGER_SCHANK,
174
+ 'Roy Black',
175
+ STEVEN_PFEIFFER,
176
+ 'Steven Victor MD',
177
+ 'Susan Edelman',
178
+ TOM_BARRACK,
179
+ 'USANYS',
180
+ 'Vahe Stepanian',
181
+ 'Vladimir Yudashkin',
182
+ ]
183
+
184
+ EMAILER_ID_REGEXES = {name: re.compile(pattern, re.IGNORECASE) for name, pattern in EMAILER_ID_PATTERNS.items()}
185
+ EMAILER_REGEXES = deepcopy(EMAILER_ID_REGEXES) # Keep a copy without the simple EMAILERS regexes
186
+
187
+ # Add simple matching regexes for EMAILERS entries to EMAILER_REGEXES
188
+ for emailer in EMAILERS:
189
+ if emailer in EMAILER_REGEXES:
190
+ raise RuntimeError(f"Can't overwrite emailer regex for '{emailer}'")
191
+
192
+ EMAILER_REGEXES[emailer] = re.compile(emailer + '?', re.IGNORECASE) # Last char optional bc OCR sucks
193
+
194
+ SUPPRESS_LOGS_FOR_AUTHORS = [
195
+ 'Multiple Senders Multiple Senders',
196
+ 'Undisclosed recipients:',
197
+ 'undisclosed-recipients:',
198
+ ]
199
+
200
+
201
+ def cleanup_str(_str: str) -> str:
202
+ return BAD_NAME_CHARS_REGEX.sub('', _str.replace(REDACTED, '')).strip().strip('_').strip()
203
+
204
+
205
+ def extract_emailer_names(emailer_str: str) -> list[str]:
206
+ """Return a list of people's names found in `emailer_str` (email author or recipients field)."""
207
+ emailer_str = cleanup_str(emailer_str)
208
+
209
+ if len(emailer_str) == 0:
210
+ return []
211
+
212
+ names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
213
+
214
+ if len(emailer_str) <= 2 or BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
215
+ if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
216
+ logger.warning(f"No emailer found in '{escape_single_quotes(emailer_str)}'")
217
+ else:
218
+ logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
219
+
220
+ return names_found
221
+
222
+ names_found = names_found or [emailer_str]
223
+ return [reverse_first_and_last_names(name) for name in names_found]
@@ -6,13 +6,12 @@ from rich.text import Text
6
6
 
7
7
  from epstein_files.util.constant.names import ANTHONY_SCARAMUCCI, JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN, Name, extract_last_name
8
8
  from epstein_files.util.constant.strings import TIMESTAMP_DIM
9
- from epstein_files.util.data import iso_timestamp
9
+ from epstein_files.util.data import AMERICAN_DATE_FORMAT, iso_timestamp
10
10
  from epstein_files.util.highlighted_group import get_style_for_name
11
11
  from epstein_files.util.logging import logger
12
12
  from epstein_files.util.rich import TEXT_LINK, highlighter
13
13
 
14
14
  EPSTEIN_TEXTERS = ['e:', 'e:jeeitunes@gmail.com']
15
- MSG_DATE_FORMAT = r"%m/%d/%y %I:%M:%S %p"
16
15
  PHONE_NUMBER_REGEX = re.compile(r'^[\d+]+.*')
17
16
  UNCERTAIN_SUFFIX = ' (?)'
18
17
 
@@ -54,7 +53,7 @@ class TextMessage:
54
53
  return self.text.startswith('http')
55
54
 
56
55
  def parse_timestamp(self) -> datetime:
57
- return datetime.strptime(self.timestamp_str, MSG_DATE_FORMAT)
56
+ return datetime.strptime(self.timestamp_str, AMERICAN_DATE_FORMAT)
58
57
 
59
58
  def timestamp_txt(self) -> Text:
60
59
  try:
@@ -28,6 +28,24 @@ class JsonFile(OtherFile):
28
28
  include_description_in_summary_panel: ClassVar[bool] = False
29
29
  strip_whitespace: ClassVar[bool] = False
30
30
 
31
+ @property
32
+ def category(self) -> str:
33
+ return JSON
34
+
35
+ @property
36
+ def is_interesting(self):
37
+ return False
38
+
39
+ @property
40
+ def info_txt(self) -> Text | None:
41
+ return Text(DESCRIPTION, style=INFO_STYLE)
42
+
43
+ @property
44
+ def metadata(self) -> Metadata:
45
+ metadata = super().metadata
46
+ metadata['description'] = DESCRIPTION
47
+ return metadata
48
+
31
49
  def __post_init__(self):
32
50
  super().__post_init__()
33
51
 
@@ -36,23 +54,9 @@ class JsonFile(OtherFile):
36
54
 
37
55
  self._set_computed_fields(text=self.json_str())
38
56
 
39
- def category(self) -> str:
40
- return JSON
41
-
42
- def info_txt(self) -> Text | None:
43
- return Text(DESCRIPTION, style=INFO_STYLE)
44
-
45
- def is_interesting(self):
46
- return False
47
-
48
57
  def json_data(self) -> object:
49
58
  with open(self.file_path, encoding='utf-8-sig') as f:
50
59
  return json.load(f)
51
60
 
52
- def metadata(self) -> Metadata:
53
- metadata = super().metadata()
54
- metadata['description'] = DESCRIPTION
55
- return metadata
56
-
57
61
  def json_str(self) -> str:
58
62
  return json.dumps(self.json_data(), indent=4)
@@ -31,38 +31,30 @@ class MessengerLog(Communication):
31
31
  messages: list[TextMessage] = field(default_factory=list)
32
32
  phone_number: str | None = None
33
33
 
34
- def __post_init__(self):
35
- super().__post_init__()
36
- self.messages = [self._build_message(match) for match in MSG_REGEX.finditer(self.text)]
37
-
38
- def first_message_at(self, name: Name) -> datetime:
39
- return self.messages_by(name)[0].parse_timestamp()
34
+ @property
35
+ def border_style(self) -> str:
36
+ return self.author_style
40
37
 
38
+ @property
41
39
  def info_txt(self) -> Text | None:
42
40
  num_days_str = days_between_str(self.timestamp, self.messages[-1].parse_timestamp())
43
41
  txt = Text(f"(Covers {num_days_str} starting ", style='dim')
44
- txt.append(self.date_str(), style=TIMESTAMP_STYLE).append(' ')
42
+ txt.append(self.date_str, style=TIMESTAMP_STYLE).append(' ')
45
43
 
46
44
  if not self.author:
47
45
  txt.append('with unknown counterparty')
48
46
  else:
49
- txt.append(GUESSED_MSG if self.is_attribution_uncertain() else CONFIRMED_MSG).append(' ')
50
- txt.append(Text(self.author, style=self.author_style() + ' bold'))
47
+ txt.append(GUESSED_MSG if self.is_attribution_uncertain else CONFIRMED_MSG).append(' ')
48
+ txt.append(Text(self.author, style=self.author_style + ' bold'))
51
49
 
52
50
  if self.phone_number:
53
51
  txt.append(highlighter(f" using the phone number {self.phone_number}"))
54
52
 
55
53
  return txt.append(')')
56
54
 
57
- def last_message_at(self, name: Name) -> datetime:
58
- return self.messages_by(name)[-1].parse_timestamp()
59
-
60
- def messages_by(self, name: Name) -> list[TextMessage]:
61
- """Return all messages by 'name'."""
62
- return [m for m in self.messages if m.author == name]
63
-
55
+ @property
64
56
  def metadata(self) -> Metadata:
65
- metadata = super().metadata()
57
+ metadata = super().metadata
66
58
  metadata.update({'num_messages': len(self.messages)})
67
59
 
68
60
  if self.phone_number:
@@ -70,8 +62,19 @@ class MessengerLog(Communication):
70
62
 
71
63
  return metadata
72
64
 
73
- def _border_style(self) -> str:
74
- return self.author_style()
65
+ def __post_init__(self):
66
+ super().__post_init__()
67
+ self.messages = [self._build_message(match) for match in MSG_REGEX.finditer(self.text)]
68
+
69
+ def first_message_at(self, name: Name) -> datetime:
70
+ return self.messages_by(name)[0].parse_timestamp()
71
+
72
+ def last_message_at(self, name: Name) -> datetime:
73
+ return self.messages_by(name)[-1].parse_timestamp()
74
+
75
+ def messages_by(self, name: Name) -> list[TextMessage]:
76
+ """Return all messages by 'name'."""
77
+ return [m for m in self.messages if m.author == name]
75
78
 
76
79
  def _build_message(self, match: re.Match) -> TextMessage:
77
80
  """Turn a regex match into a TextMessage."""
@@ -86,7 +89,7 @@ class MessengerLog(Communication):
86
89
  return TextMessage(
87
90
  author=self.author if (is_phone_number or not author_str) else author_str,
88
91
  author_str=author_str if is_phone_number else '', # Preserve phone numbers
89
- is_id_confirmed=not self.is_attribution_uncertain(),
92
+ is_id_confirmed=not self.is_attribution_uncertain,
90
93
  text=match.group(4).strip(),
91
94
  timestamp_str=match.group(2).strip(),
92
95
  )
@@ -102,25 +105,6 @@ class MessengerLog(Communication):
102
105
 
103
106
  raise RuntimeError(f"{self}: No timestamp found!")
104
107
 
105
- def _set_message_timestamps(self) -> None:
106
- raise NotImplementedError(f"TextMessage.timestamp no longer exists")
107
- last_message: TextMessage | None = None
108
-
109
- for i, message in enumerate(self.messages):
110
- try:
111
- message.timestamp = message.parse_timestamp()
112
- except Exception as e:
113
- msg = f"Failed to parse timestamp for TextMessage {i + 1}, {message}: {e}"
114
-
115
- if i == 0:
116
- message.timestamp = self.timestamp
117
- self.warn(f"{msg}\nit's the first message so using the MessengerLog timestamp property {self.timestamp}")
118
- else:
119
- message.timestamp = last_message.timestamp + timedelta(milliseconds=1)
120
- self.warn(f"{msg}\nadding 1 millisecond to last timestamp {last_message.timestamp}")
121
-
122
- last_message = message
123
-
124
108
  def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
125
109
  yield self.file_info_panel()
126
110
  yield Text('')