epstein-files 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. epstein_files/__init__.py +75 -135
  2. epstein_files/documents/communication.py +9 -9
  3. epstein_files/documents/document.py +115 -87
  4. epstein_files/documents/email.py +154 -85
  5. epstein_files/documents/emails/email_header.py +7 -6
  6. epstein_files/documents/imessage/text_message.py +3 -2
  7. epstein_files/documents/json_file.py +17 -0
  8. epstein_files/documents/messenger_log.py +62 -3
  9. epstein_files/documents/other_file.py +165 -17
  10. epstein_files/epstein_files.py +128 -169
  11. epstein_files/util/constant/names.py +8 -1
  12. epstein_files/util/constant/output_files.py +29 -0
  13. epstein_files/util/constant/strings.py +27 -0
  14. epstein_files/util/constant/urls.py +25 -9
  15. epstein_files/util/constants.py +1018 -1045
  16. epstein_files/util/data.py +20 -55
  17. epstein_files/util/{file_cfg.py → doc_cfg.py} +121 -43
  18. epstein_files/util/env.py +19 -20
  19. epstein_files/util/file_helper.py +38 -21
  20. epstein_files/util/highlighted_group.py +229 -177
  21. epstein_files/util/logging.py +63 -0
  22. epstein_files/util/output.py +180 -0
  23. epstein_files/util/rich.py +29 -17
  24. epstein_files/util/search_result.py +14 -6
  25. epstein_files/util/timer.py +24 -0
  26. epstein_files/util/word_count.py +2 -1
  27. {epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/METADATA +20 -4
  28. epstein_files-1.0.2.dist-info/RECORD +33 -0
  29. epstein_files-1.0.2.dist-info/entry_points.txt +7 -0
  30. epstein_files-1.0.0.dist-info/RECORD +0 -28
  31. {epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/LICENSE +0 -0
  32. {epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/WHEEL +0 -0
@@ -1,7 +1,6 @@
1
1
  import logging
2
2
  import re
3
- from collections import defaultdict
4
- from dataclasses import dataclass, field
3
+ from dataclasses import asdict, dataclass, field
5
4
  from datetime import datetime
6
5
  from typing import ClassVar, cast
7
6
 
@@ -9,6 +8,7 @@ from dateutil.parser import parse
9
8
  from rich.console import Console, ConsoleOptions, RenderResult
10
9
  from rich.padding import Padding
11
10
  from rich.panel import Panel
11
+ from rich.table import Table
12
12
  from rich.text import Text
13
13
 
14
14
  from epstein_files.documents.communication import Communication
@@ -19,13 +19,14 @@ from epstein_files.util.constant.names import *
19
19
  from epstein_files.util.constant.strings import REDACTED, URL_SIGNIFIERS
20
20
  from epstein_files.util.constants import *
21
21
  from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes, extract_last_name,
22
- remove_timezone, uniquify)
23
- from epstein_files.util.env import logger
22
+ flatten, remove_timezone, uniquify)
23
+ from epstein_files.util.doc_cfg import EmailCfg, Metadata
24
24
  from epstein_files.util.highlighted_group import get_style_for_name
25
+ from epstein_files.util.logging import logger
25
26
  from epstein_files.util.rich import *
26
27
 
27
- BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|L\._|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
28
- BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,])$')
28
+ BAD_FIRST_LINE_REGEX = re.compile(r'^(>>|Grant_Smith066474"eMailContent.htm|LOVE & KISSES)$')
29
+ BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communication|Importance:?\s*High|[iI,•]|i (_ )?i|, [-,]|L\._)$')
29
30
  DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
30
31
  LINK_LINE_REGEX = re.compile(f"^(> )?htt")
31
32
  QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
@@ -39,8 +40,8 @@ TIMESTAMP_LINE_REGEX = re.compile(r"\d+:\d+")
39
40
  SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
40
41
  REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
41
42
  MAX_CHARS_TO_PRINT = 4000
43
+ MAX_NUM_HEADER_LINES = 14
42
44
  MAX_QUOTED_REPLIES = 2
43
- VALID_HEADER_LINES = 14
44
45
 
45
46
  OCR_REPAIRS: dict[str | re.Pattern, str] = {
46
47
  re.compile(r'grnail\.com'): 'gmail.com',
@@ -71,12 +72,15 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
71
72
  'Imps ://': 'https://',
72
73
  re.compile(r'timestopics/people/t/landon jr thomas/inde\n?x\n?\.\n?h\n?tml'): 'timestopics/people/t/landon_jr_thomas/index.html',
73
74
  # Subject lines
74
- r"as Putin Mayhem Tests President's Grip\non GOP": "as Putin Mayhem Tests President's Grip on GOP",
75
- r"avoids testimony from alleged\nvictims": "avoids testimony from alleged victims",
76
- r"but\nwatchdogs say probe is tainted": "watchdogs say probe is tainted",
77
- r"COVER UP SEX ABUSE CRIMES\nBY THE WHITE HOUSE": "COVER UP SEX ABUSE CRIMES BY THE WHITE HOUSE",
78
- r'Priebus, used\nprivate email accounts for': 'Priebus, used private email accounts for',
79
- r"War on the Investigations\nEncircling Him": "War on the Investigations Encircling Him",
75
+ "Arrested in\nInauguration Day Riot": "Arrested in Inauguration Day Riot",
76
+ "as Putin Mayhem Tests President's Grip\non GOP": "as Putin Mayhem Tests President's Grip on GOP",
77
+ "avoids testimony from alleged\nvictims": "avoids testimony from alleged victims",
78
+ "but\nwatchdogs say probe is tainted": "watchdogs say probe is tainted",
79
+ "Christmas comes\nearly for most of macro": "Christmas comes early for most of macro", # 023717
80
+ "but majority still made good\nmoney because": "but majority still made good money because", # 023717
81
+ "COVER UP SEX ABUSE CRIMES\nBY THE WHITE HOUSE": "COVER UP SEX ABUSE CRIMES BY THE WHITE HOUSE",
82
+ 'Priebus, used\nprivate email accounts for': 'Priebus, used private email accounts for',
83
+ "War on the Investigations\nEncircling Him": "War on the Investigations Encircling Him",
80
84
  re.compile(r"deadline re Mr Bradley Edwards vs Mr\s*Jeffrey Epstein", re.I): "deadline re Mr Bradley Edwards vs Mr Jeffrey Epstein",
81
85
  re.compile(r"Following Plea That Implicated Trump -\s*https://www.npr.org/676040070", re.I): "Following Plea That Implicated Trump - https://www.npr.org/676040070",
82
86
  re.compile(r"for Attorney General -\s+Wikisource, the"): r"for Attorney General - Wikisource, the",
@@ -89,9 +93,7 @@ OCR_REPAIRS: dict[str | re.Pattern, str] = {
89
93
  'AVG°': 'AVGO',
90
94
  }
91
95
 
92
- MARTIN_WEINBERG_SIGNATURE_PATTERN = r"Martin G. Weinberg, Esq.\n20 Park Plaza((, )|\n)Suite 1000\nBoston, MA 02116(\n61.*)?(\n.*([cC]ell|Office))*"
93
-
94
- EMAIL_SIGNATURES = {
96
+ EMAIL_SIGNATURE_REGEXES = {
95
97
  ARIANE_DE_ROTHSCHILD: re.compile(r"Ensemble.*\nCe.*\ndestinataires.*\nremercions.*\nautorisee.*\nd.*\nLe.*\ncontenues.*\nEdmond.*\nRoth.*\nlo.*\nRoth.*\ninfo.*\nFranc.*\n.2.*", re.I),
96
98
  BARBRO_C_EHNBOM: re.compile(r"Barbro C.? Ehn.*\nChairman, Swedish-American.*\n((Office|Cell|Sweden):.*\n)*(360.*\nNew York.*)?"),
97
99
  DANNY_FROST: re.compile(r"Danny Frost\nDirector.*\nManhattan District.*\n212.*", re.IGNORECASE),
@@ -103,7 +105,7 @@ EMAIL_SIGNATURES = {
103
105
  KEN_JENNE: re.compile(r"Ken Jenne\nRothstein.*\n401 E.*\nFort Lauderdale.*", re.IGNORECASE),
104
106
  LARRY_SUMMERS: re.compile(r"Please direct all scheduling.*\nFollow me on twitter.*\nwww.larrysummers.*", re.IGNORECASE),
105
107
  LAWRENCE_KRAUSS: re.compile(r"Lawrence (M. )?Krauss\n(Director.*\n)?(Co-director.*\n)?Foundation.*\nSchool.*\n(Co-director.*\n)?(and Director.*\n)?Arizona.*(\nResearch.*\nOri.*\n(krauss.*\n)?origins.*)?", re.IGNORECASE),
106
- MARTIN_WEINBERG: re.compile(fr"({MARTIN_WEINBERG_SIGNATURE_PATTERN}\n)?This Electronic Message contains.*?contents of this message is.*?prohibited.", re.DOTALL),
108
+ MARTIN_WEINBERG: re.compile(r"(Martin G. Weinberg, Esq.\n20 Park Plaza((, )|\n)Suite 1000\nBoston, MA 02116(\n61.*)?(\n.*([cC]ell|Office))*\n)?This Electronic Message contains.*?contents of this message is.*?prohibited.", re.DOTALL),
107
109
  STEVEN_PFEIFFER: re.compile(r"Steven\nSteven .*\nAssociate.*\nIndependent Filmmaker Project\nMade in NY.*\n30 .*\nBrooklyn.*\n(p:.*\n)?www\.ifp.*", re.IGNORECASE),
108
110
  PETER_MANDELSON: re.compile(r'Disclaimer This email and any attachments to it may be.*?with[ \n]+number(.*?EC4V[ \n]+6BJ)?', re.DOTALL | re.IGNORECASE),
109
111
  PAUL_BARRETT: re.compile(r"Paul Barrett[\n\s]+Alpha Group Capital LLC[\n\s]+(142 W 57th Street, 11th Floor, New York, NY 10019?[\n\s]+)?(al?[\n\s]*)?ALPHA GROUP[\n\s]+CAPITAL"),
@@ -147,7 +149,6 @@ TRUNCATE_TERMS = [
147
149
  'quote from The Colbert Report distinguishes',
148
150
  'co-inventor of the GTX Smart Shoe',
149
151
  'my latest Washington Post column',
150
- 'Whether you donated to Poetry in America through',
151
152
  'supported my humanities work at Harvard',
152
153
  'Calendar of Major Events, Openings, and Fundraisers',
153
154
  'Nuclear Operator Raises Alarm on Crisis',
@@ -181,7 +182,6 @@ TRUNCATE_TERMS = [
181
182
  'We can also discuss single stock and Topix banks',
182
183
  'We are recording unprecedented divergences in falling equity vol',
183
184
  'As previously discussed between you and Ariane',
184
- 'The US trade war against China: The view from Beijing',
185
185
  'no evidence you got the latest so i have sent you just the key message',
186
186
  # Joscha Bach
187
187
  'Cells seem to be mostly indistinguishable (except',
@@ -204,6 +204,8 @@ TRUNCATE_TERMS = [
204
204
  'General Election: Trump vs. Clinton LA Times/USC Tracking',
205
205
  'Location: Quicken Loans Arena in Cleveland, OH',
206
206
  'A friendly discussion about Syria with a former US State Department',
207
+ # Robert Kuhn
208
+ 'The US trade war against China: The view from Beijing',
207
209
  # Tom / Paul Krassner
208
210
  'I forgot to post my cartoon from week before last, about Howard Schultz',
209
211
  # Bannon
@@ -221,23 +223,26 @@ TRUNCATE_TERMS = [
221
223
  'lecture in Heidelberg Oct 14 but they had to cancel',
222
224
  # Nikolic
223
225
  'people from LifeBall',
224
- # Random
225
- 'Little Hodiaki',
226
- "It began with deep worries regarding China's growth path",
227
- 'https://www.washingtonpost.com/politics/2018/09/04/transcript-phone-call',
228
226
  # Epstein
229
227
  'David Ben Gurion was asked why he, after 2000',
230
228
  # Lisa New
231
229
  'The raw materials for that period include interviews',
230
+ 'Whether you donated to Poetry in America through',
231
+ # Random
232
+ 'Little Hodiaki',
233
+ "It began with deep worries regarding China's growth path",
234
+ 'https://www.washingtonpost.com/politics/2018/09/04/transcript-phone-call',
232
235
  ]
233
236
 
234
- KRASSNER_RECIPIENTS = uniquify(KRASSNER_MANSON_RECIPIENTS + KRASSNER_024923_RECIPIENTS + KRASSNER_033568_RECIPIENTS)
237
+ # Some Paul Krassner emails have a ton of CCed parties we don't care about
238
+ KRASSNER_RECIPIENTS = uniquify(flatten(ALL_FILE_CONFIGS[id].recipients for id in ['025329', '024923', '033568']))
235
239
 
236
240
  # No point in ever displaying these; their emails show up elsewhere because they're mostly CC recipients
237
241
  USELESS_EMAILERS = IRAN_NUCLEAR_DEAL_SPAM_EMAIL_RECIPIENTS + \
238
242
  KRASSNER_RECIPIENTS + \
239
243
  FLIGHT_IN_2012_PEOPLE + [
240
244
  'Alan Rogers', # Random CC
245
+ 'Andrew Friendly', # Presumably some relation of Kelly Friendly
241
246
  'BS Stern', # A random fwd of email we have
242
247
  'Cheryl Kleen', # Single email from Anne Boyles, displayed under Anne Boyles
243
248
  'Connie Zaguirre', # Random CC
@@ -268,24 +273,41 @@ USELESS_EMAILERS = IRAN_NUCLEAR_DEAL_SPAM_EMAIL_RECIPIENTS + \
268
273
  ]
269
274
 
270
275
  # Emails sent by epstein to himself that are just notes
271
- NOTES_TO_SELF = [
276
+ SELF_EMAILS_FILE_IDS = [
272
277
  '026677',
273
- '029752',
278
+ '029752', # TODO: jokeland...
274
279
  '030238',
275
280
  # '033274', # TODO: Epstein's note to self doesn't get printed if we don't set the recipients to [None]
276
281
  ]
277
282
 
283
+ METADATA_FIELDS = [
284
+ 'is_junk_mail',
285
+ 'recipients',
286
+ 'sent_from_device',
287
+ ]
288
+
278
289
 
279
290
  @dataclass
280
291
  class Email(Communication):
292
+ """
293
+ Attributes:
294
+ actual_text (str) - best effort at the text actually sent in this email, excluding quoted replies and forwards
295
+ config (EmailCfg | None) - manual config for this email (if it exists)
296
+ header (EmailHeader) - header data extracted from the text (from/to/sent/subject etc)
297
+ is_junk_mail (bool) - True if this is junk mail
298
+ recipients (list[str | None]) - who this email was sent to
299
+ sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
300
+ signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
301
+ """
281
302
  actual_text: str = field(init=False)
303
+ config: EmailCfg | None = None
282
304
  header: EmailHeader = field(init=False)
283
305
  is_junk_mail: bool = False
284
306
  recipients: list[str | None] = field(default_factory=list)
285
307
  sent_from_device: str | None = None
286
- signature_substitution_counts: dict[str, int] = field(default_factory=lambda: defaultdict(int))
308
+ signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
287
309
 
288
- # Just for logging how many headers we rewrote
310
+ # For logging how many headers we prettified while printing, kind of janky
289
311
  rewritten_header_ids: ClassVar[set[str]] = set([])
290
312
 
291
313
  def __post_init__(self):
@@ -298,35 +320,34 @@ class Email(Communication):
298
320
  for recipient in self.header.recipients():
299
321
  self.recipients.extend(self._get_names(recipient))
300
322
 
301
- recipients = [r for r in self.recipients if r != self.author or self.file_id in NOTES_TO_SELF] # Remove self CCs
323
+ # Remove self CCs
324
+ recipients = [r for r in self.recipients if r != self.author or self.file_id in SELF_EMAILS_FILE_IDS]
302
325
  self.recipients = list(set(recipients))
303
- self.text = self._cleaned_up_text()
326
+ self.text = self._prettify_text()
304
327
  self.actual_text = self._actual_text()
305
328
  self.sent_from_device = self._sent_from_device()
306
- logger.debug(f"Constructed {self.description()}")
307
-
308
- def description(self) -> Text:
309
- """One line summary mostly for logging."""
310
- txt = self._description()
311
-
312
- if len(self.recipients) > 0:
313
- txt.append(', ').append(key_value_txt('recipients', self._recipients_txt()))
314
-
315
- return txt.append(CLOSE_PROPERTIES_CHAR)
316
-
317
- def idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
318
- """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
319
- for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text or self.text)):
320
- if i >= n:
321
- return match.end() - 1
322
329
 
323
330
  def info_txt(self) -> Text:
324
331
  txt = Text("OCR text of email from ", style='grey46').append(self.author_txt).append(' to ')
325
332
  return txt.append(self._recipients_txt()).append(highlighter(f" probably sent at {self.timestamp}"))
326
333
 
334
+ def metadata(self) -> Metadata:
335
+ metadata = super().metadata()
336
+ metadata.update({k: v for k, v in asdict(self).items() if v and k in METADATA_FIELDS})
337
+ return metadata
338
+
327
339
  def subject(self) -> str:
328
340
  return self.header.subject or ''
329
341
 
342
+ def summary(self) -> Text:
343
+ """One line summary mostly for logging."""
344
+ txt = self._summary()
345
+
346
+ if len(self.recipients) > 0:
347
+ txt.append(', ').append(key_value_txt('recipients', self._recipients_txt()))
348
+
349
+ return txt.append(CLOSE_PROPERTIES_CHAR)
350
+
330
351
  def _actual_text(self) -> str:
331
352
  """The text that comes before likely quoted replies and forwards etc."""
332
353
  if self.config and self.config.actual_text is not None:
@@ -339,8 +360,8 @@ class Email(Communication):
339
360
  # logger.info(f"Raw text:\n" + self.top_lines(20) + '\n\n')
340
361
  # logger.info(f"With header removed:\n" + text[0:500] + '\n\n')
341
362
 
342
- if self.file_id in ['024624']:
343
- return text
363
+ if self.file_id in ['024624']: # This email starts with "On September 14th"
364
+ return text.split('On Tue, May 14')[0].strip()
344
365
 
345
366
  if reply_text_match:
346
367
  actual_num_chars = len(reply_text_match.group(1))
@@ -355,7 +376,6 @@ class Email(Communication):
355
376
  if field_string not in text:
356
377
  continue
357
378
 
358
- logger.debug(f"'{self.url_slug}': Splitting based on '{field_string.strip()}'")
359
379
  pre_from_text = text.split(field_string)[0]
360
380
  actual_num_chars = len(pre_from_text)
361
381
  actual_text_pct = f"{(100 * float(actual_num_chars) / len(text)):.1f}%"
@@ -377,29 +397,6 @@ class Email(Communication):
377
397
 
378
398
  return style.replace('bold', '').strip()
379
399
 
380
- def _cleaned_up_text(self) -> str:
381
- """Add newline after headers in text if actual header wasn't empty, remove bad lines, etc."""
382
- # Insert line breaks now unless header is broken, in which case we'll do it later after fixing header
383
- text = self.text if self.header.was_initially_empty else _add_line_breaks(self.text)
384
- text = REPLY_REGEX.sub(r'\n\1', text) # Newlines between quoted replies
385
-
386
- for name, signature_regex in EMAIL_SIGNATURES.items():
387
- signature_replacement = f'<...snipped {name.lower()} legal signature...>'
388
- text, num_replaced = signature_regex.subn(signature_replacement, text)
389
- self.signature_substitution_counts[name] += num_replaced
390
-
391
- return collapse_newlines(text).strip()
392
-
393
- def _debug_info(self) -> str:
394
- info = [
395
- f"id={self.file_id}",
396
- f"url_slug={self.url_slug}",
397
- f"file_path='{self.file_path}'",
398
- f"is_local_extract_file={self.is_local_extract_file()}",
399
- ]
400
-
401
- return f" " + "\n ".join(info)
402
-
403
400
  def _extract_author(self) -> None:
404
401
  self._extract_header()
405
402
  super()._extract_author()
@@ -418,9 +415,8 @@ class Email(Communication):
418
415
  if self.header.is_empty():
419
416
  self.header.repair_empty_header(self.lines)
420
417
  else:
421
- msg = f"No header match found in '{self.filename}'! Top lines:\n\n{self.top_lines()}"
422
- log_fxn = logger.info if self.config else logger.warning
423
- log_fxn(msg)
418
+ log_level = logging.INFO if self.config else logging.WARNING
419
+ self.log_top_lines(msg='No email header match found!', level=log_level)
424
420
  self.header = EmailHeader(field_names=[])
425
421
 
426
422
  def _extract_timestamp(self) -> datetime:
@@ -432,7 +428,7 @@ class Email(Communication):
432
428
  if timestamp:
433
429
  return timestamp
434
430
 
435
- searchable_lines = self.lines[0:VALID_HEADER_LINES]
431
+ searchable_lines = self.lines[0:MAX_NUM_HEADER_LINES]
436
432
  searchable_text = '\n'.join(searchable_lines)
437
433
  date_match = DATE_HEADER_REGEX.search(searchable_text)
438
434
 
@@ -442,7 +438,7 @@ class Email(Communication):
442
438
  if timestamp:
443
439
  return timestamp
444
440
 
445
- logger.debug(f"Failed to find timestamp, falling back to parsing {VALID_HEADER_LINES} lines...")
441
+ logger.debug(f"Failed to find timestamp, falling back to parsing {MAX_NUM_HEADER_LINES} lines...")
446
442
 
447
443
  for line in searchable_lines:
448
444
  if not TIMESTAMP_LINE_REGEX.search(line):
@@ -476,6 +472,12 @@ class Email(Communication):
476
472
  names_found = names_found or [emailer_str]
477
473
  return [_reverse_first_and_last_names(name) for name in names_found]
478
474
 
475
+ def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
476
+ """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
477
+ for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text or self.text)):
478
+ if i >= n:
479
+ return match.end() - 1
480
+
479
481
  def _merge_lines(self, idx: int, idx2: int | None = None) -> None:
480
482
  """Combine lines numbered 'idx' and 'idx2' into a single line (idx2 defaults to idx + 1)."""
481
483
  idx2 = idx2 if idx2 is not None else (idx + 1)
@@ -490,6 +492,20 @@ class Email(Communication):
490
492
 
491
493
  self._set_computed_fields(lines=lines)
492
494
 
495
+ def _prettify_text(self) -> str:
496
+ """Add newlines before quoted replies and snip signatures."""
497
+ # Insert line breaks now unless header is broken, in which case we'll do it later after fixing header
498
+ text = self.text if self.header.was_initially_empty else _add_line_breaks(self.text)
499
+ text = REPLY_REGEX.sub(r'\n\1', text) # Newlines between quoted replies
500
+
501
+ for name, signature_regex in EMAIL_SIGNATURE_REGEXES.items():
502
+ signature_replacement = f'<...snipped {name.lower()} legal signature...>'
503
+ text, num_replaced = signature_regex.subn(signature_replacement, text)
504
+ self.signature_substitution_counts[name] = self.signature_substitution_counts.get(name, 0)
505
+ self.signature_substitution_counts[name] += num_replaced
506
+
507
+ return collapse_newlines(text).strip()
508
+
493
509
  def _recipients_txt(self) -> Text:
494
510
  """Text object with comma separated colored versions of all recipients."""
495
511
  recipients = [r or UNKNOWN for r in self.recipients] if len(self.recipients) > 0 else [UNKNOWN]
@@ -500,6 +516,14 @@ class Email(Communication):
500
516
  for r in recipients
501
517
  ], join=', ')
502
518
 
519
+ def _remove_line(self, idx: int) -> None:
520
+ """Remove a line from self.lines."""
521
+ num_lines = idx * 2
522
+ self.log_top_lines(num_lines, msg=f'before removal of line {idx}')
523
+ del self.lines[idx]
524
+ self._set_computed_fields(lines=self.lines)
525
+ self.log_top_lines(num_lines, msg=f'after removal of line {idx}')
526
+
503
527
  def _repair(self) -> None:
504
528
  """Repair particularly janky files."""
505
529
  if BAD_FIRST_LINE_REGEX.match(self.lines[0]):
@@ -510,21 +534,37 @@ class Email(Communication):
510
534
 
511
535
  if self.file_id in ['031442']:
512
536
  self._merge_lines(0) # Merge 1st and 2nd rows
513
- elif self.file_id in '021729 029501 029282 030626 031384 033512'.split():
537
+ elif self.file_id in '021729 025790 029282 029501 029889 030626 031384 031428 033097 033512 033583 029498 033583'.split():
514
538
  self._merge_lines(2) # Merge 3rd and 4th rows
515
539
 
516
540
  if self.file_id in ['030626']: # Merge 6th and 7th (now 5th and 6th) rows
517
541
  self._merge_lines(4)
518
- elif self.file_id in ['029976']:
542
+ elif self.file_id == '029889':
543
+ self._merge_lines(2, 5)
544
+ elif self.file_id in ['029498', '031428']:
545
+ self._merge_lines(2, 4)
546
+ elif self.file_id in ['029976', '023067']:
519
547
  self._merge_lines(3) # Merge 4th and 5th rows
520
- elif self.file_id in '026609 029402 032405'.split():
548
+ elif self.file_id in '026609 029402 032405 022695'.split():
521
549
  self._merge_lines(4) # Merge 5th and 6th rows
550
+ elif self.file_id in ['019407', '031980', '030384', '033144', '030999', '033575', '029835', '030381']:
551
+ self._merge_lines(2, 4)
552
+ elif self.file_id in ['029154', '029163']:
553
+ self._merge_lines(2, 5)
554
+ elif self.file_id in ['033228', '032063']:
555
+ self._merge_lines(3, 5)
556
+ elif self.file_id == '028931':
557
+ self._merge_lines(3, 6)
522
558
  elif self.file_id in ['033568']:
523
559
  for _i in range(5):
524
560
  self._merge_lines(5)
525
561
  elif self.file_id in ['025329']:
526
562
  for _i in range(9):
527
563
  self._merge_lines(2)
564
+ elif self.file_id == '033486':
565
+ self._merge_lines(7, 9)
566
+ elif self.file_id == '030299':
567
+ self._merge_lines(7, 10)
528
568
  elif self.file_id == '029977':
529
569
  self._set_computed_fields(text=self.text.replace('Sent 9/28/2012 2:41:02 PM', 'Sent: 9/28/2012 2:41:02 PM'))
530
570
 
@@ -533,6 +573,11 @@ class Email(Communication):
533
573
 
534
574
  self._merge_lines(4)
535
575
  self._merge_lines(2, 4)
576
+ elif self.file_id == '025041':
577
+ self._remove_line(4)
578
+ self._remove_line(4)
579
+ elif self.file_id == '029692':
580
+ self._remove_line(3)
536
581
 
537
582
  if old_text != self.text:
538
583
  self.log(f"Modified text, old:\n\n" + '\n'.join(old_text.split('\n')[0:12]) + '\n', logging.INFO)
@@ -568,21 +613,21 @@ class Email(Communication):
568
613
  self._set_computed_fields(lines=new_lines)
569
614
 
570
615
  def _sent_from_device(self) -> str | None:
571
- """Find any 'Sent from my iPhone' style lines if they exist."""
616
+ """Find any 'Sent from my iPhone' style signature line if it exist in the 'actual_text'."""
572
617
  sent_from_match = SENT_FROM_REGEX.search(self.actual_text)
573
618
 
574
619
  if sent_from_match:
575
620
  sent_from = sent_from_match.group(0)
576
621
  return 'S' + sent_from[1:] if sent_from.startswith('sent') else sent_from
577
622
 
578
- def __rich_console__(self, _console: Console, _options: ConsoleOptions) -> RenderResult:
623
+ def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
579
624
  logger.debug(f"Printing '{self.filename}'...")
580
625
  yield self.file_info_panel()
581
- text = self.text
582
626
  should_rewrite_header = self.header.was_initially_empty and self.header.num_header_rows > 0
583
- quote_cutoff = self.idx_of_nth_quoted_reply(text=text) # Trim if there's many quoted replies
627
+ quote_cutoff = self._idx_of_nth_quoted_reply(text=self.text) # Trim if there's many quoted replies
584
628
  num_chars = MAX_CHARS_TO_PRINT
585
629
  trim_footer_txt = None
630
+ text = self.text
586
631
 
587
632
  if self.file_id in TRUNCATION_LENGTHS:
588
633
  num_chars = TRUNCATION_LENGTHS[self.file_id]
@@ -611,7 +656,7 @@ class Email(Communication):
611
656
 
612
657
  lines += text.split('\n')[num_lines_to_skip:]
613
658
  text = self.header.rewrite_header() + '\n' + '\n'.join(lines)
614
- text = _add_line_breaks(text) # This was skipped when _cleaned_up_text() w/a broken header so we do it now
659
+ text = _add_line_breaks(text) # This was skipped when _prettify_text() w/a broken header so we do it now
615
660
  self.rewritten_header_ids.add(self.file_id)
616
661
 
617
662
  panel_txt = highlighter(text)
@@ -628,6 +673,30 @@ class Email(Communication):
628
673
  if should_rewrite_header:
629
674
  self.log_top_lines(self.header.num_header_rows + 4, f'Original header:', logging.INFO)
630
675
 
676
+ @staticmethod
677
+ def build_table(emails: list['Email'], _author: str | None) -> Table:
678
+ """Turn a set of Email objects into a Table."""
679
+ author = _author or UNKNOWN
680
+
681
+ table = Table(
682
+ title=f"Emails to/from {author} starting {emails[0].timestamp.date()}",
683
+ border_style=get_style_for_name(author, allow_bold=False),
684
+ header_style="bold"
685
+ )
686
+
687
+ table.add_column('From', justify='left')
688
+ table.add_column('Timestamp', justify='center')
689
+ table.add_column('Subject', justify='left', style='honeydew2', min_width=60)
690
+
691
+ for email in emails:
692
+ table.add_row(
693
+ email.author_txt,
694
+ email.epstein_media_link(link_txt=email.timestamp_without_seconds()),
695
+ highlighter(email.subject())
696
+ )
697
+
698
+ return table
699
+
631
700
 
632
701
  def _add_line_breaks(email_text: str) -> str:
633
702
  return EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX.sub(r'\n\1\n', email_text).strip()
@@ -4,8 +4,8 @@ from dataclasses import asdict, dataclass, field
4
4
 
5
5
  from epstein_files.util.constant.strings import AUTHOR, REDACTED
6
6
  from epstein_files.util.constants import ALL_CONFIGS
7
- from epstein_files.util.env import logger
8
- from epstein_files.util.file_cfg import MessageCfg
7
+ from epstein_files.util.doc_cfg import EmailCfg
8
+ from epstein_files.util.logging import logger
9
9
  from epstein_files.util.rich import UNKNOWN
10
10
 
11
11
  FIELD_NAMES = ['From', 'Date', 'Sent', 'Subject']
@@ -21,11 +21,11 @@ EMAIL_PRE_FORWARD_REGEX = re.compile(r"(.{3,2000}?)" + HEADER_REGEX_STR, re.DOTA
21
21
  TIME_REGEX = re.compile(r'^(\d{1,2}/\d{1,2}/\d{2,4}|Thursday|Monday|Tuesday|Wednesday|Friday|Saturday|Sunday).*')
22
22
 
23
23
  BAD_NAME_CHARS_REGEX = re.compile(r"[\"'\[\]*><•]")
24
- BAD_EMAILER_REGEX = re.compile(r'^(>|11111111)|agreed|ok|sexy|rt|re:|fwd:|Multiple Senders|((sent|attachments|subject|importance).*|.*(january|201\d|hysterical|i have|image0|so that people|article 1.?|momminnemummin|These conspiracy theories|your state|undisclosed|www\.theguardian|talk in|it was a|what do|cc:|call (back|me)).*)$', re.IGNORECASE)
24
+ BAD_EMAILER_REGEX = re.compile(r'^(>|11111111)|agreed|ok|sexy|re:|fwd:|Multiple Senders|((sent|attachments|subject|importance).*|.*(january|201\d|hysterical|i have|image0|so that people|article 1.?|momminnemummin|These conspiracy theories|your state|undisclosed|www\.theguardian|talk in|it was a|what do|cc:|call (back|me)).*)$', re.IGNORECASE)
25
25
 
26
26
  CONFIGURED_ACTUAL_TEXTS = [
27
27
  cfg.actual_text for cfg in ALL_CONFIGS
28
- if isinstance(cfg, MessageCfg) and cfg.actual_text is not None
28
+ if isinstance(cfg, EmailCfg) and cfg.actual_text is not None
29
29
  ]
30
30
 
31
31
 
@@ -70,7 +70,7 @@ class EmailHeader:
70
70
  raise RuntimeError(f"Ran out of header rows to check for '{field_name}'")
71
71
 
72
72
  value = email_lines[row_number_to_check]
73
- log_prefix = f"Looks like '{value}' is a mismatch for '{field_name}', "
73
+ log_prefix = f"Looks like '{value}' is a mismatch for '{field_name}'"
74
74
 
75
75
  if field_name == AUTHOR:
76
76
  if value in CONFIGURED_ACTUAL_TEXTS:
@@ -99,7 +99,8 @@ class EmailHeader:
99
99
  setattr(self, field_name, value)
100
100
 
101
101
  self.num_header_rows = len(self.field_names) + num_headers
102
- logger.debug(f"Corrected empty header using {self.num_header_rows} lines to:\n%s\n\nTop lines:\n\n%s", self, '\n'.join(email_lines[0:(num_headers + 1) * 2]))
102
+ log_msg = f"Corrected empty header using {self.num_header_rows} lines to:\n"
103
+ logger.debug(f"{log_msg}{self}\n\nTop lines:\n\n%s", '\n'.join(email_lines[0:(num_headers + 1) * 2]))
103
104
 
104
105
  def rewrite_header(self) -> str:
105
106
  header_fields = {}
@@ -7,7 +7,8 @@ from rich.text import Text
7
7
  from epstein_files.util.constant.names import JEFFREY_EPSTEIN, ANTHONY_SCARAMUCCI, STEVE_BANNON, UNKNOWN
8
8
  from epstein_files.util.data import extract_last_name
9
9
  from epstein_files.util.highlighted_group import get_style_for_name
10
- from epstein_files.util.rich import TEXT_LINK, highlighter, logger
10
+ from epstein_files.util.logging import logger
11
+ from epstein_files.util.rich import TEXT_LINK, highlighter
11
12
 
12
13
  MSG_DATE_FORMAT = r"%m/%d/%y %I:%M:%S %p"
13
14
  PHONE_NUMBER_REGEX = re.compile(r'^[\d+]+.*')
@@ -55,7 +56,7 @@ class TextMessage:
55
56
  else:
56
57
  self.author_str = self.author
57
58
 
58
- if not self.id_confirmed and self.author is not None:
59
+ if not self.id_confirmed and self.author is not None and self.author != JEFFREY_EPSTEIN:
59
60
  self.author_str = self.author + ' (?)'
60
61
 
61
62
  def timestamp(self) -> datetime:
@@ -1,14 +1,19 @@
1
+ import json
2
+ import logging
1
3
  from dataclasses import dataclass
2
4
  from pathlib import Path
5
+ from typing import ClassVar
3
6
 
4
7
  from rich.text import Text
5
8
 
6
9
  from epstein_files.documents.other_file import OtherFile
10
+ from epstein_files.util.constant.strings import JSON
7
11
 
8
12
 
9
13
  @dataclass
10
14
  class JsonFile(OtherFile):
11
15
  """File containing JSON data."""
16
+ strip_whitespace: ClassVar[bool] = False
12
17
 
13
18
  def __post_init__(self):
14
19
  super().__post_init__()
@@ -16,8 +21,20 @@ class JsonFile(OtherFile):
16
21
  if self.url_slug.endswith('.txt') or self.url_slug.endswith('.json'):
17
22
  self.url_slug = Path(self.url_slug).stem
18
23
 
24
+ self._set_computed_fields(text=self.formatted_json())
25
+
26
+ def category(self) -> str:
27
+ return JSON
28
+
29
+ def formatted_json(self) -> str:
30
+ return json.dumps(self.json_data(), indent=4)
31
+
19
32
  def info_txt(self) -> Text | None:
20
33
  return Text(f"JSON file, possibly iMessage or similar app metadata", style='white dim italic')
21
34
 
22
35
  def is_interesting(self):
23
36
  return False
37
+
38
+ def json_data(self) -> object:
39
+ with open(self.file_path, encoding='utf-8-sig') as f:
40
+ return json.load(f)