epstein-files 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
epstein_files/__init__.py CHANGED
@@ -4,73 +4,30 @@ Reformat Epstein text message files for readability and count email senders.
4
4
  For use with iMessage log files from https://drive.google.com/drive/folders/1hTNH5woIRio578onLGElkTWofUSWRoH_
5
5
 
6
6
  Install: 'poetry install'
7
- Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT ./generate.py'
7
+ Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT epstein_generate'
8
8
  """
9
9
  from sys import exit
10
10
 
11
11
  from dotenv import load_dotenv
12
12
  load_dotenv()
13
+
14
+ from rich.markup import escape
13
15
  from rich.padding import Padding
16
+ from rich.panel import Panel
14
17
 
18
+ from epstein_files.epstein_files import EpsteinFiles, document_cls
19
+ from epstein_files.documents.document import INFO_PADDING, Document
15
20
  from epstein_files.documents.email import Email
16
- from epstein_files.documents.messenger_log import MessengerLog
17
- from epstein_files.epstein_files import EpsteinFiles, count_by_month
18
21
  from epstein_files.util.constant.html import *
19
22
  from epstein_files.util.constant.names import *
20
- from epstein_files.util.constant.strings import EMAIL_CLASS, MESSENGER_LOG_CLASS
21
- from epstein_files.util.data import dict_sets_to_lists
23
+ from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, TEXT_MSGS_HTML_PATH, make_clean
22
24
  from epstein_files.util.env import args, specified_names
23
- from epstein_files.util.file_helper import GH_PAGES_HTML_PATH, JSON_METADATA_PATH, make_clean
25
+ from epstein_files.util.file_helper import coerce_file_path, extract_file_id
24
26
  from epstein_files.util.logging import logger
25
- from epstein_files.util.rich import *
27
+ from epstein_files.util.output import print_emails, print_json_metadata, print_json_stats, print_text_messages, write_urls
28
+ from epstein_files.util.rich import build_highlighter, console, print_header, print_panel, write_html
26
29
  from epstein_files.util.timer import Timer
27
30
 
28
- PRINT_COLOR_KEY_EVERY_N_EMAILS = 150
29
-
30
- # Order matters. Default names to print emails for.
31
- DEFAULT_EMAILERS = [
32
- JEREMY_RUBIN,
33
- AL_SECKEL,
34
- JOI_ITO,
35
- JABOR_Y,
36
- STEVEN_SINOFSKY,
37
- DANIEL_SIAD,
38
- JEAN_LUC_BRUNEL,
39
- STEVEN_HOFFENBERG,
40
- EHUD_BARAK,
41
- MARTIN_NOWAK,
42
- MASHA_DROKOVA,
43
- RENATA_BOLOTOVA,
44
- STEVE_BANNON,
45
- OLIVIER_COLOM,
46
- BORIS_NIKOLIC,
47
- PRINCE_ANDREW,
48
- JIDE_ZEITLIN,
49
- DAVID_STERN,
50
- MOHAMED_WAHEED_HASSAN,
51
- JENNIFER_JACQUET,
52
- None,
53
- ]
54
-
55
- # Order matters. Default names to print tables w/email subject, timestamp, etc for.
56
- # TODO: get rid of this
57
- DEFAULT_EMAILER_TABLES: list[str | None] = [
58
- GHISLAINE_MAXWELL,
59
- LEON_BLACK,
60
- LANDON_THOMAS,
61
- KATHRYN_RUEMMLER,
62
- DARREN_INDYKE,
63
- RICHARD_KAHN,
64
- TYLER_SHEARS,
65
- SULTAN_BIN_SULAYEM,
66
- DEEPAK_CHOPRA,
67
- ARIANE_DE_ROTHSCHILD,
68
- TOM_PRITZKER,
69
- ]
70
-
71
- if len(set(DEFAULT_EMAILERS).intersection(set(DEFAULT_EMAILER_TABLES))) > 0:
72
- raise RuntimeError(f"Some names appear in both DEFAULT_EMAILERS and DEFAULT_EMAILER_TABLES")
73
-
74
31
 
75
32
  def generate_html() -> None:
76
33
  if args.make_clean:
@@ -81,15 +38,7 @@ def generate_html() -> None:
81
38
  epstein_files = EpsteinFiles.get_files(timer)
82
39
 
83
40
  if args.json_metadata:
84
- json_str = epstein_files.json_metadata()
85
-
86
- if args.build:
87
- with open(JSON_METADATA_PATH, 'w') as f:
88
- f.write(json_str)
89
- timer.print_at_checkpoint(f"Wrote {file_size_str(JSON_METADATA_PATH)} to '{JSON_METADATA_PATH}'")
90
- else:
91
- console.print_json(json_str, indent=4, sort_keys=True)
92
-
41
+ print_json_metadata(epstein_files)
93
42
  exit()
94
43
 
95
44
  print_header(epstein_files)
@@ -98,11 +47,11 @@ def generate_html() -> None:
98
47
  exit()
99
48
 
100
49
  if args.output_texts:
101
- _print_text_messages(epstein_files)
50
+ print_text_messages(epstein_files)
102
51
  timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs)} text message logs')
103
52
 
104
53
  if args.output_emails:
105
- emails_printed = _print_emails(epstein_files)
54
+ emails_printed = print_emails(epstein_files)
106
55
  timer.print_at_checkpoint(f"Printed {emails_printed:,} emails")
107
56
 
108
57
  if args.output_other_files:
@@ -110,93 +59,79 @@ def generate_html() -> None:
110
59
  timer.print_at_checkpoint(f"Printed {len(files_printed)} other files")
111
60
 
112
61
  # Save output
113
- write_html(GH_PAGES_HTML_PATH)
62
+ write_html(ALL_EMAILS_PATH if args.all_emails else TEXT_MSGS_HTML_PATH)
114
63
  logger.warning(f"Total time: {timer.seconds_since_start_str()}")
115
64
 
116
65
  # JSON stats (mostly used for building pytest checks)
117
66
  if args.json_stats:
118
- console.line(5)
119
- _print_json_stats(epstein_files)
67
+ print_json_stats(epstein_files)
120
68
 
121
69
 
122
- def _print_emails(epstein_files: EpsteinFiles) -> int:
123
- """Returns number of emails printed."""
124
- print_section_header(('Selections from ' if not args.all_emails else '') + 'His Emails')
125
- print_other_site_link(is_header=False)
70
+ def epstein_diff():
71
+ """Diff the cleaned up text of two files."""
72
+ Document.diff_files(args.positional_args)
126
73
 
127
- if len(specified_names) == 0:
128
- epstein_files.print_emailer_counts_table()
129
74
 
130
- emailers_to_print: list[str | None]
131
- emailer_tables: list[str | None] = []
132
- already_printed_emails: list[Email] = []
133
- num_emails_printed_since_last_color_key = 0
75
+ def epstein_search():
76
+ """Search the cleaned up text of the files."""
77
+ _assert_positional_args()
78
+ epstein_files = EpsteinFiles.get_files(use_pickled=True)
134
79
 
135
- if args.all_emails:
136
- console.print('Email conversations are sorted chronologically based on time of the first email.')
137
- emailers_to_print = sorted(epstein_files.all_emailers(), key=lambda e: epstein_files.earliest_email_at(e))
138
- print_numbered_list_of_emailers(emailers_to_print, epstein_files)
139
- else:
140
- emailers_to_print = specified_names if specified_names else DEFAULT_EMAILERS
141
- console.print('Email conversations grouped by counterparty can be found in the order listed below.')
142
- print_numbered_list_of_emailers(emailers_to_print)
143
- console.print("\nAfter that there's tables linking to (but not displaying) all known emails for each of these people:")
144
-
145
- if len(specified_names) > 0:
146
- print_numbered_list_of_emailers(DEFAULT_EMAILER_TABLES)
80
+ for search_term in args.positional_args:
81
+ temp_highlighter = build_highlighter(search_term)
82
+ search_results = epstein_files.docs_matching(search_term, specified_names)
83
+ console.line(2)
84
+ print_panel(f"Found {len(search_results)} documents matching '{search_term}'", padding=(0, 0, 0, 3))
147
85
 
148
- for author in emailers_to_print:
149
- newly_printed_emails = epstein_files.print_emails_for(author)
150
- already_printed_emails.extend(newly_printed_emails)
151
- num_emails_printed_since_last_color_key += len(newly_printed_emails)
86
+ for search_result in search_results:
87
+ console.line()
152
88
 
153
- # Print color key every once in a while
154
- if num_emails_printed_since_last_color_key > PRINT_COLOR_KEY_EVERY_N_EMAILS:
155
- print_color_key()
156
- num_emails_printed_since_last_color_key = 0
89
+ if args.whole_file:
90
+ if isinstance(search_result.document, Email):
91
+ search_result.document.truncation_allowed = False
157
92
 
158
- if not specified_names:
159
- if not args.all_emails:
160
- print_author_header(f"Email Tables for {len(emailer_tables)} Other People", 'white')
93
+ console.print(search_result.document)
94
+ else:
95
+ console.print(search_result.document.description_panel())
161
96
 
162
- for name in DEFAULT_EMAILER_TABLES:
163
- epstein_files.print_emails_table_for(name)
97
+ for matching_line in search_result.lines:
98
+ line_txt = matching_line.__rich__()
99
+ console.print(Padding(temp_highlighter(line_txt), INFO_PADDING), style='gray37')
164
100
 
165
- epstein_files.print_email_device_info()
166
101
 
167
- # Check that all emails were actually printed
168
- if args.all_emails:
169
- email_ids_that_were_printed = set([email.file_id for email in already_printed_emails])
170
- logger.warning(f"Printed {len(already_printed_emails)} emails of {len(email_ids_that_were_printed)} unique file IDs.")
102
+ def epstein_show():
103
+ """Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
104
+ _assert_positional_args()
105
+ ids = [extract_file_id(arg) for arg in args.positional_args]
106
+ console.line()
171
107
 
172
- for email in epstein_files.emails:
173
- if email.file_id not in email_ids_that_were_printed and not email.is_duplicate:
174
- logger.warning(f"Failed to print {email.summary()}")
108
+ if args.pickled:
109
+ epstein_files = EpsteinFiles.get_files(use_pickled=True)
110
+ docs = epstein_files.get_documents_by_id(ids)
111
+ else:
112
+ raw_docs = [Document(coerce_file_path(id)) for id in ids]
113
+ docs = [document_cls(doc)(doc.file_path) for doc in raw_docs]
175
114
 
176
- logger.warning(f"Rewrote {len(Email.rewritten_header_ids)} headers of {len(epstein_files.emails)} emails")
177
- return len(already_printed_emails)
115
+ for doc in docs:
116
+ console.line()
117
+ console.print(doc)
178
118
 
119
+ if args.raw:
120
+ console.line()
121
+ console.print(Panel(f"*** {doc.url_slug} RAW ***", expand=False, style=doc._border_style()))
122
+ console.print(escape(doc.raw_text()))
179
123
 
180
- def _print_text_messages(epstein_files: EpsteinFiles) -> None:
181
- print_section_header('Text Messages')
182
- print_centered("(conversations are sorted chronologically based on timestamp of first message)\n", style='gray30')
183
- authors: list[str | None] = specified_names if specified_names else [JEFFREY_EPSTEIN]
184
- log_files = epstein_files.imessage_logs_for(authors)
124
+ if isinstance(doc, Email):
125
+ console.line()
126
+ console.print(Panel(f"*** {doc.url_slug} actual_text ***", expand=False, style=doc._border_style()))
127
+ console.print(escape(doc._actual_text()))
185
128
 
186
- for log_file in log_files:
187
- console.print(Padding(log_file))
188
- console.line(2)
189
129
 
190
- epstein_files.print_imessage_summary()
130
+ def epstein_dump_urls() -> None:
131
+ write_urls()
191
132
 
192
133
 
193
- def _print_json_stats(epstein_files: EpsteinFiles) -> None:
194
- console.print(Panel('JSON Stats Dump', expand=True, style='reverse bold'), '\n')
195
- print_json(f"{MESSENGER_LOG_CLASS} Sender Counts", MessengerLog.count_authors(epstein_files.imessage_logs), skip_falsey=True)
196
- print_json(f"{EMAIL_CLASS} Author Counts", epstein_files.email_author_counts, skip_falsey=True)
197
- print_json(f"{EMAIL_CLASS} Recipient Counts", epstein_files.email_recipient_counts, skip_falsey=True)
198
- print_json("Email signature_substitution_countss", epstein_files.email_signature_substitution_counts(), skip_falsey=True)
199
- print_json("email_author_device_signatures", dict_sets_to_lists(epstein_files.email_authors_to_device_signatures))
200
- print_json("email_sent_from_devices", dict_sets_to_lists(epstein_files.email_device_signatures_to_authors))
201
- print_json("email_unknown_recipient_file_ids", epstein_files.email_unknown_recipient_file_ids())
202
- print_json("count_by_month", count_by_month(epstein_files.all_documents()))
134
+ def _assert_positional_args():
135
+ if not args.positional_args:
136
+ console.print(f"\n ERROR: No positional args!\n", style='red1')
137
+ exit(1)
@@ -15,7 +15,7 @@ from epstein_files.util.constant.names import *
15
15
  from epstein_files.util.constant.strings import *
16
16
  from epstein_files.util.constant.urls import *
17
17
  from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
18
- from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize, without_nones
18
+ from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize, without_falsey
19
19
  from epstein_files.util.doc_cfg import EmailCfg, DocCfg, Metadata, TextCfg
20
20
  from epstein_files.util.env import args
21
21
  from epstein_files.util.file_helper import (DOCS_DIR, file_stem_for_id, extract_file_id, file_size,
@@ -159,7 +159,7 @@ class Document:
159
159
  if hint_msg:
160
160
  hints.append(highlighter(Text(hint_msg, style='white dim italic')))
161
161
 
162
- return without_nones(hints)
162
+ return without_falsey(hints)
163
163
 
164
164
  def info_txt(self) -> Text | None:
165
165
  """Secondary info about this file (recipients, level of certainty, etc). Overload in subclasses."""
@@ -255,7 +255,11 @@ class Document:
255
255
  txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
256
256
 
257
257
  txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
258
- txt.append(", ").append(key_value_txt('lines', Text(f"{self.num_lines}", style='cyan')))
258
+ txt.append(", ").append(key_value_txt('lines', self.num_lines))
259
+
260
+ if self.config and self.config.dupe_of_id:
261
+ txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.dupe_of_id, style='magenta')))
262
+
259
263
  return txt
260
264
 
261
265
  def top_lines(self, n: int = 10) -> str:
@@ -352,6 +356,11 @@ class Document:
352
356
  for f in tmpfiles:
353
357
  f.unlink()
354
358
 
359
+ @staticmethod
360
+ def known_author_count(docs: Sequence['Document']) -> int:
361
+ """Count of how many Document objects have an author attribution."""
362
+ return len([doc for doc in docs if doc.author])
363
+
355
364
  @staticmethod
356
365
  def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
357
366
  return sorted(docs, key=lambda doc: doc.sort_key())
@@ -30,7 +30,6 @@ BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communicati
30
30
  DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
31
31
  LINK_LINE_REGEX = re.compile(f"^(> )?htt")
32
32
  QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
33
- REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + ['********************************']
34
33
  REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
35
34
 
36
35
  BAD_TIMEZONE_REGEX = re.compile(fr'\((UTC|GMT\+\d\d:\d\d)\)|{REDACTED}')
@@ -39,10 +38,16 @@ TIMESTAMP_LINE_REGEX = re.compile(r"\d+:\d+")
39
38
 
40
39
  SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
41
40
  REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
41
+ IS_JUNK_MAIL = 'is_junk_mail'
42
42
  MAX_CHARS_TO_PRINT = 4000
43
43
  MAX_NUM_HEADER_LINES = 14
44
44
  MAX_QUOTED_REPLIES = 2
45
45
 
46
+ REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
47
+ '********************************',
48
+ 'Begin forwarded message',
49
+ ]
50
+
46
51
  OCR_REPAIRS: dict[str | re.Pattern, str] = {
47
52
  re.compile(r'grnail\.com'): 'gmail.com',
48
53
  re.compile(r"^(From|To)(: )?[_1.]{5,}", re.MULTILINE): rf"\1: {REDACTED}", # Redacted email addresses
@@ -119,6 +124,7 @@ EMAIL_SIGNATURE_REGEXES = {
119
124
  # Invalid for links to EpsteinWeb
120
125
  JUNK_EMAILERS = [
121
126
  'asmallworld@travel.asmallworld.net',
127
+ "digest-noreply@quora.com",
122
128
  'editorialstaff@flipboard.com',
123
129
  'How To Academy',
124
130
  'Jokeland',
@@ -126,9 +132,13 @@ JUNK_EMAILERS = [
126
132
  'Saved by Internet Explorer 11',
127
133
  ]
128
134
 
129
- TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + [
130
- 'Alan S Halperin',
135
+ MAILING_LISTS = [
136
+ INTELLIGENCE_SQUARED,
131
137
  'middle.east.update@hotmail.com',
138
+ ]
139
+
140
+ TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + MAILING_LISTS + [
141
+ 'Alan S Halperin',
132
142
  'Mitchell Bard',
133
143
  'Skip Rimer',
134
144
  ]
@@ -281,7 +291,7 @@ SELF_EMAILS_FILE_IDS = [
281
291
  ]
282
292
 
283
293
  METADATA_FIELDS = [
284
- 'is_junk_mail',
294
+ IS_JUNK_MAIL,
285
295
  'recipients',
286
296
  'sent_from_device',
287
297
  ]
@@ -294,7 +304,6 @@ class Email(Communication):
294
304
  actual_text (str) - best effort at the text actually sent in this email, excluding quoted replies and forwards
295
305
  config (EmailCfg | None) - manual config for this email (if it exists)
296
306
  header (EmailHeader) - header data extracted from the text (from/to/sent/subject etc)
297
- is_junk_mail (bool) - True if this is junk mail
298
307
  recipients (list[str | None]) - who this email was sent to
299
308
  sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
300
309
  signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
@@ -302,17 +311,16 @@ class Email(Communication):
302
311
  actual_text: str = field(init=False)
303
312
  config: EmailCfg | None = None
304
313
  header: EmailHeader = field(init=False)
305
- is_junk_mail: bool = False
306
314
  recipients: list[str | None] = field(default_factory=list)
307
315
  sent_from_device: str | None = None
308
316
  signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
317
+ truncation_allowed: bool = True
309
318
 
310
319
  # For logging how many headers we prettified while printing, kind of janky
311
320
  rewritten_header_ids: ClassVar[set[str]] = set([])
312
321
 
313
322
  def __post_init__(self):
314
323
  super().__post_init__()
315
- self.is_junk_mail = self.author in JUNK_EMAILERS
316
324
 
317
325
  if self.config and self.config.recipients:
318
326
  self.recipients = cast(list[str | None], self.config.recipients)
@@ -331,9 +339,17 @@ class Email(Communication):
331
339
  txt = Text("OCR text of email from ", style='grey46').append(self.author_txt).append(' to ')
332
340
  return txt.append(self._recipients_txt()).append(highlighter(f" probably sent at {self.timestamp}"))
333
341
 
342
+ def is_fwded_article(self) -> bool:
343
+ return bool(self.config and self.config.is_fwded_article)
344
+
345
+ def is_junk_mail(self) -> bool:
346
+ return self.author in JUNK_EMAILERS or self.author in MAILING_LISTS
347
+
334
348
  def metadata(self) -> Metadata:
349
+ local_metadata = asdict(self)
350
+ local_metadata[IS_JUNK_MAIL] = self.is_junk_mail()
335
351
  metadata = super().metadata()
336
- metadata.update({k: v for k, v in asdict(self).items() if v and k in METADATA_FIELDS})
352
+ metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
337
353
  return metadata
338
354
 
339
355
  def subject(self) -> str:
@@ -352,17 +368,18 @@ class Email(Communication):
352
368
  """The text that comes before likely quoted replies and forwards etc."""
353
369
  if self.config and self.config.actual_text is not None:
354
370
  return self.config.actual_text
371
+
372
+ text = '\n'.join(self.text.split('\n')[self.header.num_header_rows:]).strip()
373
+
374
+ if self.config and self.config.fwded_text_after:
375
+ return text.split(self.config.fwded_text_after)[0].strip()
355
376
  elif self.header.num_header_rows == 0:
356
377
  return self.text
357
378
 
358
- text = '\n'.join(self.text.split('\n')[self.header.num_header_rows:]).strip()
359
379
  reply_text_match = REPLY_TEXT_REGEX.search(text)
360
380
  # logger.info(f"Raw text:\n" + self.top_lines(20) + '\n\n')
361
381
  # logger.info(f"With header removed:\n" + text[0:500] + '\n\n')
362
382
 
363
- if self.file_id in ['024624']: # This email starts with "On September 14th"
364
- return text.split('On Tue, May 14')[0].strip()
365
-
366
383
  if reply_text_match:
367
384
  actual_num_chars = len(reply_text_match.group(1))
368
385
  actual_text_pct = f"{(100 * float(actual_num_chars) / len(text)):.1f}%"
@@ -555,6 +572,9 @@ class Email(Communication):
555
572
  self._merge_lines(3, 5)
556
573
  elif self.file_id == '028931':
557
574
  self._merge_lines(3, 6)
575
+ elif self.file_id == '013415':
576
+ for _i in range(2):
577
+ self._merge_lines(4)
558
578
  elif self.file_id in ['033568']:
559
579
  for _i in range(5):
560
580
  self._merge_lines(5)
@@ -637,7 +657,7 @@ class Email(Communication):
637
657
  num_chars = quote_cutoff
638
658
 
639
659
  # Truncate long emails but leave a note explaining what happened w/link to source document
640
- if len(text) > num_chars:
660
+ if len(text) > num_chars and self.truncation_allowed:
641
661
  text = text[0:num_chars]
642
662
  doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
643
663
  trim_note = f"<...trimmed to {num_chars} characters of {self.length}, read the rest at {doc_link_markup}...>"
@@ -4,7 +4,7 @@ from datetime import datetime
4
4
 
5
5
  from rich.text import Text
6
6
 
7
- from epstein_files.util.constant.names import JEFFREY_EPSTEIN, ANTHONY_SCARAMUCCI, STEVE_BANNON, UNKNOWN
7
+ from epstein_files.util.constant.names import JEFFREY_EPSTEIN, ANTHONY_SCARAMUCCI, CELINA_DUBIN, EVA, STEVE_BANNON, UNKNOWN
8
8
  from epstein_files.util.data import extract_last_name
9
9
  from epstein_files.util.highlighted_group import get_style_for_name
10
10
  from epstein_files.util.logging import logger
@@ -19,17 +19,18 @@ DISPLAY_LAST_NAME_ONLY = [
19
19
  STEVE_BANNON,
20
20
  ]
21
21
 
22
- UNKNOWN_TEXTERS = [
23
- '+16463880059',
24
- '+13108737937',
25
- '+13108802851',
26
- ]
22
+ PHONE_NUMBER_MAPPING = {
23
+ '+19174393646': ANTHONY_SCARAMUCCI,
24
+ '+13109906526': STEVE_BANNON,
25
+ '+16463880059': EVA,
26
+ '+13108737937': CELINA_DUBIN,
27
+ '+13108802851': STEVE_BANNON,
28
+
29
+ }
27
30
 
28
31
  TEXTER_MAPPING = {
29
32
  'e:': JEFFREY_EPSTEIN,
30
33
  'e:jeeitunes@gmail.com': JEFFREY_EPSTEIN,
31
- '+19174393646': ANTHONY_SCARAMUCCI,
32
- '+13109906526': STEVE_BANNON,
33
34
  }
34
35
 
35
36
 
@@ -37,7 +38,7 @@ TEXTER_MAPPING = {
37
38
  class TextMessage:
38
39
  """Class representing a single iMessage text message."""
39
40
  author: str | None
40
- author_str: str = field(init=False)
41
+ author_str: str | None = None
41
42
  id_confirmed: bool = False
42
43
  text: str
43
44
  timestamp_str: str
@@ -47,14 +48,10 @@ class TextMessage:
47
48
 
48
49
  if self.author is None:
49
50
  self.author_str = UNKNOWN
50
- elif self.author in UNKNOWN_TEXTERS:
51
- logger.warning(f"Bad text from '{self.author}': \"{self.text}\"")
52
- self.author_str = self.author
53
- self.author = None # TODO: this shouldn't be happening; we still know the author...
54
51
  elif self.author in DISPLAY_LAST_NAME_ONLY:
55
52
  self.author_str = extract_last_name(self.author)
56
53
  else:
57
- self.author_str = self.author
54
+ self.author_str = self.author_str or self.author
58
55
 
59
56
  if not self.id_confirmed and self.author is not None and self.author != JEFFREY_EPSTEIN:
60
57
  self.author_str = self.author + ' (?)'
@@ -87,7 +84,6 @@ class TextMessage:
87
84
  return msg_txt
88
85
 
89
86
  def __rich__(self) -> Text:
90
- # TODO: Workaround for phone numbers that sucks
91
87
  author_style = get_style_for_name(self.author_str if self.author_str.startswith('+') else self.author)
92
88
  author_txt = Text(self.author_str, style=author_style)
93
89
  timestamp_txt = Text(f"[{self.timestamp_str}]", style=TIMESTAMP_STYLE).append(' ')
@@ -44,17 +44,8 @@ class MessengerLog(Communication):
44
44
 
45
45
  def messages(self) -> list[TextMessage]:
46
46
  """Lazily evaluated accessor for self._messages."""
47
- if len(self._messages) == 0:
48
- self._messages = [
49
- TextMessage(
50
- # If the Sender: is redacted that means it's from self.author
51
- author=REDACTED_AUTHOR_REGEX.sub('', match.group(1).strip()) or self.author,
52
- id_confirmed=not self.is_attribution_uncertain(),
53
- text=match.group(4).strip(),
54
- timestamp_str=match.group(2).strip(),
55
- )
56
- for match in MSG_REGEX.finditer(self.text)
57
- ]
47
+ if not self._messages:
48
+ self._messages = [self._build_message(match) for match in MSG_REGEX.finditer(self.text)]
58
49
 
59
50
  return self._messages
60
51
 
@@ -70,6 +61,19 @@ class MessengerLog(Communication):
70
61
  def _border_style(self) -> str:
71
62
  return self.author_style
72
63
 
64
+ def _build_message(self, match: re.Match) -> TextMessage:
65
+ """Turn a regex match into a TextMessage."""
66
+ author_str = REDACTED_AUTHOR_REGEX.sub('', match.group(1).strip())
67
+
68
+ # If the Sender: is redacted that means it's from self.author
69
+ return TextMessage(
70
+ author=self.author if (author_str.startswith('+') or not author_str) else author_str,
71
+ author_str=author_str if author_str.startswith('+') else None, # Preserve phone numbers
72
+ id_confirmed=not self.is_attribution_uncertain(),
73
+ text=match.group(4).strip(),
74
+ timestamp_str=match.group(2).strip(),
75
+ )
76
+
73
77
  def _extract_timestamp(self) -> datetime:
74
78
  for match in MSG_REGEX.finditer(self.text):
75
79
  timestamp_str = match.group(2).strip()
@@ -1,7 +1,7 @@
1
1
  import re
2
2
  import logging
3
3
  import warnings
4
- from dataclasses import dataclass
4
+ from dataclasses import asdict, dataclass
5
5
  from datetime import datetime
6
6
 
7
7
  import datefinder
@@ -15,7 +15,7 @@ from rich.text import Text
15
15
  from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, WHITESPACE_REGEX, Document
16
16
  from epstein_files.util.constant.strings import *
17
17
  from epstein_files.util.constants import *
18
- from epstein_files.util.doc_cfg import FINANCIAL_REPORTS_AUTHORS, DocCfg
18
+ from epstein_files.util.doc_cfg import FINANCIAL_REPORTS_AUTHORS, DocCfg, Metadata
19
19
  from epstein_files.util.data import escape_single_quotes, remove_timezone, uniquify
20
20
  from epstein_files.util.file_helper import FILENAME_LENGTH
21
21
  from epstein_files.util.env import args
@@ -83,11 +83,10 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
83
83
  NOBEL_CHARITABLE_TRUST,
84
84
  'Nautilus',
85
85
  'New Yorker',
86
- NYT_ARTICLE,
87
- NYT_COLUMN,
86
+ NYT,
88
87
  PALM_BEACH_CODE_ENFORCEMENT,
89
- PALM_BEACH_DAILY_ARTICLE,
90
- PALM_BEACH_POST_ARTICLE,
88
+ PALM_BEACH_DAILY_NEWS,
89
+ PALM_BEACH_POST,
91
90
  PALM_BEACH_TSV,
92
91
  PALM_BEACH_WATER_COMMITTEE,
93
92
  PAUL_KRASSNER,
@@ -102,6 +101,7 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
102
101
  SHIMON_POST_ARTICLE,
103
102
  SINGLE_PAGE,
104
103
  STACEY_PLASKETT,
104
+ 'Tatler',
105
105
  TERJE_ROD_LARSEN,
106
106
  TEXT_OF_US_LAW,
107
107
  TRANSLATION,
@@ -113,7 +113,7 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
113
113
  'U.S. News',
114
114
  'US Office',
115
115
  'Vanity Fair',
116
- VI_DAILY_NEWS_ARTICLE,
116
+ VI_DAILY_NEWS,
117
117
  WAPO,
118
118
  ]
119
119
 
@@ -127,7 +127,7 @@ class OtherFile(Document):
127
127
 
128
128
  if self.config is None and VI_DAILY_NEWS_REGEX.search(self.text):
129
129
  self.log(f"Creating synthetic config for VI Daily News article...", logging.INFO)
130
- self.config = DocCfg(id=self.file_id, description=VI_DAILY_NEWS_ARTICLE, category=ARTICLE)
130
+ self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
131
131
 
132
132
  def category(self) -> str | None:
133
133
  return self.config and self.config.category
@@ -175,6 +175,11 @@ class OtherFile(Document):
175
175
 
176
176
  return True
177
177
 
178
+ def metadata(self) -> Metadata:
179
+ metadata = super().metadata()
180
+ metadata['is_interesting'] = self.is_interesting()
181
+ return metadata
182
+
178
183
  def preview_text(self) -> str:
179
184
  return WHITESPACE_REGEX.sub(' ', self.text)[0:PREVIEW_CHARS]
180
185