epstein-files 1.0.2__tar.gz → 1.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {epstein_files-1.0.2 → epstein_files-1.0.3}/PKG-INFO +14 -4
  2. {epstein_files-1.0.2 → epstein_files-1.0.3}/README.md +12 -3
  3. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/__init__.py +4 -1
  4. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/documents/document.py +7 -2
  5. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/documents/email.py +33 -13
  6. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/documents/imessage/text_message.py +11 -15
  7. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/documents/messenger_log.py +15 -11
  8. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/documents/other_file.py +13 -8
  9. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/epstein_files.py +21 -15
  10. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/util/constant/names.py +19 -23
  11. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/util/constant/strings.py +8 -2
  12. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/util/constant/urls.py +1 -0
  13. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/util/constants.py +194 -116
  14. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/util/data.py +1 -1
  15. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/util/doc_cfg.py +5 -4
  16. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/util/env.py +3 -2
  17. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/util/highlighted_group.py +30 -25
  18. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/util/logging.py +1 -0
  19. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/util/output.py +8 -9
  20. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/util/rich.py +6 -1
  21. {epstein_files-1.0.2 → epstein_files-1.0.3}/pyproject.toml +2 -1
  22. {epstein_files-1.0.2 → epstein_files-1.0.3}/LICENSE +0 -0
  23. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/documents/communication.py +0 -0
  24. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/documents/emails/email_header.py +0 -0
  25. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/documents/json_file.py +0 -0
  26. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/util/constant/common_words.py +0 -0
  27. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/util/constant/html.py +0 -0
  28. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/util/constant/output_files.py +0 -0
  29. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/util/file_helper.py +0 -0
  30. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/util/search_result.py +0 -0
  31. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/util/timer.py +0 -0
  32. {epstein_files-1.0.2 → epstein_files-1.0.3}/epstein_files/util/word_count.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: epstein-files
3
- Version: 1.0.2
3
+ Version: 1.0.3
4
4
  Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
5
5
  Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
6
6
  License: GPL-3.0-or-later
@@ -25,6 +25,7 @@ Requires-Dist: requests (>=2.32.5,<3.0.0)
25
25
  Requires-Dist: rich (>=14.2.0,<15.0.0)
26
26
  Project-URL: Emails, https://michelcrypt4d4mus.github.io/epstein_text_messages/all_emails_epstein_files_nov_2025.html
27
27
  Project-URL: Metadata, https://michelcrypt4d4mus.github.io/epstein_text_messages/file_metadata_epstein_files_nov_2025.json
28
+ Project-URL: Repository, https://github.com/michelcrypt4d4mus/epstein_text_messages
28
29
  Project-URL: TextMessages, https://michelcrypt4d4mus.github.io/epstein_text_messages
29
30
  Project-URL: WordCounts, https://michelcrypt4d4mus.github.io/epstein_text_messages/communication_word_count_epstein_files_nov_2025.html
30
31
  Description-Content-Type: text/markdown
@@ -46,11 +47,20 @@ Description-Content-Type: text/markdown
46
47
  You need to set the `DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
47
48
 
48
49
  ```bash
49
- DOCS_DIR=/path/to/epstein/ocr_txt_files ./generate.py
50
+ # Generate color highlighted texts/emails/other files
51
+ DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate
52
+
53
+ # Search
54
+ DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_search Bannon
55
+
56
+ # Show a color highlighted file
57
+ DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_show 030999
58
+ # This also works
59
+ DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_show HOUSE_OVERSIGHT_030999
50
60
  ```
51
61
 
52
- Run `./generate.py --help` for command line option assistance. Look in the [scripts](./scripts/) folder for various scripts.
53
- The first time you run anything it will take a few minutes to fix all the data, attribute the redacted emails, etc. Once you've run things once you can run the `./generate.py --pickled` to load the cached fixed up data and things will be quick.
62
+ Run `epstein_generate --help` for command line option assistance.
63
+ The first time you run anything it will take a few minutes to fix all the data, attribute the redacted emails, etc. Once you've run things once you can run the `epstein_generate --pickled` to load the cached fixed up data and things will be quick.
54
64
 
55
65
  #### As A Library
56
66
  ```python
@@ -15,11 +15,20 @@
15
15
  You need to set the `DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
16
16
 
17
17
  ```bash
18
- DOCS_DIR=/path/to/epstein/ocr_txt_files ./generate.py
18
+ # Generate color highlighted texts/emails/other files
19
+ DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate
20
+
21
+ # Search
22
+ DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_search Bannon
23
+
24
+ # Show a color highlighted file
25
+ DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_show 030999
26
+ # This also works
27
+ DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_show HOUSE_OVERSIGHT_030999
19
28
  ```
20
29
 
21
- Run `./generate.py --help` for command line option assistance. Look in the [scripts](./scripts/) folder for various scripts.
22
- The first time you run anything it will take a few minutes to fix all the data, attribute the redacted emails, etc. Once you've run things once you can run the `./generate.py --pickled` to load the cached fixed up data and things will be quick.
30
+ Run `epstein_generate --help` for command line option assistance.
31
+ The first time you run anything it will take a few minutes to fix all the data, attribute the redacted emails, etc. Once you've run things once you can run the `epstein_generate --pickled` to load the cached fixed up data and things will be quick.
23
32
 
24
33
  #### As A Library
25
34
  ```python
@@ -4,7 +4,7 @@ Reformat Epstein text message files for readability and count email senders.
4
4
  For use with iMessage log files from https://drive.google.com/drive/folders/1hTNH5woIRio578onLGElkTWofUSWRoH_
5
5
 
6
6
  Install: 'poetry install'
7
- Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT ./generate.py'
7
+ Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT epstein_generate'
8
8
  """
9
9
  from sys import exit
10
10
 
@@ -87,6 +87,9 @@ def epstein_search():
87
87
  console.line()
88
88
 
89
89
  if args.whole_file:
90
+ if isinstance(search_result.document, Email):
91
+ search_result.document.truncation_allowed = False
92
+
90
93
  console.print(search_result.document)
91
94
  else:
92
95
  console.print(search_result.document.description_panel())
@@ -15,7 +15,7 @@ from epstein_files.util.constant.names import *
15
15
  from epstein_files.util.constant.strings import *
16
16
  from epstein_files.util.constant.urls import *
17
17
  from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
18
- from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize, without_nones
18
+ from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize, without_falsey
19
19
  from epstein_files.util.doc_cfg import EmailCfg, DocCfg, Metadata, TextCfg
20
20
  from epstein_files.util.env import args
21
21
  from epstein_files.util.file_helper import (DOCS_DIR, file_stem_for_id, extract_file_id, file_size,
@@ -159,7 +159,7 @@ class Document:
159
159
  if hint_msg:
160
160
  hints.append(highlighter(Text(hint_msg, style='white dim italic')))
161
161
 
162
- return without_nones(hints)
162
+ return without_falsey(hints)
163
163
 
164
164
  def info_txt(self) -> Text | None:
165
165
  """Secondary info about this file (recipients, level of certainty, etc). Overload in subclasses."""
@@ -356,6 +356,11 @@ class Document:
356
356
  for f in tmpfiles:
357
357
  f.unlink()
358
358
 
359
+ @staticmethod
360
+ def known_author_count(docs: Sequence['Document']) -> int:
361
+ """Count of how many Document objects have an author attribution."""
362
+ return len([doc for doc in docs if doc.author])
363
+
359
364
  @staticmethod
360
365
  def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
361
366
  return sorted(docs, key=lambda doc: doc.sort_key())
@@ -30,7 +30,6 @@ BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communicati
30
30
  DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
31
31
  LINK_LINE_REGEX = re.compile(f"^(> )?htt")
32
32
  QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
33
- REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + ['********************************']
34
33
  REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
35
34
 
36
35
  BAD_TIMEZONE_REGEX = re.compile(fr'\((UTC|GMT\+\d\d:\d\d)\)|{REDACTED}')
@@ -39,10 +38,16 @@ TIMESTAMP_LINE_REGEX = re.compile(r"\d+:\d+")
39
38
 
40
39
  SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
41
40
  REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
41
+ IS_JUNK_MAIL = 'is_junk_mail'
42
42
  MAX_CHARS_TO_PRINT = 4000
43
43
  MAX_NUM_HEADER_LINES = 14
44
44
  MAX_QUOTED_REPLIES = 2
45
45
 
46
+ REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
47
+ '********************************',
48
+ 'Begin forwarded message',
49
+ ]
50
+
46
51
  OCR_REPAIRS: dict[str | re.Pattern, str] = {
47
52
  re.compile(r'grnail\.com'): 'gmail.com',
48
53
  re.compile(r"^(From|To)(: )?[_1.]{5,}", re.MULTILINE): rf"\1: {REDACTED}", # Redacted email addresses
@@ -119,6 +124,7 @@ EMAIL_SIGNATURE_REGEXES = {
119
124
  # Invalid for links to EpsteinWeb
120
125
  JUNK_EMAILERS = [
121
126
  'asmallworld@travel.asmallworld.net',
127
+ "digest-noreply@quora.com",
122
128
  'editorialstaff@flipboard.com',
123
129
  'How To Academy',
124
130
  'Jokeland',
@@ -126,9 +132,13 @@ JUNK_EMAILERS = [
126
132
  'Saved by Internet Explorer 11',
127
133
  ]
128
134
 
129
- TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + [
130
- 'Alan S Halperin',
135
+ MAILING_LISTS = [
136
+ INTELLIGENCE_SQUARED,
131
137
  'middle.east.update@hotmail.com',
138
+ ]
139
+
140
+ TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + MAILING_LISTS + [
141
+ 'Alan S Halperin',
132
142
  'Mitchell Bard',
133
143
  'Skip Rimer',
134
144
  ]
@@ -281,7 +291,7 @@ SELF_EMAILS_FILE_IDS = [
281
291
  ]
282
292
 
283
293
  METADATA_FIELDS = [
284
- 'is_junk_mail',
294
+ IS_JUNK_MAIL,
285
295
  'recipients',
286
296
  'sent_from_device',
287
297
  ]
@@ -294,7 +304,6 @@ class Email(Communication):
294
304
  actual_text (str) - best effort at the text actually sent in this email, excluding quoted replies and forwards
295
305
  config (EmailCfg | None) - manual config for this email (if it exists)
296
306
  header (EmailHeader) - header data extracted from the text (from/to/sent/subject etc)
297
- is_junk_mail (bool) - True if this is junk mail
298
307
  recipients (list[str | None]) - who this email was sent to
299
308
  sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
300
309
  signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
@@ -302,17 +311,16 @@ class Email(Communication):
302
311
  actual_text: str = field(init=False)
303
312
  config: EmailCfg | None = None
304
313
  header: EmailHeader = field(init=False)
305
- is_junk_mail: bool = False
306
314
  recipients: list[str | None] = field(default_factory=list)
307
315
  sent_from_device: str | None = None
308
316
  signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
317
+ truncation_allowed: bool = True
309
318
 
310
319
  # For logging how many headers we prettified while printing, kind of janky
311
320
  rewritten_header_ids: ClassVar[set[str]] = set([])
312
321
 
313
322
  def __post_init__(self):
314
323
  super().__post_init__()
315
- self.is_junk_mail = self.author in JUNK_EMAILERS
316
324
 
317
325
  if self.config and self.config.recipients:
318
326
  self.recipients = cast(list[str | None], self.config.recipients)
@@ -331,9 +339,17 @@ class Email(Communication):
331
339
  txt = Text("OCR text of email from ", style='grey46').append(self.author_txt).append(' to ')
332
340
  return txt.append(self._recipients_txt()).append(highlighter(f" probably sent at {self.timestamp}"))
333
341
 
342
+ def is_fwded_article(self) -> bool:
343
+ return bool(self.config and self.config.is_fwded_article)
344
+
345
+ def is_junk_mail(self) -> bool:
346
+ return self.author in JUNK_EMAILERS or self.author in MAILING_LISTS
347
+
334
348
  def metadata(self) -> Metadata:
349
+ local_metadata = asdict(self)
350
+ local_metadata[IS_JUNK_MAIL] = self.is_junk_mail()
335
351
  metadata = super().metadata()
336
- metadata.update({k: v for k, v in asdict(self).items() if v and k in METADATA_FIELDS})
352
+ metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
337
353
  return metadata
338
354
 
339
355
  def subject(self) -> str:
@@ -352,17 +368,18 @@ class Email(Communication):
352
368
  """The text that comes before likely quoted replies and forwards etc."""
353
369
  if self.config and self.config.actual_text is not None:
354
370
  return self.config.actual_text
371
+
372
+ text = '\n'.join(self.text.split('\n')[self.header.num_header_rows:]).strip()
373
+
374
+ if self.config and self.config.fwded_text_after:
375
+ return text.split(self.config.fwded_text_after)[0].strip()
355
376
  elif self.header.num_header_rows == 0:
356
377
  return self.text
357
378
 
358
- text = '\n'.join(self.text.split('\n')[self.header.num_header_rows:]).strip()
359
379
  reply_text_match = REPLY_TEXT_REGEX.search(text)
360
380
  # logger.info(f"Raw text:\n" + self.top_lines(20) + '\n\n')
361
381
  # logger.info(f"With header removed:\n" + text[0:500] + '\n\n')
362
382
 
363
- if self.file_id in ['024624']: # This email starts with "On September 14th"
364
- return text.split('On Tue, May 14')[0].strip()
365
-
366
383
  if reply_text_match:
367
384
  actual_num_chars = len(reply_text_match.group(1))
368
385
  actual_text_pct = f"{(100 * float(actual_num_chars) / len(text)):.1f}%"
@@ -555,6 +572,9 @@ class Email(Communication):
555
572
  self._merge_lines(3, 5)
556
573
  elif self.file_id == '028931':
557
574
  self._merge_lines(3, 6)
575
+ elif self.file_id == '013415':
576
+ for _i in range(2):
577
+ self._merge_lines(4)
558
578
  elif self.file_id in ['033568']:
559
579
  for _i in range(5):
560
580
  self._merge_lines(5)
@@ -637,7 +657,7 @@ class Email(Communication):
637
657
  num_chars = quote_cutoff
638
658
 
639
659
  # Truncate long emails but leave a note explaining what happened w/link to source document
640
- if len(text) > num_chars:
660
+ if len(text) > num_chars and self.truncation_allowed:
641
661
  text = text[0:num_chars]
642
662
  doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
643
663
  trim_note = f"<...trimmed to {num_chars} characters of {self.length}, read the rest at {doc_link_markup}...>"
@@ -4,7 +4,7 @@ from datetime import datetime
4
4
 
5
5
  from rich.text import Text
6
6
 
7
- from epstein_files.util.constant.names import JEFFREY_EPSTEIN, ANTHONY_SCARAMUCCI, STEVE_BANNON, UNKNOWN
7
+ from epstein_files.util.constant.names import JEFFREY_EPSTEIN, ANTHONY_SCARAMUCCI, CELINA_DUBIN, EVA, STEVE_BANNON, UNKNOWN
8
8
  from epstein_files.util.data import extract_last_name
9
9
  from epstein_files.util.highlighted_group import get_style_for_name
10
10
  from epstein_files.util.logging import logger
@@ -19,17 +19,18 @@ DISPLAY_LAST_NAME_ONLY = [
19
19
  STEVE_BANNON,
20
20
  ]
21
21
 
22
- UNKNOWN_TEXTERS = [
23
- '+16463880059',
24
- '+13108737937',
25
- '+13108802851',
26
- ]
22
+ PHONE_NUMBER_MAPPING = {
23
+ '+19174393646': ANTHONY_SCARAMUCCI,
24
+ '+13109906526': STEVE_BANNON,
25
+ '+16463880059': EVA,
26
+ '+13108737937': CELINA_DUBIN,
27
+ '+13108802851': STEVE_BANNON,
28
+
29
+ }
27
30
 
28
31
  TEXTER_MAPPING = {
29
32
  'e:': JEFFREY_EPSTEIN,
30
33
  'e:jeeitunes@gmail.com': JEFFREY_EPSTEIN,
31
- '+19174393646': ANTHONY_SCARAMUCCI,
32
- '+13109906526': STEVE_BANNON,
33
34
  }
34
35
 
35
36
 
@@ -37,7 +38,7 @@ TEXTER_MAPPING = {
37
38
  class TextMessage:
38
39
  """Class representing a single iMessage text message."""
39
40
  author: str | None
40
- author_str: str = field(init=False)
41
+ author_str: str | None = None
41
42
  id_confirmed: bool = False
42
43
  text: str
43
44
  timestamp_str: str
@@ -47,14 +48,10 @@ class TextMessage:
47
48
 
48
49
  if self.author is None:
49
50
  self.author_str = UNKNOWN
50
- elif self.author in UNKNOWN_TEXTERS:
51
- logger.warning(f"Bad text from '{self.author}': \"{self.text}\"")
52
- self.author_str = self.author
53
- self.author = None # TODO: this shouldn't be happening; we still know the author...
54
51
  elif self.author in DISPLAY_LAST_NAME_ONLY:
55
52
  self.author_str = extract_last_name(self.author)
56
53
  else:
57
- self.author_str = self.author
54
+ self.author_str = self.author_str or self.author
58
55
 
59
56
  if not self.id_confirmed and self.author is not None and self.author != JEFFREY_EPSTEIN:
60
57
  self.author_str = self.author + ' (?)'
@@ -87,7 +84,6 @@ class TextMessage:
87
84
  return msg_txt
88
85
 
89
86
  def __rich__(self) -> Text:
90
- # TODO: Workaround for phone numbers that sucks
91
87
  author_style = get_style_for_name(self.author_str if self.author_str.startswith('+') else self.author)
92
88
  author_txt = Text(self.author_str, style=author_style)
93
89
  timestamp_txt = Text(f"[{self.timestamp_str}]", style=TIMESTAMP_STYLE).append(' ')
@@ -44,17 +44,8 @@ class MessengerLog(Communication):
44
44
 
45
45
  def messages(self) -> list[TextMessage]:
46
46
  """Lazily evaluated accessor for self._messages."""
47
- if len(self._messages) == 0:
48
- self._messages = [
49
- TextMessage(
50
- # If the Sender: is redacted that means it's from self.author
51
- author=REDACTED_AUTHOR_REGEX.sub('', match.group(1).strip()) or self.author,
52
- id_confirmed=not self.is_attribution_uncertain(),
53
- text=match.group(4).strip(),
54
- timestamp_str=match.group(2).strip(),
55
- )
56
- for match in MSG_REGEX.finditer(self.text)
57
- ]
47
+ if not self._messages:
48
+ self._messages = [self._build_message(match) for match in MSG_REGEX.finditer(self.text)]
58
49
 
59
50
  return self._messages
60
51
 
@@ -70,6 +61,19 @@ class MessengerLog(Communication):
70
61
  def _border_style(self) -> str:
71
62
  return self.author_style
72
63
 
64
+ def _build_message(self, match: re.Match) -> TextMessage:
65
+ """Turn a regex match into a TextMessage."""
66
+ author_str = REDACTED_AUTHOR_REGEX.sub('', match.group(1).strip())
67
+
68
+ # If the Sender: is redacted that means it's from self.author
69
+ return TextMessage(
70
+ author=self.author if (author_str.startswith('+') or not author_str) else author_str,
71
+ author_str=author_str if author_str.startswith('+') else None, # Preserve phone numbers
72
+ id_confirmed=not self.is_attribution_uncertain(),
73
+ text=match.group(4).strip(),
74
+ timestamp_str=match.group(2).strip(),
75
+ )
76
+
73
77
  def _extract_timestamp(self) -> datetime:
74
78
  for match in MSG_REGEX.finditer(self.text):
75
79
  timestamp_str = match.group(2).strip()
@@ -1,7 +1,7 @@
1
1
  import re
2
2
  import logging
3
3
  import warnings
4
- from dataclasses import dataclass
4
+ from dataclasses import asdict, dataclass
5
5
  from datetime import datetime
6
6
 
7
7
  import datefinder
@@ -15,7 +15,7 @@ from rich.text import Text
15
15
  from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, WHITESPACE_REGEX, Document
16
16
  from epstein_files.util.constant.strings import *
17
17
  from epstein_files.util.constants import *
18
- from epstein_files.util.doc_cfg import FINANCIAL_REPORTS_AUTHORS, DocCfg
18
+ from epstein_files.util.doc_cfg import FINANCIAL_REPORTS_AUTHORS, DocCfg, Metadata
19
19
  from epstein_files.util.data import escape_single_quotes, remove_timezone, uniquify
20
20
  from epstein_files.util.file_helper import FILENAME_LENGTH
21
21
  from epstein_files.util.env import args
@@ -83,11 +83,10 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
83
83
  NOBEL_CHARITABLE_TRUST,
84
84
  'Nautilus',
85
85
  'New Yorker',
86
- NYT_ARTICLE,
87
- NYT_COLUMN,
86
+ NYT,
88
87
  PALM_BEACH_CODE_ENFORCEMENT,
89
- PALM_BEACH_DAILY_ARTICLE,
90
- PALM_BEACH_POST_ARTICLE,
88
+ PALM_BEACH_DAILY_NEWS,
89
+ PALM_BEACH_POST,
91
90
  PALM_BEACH_TSV,
92
91
  PALM_BEACH_WATER_COMMITTEE,
93
92
  PAUL_KRASSNER,
@@ -102,6 +101,7 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
102
101
  SHIMON_POST_ARTICLE,
103
102
  SINGLE_PAGE,
104
103
  STACEY_PLASKETT,
104
+ 'Tatler',
105
105
  TERJE_ROD_LARSEN,
106
106
  TEXT_OF_US_LAW,
107
107
  TRANSLATION,
@@ -113,7 +113,7 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
113
113
  'U.S. News',
114
114
  'US Office',
115
115
  'Vanity Fair',
116
- VI_DAILY_NEWS_ARTICLE,
116
+ VI_DAILY_NEWS,
117
117
  WAPO,
118
118
  ]
119
119
 
@@ -127,7 +127,7 @@ class OtherFile(Document):
127
127
 
128
128
  if self.config is None and VI_DAILY_NEWS_REGEX.search(self.text):
129
129
  self.log(f"Creating synthetic config for VI Daily News article...", logging.INFO)
130
- self.config = DocCfg(id=self.file_id, description=VI_DAILY_NEWS_ARTICLE, category=ARTICLE)
130
+ self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
131
131
 
132
132
  def category(self) -> str | None:
133
133
  return self.config and self.config.category
@@ -175,6 +175,11 @@ class OtherFile(Document):
175
175
 
176
176
  return True
177
177
 
178
+ def metadata(self) -> Metadata:
179
+ metadata = super().metadata()
180
+ metadata['is_interesting'] = self.is_interesting()
181
+ return metadata
182
+
178
183
  def preview_text(self) -> str:
179
184
  return WHITESPACE_REGEX.sub(' ', self.text)[0:PREVIEW_CHARS]
180
185
 
@@ -21,11 +21,11 @@ from epstein_files.documents.messenger_log import MSG_REGEX, MessengerLog
21
21
  from epstein_files.documents.other_file import OtherFile
22
22
  from epstein_files.util.constant.output_files import PICKLED_PATH
23
23
  from epstein_files.util.constant.strings import *
24
- from epstein_files.util.constant.urls import (EPSTEIN_WEB, JMAIL, epsteinify_name_url, epstein_web_person_url,
25
- search_jmail_url, search_twitter_url)
24
+ from epstein_files.util.constant.urls import (EPSTEIN_MEDIA, EPSTEIN_WEB, JMAIL, epstein_media_person_url,
25
+ epsteinify_name_url, epstein_web_person_url, search_jmail_url, search_twitter_url)
26
26
  from epstein_files.util.constants import *
27
27
  from epstein_files.util.data import dict_sets_to_lists, json_safe, sort_dict
28
- from epstein_files.util.doc_cfg import EmailCfg
28
+ from epstein_files.util.doc_cfg import EmailCfg, Metadata
29
29
  from epstein_files.util.env import args, logger
30
30
  from epstein_files.util.file_helper import DOCS_DIR, file_size_str
31
31
  from epstein_files.util.highlighted_group import get_info_for_name, get_style_for_name
@@ -68,6 +68,7 @@ class EpsteinFiles:
68
68
  """Iterate through files and build appropriate objects."""
69
69
  self.all_files = [f for f in DOCS_DIR.iterdir() if f.is_file() and not f.name.startswith('.')]
70
70
  documents = []
71
+ file_type_count = defaultdict(int)
71
72
 
72
73
  # Read through and classify all the files
73
74
  for file_arg in self.all_files:
@@ -75,12 +76,13 @@ class EpsteinFiles:
75
76
  document = Document(file_arg)
76
77
 
77
78
  if document.length == 0:
78
- logger.warning(f"Skipping empty file: {document}")
79
+ logger.warning(f"Skipping empty file: {document}]")
79
80
  continue
80
81
 
81
82
  cls = document_cls(document)
82
83
  documents.append(cls(file_arg, text=document.text))
83
84
  logger.info(str(documents[-1]))
85
+ file_type_count[cls.__name__] += 1
84
86
 
85
87
  if doc_timer.seconds_since_start() > SLOW_FILE_SECONDS:
86
88
  doc_timer.print_at_checkpoint(f"Slow file: {documents[-1]} processed")
@@ -195,15 +197,13 @@ class EpsteinFiles:
195
197
  def imessage_logs_for(self, author: str | None | list[str | None]) -> Sequence[MessengerLog]:
196
198
  return MessengerLog.logs_for(author, self.imessage_logs)
197
199
 
198
- def identified_imessage_log_count(self) -> int:
199
- return len([log for log in self.imessage_logs if log.author])
200
-
201
200
  def json_metadata(self) -> str:
201
+ """Create a JSON string containing metadata for all the files."""
202
202
  metadata = {
203
- EMAIL_CLASS: [json_safe(d.metadata()) for d in self.emails],
204
- JSON_FILE_CLASS: [json_safe(d.metadata()) for d in self.json_files],
205
- MESSENGER_LOG_CLASS: [json_safe(d.metadata()) for d in self.imessage_logs],
206
- OTHER_FILE_CLASS: [json_safe(d.metadata()) for d in self.other_files if not isinstance(d, JsonFile)],
203
+ EMAIL_CLASS: _sorted_metadata(self.emails),
204
+ JSON_FILE_CLASS: _sorted_metadata(self.json_files),
205
+ MESSENGER_LOG_CLASS: _sorted_metadata(self.imessage_logs),
206
+ OTHER_FILE_CLASS: _sorted_metadata(self.non_json_other_files()),
207
207
  }
208
208
 
209
209
  return json.dumps(metadata, indent=4, sort_keys=True)
@@ -216,7 +216,7 @@ class EpsteinFiles:
216
216
  add_cols_to_table(table, ['File Type', 'Files', 'Author Known', 'Author Unknown', 'Duplicates'])
217
217
 
218
218
  def add_row(label: str, docs: list):
219
- known = None if isinstance(docs[0], JsonFile) else len([d for d in docs if d.author])
219
+ known = None if isinstance(docs[0], JsonFile) else Document.known_author_count(docs)
220
220
 
221
221
  table.add_row(
222
222
  label,
@@ -274,7 +274,7 @@ class EpsteinFiles:
274
274
  def print_emailer_counts_table(self) -> None:
275
275
  footer = f"Identified authors of {self.attributed_email_count():,} emails out of {len(self.emails):,}."
276
276
  counts_table = Table(title=f"Email Counts", caption=footer, header_style="bold")
277
- add_cols_to_table(counts_table, ['Name', 'Count', 'Sent', "Recv'd", JMAIL, EPSTEIN_WEB, 'Twitter'])
277
+ add_cols_to_table(counts_table, ['Name', 'Count', 'Sent', "Recv'd", JMAIL, EPSTEIN_MEDIA, EPSTEIN_WEB, 'Twitter'])
278
278
 
279
279
  emailer_counts = {
280
280
  emailer: self.email_author_counts[emailer] + self.email_recipient_counts[emailer]
@@ -290,7 +290,8 @@ class EpsteinFiles:
290
290
  str(self.email_author_counts[p]),
291
291
  str(self.email_recipient_counts[p]),
292
292
  '' if p is None else link_text_obj(search_jmail_url(p), JMAIL),
293
- '' if not is_ok_for_epstein_web(p) else link_text_obj(epstein_web_person_url(p), EPSTEIN_WEB.lower()),
293
+ '' if not is_ok_for_epstein_web(p) else link_text_obj(epstein_media_person_url(p), EPSTEIN_MEDIA),
294
+ '' if not is_ok_for_epstein_web(p) else link_text_obj(epstein_web_person_url(p), EPSTEIN_WEB),
294
295
  '' if p is None else link_text_obj(search_twitter_url(p), 'search X'),
295
296
  )
296
297
 
@@ -299,7 +300,7 @@ class EpsteinFiles:
299
300
  def print_imessage_summary(self) -> None:
300
301
  """Print summary table and stats for text messages."""
301
302
  console.print(MessengerLog.summary_table(self.imessage_logs))
302
- text_summary_msg = f"\nDeanonymized {self.identified_imessage_log_count()} of "
303
+ text_summary_msg = f"\nDeanonymized {Document.known_author_count(self.imessage_logs)} of "
303
304
  text_summary_msg += f"{len(self.imessage_logs)} {TEXT_MESSAGE} logs found in {len(self.all_files):,} files."
304
305
  console.print(text_summary_msg)
305
306
  imessage_msg_count = sum([len(log.messages()) for log in self.imessage_logs])
@@ -394,3 +395,8 @@ def is_ok_for_epstein_web(name: str | None) -> bool:
394
395
  return False
395
396
 
396
397
  return True
398
+
399
+
400
+ def _sorted_metadata(docs: Sequence[Document]) -> list[Metadata]:
401
+ docs_sorted_by_id = sorted(docs, key=lambda d: d.file_id)
402
+ return [json_safe(d.metadata()) for d in docs_sorted_by_id]
@@ -198,14 +198,10 @@ OSBORNE_LLP = f"{IAN_OSBORNE} & Partners LLP" # Ian Osborne's PR firm
198
198
  TRUMP_ORG = 'Trump Organization'
199
199
  UBS = 'UBS'
200
200
 
201
- # Locations
202
- PALM_BEACH = 'Palm Beach'
203
- VIRGIN_ISLANDS = 'Virgin Islands'
204
-
205
201
  # First and last names that should be made part of a highlighting regex for emailers
206
202
  NAMES_TO_NOT_HIGHLIGHT: list[str] = [name.lower() for name in [
207
- 'Al', 'Alfredo', 'Allen', 'Alex', 'Alexander', 'Amanda', 'Andres', 'Andrew',
208
- 'Bard', 'Barry', 'Bill', 'Black', 'Brad', 'Bruce',
203
+ 'Al', 'Alan', 'Alfredo', 'Allen', 'Alex', 'Alexander', 'Amanda', 'Andres', 'Andrew',
204
+ 'Bard', 'Barry', 'Bill', 'Black', 'Boris', 'Brad', 'Bruce',
209
205
  'Carolyn', 'Chris', 'Christina',
210
206
  'Dan', 'Daniel', 'Danny', 'Darren', 'Dave', 'David',
211
207
  'Ed', 'Edward', 'Edwards', 'Epstein', 'Eric', 'Erika', 'Etienne',
@@ -215,10 +211,10 @@ NAMES_TO_NOT_HIGHLIGHT: list[str] = [name.lower() for name in [
215
211
  'Ian',
216
212
  'Jack', 'James', 'Jay', 'Jean', 'Jeff', 'Jeffrey', 'Jennifer', 'Jeremy', 'jessica', 'Joel', 'John', 'Jon', 'Jonathan', 'Joseph', 'Jr',
217
213
  'Kahn', 'Katherine', 'Ken', 'Kevin',
218
- 'Leon', 'Lesley', 'Linda', 'Link', 'Lisa',
214
+ 'Larry', 'Leon', 'Lesley', 'Linda', 'Link', 'Lisa',
219
215
  'Mann', 'Marc', 'Marie', 'Mark', 'Martin', 'Melanie', 'Michael', 'Mike', 'Miller', 'Mitchell', 'Miles', 'Morris', 'Moskowitz',
220
216
  'Nancy', 'Neal', 'New',
221
- 'Paul', 'Paula', 'Pen', 'Peter', 'Philip',
217
+ 'Paul', 'Paula', 'Pen', 'Peter', 'Philip', 'Prince',
222
218
  'Randall', 'Reid', 'Richard', 'Robert', 'Rodriguez', 'Roger', 'Rosenberg', 'Ross', 'Roth', 'Rubin',
223
219
  'Scott', 'Sean', 'Stanley', 'Stern', 'Stephen', 'Steve', 'Steven', 'Stone', 'Susan',
224
220
  'The', 'Thomas', 'Tim', 'Tom', 'Tyler',
@@ -228,25 +224,25 @@ NAMES_TO_NOT_HIGHLIGHT: list[str] = [name.lower() for name in [
228
224
  ]]
229
225
 
230
226
  # Names to color white in the word counts
231
- OTHER_NAMES = """
232
- aaron albert alberto alec alex alexandra alice allen anderson andre andres ann anna anne ariana arthur
233
- baldwin barack barbro barry ben benjamin berger bert binant bob bonner boyden brad bradley brady branson bruce bruno burton
234
- chapman charles charlie chris christopher clint cohen colin collins conway
235
- dave davis dean debra deborah dennis diana diane diaz dickinson dixon dominique don dylan
236
- ed edmond elizabeth emily entwistle erik erika etienne evelyn
227
+ OTHER_NAMES = NAMES_TO_NOT_HIGHLIGHT + """
228
+ aaron albert alberto alec alexandra alice anderson andre ann anna anne ariana arthur
229
+ baldwin barack ben benjamin berger bert binant bob bonner boyden bradley brady branson bruno bryant burton
230
+ chapman charles charlie christopher clint cohen colin collins conway
231
+ davis dean debra deborah dennis diana diane diaz dickinson dixon dominique don dylan
232
+ edmond elizabeth emily entwistle erik evelyn
237
233
  ferguson flachsbart francis franco frank
238
- gardner gary geoff geoffrey george gilbert goldberg gonzalez gould graham greene guarino gwyneth
234
+ gardner gary geoff geoffrey gilbert gloria goldberg gonzalez gould graham greene guarino gwyneth
239
235
  hancock harold harrison harry helen hirsch hofstadter horowitz hussein
240
236
  isaac isaacson
241
- jamie james jane janet jason jen jennifer jim joe joel johnson jones julie justin
242
- kate kathy kelly kevin kim kruger kyle
243
- leonard lenny lieberman louis lynch lynn
244
- marcus marianne matt matthew melissa michele michelle mike mitchell moore moscowitz
237
+ jamie jane janet jason jen jim joe johnson jones josh julie justin
238
+ kate kathy kelly kim kruger kyle
239
+ leo leonard lenny leslie lieberman louis lynch lynn
240
+ marcus marianne matt matthew melissa michele michelle moore moscowitz
245
241
  nicole nussbaum
246
- paul paula paulson philip philippe
247
- rafael ray richardson rob robin rodriguez ron rudolph ryan
248
- sara sarah seligman serge sergey silverman sloman smith snowden sorkin stanley steele stevie stewart susan
249
- ted theresa thompson tiffany tim timothy tom
242
+ paulson philippe
243
+ rafael ray richardson rob robin ron rudolph ryan
244
+ sara sarah seligman serge sergey silverman sloman smith snowden sorkin steele stevie stewart
245
+ ted theresa thompson tiffany timothy
250
246
  valeria
251
247
  walter warren weinstein weiss william
252
248
  zach zack
@@ -30,6 +30,10 @@ REPUTATION = 'reputation'
30
30
  SOCIAL = 'social'
31
31
  SPEECH = 'speech'
32
32
 
33
+ # Locations
34
+ PALM_BEACH = 'Palm Beach'
35
+ VIRGIN_ISLANDS = 'Virgin Islands'
36
+
33
37
  # Publications
34
38
  BBC = 'BBC'
35
39
  BLOOMBERG = 'Bloomberg'
@@ -38,10 +42,12 @@ DAILY_MAIL = 'Daily Mail'
38
42
  DAILY_TELEGRAPH = "Daily Telegraph"
39
43
  LA_TIMES = 'LA Times'
40
44
  MIAMI_HERALD = 'Miami Herald'
41
- NYT_ARTICLE = 'NYT article about'
42
- NYT_COLUMN = 'NYT column about'
45
+ NYT = "New York Times"
46
+ PALM_BEACH_DAILY_NEWS = f'{PALM_BEACH} Daily News'
47
+ PALM_BEACH_POST = f'{PALM_BEACH} Post'
43
48
  THE_REAL_DEAL = 'The Real Deal'
44
49
  WAPO = 'WaPo'
50
+ VI_DAILY_NEWS = f'{VIRGIN_ISLANDS} Daily News'
45
51
 
46
52
  # Site types
47
53
  EMAIL = 'email'
@@ -71,6 +71,7 @@ epsteinify_name_url = lambda name: f"{EPSTEINIFY_URL}/?name={urllib.parse.quote(
71
71
  epstein_media_doc_url = lambda file_stem: build_doc_url(DOC_LINK_BASE_URLS[EPSTEIN_MEDIA], file_stem, True)
72
72
  epstein_media_doc_link_markup = lambda filename_or_id, style = TEXT_LINK: external_doc_link_markup(EPSTEIN_MEDIA, filename_or_id, style)
73
73
  epstein_media_doc_link_txt = lambda filename_or_id, style = TEXT_LINK: Text.from_markup(epstein_media_doc_link_markup(filename_or_id, style))
74
+ epstein_media_person_url = lambda person: f"{EPSTEIN_MEDIA_URL}/people/{parameterize(person)}"
74
75
 
75
76
  epstein_web_doc_url = lambda file_stem: f"{DOC_LINK_BASE_URLS[EPSTEIN_WEB]}/{file_stem}.jpg"
76
77
  epstein_web_person_url = lambda person: f"{EPSTEIN_WEB_URL}/{parameterize(person)}"