epstein-files 1.0.1__tar.gz → 1.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {epstein_files-1.0.1 → epstein_files-1.0.3}/PKG-INFO +32 -7
  2. {epstein_files-1.0.1 → epstein_files-1.0.3}/README.md +15 -6
  3. epstein_files-1.0.3/epstein_files/__init__.py +137 -0
  4. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/documents/document.py +12 -3
  5. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/documents/email.py +33 -13
  6. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/documents/imessage/text_message.py +11 -15
  7. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/documents/messenger_log.py +15 -11
  8. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/documents/other_file.py +13 -8
  9. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/epstein_files.py +51 -43
  10. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/util/constant/names.py +21 -24
  11. epstein_files-1.0.3/epstein_files/util/constant/output_files.py +29 -0
  12. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/util/constant/strings.py +8 -2
  13. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/util/constant/urls.py +11 -7
  14. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/util/constants.py +325 -227
  15. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/util/data.py +12 -33
  16. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/util/doc_cfg.py +7 -14
  17. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/util/env.py +5 -3
  18. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/util/file_helper.py +0 -22
  19. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/util/highlighted_group.py +31 -26
  20. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/util/logging.py +7 -0
  21. epstein_files-1.0.3/epstein_files/util/output.py +179 -0
  22. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/util/rich.py +22 -10
  23. epstein_files-1.0.3/pyproject.toml +68 -0
  24. epstein_files-1.0.1/epstein_files/__init__.py +0 -202
  25. epstein_files-1.0.1/pyproject.toml +0 -31
  26. {epstein_files-1.0.1 → epstein_files-1.0.3}/LICENSE +0 -0
  27. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/documents/communication.py +0 -0
  28. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/documents/emails/email_header.py +0 -0
  29. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/documents/json_file.py +0 -0
  30. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/util/constant/common_words.py +0 -0
  31. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/util/constant/html.py +0 -0
  32. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/util/search_result.py +0 -0
  33. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/util/timer.py +0 -0
  34. {epstein_files-1.0.1 → epstein_files-1.0.3}/epstein_files/util/word_count.py +0 -0
@@ -1,26 +1,42 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: epstein-files
3
- Version: 1.0.1
3
+ Version: 1.0.3
4
4
  Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
5
+ Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
6
+ License: GPL-3.0-or-later
7
+ Keywords: Epstein,Jeffrey Epstein
5
8
  Author: Michel de Cryptadamus
6
9
  Requires-Python: >=3.11,<4.0
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Environment :: Console
12
+ Classifier: Intended Audience :: Information Technology
13
+ Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
14
+ Classifier: Programming Language :: Python
7
15
  Classifier: Programming Language :: Python :: 3
8
16
  Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
9
20
  Requires-Dist: datefinder (>=0.7.3,<0.8.0)
10
21
  Requires-Dist: inflection (>=0.5.1,<0.6.0)
11
22
  Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
12
23
  Requires-Dist: python-dotenv (>=1.2.1,<2.0.0)
13
24
  Requires-Dist: requests (>=2.32.5,<3.0.0)
14
25
  Requires-Dist: rich (>=14.2.0,<15.0.0)
26
+ Project-URL: Emails, https://michelcrypt4d4mus.github.io/epstein_text_messages/all_emails_epstein_files_nov_2025.html
27
+ Project-URL: Metadata, https://michelcrypt4d4mus.github.io/epstein_text_messages/file_metadata_epstein_files_nov_2025.json
28
+ Project-URL: Repository, https://github.com/michelcrypt4d4mus/epstein_text_messages
29
+ Project-URL: TextMessages, https://michelcrypt4d4mus.github.io/epstein_text_messages
30
+ Project-URL: WordCounts, https://michelcrypt4d4mus.github.io/epstein_text_messages/communication_word_count_epstein_files_nov_2025.html
15
31
  Description-Content-Type: text/markdown
16
32
 
17
33
  # I Made Epstein's Text Messages Great Again
18
34
 
19
35
  * [I Made Epstein's Text Messages Great Again (And You Should Read Them)](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great) post on [Substack](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great)
20
36
  * The Epstein text messages (and some of the emails along with summary counts of sent emails to/from Epstein) generated by this code can be viewed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/).
21
- * All of His Emails can be read at another page also generated by this code [here](https://michelcrypt4d4mus.github.io/epstein_emails_house_oversight/).
22
- * Word counts for the emails and text messages are [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/epstein_emails_word_count.html).
23
- * Metadata containing what I have figured out about who sent or received the communications in a given file (and a brief explanation for how I figured it out for each file) is deployed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/epstein_files_nov_2025_cryptadamus_metadata.json)
37
+ * All of His Emails can be read at another page also generated by this code [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/all_emails_epstein_files_nov_2025.html).
38
+ * Word counts for the emails and text messages are [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/communication_word_count_epstein_files_nov_2025.html).
39
+ * Metadata containing what I have figured out about who sent or received the communications in a given file (and a brief explanation for how I figured it out for each file) is deployed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/file_metadata_epstein_files_nov_2025.json)
24
40
  * Configuration variables assigning specific `HOUSE_OVERSIGHT_XXXXXX.txt` file IDs (the `111111` part) as being emails to or from particular people based on various research and contributions can be found in [constants.py](./epstein_files/util/constants.py). Everything in `constants.py` should also appear in the JSON metadata.
25
41
 
26
42
 
@@ -31,11 +47,20 @@ Description-Content-Type: text/markdown
31
47
  You need to set the `DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
32
48
 
33
49
  ```bash
34
- DOCS_DIR=/path/to/epstein/ocr_txt_files ./generate.py
50
+ # Generate color highlighted texts/emails/other files
51
+ DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate
52
+
53
+ # Search
54
+ DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_search Bannon
55
+
56
+ # Show a color highlighted file
57
+ DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_show 030999
58
+ # This also works
59
+ DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_show HOUSE_OVERSIGHT_030999
35
60
  ```
36
61
 
37
- Run `./generate.py --help` for command line option assistance. Look in the [scripts](./scripts/) folder for various scripts.
38
- The first time you run anything it will take a few minutes to fix all the data, attribute the redacted emails, etc. Once you've run things once you can run the `./generate.py --pickled` to load the cached fixed up data and things will be quick.
62
+ Run `epstein_generate --help` for command line option assistance.
63
+ The first time you run anything it will take a few minutes to fix all the data, attribute the redacted emails, etc. Once you've run things once you can run the `epstein_generate --pickled` to load the cached fixed up data and things will be quick.
39
64
 
40
65
  #### As A Library
41
66
  ```python
@@ -2,9 +2,9 @@
2
2
 
3
3
  * [I Made Epstein's Text Messages Great Again (And You Should Read Them)](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great) post on [Substack](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great)
4
4
  * The Epstein text messages (and some of the emails along with summary counts of sent emails to/from Epstein) generated by this code can be viewed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/).
5
- * All of His Emails can be read at another page also generated by this code [here](https://michelcrypt4d4mus.github.io/epstein_emails_house_oversight/).
6
- * Word counts for the emails and text messages are [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/epstein_emails_word_count.html).
7
- * Metadata containing what I have figured out about who sent or received the communications in a given file (and a brief explanation for how I figured it out for each file) is deployed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/epstein_files_nov_2025_cryptadamus_metadata.json)
5
+ * All of His Emails can be read at another page also generated by this code [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/all_emails_epstein_files_nov_2025.html).
6
+ * Word counts for the emails and text messages are [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/communication_word_count_epstein_files_nov_2025.html).
7
+ * Metadata containing what I have figured out about who sent or received the communications in a given file (and a brief explanation for how I figured it out for each file) is deployed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/file_metadata_epstein_files_nov_2025.json)
8
8
  * Configuration variables assigning specific `HOUSE_OVERSIGHT_XXXXXX.txt` file IDs (the `111111` part) as being emails to or from particular people based on various research and contributions can be found in [constants.py](./epstein_files/util/constants.py). Everything in `constants.py` should also appear in the JSON metadata.
9
9
 
10
10
 
@@ -15,11 +15,20 @@
15
15
  You need to set the `DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
16
16
 
17
17
  ```bash
18
- DOCS_DIR=/path/to/epstein/ocr_txt_files ./generate.py
18
+ # Generate color highlighted texts/emails/other files
19
+ DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_generate
20
+
21
+ # Search
22
+ DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_search Bannon
23
+
24
+ # Show a color highlighted file
25
+ DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_show 030999
26
+ # This also works
27
+ DOCS_DIR=/path/to/epstein/ocr_txt_files epstein_show HOUSE_OVERSIGHT_030999
19
28
  ```
20
29
 
21
- Run `./generate.py --help` for command line option assistance. Look in the [scripts](./scripts/) folder for various scripts.
22
- The first time you run anything it will take a few minutes to fix all the data, attribute the redacted emails, etc. Once you've run things once you can run the `./generate.py --pickled` to load the cached fixed up data and things will be quick.
30
+ Run `epstein_generate --help` for command line option assistance.
31
+ The first time you run anything it will take a few minutes to fix all the data, attribute the redacted emails, etc. Once you've run things once you can run the `epstein_generate --pickled` to load the cached fixed up data and things will be quick.
23
32
 
24
33
  #### As A Library
25
34
  ```python
@@ -0,0 +1,137 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ Reformat Epstein text message files for readability and count email senders.
4
+ For use with iMessage log files from https://drive.google.com/drive/folders/1hTNH5woIRio578onLGElkTWofUSWRoH_
5
+
6
+ Install: 'poetry install'
7
+ Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT epstein_generate'
8
+ """
9
+ from sys import exit
10
+
11
+ from dotenv import load_dotenv
12
+ load_dotenv()
13
+
14
+ from rich.markup import escape
15
+ from rich.padding import Padding
16
+ from rich.panel import Panel
17
+
18
+ from epstein_files.epstein_files import EpsteinFiles, document_cls
19
+ from epstein_files.documents.document import INFO_PADDING, Document
20
+ from epstein_files.documents.email import Email
21
+ from epstein_files.util.constant.html import *
22
+ from epstein_files.util.constant.names import *
23
+ from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, TEXT_MSGS_HTML_PATH, make_clean
24
+ from epstein_files.util.env import args, specified_names
25
+ from epstein_files.util.file_helper import coerce_file_path, extract_file_id
26
+ from epstein_files.util.logging import logger
27
+ from epstein_files.util.output import print_emails, print_json_metadata, print_json_stats, print_text_messages, write_urls
28
+ from epstein_files.util.rich import build_highlighter, console, print_header, print_panel, write_html
29
+ from epstein_files.util.timer import Timer
30
+
31
+
32
+ def generate_html() -> None:
33
+ if args.make_clean:
34
+ make_clean()
35
+ exit()
36
+
37
+ timer = Timer()
38
+ epstein_files = EpsteinFiles.get_files(timer)
39
+
40
+ if args.json_metadata:
41
+ print_json_metadata(epstein_files)
42
+ exit()
43
+
44
+ print_header(epstein_files)
45
+
46
+ if args.colors_only:
47
+ exit()
48
+
49
+ if args.output_texts:
50
+ print_text_messages(epstein_files)
51
+ timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs)} text message logs')
52
+
53
+ if args.output_emails:
54
+ emails_printed = print_emails(epstein_files)
55
+ timer.print_at_checkpoint(f"Printed {emails_printed:,} emails")
56
+
57
+ if args.output_other_files:
58
+ files_printed = epstein_files.print_other_files_table()
59
+ timer.print_at_checkpoint(f"Printed {len(files_printed)} other files")
60
+
61
+ # Save output
62
+ write_html(ALL_EMAILS_PATH if args.all_emails else TEXT_MSGS_HTML_PATH)
63
+ logger.warning(f"Total time: {timer.seconds_since_start_str()}")
64
+
65
+ # JSON stats (mostly used for building pytest checks)
66
+ if args.json_stats:
67
+ print_json_stats(epstein_files)
68
+
69
+
70
+ def epstein_diff():
71
+ """Diff the cleaned up text of two files."""
72
+ Document.diff_files(args.positional_args)
73
+
74
+
75
+ def epstein_search():
76
+ """Search the cleaned up text of the files."""
77
+ _assert_positional_args()
78
+ epstein_files = EpsteinFiles.get_files(use_pickled=True)
79
+
80
+ for search_term in args.positional_args:
81
+ temp_highlighter = build_highlighter(search_term)
82
+ search_results = epstein_files.docs_matching(search_term, specified_names)
83
+ console.line(2)
84
+ print_panel(f"Found {len(search_results)} documents matching '{search_term}'", padding=(0, 0, 0, 3))
85
+
86
+ for search_result in search_results:
87
+ console.line()
88
+
89
+ if args.whole_file:
90
+ if isinstance(search_result.document, Email):
91
+ search_result.document.truncation_allowed = False
92
+
93
+ console.print(search_result.document)
94
+ else:
95
+ console.print(search_result.document.description_panel())
96
+
97
+ for matching_line in search_result.lines:
98
+ line_txt = matching_line.__rich__()
99
+ console.print(Padding(temp_highlighter(line_txt), INFO_PADDING), style='gray37')
100
+
101
+
102
+ def epstein_show():
103
+ """Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
104
+ _assert_positional_args()
105
+ ids = [extract_file_id(arg) for arg in args.positional_args]
106
+ console.line()
107
+
108
+ if args.pickled:
109
+ epstein_files = EpsteinFiles.get_files(use_pickled=True)
110
+ docs = epstein_files.get_documents_by_id(ids)
111
+ else:
112
+ raw_docs = [Document(coerce_file_path(id)) for id in ids]
113
+ docs = [document_cls(doc)(doc.file_path) for doc in raw_docs]
114
+
115
+ for doc in docs:
116
+ console.line()
117
+ console.print(doc)
118
+
119
+ if args.raw:
120
+ console.line()
121
+ console.print(Panel(f"*** {doc.url_slug} RAW ***", expand=False, style=doc._border_style()))
122
+ console.print(escape(doc.raw_text()))
123
+
124
+ if isinstance(doc, Email):
125
+ console.line()
126
+ console.print(Panel(f"*** {doc.url_slug} actual_text ***", expand=False, style=doc._border_style()))
127
+ console.print(escape(doc._actual_text()))
128
+
129
+
130
+ def epstein_dump_urls() -> None:
131
+ write_urls()
132
+
133
+
134
+ def _assert_positional_args():
135
+ if not args.positional_args:
136
+ console.print(f"\n ERROR: No positional args!\n", style='red1')
137
+ exit(1)
@@ -15,7 +15,7 @@ from epstein_files.util.constant.names import *
15
15
  from epstein_files.util.constant.strings import *
16
16
  from epstein_files.util.constant.urls import *
17
17
  from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
18
- from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize, without_nones
18
+ from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize, without_falsey
19
19
  from epstein_files.util.doc_cfg import EmailCfg, DocCfg, Metadata, TextCfg
20
20
  from epstein_files.util.env import args
21
21
  from epstein_files.util.file_helper import (DOCS_DIR, file_stem_for_id, extract_file_id, file_size,
@@ -159,7 +159,7 @@ class Document:
159
159
  if hint_msg:
160
160
  hints.append(highlighter(Text(hint_msg, style='white dim italic')))
161
161
 
162
- return without_nones(hints)
162
+ return without_falsey(hints)
163
163
 
164
164
  def info_txt(self) -> Text | None:
165
165
  """Secondary info about this file (recipients, level of certainty, etc). Overload in subclasses."""
@@ -255,7 +255,11 @@ class Document:
255
255
  txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
256
256
 
257
257
  txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
258
- txt.append(", ").append(key_value_txt('lines', Text(f"{self.num_lines}", style='cyan')))
258
+ txt.append(", ").append(key_value_txt('lines', self.num_lines))
259
+
260
+ if self.config and self.config.dupe_of_id:
261
+ txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.dupe_of_id, style='magenta')))
262
+
259
263
  return txt
260
264
 
261
265
  def top_lines(self, n: int = 10) -> str:
@@ -352,6 +356,11 @@ class Document:
352
356
  for f in tmpfiles:
353
357
  f.unlink()
354
358
 
359
+ @staticmethod
360
+ def known_author_count(docs: Sequence['Document']) -> int:
361
+ """Count of how many Document objects have an author attribution."""
362
+ return len([doc for doc in docs if doc.author])
363
+
355
364
  @staticmethod
356
365
  def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
357
366
  return sorted(docs, key=lambda doc: doc.sort_key())
@@ -30,7 +30,6 @@ BAD_LINE_REGEX = re.compile(r'^(>;?|\d{1,2}|Classification: External Communicati
30
30
  DETECT_EMAIL_REGEX = re.compile(r'^(.*\n){0,2}From:')
31
31
  LINK_LINE_REGEX = re.compile(f"^(> )?htt")
32
32
  QUOTED_REPLY_LINE_REGEX = re.compile(r'wrote:\n', re.IGNORECASE)
33
- REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + ['********************************']
34
33
  REPLY_TEXT_REGEX = re.compile(rf"^(.*?){REPLY_LINE_PATTERN}", re.DOTALL | re.IGNORECASE | re.MULTILINE)
35
34
 
36
35
  BAD_TIMEZONE_REGEX = re.compile(fr'\((UTC|GMT\+\d\d:\d\d)\)|{REDACTED}')
@@ -39,10 +38,16 @@ TIMESTAMP_LINE_REGEX = re.compile(r"\d+:\d+")
39
38
 
40
39
  SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
41
40
  REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
41
+ IS_JUNK_MAIL = 'is_junk_mail'
42
42
  MAX_CHARS_TO_PRINT = 4000
43
43
  MAX_NUM_HEADER_LINES = 14
44
44
  MAX_QUOTED_REPLIES = 2
45
45
 
46
+ REPLY_SPLITTERS = [f"{field}:" for field in FIELD_NAMES] + [
47
+ '********************************',
48
+ 'Begin forwarded message',
49
+ ]
50
+
46
51
  OCR_REPAIRS: dict[str | re.Pattern, str] = {
47
52
  re.compile(r'grnail\.com'): 'gmail.com',
48
53
  re.compile(r"^(From|To)(: )?[_1.]{5,}", re.MULTILINE): rf"\1: {REDACTED}", # Redacted email addresses
@@ -119,6 +124,7 @@ EMAIL_SIGNATURE_REGEXES = {
119
124
  # Invalid for links to EpsteinWeb
120
125
  JUNK_EMAILERS = [
121
126
  'asmallworld@travel.asmallworld.net',
127
+ "digest-noreply@quora.com",
122
128
  'editorialstaff@flipboard.com',
123
129
  'How To Academy',
124
130
  'Jokeland',
@@ -126,9 +132,13 @@ JUNK_EMAILERS = [
126
132
  'Saved by Internet Explorer 11',
127
133
  ]
128
134
 
129
- TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + [
130
- 'Alan S Halperin',
135
+ MAILING_LISTS = [
136
+ INTELLIGENCE_SQUARED,
131
137
  'middle.east.update@hotmail.com',
138
+ ]
139
+
140
+ TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + MAILING_LISTS + [
141
+ 'Alan S Halperin',
132
142
  'Mitchell Bard',
133
143
  'Skip Rimer',
134
144
  ]
@@ -281,7 +291,7 @@ SELF_EMAILS_FILE_IDS = [
281
291
  ]
282
292
 
283
293
  METADATA_FIELDS = [
284
- 'is_junk_mail',
294
+ IS_JUNK_MAIL,
285
295
  'recipients',
286
296
  'sent_from_device',
287
297
  ]
@@ -294,7 +304,6 @@ class Email(Communication):
294
304
  actual_text (str) - best effort at the text actually sent in this email, excluding quoted replies and forwards
295
305
  config (EmailCfg | None) - manual config for this email (if it exists)
296
306
  header (EmailHeader) - header data extracted from the text (from/to/sent/subject etc)
297
- is_junk_mail (bool) - True if this is junk mail
298
307
  recipients (list[str | None]) - who this email was sent to
299
308
  sent_from_device (str | None) - "Sent from my iPhone" style signature (if it exists)
300
309
  signature_substitution_counts (dict[str, int]) - count of how many times a signature was replaced with <...snipped...> for each participant
@@ -302,17 +311,16 @@ class Email(Communication):
302
311
  actual_text: str = field(init=False)
303
312
  config: EmailCfg | None = None
304
313
  header: EmailHeader = field(init=False)
305
- is_junk_mail: bool = False
306
314
  recipients: list[str | None] = field(default_factory=list)
307
315
  sent_from_device: str | None = None
308
316
  signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
317
+ truncation_allowed: bool = True
309
318
 
310
319
  # For logging how many headers we prettified while printing, kind of janky
311
320
  rewritten_header_ids: ClassVar[set[str]] = set([])
312
321
 
313
322
  def __post_init__(self):
314
323
  super().__post_init__()
315
- self.is_junk_mail = self.author in JUNK_EMAILERS
316
324
 
317
325
  if self.config and self.config.recipients:
318
326
  self.recipients = cast(list[str | None], self.config.recipients)
@@ -331,9 +339,17 @@ class Email(Communication):
331
339
  txt = Text("OCR text of email from ", style='grey46').append(self.author_txt).append(' to ')
332
340
  return txt.append(self._recipients_txt()).append(highlighter(f" probably sent at {self.timestamp}"))
333
341
 
342
+ def is_fwded_article(self) -> bool:
343
+ return bool(self.config and self.config.is_fwded_article)
344
+
345
+ def is_junk_mail(self) -> bool:
346
+ return self.author in JUNK_EMAILERS or self.author in MAILING_LISTS
347
+
334
348
  def metadata(self) -> Metadata:
349
+ local_metadata = asdict(self)
350
+ local_metadata[IS_JUNK_MAIL] = self.is_junk_mail()
335
351
  metadata = super().metadata()
336
- metadata.update({k: v for k, v in asdict(self).items() if v and k in METADATA_FIELDS})
352
+ metadata.update({k: v for k, v in local_metadata.items() if v and k in METADATA_FIELDS})
337
353
  return metadata
338
354
 
339
355
  def subject(self) -> str:
@@ -352,17 +368,18 @@ class Email(Communication):
352
368
  """The text that comes before likely quoted replies and forwards etc."""
353
369
  if self.config and self.config.actual_text is not None:
354
370
  return self.config.actual_text
371
+
372
+ text = '\n'.join(self.text.split('\n')[self.header.num_header_rows:]).strip()
373
+
374
+ if self.config and self.config.fwded_text_after:
375
+ return text.split(self.config.fwded_text_after)[0].strip()
355
376
  elif self.header.num_header_rows == 0:
356
377
  return self.text
357
378
 
358
- text = '\n'.join(self.text.split('\n')[self.header.num_header_rows:]).strip()
359
379
  reply_text_match = REPLY_TEXT_REGEX.search(text)
360
380
  # logger.info(f"Raw text:\n" + self.top_lines(20) + '\n\n')
361
381
  # logger.info(f"With header removed:\n" + text[0:500] + '\n\n')
362
382
 
363
- if self.file_id in ['024624']: # This email starts with "On September 14th"
364
- return text.split('On Tue, May 14')[0].strip()
365
-
366
383
  if reply_text_match:
367
384
  actual_num_chars = len(reply_text_match.group(1))
368
385
  actual_text_pct = f"{(100 * float(actual_num_chars) / len(text)):.1f}%"
@@ -555,6 +572,9 @@ class Email(Communication):
555
572
  self._merge_lines(3, 5)
556
573
  elif self.file_id == '028931':
557
574
  self._merge_lines(3, 6)
575
+ elif self.file_id == '013415':
576
+ for _i in range(2):
577
+ self._merge_lines(4)
558
578
  elif self.file_id in ['033568']:
559
579
  for _i in range(5):
560
580
  self._merge_lines(5)
@@ -637,7 +657,7 @@ class Email(Communication):
637
657
  num_chars = quote_cutoff
638
658
 
639
659
  # Truncate long emails but leave a note explaining what happened w/link to source document
640
- if len(text) > num_chars:
660
+ if len(text) > num_chars and self.truncation_allowed:
641
661
  text = text[0:num_chars]
642
662
  doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
643
663
  trim_note = f"<...trimmed to {num_chars} characters of {self.length}, read the rest at {doc_link_markup}...>"
@@ -4,7 +4,7 @@ from datetime import datetime
4
4
 
5
5
  from rich.text import Text
6
6
 
7
- from epstein_files.util.constant.names import JEFFREY_EPSTEIN, ANTHONY_SCARAMUCCI, STEVE_BANNON, UNKNOWN
7
+ from epstein_files.util.constant.names import JEFFREY_EPSTEIN, ANTHONY_SCARAMUCCI, CELINA_DUBIN, EVA, STEVE_BANNON, UNKNOWN
8
8
  from epstein_files.util.data import extract_last_name
9
9
  from epstein_files.util.highlighted_group import get_style_for_name
10
10
  from epstein_files.util.logging import logger
@@ -19,17 +19,18 @@ DISPLAY_LAST_NAME_ONLY = [
19
19
  STEVE_BANNON,
20
20
  ]
21
21
 
22
- UNKNOWN_TEXTERS = [
23
- '+16463880059',
24
- '+13108737937',
25
- '+13108802851',
26
- ]
22
+ PHONE_NUMBER_MAPPING = {
23
+ '+19174393646': ANTHONY_SCARAMUCCI,
24
+ '+13109906526': STEVE_BANNON,
25
+ '+16463880059': EVA,
26
+ '+13108737937': CELINA_DUBIN,
27
+ '+13108802851': STEVE_BANNON,
28
+
29
+ }
27
30
 
28
31
  TEXTER_MAPPING = {
29
32
  'e:': JEFFREY_EPSTEIN,
30
33
  'e:jeeitunes@gmail.com': JEFFREY_EPSTEIN,
31
- '+19174393646': ANTHONY_SCARAMUCCI,
32
- '+13109906526': STEVE_BANNON,
33
34
  }
34
35
 
35
36
 
@@ -37,7 +38,7 @@ TEXTER_MAPPING = {
37
38
  class TextMessage:
38
39
  """Class representing a single iMessage text message."""
39
40
  author: str | None
40
- author_str: str = field(init=False)
41
+ author_str: str | None = None
41
42
  id_confirmed: bool = False
42
43
  text: str
43
44
  timestamp_str: str
@@ -47,14 +48,10 @@ class TextMessage:
47
48
 
48
49
  if self.author is None:
49
50
  self.author_str = UNKNOWN
50
- elif self.author in UNKNOWN_TEXTERS:
51
- logger.warning(f"Bad text from '{self.author}': \"{self.text}\"")
52
- self.author_str = self.author
53
- self.author = None # TODO: this shouldn't be happening; we still know the author...
54
51
  elif self.author in DISPLAY_LAST_NAME_ONLY:
55
52
  self.author_str = extract_last_name(self.author)
56
53
  else:
57
- self.author_str = self.author
54
+ self.author_str = self.author_str or self.author
58
55
 
59
56
  if not self.id_confirmed and self.author is not None and self.author != JEFFREY_EPSTEIN:
60
57
  self.author_str = self.author + ' (?)'
@@ -87,7 +84,6 @@ class TextMessage:
87
84
  return msg_txt
88
85
 
89
86
  def __rich__(self) -> Text:
90
- # TODO: Workaround for phone numbers that sucks
91
87
  author_style = get_style_for_name(self.author_str if self.author_str.startswith('+') else self.author)
92
88
  author_txt = Text(self.author_str, style=author_style)
93
89
  timestamp_txt = Text(f"[{self.timestamp_str}]", style=TIMESTAMP_STYLE).append(' ')
@@ -44,17 +44,8 @@ class MessengerLog(Communication):
44
44
 
45
45
  def messages(self) -> list[TextMessage]:
46
46
  """Lazily evaluated accessor for self._messages."""
47
- if len(self._messages) == 0:
48
- self._messages = [
49
- TextMessage(
50
- # If the Sender: is redacted that means it's from self.author
51
- author=REDACTED_AUTHOR_REGEX.sub('', match.group(1).strip()) or self.author,
52
- id_confirmed=not self.is_attribution_uncertain(),
53
- text=match.group(4).strip(),
54
- timestamp_str=match.group(2).strip(),
55
- )
56
- for match in MSG_REGEX.finditer(self.text)
57
- ]
47
+ if not self._messages:
48
+ self._messages = [self._build_message(match) for match in MSG_REGEX.finditer(self.text)]
58
49
 
59
50
  return self._messages
60
51
 
@@ -70,6 +61,19 @@ class MessengerLog(Communication):
70
61
  def _border_style(self) -> str:
71
62
  return self.author_style
72
63
 
64
+ def _build_message(self, match: re.Match) -> TextMessage:
65
+ """Turn a regex match into a TextMessage."""
66
+ author_str = REDACTED_AUTHOR_REGEX.sub('', match.group(1).strip())
67
+
68
+ # If the Sender: is redacted that means it's from self.author
69
+ return TextMessage(
70
+ author=self.author if (author_str.startswith('+') or not author_str) else author_str,
71
+ author_str=author_str if author_str.startswith('+') else None, # Preserve phone numbers
72
+ id_confirmed=not self.is_attribution_uncertain(),
73
+ text=match.group(4).strip(),
74
+ timestamp_str=match.group(2).strip(),
75
+ )
76
+
73
77
  def _extract_timestamp(self) -> datetime:
74
78
  for match in MSG_REGEX.finditer(self.text):
75
79
  timestamp_str = match.group(2).strip()
@@ -1,7 +1,7 @@
1
1
  import re
2
2
  import logging
3
3
  import warnings
4
- from dataclasses import dataclass
4
+ from dataclasses import asdict, dataclass
5
5
  from datetime import datetime
6
6
 
7
7
  import datefinder
@@ -15,7 +15,7 @@ from rich.text import Text
15
15
  from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, WHITESPACE_REGEX, Document
16
16
  from epstein_files.util.constant.strings import *
17
17
  from epstein_files.util.constants import *
18
- from epstein_files.util.doc_cfg import FINANCIAL_REPORTS_AUTHORS, DocCfg
18
+ from epstein_files.util.doc_cfg import FINANCIAL_REPORTS_AUTHORS, DocCfg, Metadata
19
19
  from epstein_files.util.data import escape_single_quotes, remove_timezone, uniquify
20
20
  from epstein_files.util.file_helper import FILENAME_LENGTH
21
21
  from epstein_files.util.env import args
@@ -83,11 +83,10 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
83
83
  NOBEL_CHARITABLE_TRUST,
84
84
  'Nautilus',
85
85
  'New Yorker',
86
- NYT_ARTICLE,
87
- NYT_COLUMN,
86
+ NYT,
88
87
  PALM_BEACH_CODE_ENFORCEMENT,
89
- PALM_BEACH_DAILY_ARTICLE,
90
- PALM_BEACH_POST_ARTICLE,
88
+ PALM_BEACH_DAILY_NEWS,
89
+ PALM_BEACH_POST,
91
90
  PALM_BEACH_TSV,
92
91
  PALM_BEACH_WATER_COMMITTEE,
93
92
  PAUL_KRASSNER,
@@ -102,6 +101,7 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
102
101
  SHIMON_POST_ARTICLE,
103
102
  SINGLE_PAGE,
104
103
  STACEY_PLASKETT,
104
+ 'Tatler',
105
105
  TERJE_ROD_LARSEN,
106
106
  TEXT_OF_US_LAW,
107
107
  TRANSLATION,
@@ -113,7 +113,7 @@ UNINTERESTING_PREFIXES = FINANCIAL_REPORTS_AUTHORS + [
113
113
  'U.S. News',
114
114
  'US Office',
115
115
  'Vanity Fair',
116
- VI_DAILY_NEWS_ARTICLE,
116
+ VI_DAILY_NEWS,
117
117
  WAPO,
118
118
  ]
119
119
 
@@ -127,7 +127,7 @@ class OtherFile(Document):
127
127
 
128
128
  if self.config is None and VI_DAILY_NEWS_REGEX.search(self.text):
129
129
  self.log(f"Creating synthetic config for VI Daily News article...", logging.INFO)
130
- self.config = DocCfg(id=self.file_id, description=VI_DAILY_NEWS_ARTICLE, category=ARTICLE)
130
+ self.config = DocCfg(id=self.file_id, author=VI_DAILY_NEWS, category=ARTICLE, description='article')
131
131
 
132
132
  def category(self) -> str | None:
133
133
  return self.config and self.config.category
@@ -175,6 +175,11 @@ class OtherFile(Document):
175
175
 
176
176
  return True
177
177
 
178
+ def metadata(self) -> Metadata:
179
+ metadata = super().metadata()
180
+ metadata['is_interesting'] = self.is_interesting()
181
+ return metadata
182
+
178
183
  def preview_text(self) -> str:
179
184
  return WHITESPACE_REGEX.sub(' ', self.text)[0:PREVIEW_CHARS]
180
185