epstein-files 1.0.13__tar.gz → 1.0.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {epstein_files-1.0.13 → epstein_files-1.0.15}/PKG-INFO +10 -3
  2. {epstein_files-1.0.13 → epstein_files-1.0.15}/README.md +9 -2
  3. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/__init__.py +16 -11
  4. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/documents/communication.py +2 -2
  5. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/documents/document.py +59 -51
  6. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/documents/email.py +34 -30
  7. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/documents/imessage/text_message.py +4 -4
  8. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/documents/json_file.py +9 -3
  9. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/documents/messenger_log.py +29 -27
  10. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/documents/other_file.py +80 -100
  11. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/epstein_files.py +50 -69
  12. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/constant/names.py +3 -1
  13. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/constant/strings.py +1 -3
  14. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/constant/urls.py +1 -7
  15. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/constants.py +126 -114
  16. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/data.py +2 -0
  17. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/doc_cfg.py +11 -10
  18. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/env.py +12 -13
  19. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/file_helper.py +8 -4
  20. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/highlighted_group.py +8 -16
  21. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/output.py +56 -36
  22. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/rich.py +29 -29
  23. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/word_count.py +7 -9
  24. {epstein_files-1.0.13 → epstein_files-1.0.15}/pyproject.toml +1 -1
  25. {epstein_files-1.0.13 → epstein_files-1.0.15}/LICENSE +0 -0
  26. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/documents/emails/email_header.py +0 -0
  27. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/constant/common_words.py +0 -0
  28. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/constant/html.py +0 -0
  29. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/constant/output_files.py +0 -0
  30. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/logging.py +0 -0
  31. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/search_result.py +0 -0
  32. {epstein_files-1.0.13 → epstein_files-1.0.15}/epstein_files/util/timer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: epstein-files
3
- Version: 1.0.13
3
+ Version: 1.0.15
4
4
  Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
5
5
  Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
6
6
  License: GPL-3.0-or-later
@@ -43,11 +43,12 @@ Description-Content-Type: text/markdown
43
43
 
44
44
 
45
45
  ## Usage
46
-
47
46
  #### Installation
48
47
  1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8) (make sure you grab both the `001/` and `002/` folders).
49
48
  1. Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
50
49
 
50
+
51
+ #### Command Line Tools
51
52
  You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
52
53
 
53
54
  ```bash
@@ -63,7 +64,7 @@ epstein_generate
63
64
  # Search for a string:
64
65
  epstein_search Bannon
65
66
  # Or a regex:
66
- epstein_search '\bSteve\s*Bannon\b'
67
+ epstein_search '\bSteve\s*Bannon|Jeffrey\s*Epstein\b'
67
68
 
68
69
  # Show a file with color highlighting of keywords:
69
70
  epstein_show 030999
@@ -82,6 +83,12 @@ epstein_diff 030999 020442
82
83
  The first time you run anything it will take a few minutes to fix all the janky OCR text, attribute the redacted emails, etc. After that things will be quick.
83
84
  Run `epstein_generate --help` for command line option assistance.
84
85
 
86
+ **Optional:** There are a handful of emails that I extracted from the legal filings they were contained in. If you want to include these files in your local analysis you'll need to copy those files from the repo into your local document directory. Something like:
87
+
88
+ ```bash
89
+ cp ./emails_extracted_from_legal_filings/*.txt "$EPSTEIN_DOCS_DIR"
90
+ ```
91
+
85
92
 
86
93
  #### As A Library
87
94
  ```python
@@ -10,11 +10,12 @@
10
10
 
11
11
 
12
12
  ## Usage
13
-
14
13
  #### Installation
15
14
  1. Requires you have a local copy of the OCR text files from the House Oversight document release in a directory `/path/to/epstein/ocr_txt_files`. You can download those OCR text files from [the Congressional Google Drive folder](https://drive.google.com/drive/folders/1ldncvdqIf6miiskDp_EDuGSDAaI_fJx8) (make sure you grab both the `001/` and `002/` folders).
16
15
  1. Use `poetry install` for easiest time installing. `pip install epstein-files` should also work, though `pipx install epstein-files` is usually better.
17
16
 
17
+
18
+ #### Command Line Tools
18
19
  You need to set the `EPSTEIN_DOCS_DIR` environment variable with the path to the folder of files you just downloaded when running. You can either create a `.env` file modeled on [`.env.example`](./.env.example) (which will set it permanently) or you can run with:
19
20
 
20
21
  ```bash
@@ -30,7 +31,7 @@ epstein_generate
30
31
  # Search for a string:
31
32
  epstein_search Bannon
32
33
  # Or a regex:
33
- epstein_search '\bSteve\s*Bannon\b'
34
+ epstein_search '\bSteve\s*Bannon|Jeffrey\s*Epstein\b'
34
35
 
35
36
  # Show a file with color highlighting of keywords:
36
37
  epstein_show 030999
@@ -49,6 +50,12 @@ epstein_diff 030999 020442
49
50
  The first time you run anything it will take a few minutes to fix all the janky OCR text, attribute the redacted emails, etc. After that things will be quick.
50
51
  Run `epstein_generate --help` for command line option assistance.
51
52
 
53
+ **Optional:** There are a handful of emails that I extracted from the legal filings they were contained in. If you want to include these files in your local analysis you'll need to copy those files from the repo into your local document directory. Something like:
54
+
55
+ ```bash
56
+ cp ./emails_extracted_from_legal_filings/*.txt "$EPSTEIN_DOCS_DIR"
57
+ ```
58
+
52
59
 
53
60
  #### As A Library
54
61
  ```python
@@ -17,11 +17,11 @@ from epstein_files.epstein_files import EpsteinFiles, document_cls
17
17
  from epstein_files.documents.document import INFO_PADDING, Document
18
18
  from epstein_files.documents.email import Email
19
19
  from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, TEXT_MSGS_HTML_PATH, make_clean
20
- from epstein_files.util.env import args, specified_names
20
+ from epstein_files.util.env import args
21
21
  from epstein_files.util.file_helper import coerce_file_path, extract_file_id
22
22
  from epstein_files.util.logging import logger
23
- from epstein_files.util.output import (print_emails, print_json_files, print_json_stats,
24
- print_text_messages, write_json_metadata, write_urls)
23
+ from epstein_files.util.output import (print_emails_section, print_json_files, print_json_stats,
24
+ print_other_files_section, print_text_messages_section, write_json_metadata, write_urls)
25
25
  from epstein_files.util.rich import build_highlighter, console, print_header, print_panel, write_html
26
26
  from epstein_files.util.timer import Timer
27
27
  from epstein_files.util.word_count import write_word_counts_html
@@ -49,16 +49,21 @@ def generate_html() -> None:
49
49
  exit()
50
50
 
51
51
  if args.output_texts:
52
- print_text_messages(epstein_files)
52
+ print_text_messages_section(epstein_files)
53
53
  timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs)} text message logs')
54
54
 
55
55
  if args.output_emails:
56
- emails_printed = print_emails(epstein_files)
57
- timer.print_at_checkpoint(f"Printed {emails_printed:,} emails")
56
+ emails_that_were_printed = print_emails_section(epstein_files)
57
+ timer.print_at_checkpoint(f"Printed {len(emails_that_were_printed):,} emails")
58
58
 
59
59
  if args.output_other:
60
- files_printed = epstein_files.print_other_files_table()
61
- timer.print_at_checkpoint(f"Printed {len(files_printed)} other files")
60
+ if args.uninteresting:
61
+ files = [f for f in epstein_files.other_files if not f.is_interesting()]
62
+ else:
63
+ files = [f for f in epstein_files.other_files if args.all_other_files or f.is_interesting()]
64
+
65
+ print_other_files_section(files, epstein_files)
66
+ timer.print_at_checkpoint(f"Printed {len(files)} other files (skipped {len(epstein_files.other_files) - len(files)})")
62
67
 
63
68
  # Save output
64
69
  write_html(ALL_EMAILS_PATH if args.all_emails else TEXT_MSGS_HTML_PATH)
@@ -81,7 +86,7 @@ def epstein_search():
81
86
 
82
87
  for search_term in args.positional_args:
83
88
  temp_highlighter = build_highlighter(search_term)
84
- search_results = epstein_files.docs_matching(search_term, specified_names)
89
+ search_results = epstein_files.docs_matching(search_term, args.names)
85
90
  console.line(2)
86
91
  print_panel(f"Found {len(search_results)} documents matching '{search_term}'", padding=(0, 0, 0, 3))
87
92
 
@@ -90,7 +95,7 @@ def epstein_search():
90
95
 
91
96
  if args.whole_file:
92
97
  if isinstance(search_result.document, Email):
93
- search_result.document.truncation_allowed = False
98
+ search_result.document._truncation_allowed = False
94
99
 
95
100
  console.print(search_result.document)
96
101
  else:
@@ -111,7 +116,7 @@ def epstein_show():
111
116
 
112
117
  for doc in docs:
113
118
  if isinstance(doc, Email):
114
- doc.truncation_allowed = False
119
+ doc._truncation_allowed = False
115
120
 
116
121
  console.print('\n', doc, '\n')
117
122
 
@@ -34,9 +34,9 @@ class Communication(Document):
34
34
  def is_attribution_uncertain(self) -> bool:
35
35
  return bool(self.config and self.config.is_attribution_uncertain)
36
36
 
37
- def external_links(self, _style: str = '', include_alt_links: bool = True) -> Text:
37
+ def external_links_txt(self, _style: str = '', include_alt_links: bool = True) -> Text:
38
38
  """Overrides super() method to apply self.author_style."""
39
- return super().external_links(self.author_style, include_alt_links=include_alt_links)
39
+ return super().external_links_txt(self.author_style, include_alt_links=include_alt_links)
40
40
 
41
41
  def summary(self) -> Text:
42
42
  return self._summary().append(CLOSE_PROPERTIES_CHAR)
@@ -19,12 +19,12 @@ from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
19
19
  from epstein_files.util.data import collapse_newlines, date_str, patternize, remove_zero_time_from_timestamp_str, without_falsey
20
20
  from epstein_files.util.doc_cfg import DUPE_TYPE_STRS, EmailCfg, DocCfg, Metadata, TextCfg
21
21
  from epstein_files.util.env import DOCS_DIR, args
22
- from epstein_files.util.file_helper import (file_stem_for_id, extract_file_id, file_size,
23
- file_size_str, is_local_extract_file)
22
+ from epstein_files.util.file_helper import extract_file_id, file_size, file_size_str, is_local_extract_file
24
23
  from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
25
- from epstein_files.util.rich import INFO_STYLE, SYMBOL_STYLE, console, highlighter, key_value_txt, link_text_obj
24
+ from epstein_files.util.rich import INFO_STYLE, SYMBOL_STYLE, console, highlighter, join_texts, key_value_txt, link_text_obj, parenthesize
26
25
  from epstein_files.util.search_result import MatchedLine
27
26
 
27
+ ALT_LINK_STYLE = 'white dim'
28
28
  CLOSE_PROPERTIES_CHAR = ']'
29
29
  HOUSE_OVERSIGHT = HOUSE_OVERSIGHT_PREFIX.replace('_', ' ').strip()
30
30
  INFO_INDENT = 2
@@ -46,7 +46,6 @@ FILENAME_MATCH_STYLES = [
46
46
  METADATA_FIELDS = [
47
47
  'author',
48
48
  'file_id',
49
- 'num_lines',
50
49
  'timestamp'
51
50
  ]
52
51
 
@@ -68,7 +67,6 @@ class Document:
68
67
  config (DocCfg): Information about this fil
69
68
  file_id (str): 6 digit (or 8 digits if it's a local extract file) string ID
70
69
  filename (str): File's basename
71
- length (int): Number of characters in the file after all the cleanup
72
70
  lines (str): Number of lines in the file after all the cleanup
73
71
  text (str): Contents of the file
74
72
  timestamp (datetime | None): When the file was originally created
@@ -80,12 +78,10 @@ class Document:
80
78
  config: EmailCfg | DocCfg | TextCfg | None = None
81
79
  file_id: str = field(init=False)
82
80
  filename: str = field(init=False)
83
- length: int = field(init=False)
84
- lines: list[str] = field(init=False)
85
- num_lines: int = field(init=False)
81
+ lines: list[str] = field(default_factory=list)
86
82
  text: str = ''
87
83
  timestamp: datetime | None = None
88
- url_slug: str = field(init=False) # e.g. 'HOUSE_OVERSIGHT_123456
84
+ url_slug: str = ''
89
85
 
90
86
  # Class variables
91
87
  include_description_in_summary_panel: ClassVar[bool] = False
@@ -94,12 +90,13 @@ class Document:
94
90
  def __post_init__(self):
95
91
  self.filename = self.file_path.name
96
92
  self.file_id = extract_file_id(self.filename)
93
+ # config and url_slug could have been pre-set in Email
97
94
  self.config = self.config or deepcopy(ALL_FILE_CONFIGS.get(self.file_id))
95
+ self.url_slug = self.url_slug or self.filename.split('.')[0]
98
96
 
99
- if 'url_slug' not in vars(self):
100
- self.url_slug = self.file_path.stem
97
+ if not self.text:
98
+ self._load_file()
101
99
 
102
- self._set_computed_fields(text=self.text or self._load_file())
103
100
  self._repair()
104
101
  self._extract_author()
105
102
  self.timestamp = self._extract_timestamp()
@@ -114,59 +111,57 @@ class Document:
114
111
 
115
112
  def duplicate_file_txt(self) -> Text:
116
113
  """If the file is a dupe make a nice message to explain what file it's a duplicate of."""
117
- if not self.config or not self.config.dupe_of_id or self.config.dupe_type is None:
114
+ if not self.is_duplicate():
118
115
  raise RuntimeError(f"duplicate_file_txt() called on {self.summary()} but not a dupe! config:\n\n{self.config}")
119
116
 
120
117
  txt = Text(f"Not showing ", style=INFO_STYLE).append(epstein_media_doc_link_txt(self.file_id, style='cyan'))
121
118
  txt.append(f" because it's {DUPE_TYPE_STRS[self.config.dupe_type]} ")
122
- return txt.append(epstein_media_doc_link_txt(self.config.dupe_of_id, style='royal_blue1'))
119
+ return txt.append(epstein_media_doc_link_txt(self.config.duplicate_of_id, style='royal_blue1'))
123
120
 
124
121
  def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
125
- return self.external_url(epsteinify_doc_url, style, link_txt)
122
+ return self.external_link(epsteinify_doc_url, style, link_txt)
126
123
 
127
124
  def epstein_media_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
128
- return self.external_url(epstein_media_doc_url, style, link_txt)
125
+ return self.external_link(epstein_media_doc_url, style, link_txt)
129
126
 
130
127
  def epstein_web_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
131
- return self.external_url(epstein_web_doc_url, style, link_txt)
128
+ return self.external_link(epstein_web_doc_url, style, link_txt)
132
129
 
133
130
  def rollcall_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
134
- return self.external_url(rollcall_doc_url, style, link_txt)
131
+ return self.external_link(rollcall_doc_url, style, link_txt)
135
132
 
136
- def external_url(self, fxn: Callable[[str], str], style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
133
+ def external_link(self, fxn: Callable[[str], str], style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
137
134
  return link_text_obj(fxn(self.url_slug), link_txt or self.file_path.stem, style)
138
135
 
139
- def external_links(self, style: str = '', include_alt_links: bool = False) -> Text:
140
- """Returns colored links to epstein.media and and epsteinweb in a Text object."""
141
- txt = Text('', style='white' if include_alt_links else ARCHIVE_LINK_COLOR)
142
-
143
- if args.use_epstein_web:
144
- txt.append(self.epstein_web_link(style=style))
145
- alt_link = self.epstein_media_link(style='white dim', link_txt=EPSTEIN_MEDIA)
146
- else:
147
- txt.append(self.epstein_media_link(style=style))
148
- alt_link = self.epstein_web_link(style='white dim', link_txt=EPSTEIN_WEB)
136
+ def external_links_txt(self, style: str = '', include_alt_links: bool = False) -> Text:
137
+ """Returns colored links to epstein.media and alternates in a Text object."""
138
+ links = [self.epstein_media_link(style=style)]
149
139
 
150
140
  if include_alt_links:
151
- txt.append(' (').append(self.epsteinify_link(style='white dim', link_txt=EPSTEINIFY)).append(')')
152
- txt.append(' (').append(alt_link).append(')')
141
+ links.append(self.epsteinify_link(style=ALT_LINK_STYLE, link_txt=EPSTEINIFY))
142
+ links.append(self.epstein_web_link(style=ALT_LINK_STYLE, link_txt=EPSTEIN_WEB))
153
143
 
154
144
  if self._class_name() == 'Email':
155
- txt.append(' (').append(self.rollcall_link(style='white dim', link_txt=ROLLCALL)).append(')')
145
+ links.append(self.rollcall_link(style=ALT_LINK_STYLE, link_txt=ROLLCALL))
156
146
 
157
- return txt
147
+ links = [links[0]] + [parenthesize(link) for link in links[1:]]
148
+ base_txt = Text('', style='white' if include_alt_links else ARCHIVE_LINK_COLOR)
149
+ return base_txt.append(join_texts(links))
150
+
151
+ def file_id_debug_info(self) -> str:
152
+ return ', '.join([f"{prop}={getattr(self, prop)}" for prop in ['file_id', 'filename', 'url_slug']])
158
153
 
159
154
  def file_info_panel(self) -> Group:
160
155
  """Panel with filename linking to raw file plus any additional info about the file."""
161
- panel = Panel(self.external_links(include_alt_links=True), border_style=self._border_style(), expand=False)
156
+ panel = Panel(self.external_links_txt(include_alt_links=True), border_style=self._border_style(), expand=False)
162
157
  padded_info = [Padding(sentence, INFO_PADDING) for sentence in self.info()]
163
158
  return Group(*([panel] + padded_info))
164
159
 
165
160
  def file_size(self) -> int:
166
161
  return file_size(self.file_path)
167
162
 
168
- def file_size_str(self) -> str:
169
- return file_size_str(self.file_path)
163
+ def file_size_str(self, decimal_places: int | None = None) -> str:
164
+ return file_size_str(self.file_path, decimal_places)
170
165
 
171
166
  def info(self) -> list[Text]:
172
167
  """0 to 2 sentences containing the info_txt() as well as any configured description."""
@@ -176,16 +171,19 @@ class Document:
176
171
  ])
177
172
 
178
173
  def info_txt(self) -> Text | None:
179
- """Secondary info about this file (recipients, level of certainty, etc). Overload in subclasses."""
174
+ """Secondary info about this file (description recipients, etc). Overload in subclasses."""
180
175
  return None
181
176
 
182
177
  def is_duplicate(self) -> bool:
183
- return bool(self.config and self.config.dupe_of_id)
178
+ return bool(self.config and self.config.duplicate_of_id)
184
179
 
185
180
  def is_local_extract_file(self) -> bool:
186
- """True if file created by extracting text from a court doc (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
181
+ """True if extracted from other file (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
187
182
  return is_local_extract_file(self.filename)
188
183
 
184
+ def length(self) -> int:
185
+ return len(self.text)
186
+
189
187
  def log(self, msg: str, level: int = logging.INFO):
190
188
  """Log with filename as a prefix."""
191
189
  logger.log(level, f"{self.file_path.stem} {msg}")
@@ -206,17 +204,21 @@ class Document:
206
204
  metadata.update({k: v for k, v in asdict(self).items() if k in METADATA_FIELDS and v is not None})
207
205
  metadata['bytes'] = self.file_size()
208
206
  metadata['filename'] = f"{self.url_slug}.txt"
207
+ metadata['num_lines'] = self.num_lines()
209
208
  metadata['type'] = self._class_name()
210
209
 
211
210
  if self.is_local_extract_file():
212
211
  metadata['extracted_file'] = {
213
- 'explanation': 'Manually extracted from one of the court filings.',
212
+ 'explanation': 'manually extracted from one of the other files',
214
213
  'extracted_from': self.url_slug + '.txt',
215
214
  'url': extracted_file_url(self.filename),
216
215
  }
217
216
 
218
217
  return metadata
219
218
 
219
+ def num_lines(self) -> int:
220
+ return len(self.lines)
221
+
220
222
  def raw_text(self) -> str:
221
223
  with open(self.file_path) as f:
222
224
  return f.read()
@@ -232,8 +234,9 @@ class Document:
232
234
  return text
233
235
 
234
236
  def sort_key(self) -> tuple[datetime, str, int]:
237
+ """Sort by timestamp, file_id, then whether or not it's a duplicate file."""
235
238
  if self.is_duplicate():
236
- sort_id = self.config.dupe_of_id
239
+ sort_id = self.config.duplicate_of_id
237
240
  dupe_idx = 1
238
241
  else:
239
242
  sort_id = self.file_id
@@ -251,11 +254,11 @@ class Document:
251
254
  txt.append(' (', style=SYMBOL_STYLE)
252
255
  txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
253
256
 
254
- txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
255
- txt.append(", ").append(key_value_txt('lines', self.num_lines))
257
+ txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(0), style='aquamarine1')))
258
+ txt.append(", ").append(key_value_txt('lines', self.num_lines()))
256
259
 
257
- if self.config and self.config.dupe_of_id:
258
- txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.dupe_of_id, style='magenta')))
260
+ if self.config and self.config.duplicate_of_id:
261
+ txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.duplicate_of_id, style='magenta')))
259
262
 
260
263
  return txt
261
264
 
@@ -269,6 +272,7 @@ class Document:
269
272
  return Panel(Group(*sentences), border_style=self._class_style(), expand=False)
270
273
 
271
274
  def top_lines(self, n: int = 10) -> str:
275
+ """First n lines."""
272
276
  return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
273
277
 
274
278
  def warn(self, msg: str) -> None:
@@ -294,13 +298,19 @@ class Document:
294
298
  """Should be implemented in subclasses."""
295
299
  pass
296
300
 
297
- def _load_file(self) -> str:
301
+ def _load_file(self) -> None:
298
302
  """Remove BOM and HOUSE OVERSIGHT lines, strip whitespace."""
299
303
  text = self.raw_text()
300
304
  text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text # remove BOM
301
305
  text = self.repair_ocr_text(OCR_REPAIRS, text.strip())
302
- lines = [l.strip() for l in text.split('\n') if not l.startswith(HOUSE_OVERSIGHT)]
303
- return collapse_newlines('\n'.join(lines))
306
+
307
+ lines = [
308
+ line.strip() if self.strip_whitespace else line for line in text.split('\n')
309
+ if not line.startswith(HOUSE_OVERSIGHT)
310
+ ]
311
+
312
+ self.text = collapse_newlines('\n'.join(lines))
313
+ self.lines = self.text.split('\n')
304
314
 
305
315
  def _repair(self) -> None:
306
316
  """Can optionally be overloaded in subclasses to further improve self.text."""
@@ -317,9 +327,7 @@ class Document:
317
327
  else:
318
328
  raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (neither was)")
319
329
 
320
- self.length = len(self.text)
321
330
  self.lines = [line.strip() if self.strip_whitespace else line for line in self.text.split('\n')]
322
- self.num_lines = len(self.lines)
323
331
 
324
332
  def _write_clean_text(self, output_path: Path) -> None:
325
333
  """Write self.text to 'output_path'. Used only for diffing files."""
@@ -332,7 +340,7 @@ class Document:
332
340
  with open(output_path, 'w') as f:
333
341
  f.write(self.text)
334
342
 
335
- logger.warning(f"Wrote {self.length} chars of cleaned {self.filename} to {output_path}.")
343
+ logger.warning(f"Wrote {self.length()} chars of cleaned {self.filename} to {output_path}.")
336
344
 
337
345
  def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
338
346
  yield self.file_info_panel()
@@ -17,7 +17,7 @@ from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, INFO_INDENT
17
17
  from epstein_files.documents.emails.email_header import (BAD_EMAILER_REGEX, EMAIL_SIMPLE_HEADER_REGEX,
18
18
  EMAIL_SIMPLE_HEADER_LINE_BREAK_REGEX, FIELD_NAMES, TIME_REGEX, EmailHeader)
19
19
  from epstein_files.util.constant.names import *
20
- from epstein_files.util.constant.strings import REDACTED, URL_SIGNIFIERS
20
+ from epstein_files.util.constant.strings import REDACTED
21
21
  from epstein_files.util.constants import *
22
22
  from epstein_files.util.data import (TIMEZONE_INFO, collapse_newlines, escape_single_quotes, extract_last_name,
23
23
  flatten, remove_timezone, uniquify)
@@ -41,6 +41,7 @@ LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
41
41
 
42
42
  SUPPRESS_LOGS_FOR_AUTHORS = ['Undisclosed recipients:', 'undisclosed-recipients:', 'Multiple Senders Multiple Senders']
43
43
  REWRITTEN_HEADER_MSG = "(janky OCR header fields were prettified, check source if something seems off)"
44
+ URL_SIGNIFIERS = ['gclid', 'htm', 'ref=', 'utm']
44
45
  APPEARS_IN = 'Appears in'
45
46
  MAX_CHARS_TO_PRINT = 4000
46
47
  MAX_NUM_HEADER_LINES = 14
@@ -131,12 +132,12 @@ JUNK_EMAILERS = [
131
132
  'editorialstaff@flipboard.com',
132
133
  'How To Academy',
133
134
  'Jokeland',
134
- JP_MORGAN_USGIO,
135
135
  ]
136
136
 
137
137
  MAILING_LISTS = [
138
138
  INTELLIGENCE_SQUARED,
139
139
  'middle.east.update@hotmail.com',
140
+ JP_MORGAN_USGIO,
140
141
  ]
141
142
 
142
143
  TRUNCATE_ALL_EMAILS_FROM = JUNK_EMAILERS + MAILING_LISTS + [
@@ -316,7 +317,7 @@ class Email(Communication):
316
317
  recipients: list[str | None] = field(default_factory=list)
317
318
  sent_from_device: str | None = None
318
319
  signature_substitution_counts: dict[str, int] = field(default_factory=dict) # defaultdict breaks asdict :(
319
- truncation_allowed: bool = True
320
+ _truncation_allowed: bool = True # Hacky way to get __rich_console__() not to truncate in epstein_show script
320
321
 
321
322
  # For logging how many headers we prettified while printing, kind of janky
322
323
  rewritten_header_ids: ClassVar[set[str]] = set([])
@@ -337,10 +338,10 @@ class Email(Communication):
337
338
 
338
339
  try:
339
340
  if self.config and self.config.recipients:
340
- self.recipients = cast(list[str | None], self.config.recipients)
341
+ self.recipients = self.config.recipients
341
342
  else:
342
343
  for recipient in self.header.recipients():
343
- self.recipients.extend(self._get_names(recipient))
344
+ self.recipients.extend(self._emailer_names(recipient))
344
345
  except Exception as e:
345
346
  console.print_exception()
346
347
  console.line(2)
@@ -402,8 +403,8 @@ class Email(Communication):
402
403
  return self.text
403
404
 
404
405
  reply_text_match = REPLY_TEXT_REGEX.search(text)
405
- # logger.info(f"Raw text:\n" + self.top_lines(20) + '\n\n')
406
- # logger.info(f"With header removed:\n" + text[0:500] + '\n\n')
406
+ self.log_top_lines(20, "Raw text:", logging.DEBUG)
407
+ self.log(f"With header removed:\n{text[0:500]}\n\n", logging.DEBUG)
407
408
 
408
409
  if reply_text_match:
409
410
  actual_num_chars = len(reply_text_match.group(1))
@@ -439,12 +440,32 @@ class Email(Communication):
439
440
 
440
441
  return style.replace('bold', '').strip()
441
442
 
443
+ def _emailer_names(self, emailer_str: str) -> list[str]:
444
+ """Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
445
+ emailer_str = EmailHeader.cleanup_str(emailer_str)
446
+
447
+ if len(emailer_str) == 0:
448
+ return []
449
+
450
+ names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
451
+
452
+ if BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
453
+ if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
454
+ logger.warning(f"'{self.filename}': No emailer found in '{escape_single_quotes(emailer_str)}'")
455
+ else:
456
+ logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
457
+
458
+ return names_found
459
+
460
+ names_found = names_found or [emailer_str]
461
+ return [_reverse_first_and_last_names(name) for name in names_found]
462
+
442
463
  def _extract_author(self) -> None:
443
464
  self._extract_header()
444
465
  super()._extract_author()
445
466
 
446
467
  if not self.author and self.header.author:
447
- authors = self._get_names(self.header.author)
468
+ authors = self._emailer_names(self.header.author)
448
469
  self.author = authors[0] if (len(authors) > 0 and authors[0]) else None
449
470
 
450
471
  def _extract_header(self) -> None:
@@ -494,26 +515,6 @@ class Email(Communication):
494
515
 
495
516
  raise RuntimeError(f"No timestamp found in '{self.file_path.name}' top lines:\n{searchable_text}")
496
517
 
497
- def _get_names(self, emailer_str: str) -> list[str]:
498
- """Return a list of people's names found in 'emailer_str' (email author or recipients field)."""
499
- emailer_str = EmailHeader.cleanup_str(emailer_str)
500
-
501
- if len(emailer_str) == 0:
502
- return []
503
-
504
- names_found = [name for name, regex in EMAILER_REGEXES.items() if regex.search(emailer_str)]
505
-
506
- if BAD_EMAILER_REGEX.match(emailer_str) or TIME_REGEX.match(emailer_str):
507
- if len(names_found) == 0 and emailer_str not in SUPPRESS_LOGS_FOR_AUTHORS:
508
- logger.warning(f"'{self.filename}': No emailer found in '{escape_single_quotes(emailer_str)}'")
509
- else:
510
- logger.info(f"Extracted {len(names_found)} names from semi-invalid '{emailer_str}': {names_found}...")
511
-
512
- return names_found
513
-
514
- names_found = names_found or [emailer_str]
515
- return [_reverse_first_and_last_names(name) for name in names_found]
516
-
517
518
  def _idx_of_nth_quoted_reply(self, n: int = MAX_QUOTED_REPLIES, text: str | None = None) -> int | None:
518
519
  """Get position of the nth 'On June 12th, 1985 [SOMEONE] wrote:' style line in self.text."""
519
520
  for i, match in enumerate(QUOTED_REPLY_LINE_REGEX.finditer(text or self.text)):
@@ -683,6 +684,9 @@ class Email(Communication):
683
684
  if extracted_from_description:
684
685
  extracted_description = f"{APPEARS_IN} {extracted_from_description}"
685
686
 
687
+ if isinstance(extracted_from_doc_cfg, EmailCfg):
688
+ extracted_description += ' email'
689
+
686
690
  if self.config.description:
687
691
  self.warn(f"Overwriting description '{self.config.description}' with extract description '{self.config.description}'")
688
692
 
@@ -708,10 +712,10 @@ class Email(Communication):
708
712
  num_chars = quote_cutoff
709
713
 
710
714
  # Truncate long emails but leave a note explaining what happened w/link to source document
711
- if len(text) > num_chars and self.truncation_allowed:
715
+ if len(text) > num_chars and self._truncation_allowed:
712
716
  text = text[0:num_chars]
713
717
  doc_link_markup = epstein_media_doc_link_markup(self.url_slug, self.author_style)
714
- trim_note = f"<...trimmed to {num_chars} characters of {self.length}, read the rest at {doc_link_markup}...>"
718
+ trim_note = f"<...trimmed to {num_chars} characters of {self.length()}, read the rest at {doc_link_markup}...>"
715
719
  trim_footer_txt = Text.from_markup(wrap_in_markup_style(trim_note, 'dim'))
716
720
 
717
721
  # Rewrite broken headers where the values are on separate lines from the field names
@@ -5,6 +5,7 @@ from datetime import datetime
5
5
  from rich.text import Text
6
6
 
7
7
  from epstein_files.util.constant.names import JEFFREY_EPSTEIN, STEVE_BANNON, UNKNOWN
8
+ from epstein_files.util.constant.strings import TIMESTAMP_DIM
8
9
  from epstein_files.util.data import extract_last_name
9
10
  from epstein_files.util.highlighted_group import get_style_for_name
10
11
  from epstein_files.util.logging import logger
@@ -12,7 +13,6 @@ from epstein_files.util.rich import TEXT_LINK, highlighter
12
13
 
13
14
  MSG_DATE_FORMAT = r"%m/%d/%y %I:%M:%S %p"
14
15
  PHONE_NUMBER_REGEX = re.compile(r'^[\d+]+.*')
15
- TIMESTAMP_STYLE = 'turquoise4 dim'
16
16
 
17
17
  DISPLAY_LAST_NAME_ONLY = [
18
18
  JEFFREY_EPSTEIN,
@@ -29,7 +29,7 @@ TEXTER_MAPPING = {
29
29
  class TextMessage:
30
30
  """Class representing a single iMessage text message."""
31
31
  author: str | None
32
- author_str: str | None = None
32
+ author_str: str = ''
33
33
  id_confirmed: bool = False
34
34
  text: str
35
35
  timestamp_str: str
@@ -37,7 +37,7 @@ class TextMessage:
37
37
  def __post_init__(self):
38
38
  self.author = TEXTER_MAPPING.get(self.author or UNKNOWN, self.author)
39
39
 
40
- if self.author is None:
40
+ if not self.author:
41
41
  self.author_str = UNKNOWN
42
42
  elif self.author in DISPLAY_LAST_NAME_ONLY and not self.author_str:
43
43
  self.author_str = extract_last_name(self.author)
@@ -77,5 +77,5 @@ class TextMessage:
77
77
  def __rich__(self) -> Text:
78
78
  author_style = get_style_for_name(self.author_str if self.author_str.startswith('+') else self.author)
79
79
  author_txt = Text(self.author_str, style=author_style)
80
- timestamp_txt = Text(f"[{self.timestamp_str}]", style=TIMESTAMP_STYLE).append(' ')
80
+ timestamp_txt = Text(f"[{self.timestamp_str}]", style=TIMESTAMP_DIM).append(' ')
81
81
  return Text('').append(timestamp_txt).append(author_txt).append(': ', style='dim').append(self._message())
@@ -6,10 +6,12 @@ from typing import ClassVar
6
6
 
7
7
  from rich.text import Text
8
8
 
9
- from epstein_files.documents.other_file import OtherFile
9
+ from epstein_files.documents.other_file import Metadata, OtherFile
10
10
  from epstein_files.util.constant.strings import JSON
11
11
  from epstein_files.util.rich import INFO_STYLE
12
12
 
13
+ DESCRIPTION = "JSON data containing preview info for links sent in a messaging app like iMessage"
14
+
13
15
  TEXT_FIELDS = [
14
16
  'caption',
15
17
  'standard',
@@ -23,7 +25,6 @@ TEXT_FIELDS = [
23
25
  @dataclass
24
26
  class JsonFile(OtherFile):
25
27
  """File containing JSON data."""
26
-
27
28
  include_description_in_summary_panel: ClassVar[bool] = False
28
29
  strip_whitespace: ClassVar[bool] = False
29
30
 
@@ -39,7 +40,7 @@ class JsonFile(OtherFile):
39
40
  return JSON
40
41
 
41
42
  def info_txt(self) -> Text | None:
42
- return Text(f"JSON file, contains preview data for links sent a messaging app", style=INFO_STYLE)
43
+ return Text(DESCRIPTION, style=INFO_STYLE)
43
44
 
44
45
  def is_interesting(self):
45
46
  return False
@@ -48,5 +49,10 @@ class JsonFile(OtherFile):
48
49
  with open(self.file_path, encoding='utf-8-sig') as f:
49
50
  return json.load(f)
50
51
 
52
+ def metadata(self) -> Metadata:
53
+ metadata = super().metadata()
54
+ metadata['description'] = DESCRIPTION
55
+ return metadata
56
+
51
57
  def json_str(self) -> str:
52
58
  return json.dumps(self.json_data(), indent=4)