epstein-files 1.0.0__tar.gz → 1.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {epstein_files-1.0.0 → epstein_files-1.0.2}/PKG-INFO +20 -4
  2. {epstein_files-1.0.0 → epstein_files-1.0.2}/README.md +4 -3
  3. epstein_files-1.0.2/epstein_files/__init__.py +134 -0
  4. {epstein_files-1.0.0 → epstein_files-1.0.2}/epstein_files/documents/communication.py +9 -9
  5. {epstein_files-1.0.0 → epstein_files-1.0.2}/epstein_files/documents/document.py +115 -87
  6. {epstein_files-1.0.0 → epstein_files-1.0.2}/epstein_files/documents/email.py +154 -85
  7. {epstein_files-1.0.0 → epstein_files-1.0.2}/epstein_files/documents/emails/email_header.py +7 -6
  8. {epstein_files-1.0.0 → epstein_files-1.0.2}/epstein_files/documents/imessage/text_message.py +3 -2
  9. {epstein_files-1.0.0 → epstein_files-1.0.2}/epstein_files/documents/json_file.py +17 -0
  10. epstein_files-1.0.2/epstein_files/documents/messenger_log.py +132 -0
  11. epstein_files-1.0.2/epstein_files/documents/other_file.py +265 -0
  12. {epstein_files-1.0.0 → epstein_files-1.0.2}/epstein_files/epstein_files.py +128 -169
  13. {epstein_files-1.0.0 → epstein_files-1.0.2}/epstein_files/util/constant/names.py +8 -1
  14. epstein_files-1.0.2/epstein_files/util/constant/output_files.py +29 -0
  15. {epstein_files-1.0.0 → epstein_files-1.0.2}/epstein_files/util/constant/strings.py +27 -0
  16. {epstein_files-1.0.0 → epstein_files-1.0.2}/epstein_files/util/constant/urls.py +25 -9
  17. epstein_files-1.0.2/epstein_files/util/constants.py +1525 -0
  18. {epstein_files-1.0.0 → epstein_files-1.0.2}/epstein_files/util/data.py +20 -55
  19. epstein_files-1.0.0/epstein_files/util/file_cfg.py → epstein_files-1.0.2/epstein_files/util/doc_cfg.py +121 -43
  20. {epstein_files-1.0.0 → epstein_files-1.0.2}/epstein_files/util/env.py +19 -20
  21. {epstein_files-1.0.0 → epstein_files-1.0.2}/epstein_files/util/file_helper.py +38 -21
  22. {epstein_files-1.0.0 → epstein_files-1.0.2}/epstein_files/util/highlighted_group.py +229 -177
  23. epstein_files-1.0.2/epstein_files/util/logging.py +63 -0
  24. epstein_files-1.0.2/epstein_files/util/output.py +180 -0
  25. {epstein_files-1.0.0 → epstein_files-1.0.2}/epstein_files/util/rich.py +29 -17
  26. epstein_files-1.0.2/epstein_files/util/search_result.py +23 -0
  27. epstein_files-1.0.2/epstein_files/util/timer.py +24 -0
  28. {epstein_files-1.0.0 → epstein_files-1.0.2}/epstein_files/util/word_count.py +2 -1
  29. epstein_files-1.0.2/pyproject.toml +67 -0
  30. epstein_files-1.0.0/epstein_files/__init__.py +0 -194
  31. epstein_files-1.0.0/epstein_files/documents/messenger_log.py +0 -73
  32. epstein_files-1.0.0/epstein_files/documents/other_file.py +0 -117
  33. epstein_files-1.0.0/epstein_files/util/constants.py +0 -1552
  34. epstein_files-1.0.0/epstein_files/util/search_result.py +0 -15
  35. epstein_files-1.0.0/pyproject.toml +0 -31
  36. {epstein_files-1.0.0 → epstein_files-1.0.2}/LICENSE +0 -0
  37. {epstein_files-1.0.0 → epstein_files-1.0.2}/epstein_files/util/constant/common_words.py +0 -0
  38. {epstein_files-1.0.0 → epstein_files-1.0.2}/epstein_files/util/constant/html.py +0 -0
@@ -1,26 +1,42 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: epstein-files
3
- Version: 1.0.0
3
+ Version: 1.0.2
4
4
  Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
5
+ Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
6
+ License: GPL-3.0-or-later
7
+ Keywords: Epstein,Jeffrey Epstein
5
8
  Author: Michel de Cryptadamus
6
9
  Requires-Python: >=3.11,<4.0
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Environment :: Console
12
+ Classifier: Intended Audience :: Information Technology
13
+ Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
14
+ Classifier: Programming Language :: Python
7
15
  Classifier: Programming Language :: Python :: 3
8
16
  Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
9
20
  Requires-Dist: datefinder (>=0.7.3,<0.8.0)
10
21
  Requires-Dist: inflection (>=0.5.1,<0.6.0)
11
22
  Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
12
23
  Requires-Dist: python-dotenv (>=1.2.1,<2.0.0)
13
24
  Requires-Dist: requests (>=2.32.5,<3.0.0)
14
25
  Requires-Dist: rich (>=14.2.0,<15.0.0)
26
+ Project-URL: Emails, https://michelcrypt4d4mus.github.io/epstein_text_messages/all_emails_epstein_files_nov_2025.html
27
+ Project-URL: Metadata, https://michelcrypt4d4mus.github.io/epstein_text_messages/file_metadata_epstein_files_nov_2025.json
28
+ Project-URL: TextMessages, https://michelcrypt4d4mus.github.io/epstein_text_messages
29
+ Project-URL: WordCounts, https://michelcrypt4d4mus.github.io/epstein_text_messages/communication_word_count_epstein_files_nov_2025.html
15
30
  Description-Content-Type: text/markdown
16
31
 
17
32
  # I Made Epstein's Text Messages Great Again
18
33
 
19
34
  * [I Made Epstein's Text Messages Great Again (And You Should Read Them)](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great) post on [Substack](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great)
20
35
  * The Epstein text messages (and some of the emails along with summary counts of sent emails to/from Epstein) generated by this code can be viewed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/).
21
- * All of His Emails can be read at another page also generated by this code [here](https://michelcrypt4d4mus.github.io/epstein_emails_house_oversight/).
22
- * Word counts for the emails and text messages are [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/epstein_emails_word_count.html).
23
- * Configuration variables assigning specific `HOUSE_OVERSIGHT_XXXXXX.txt` file IDs (the `111111` part) as being emails to or from particular people based on various research and contributions can be found in the [constants.py](./epstein_files/util/constants.py) file in this repo.
36
+ * All of His Emails can be read at another page also generated by this code [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/all_emails_epstein_files_nov_2025.html).
37
+ * Word counts for the emails and text messages are [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/communication_word_count_epstein_files_nov_2025.html).
38
+ * Metadata containing what I have figured out about who sent or received the communications in a given file (and a brief explanation for how I figured it out for each file) is deployed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/file_metadata_epstein_files_nov_2025.json)
39
+ * Configuration variables assigning specific `HOUSE_OVERSIGHT_XXXXXX.txt` file IDs (the `111111` part) as being emails to or from particular people based on various research and contributions can be found in [constants.py](./epstein_files/util/constants.py). Everything in `constants.py` should also appear in the JSON metadata.
24
40
 
25
41
 
26
42
  ### Usage
@@ -2,9 +2,10 @@
2
2
 
3
3
  * [I Made Epstein's Text Messages Great Again (And You Should Read Them)](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great) post on [Substack](https://cryptadamus.substack.com/p/i-made-epsteins-text-messages-great)
4
4
  * The Epstein text messages (and some of the emails along with summary counts of sent emails to/from Epstein) generated by this code can be viewed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/).
5
- * All of His Emails can be read at another page also generated by this code [here](https://michelcrypt4d4mus.github.io/epstein_emails_house_oversight/).
6
- * Word counts for the emails and text messages are [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/epstein_emails_word_count.html).
7
- * Configuration variables assigning specific `HOUSE_OVERSIGHT_XXXXXX.txt` file IDs (the `111111` part) as being emails to or from particular people based on various research and contributions can be found in the [constants.py](./epstein_files/util/constants.py) file in this repo.
5
+ * All of His Emails can be read at another page also generated by this code [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/all_emails_epstein_files_nov_2025.html).
6
+ * Word counts for the emails and text messages are [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/communication_word_count_epstein_files_nov_2025.html).
7
+ * Metadata containing what I have figured out about who sent or received the communications in a given file (and a brief explanation for how I figured it out for each file) is deployed [here](https://michelcrypt4d4mus.github.io/epstein_text_messages/file_metadata_epstein_files_nov_2025.json)
8
+ * Configuration variables assigning specific `HOUSE_OVERSIGHT_XXXXXX.txt` file IDs (the `111111` part) as being emails to or from particular people based on various research and contributions can be found in [constants.py](./epstein_files/util/constants.py). Everything in `constants.py` should also appear in the JSON metadata.
8
9
 
9
10
 
10
11
  ### Usage
@@ -0,0 +1,134 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ Reformat Epstein text message files for readability and count email senders.
4
+ For use with iMessage log files from https://drive.google.com/drive/folders/1hTNH5woIRio578onLGElkTWofUSWRoH_
5
+
6
+ Install: 'poetry install'
7
+ Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT ./generate.py'
8
+ """
9
+ from sys import exit
10
+
11
+ from dotenv import load_dotenv
12
+ load_dotenv()
13
+
14
+ from rich.markup import escape
15
+ from rich.padding import Padding
16
+ from rich.panel import Panel
17
+
18
+ from epstein_files.epstein_files import EpsteinFiles, document_cls
19
+ from epstein_files.documents.document import INFO_PADDING, Document
20
+ from epstein_files.documents.email import Email
21
+ from epstein_files.util.constant.html import *
22
+ from epstein_files.util.constant.names import *
23
+ from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, TEXT_MSGS_HTML_PATH, make_clean
24
+ from epstein_files.util.env import args, specified_names
25
+ from epstein_files.util.file_helper import coerce_file_path, extract_file_id
26
+ from epstein_files.util.logging import logger
27
+ from epstein_files.util.output import print_emails, print_json_metadata, print_json_stats, print_text_messages, write_urls
28
+ from epstein_files.util.rich import build_highlighter, console, print_header, print_panel, write_html
29
+ from epstein_files.util.timer import Timer
30
+
31
+
32
+ def generate_html() -> None:
33
+ if args.make_clean:
34
+ make_clean()
35
+ exit()
36
+
37
+ timer = Timer()
38
+ epstein_files = EpsteinFiles.get_files(timer)
39
+
40
+ if args.json_metadata:
41
+ print_json_metadata(epstein_files)
42
+ exit()
43
+
44
+ print_header(epstein_files)
45
+
46
+ if args.colors_only:
47
+ exit()
48
+
49
+ if args.output_texts:
50
+ print_text_messages(epstein_files)
51
+ timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs)} text message logs')
52
+
53
+ if args.output_emails:
54
+ emails_printed = print_emails(epstein_files)
55
+ timer.print_at_checkpoint(f"Printed {emails_printed:,} emails")
56
+
57
+ if args.output_other_files:
58
+ files_printed = epstein_files.print_other_files_table()
59
+ timer.print_at_checkpoint(f"Printed {len(files_printed)} other files")
60
+
61
+ # Save output
62
+ write_html(ALL_EMAILS_PATH if args.all_emails else TEXT_MSGS_HTML_PATH)
63
+ logger.warning(f"Total time: {timer.seconds_since_start_str()}")
64
+
65
+ # JSON stats (mostly used for building pytest checks)
66
+ if args.json_stats:
67
+ print_json_stats(epstein_files)
68
+
69
+
70
+ def epstein_diff():
71
+ """Diff the cleaned up text of two files."""
72
+ Document.diff_files(args.positional_args)
73
+
74
+
75
+ def epstein_search():
76
+ """Search the cleaned up text of the files."""
77
+ _assert_positional_args()
78
+ epstein_files = EpsteinFiles.get_files(use_pickled=True)
79
+
80
+ for search_term in args.positional_args:
81
+ temp_highlighter = build_highlighter(search_term)
82
+ search_results = epstein_files.docs_matching(search_term, specified_names)
83
+ console.line(2)
84
+ print_panel(f"Found {len(search_results)} documents matching '{search_term}'", padding=(0, 0, 0, 3))
85
+
86
+ for search_result in search_results:
87
+ console.line()
88
+
89
+ if args.whole_file:
90
+ console.print(search_result.document)
91
+ else:
92
+ console.print(search_result.document.description_panel())
93
+
94
+ for matching_line in search_result.lines:
95
+ line_txt = matching_line.__rich__()
96
+ console.print(Padding(temp_highlighter(line_txt), INFO_PADDING), style='gray37')
97
+
98
+
99
+ def epstein_show():
100
+ """Show the color highlighted file. If --raw arg is passed, show the raw text of the file as well."""
101
+ _assert_positional_args()
102
+ ids = [extract_file_id(arg) for arg in args.positional_args]
103
+ console.line()
104
+
105
+ if args.pickled:
106
+ epstein_files = EpsteinFiles.get_files(use_pickled=True)
107
+ docs = epstein_files.get_documents_by_id(ids)
108
+ else:
109
+ raw_docs = [Document(coerce_file_path(id)) for id in ids]
110
+ docs = [document_cls(doc)(doc.file_path) for doc in raw_docs]
111
+
112
+ for doc in docs:
113
+ console.line()
114
+ console.print(doc)
115
+
116
+ if args.raw:
117
+ console.line()
118
+ console.print(Panel(f"*** {doc.url_slug} RAW ***", expand=False, style=doc._border_style()))
119
+ console.print(escape(doc.raw_text()))
120
+
121
+ if isinstance(doc, Email):
122
+ console.line()
123
+ console.print(Panel(f"*** {doc.url_slug} actual_text ***", expand=False, style=doc._border_style()))
124
+ console.print(escape(doc._actual_text()))
125
+
126
+
127
+ def epstein_dump_urls() -> None:
128
+ write_urls()
129
+
130
+
131
+ def _assert_positional_args():
132
+ if not args.positional_args:
133
+ console.print(f"\n ERROR: No positional args!\n", style='red1')
134
+ exit(1)
@@ -8,7 +8,7 @@ from rich.text import Text
8
8
  from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, Document
9
9
  from epstein_files.util.constant.names import UNKNOWN
10
10
  from epstein_files.util.constants import FALLBACK_TIMESTAMP
11
- from epstein_files.util.file_cfg import MessageCfg
11
+ from epstein_files.util.doc_cfg import CommunicationCfg
12
12
  from epstein_files.util.highlighted_group import get_style_for_name
13
13
  from epstein_files.util.rich import key_value_txt
14
14
 
@@ -20,7 +20,7 @@ class Communication(Document):
20
20
  """Superclass for Email and MessengerLog."""
21
21
  author_style: str = 'white'
22
22
  author_txt: Text = field(init=False)
23
- config: MessageCfg | None = None
23
+ config: CommunicationCfg | None = None
24
24
  timestamp: datetime = FALLBACK_TIMESTAMP # TODO this default sucks (though it never happens)
25
25
 
26
26
  def __post_init__(self):
@@ -31,22 +31,22 @@ class Communication(Document):
31
31
  def author_or_unknown(self) -> str:
32
32
  return self.author or UNKNOWN
33
33
 
34
- def description(self) -> Text:
35
- return self._description().append(CLOSE_PROPERTIES_CHAR)
36
-
37
- def is_attribution_uncertain(self) -> bool | None:
38
- return self.config and self.config.is_attribution_uncertain
34
+ def is_attribution_uncertain(self) -> bool:
35
+ return bool(self.config and self.config.is_attribution_uncertain)
39
36
 
40
37
  def raw_document_link_txt(self, _style: str = '', include_alt_link: bool = True) -> Text:
41
38
  """Overrides super() method to apply self.author_style."""
42
39
  return super().raw_document_link_txt(self.author_style, include_alt_link=include_alt_link)
43
40
 
41
+ def summary(self) -> Text:
42
+ return self._summary().append(CLOSE_PROPERTIES_CHAR)
43
+
44
44
  def timestamp_without_seconds(self) -> str:
45
45
  return TIMESTAMP_SECONDS_REGEX.sub('', str(self.timestamp))
46
46
 
47
- def _description(self) -> Text:
47
+ def _summary(self) -> Text:
48
48
  """One line summary mostly for logging."""
49
- txt = super().description().append(', ')
49
+ txt = super().summary().append(', ')
50
50
  return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown()}'", style=self.author_style)))
51
51
 
52
52
 
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  import re
3
- from dataclasses import dataclass, field
3
+ from dataclasses import asdict, dataclass, field
4
4
  from datetime import datetime
5
5
  from pathlib import Path
6
6
  from subprocess import run
@@ -14,33 +14,28 @@ from rich.text import Text
14
14
  from epstein_files.util.constant.names import *
15
15
  from epstein_files.util.constant.strings import *
16
16
  from epstein_files.util.constant.urls import *
17
- from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP, VI_DAILY_NEWS_ARTICLE
18
- from epstein_files.util.file_cfg import FileCfg, MessageCfg
19
- from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize
20
- from epstein_files.util.env import args, logger
21
- from epstein_files.util.file_helper import DOCS_DIR, file_stem_for_id, extract_file_id, file_size_str, is_local_extract_file
22
- from epstein_files.util.rich import SYMBOL_STYLE, console, highlighter, key_value_txt, logger, link_text_obj
17
+ from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
18
+ from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize, without_nones
19
+ from epstein_files.util.doc_cfg import EmailCfg, DocCfg, Metadata, TextCfg
20
+ from epstein_files.util.env import args
21
+ from epstein_files.util.file_helper import (DOCS_DIR, file_stem_for_id, extract_file_id, file_size,
22
+ file_size_str, is_local_extract_file)
23
+ from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
24
+ from epstein_files.util.rich import SYMBOL_STYLE, console, highlighter, key_value_txt, link_text_obj
25
+ from epstein_files.util.search_result import MatchedLine
23
26
 
24
- WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
27
+ CLOSE_PROPERTIES_CHAR = ']'
25
28
  HOUSE_OVERSIGHT = HOUSE_OVERSIGHT_PREFIX.replace('_', ' ').strip()
26
- MIN_DOCUMENT_ID = 10477
27
29
  INFO_INDENT = 2
28
30
  INFO_PADDING = (0, 0, 0, INFO_INDENT)
31
+ MAX_TOP_LINES_LEN = 4000 # Only for logging
32
+ MIN_DOCUMENT_ID = 10477
33
+ LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
34
+ WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
29
35
 
30
- CLOSE_PROPERTIES_CHAR = ']'
31
- MAX_EXTRACTED_TIMESTAMPS = 6
32
36
  MIN_TIMESTAMP = datetime(1991, 1, 1)
33
37
  MID_TIMESTAMP = datetime(2007, 1, 1)
34
38
  MAX_TIMESTAMP = datetime(2020, 1, 1)
35
- VI_DAILY_NEWS_REGEX = re.compile(r'virgin\s*is[kl][ai]nds\s*daily\s*news', re.IGNORECASE)
36
-
37
- DOC_TYPE_STYLES = {
38
- DOCUMENT_CLASS: 'grey69',
39
- EMAIL_CLASS: 'sea_green2',
40
- JSON_FILE_CLASS: 'sandy_brown',
41
- MESSENGER_LOG_CLASS: 'cyan',
42
- OTHER_FILE_CLASS: 'grey69',
43
- }
44
39
 
45
40
  FILENAME_MATCH_STYLES = [
46
41
  'dark_green',
@@ -48,6 +43,13 @@ FILENAME_MATCH_STYLES = [
48
43
  'spring_green4',
49
44
  ]
50
45
 
46
+ METADATA_FIELDS = [
47
+ 'author',
48
+ 'file_id',
49
+ 'num_lines',
50
+ 'timestamp'
51
+ ]
52
+
51
53
  OCR_REPAIRS = {
52
54
  re.compile(r'\.corn\b'): '.com',
53
55
  re.compile('ln(adequate|dyke)'): r'In\1',
@@ -61,7 +63,7 @@ class Document:
61
63
  file_path: Path
62
64
  # Optional fields
63
65
  author: str | None = None
64
- config: FileCfg | MessageCfg | None = None
66
+ config: EmailCfg | DocCfg | TextCfg | None = None
65
67
  file_id: str = field(init=False)
66
68
  filename: str = field(init=False)
67
69
  is_duplicate: bool = False
@@ -72,8 +74,8 @@ class Document:
72
74
  timestamp: datetime | None = None
73
75
  url_slug: str = field(init=False) # e.g. 'HOUSE_OVERSIGHT_123456
74
76
 
75
- # Class variable; only used to cycle color of output when using lines_match()
76
- file_matching_idx: ClassVar[int] = 0
77
+ # Class variable overridden in JsonFile
78
+ strip_whitespace: ClassVar[bool] = True
77
79
 
78
80
  def __post_init__(self):
79
81
  self.filename = self.file_path.name
@@ -82,12 +84,12 @@ class Document:
82
84
  self.is_duplicate = bool(self.config.dupe_of_id) if self.config else False
83
85
 
84
86
  if self.is_local_extract_file():
85
- self.url_slug = file_stem_for_id(self.file_id)
87
+ self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
86
88
  cfg_type = type(self.config).__name__ if self.config else None
87
89
 
88
90
  # Coerce FileConfig for court docs etc. to MessageCfg for email files extracted from that document
89
- if self.document_type() == EMAIL_CLASS and self.config and cfg_type != MessageCfg.__name__:
90
- self.config = MessageCfg.from_file_cfg(self.config)
91
+ if self.class_name() == EMAIL_CLASS and self.config and cfg_type != EmailCfg.__name__:
92
+ self.config = EmailCfg.from_doc_cfg(self.config)
91
93
  else:
92
94
  self.url_slug = self.file_path.stem
93
95
 
@@ -96,41 +98,30 @@ class Document:
96
98
  self._extract_author()
97
99
  self.timestamp = self._extract_timestamp()
98
100
 
101
+ def class_name(self) -> str:
102
+ """Annoying workaround for circular import issues and isinstance()."""
103
+ return str(type(self).__name__)
104
+
99
105
  def configured_description(self) -> str | None:
100
- return self.config.description if self.config else None
106
+ """Overloaded in OtherFile."""
107
+ if self.config and self.config.description:
108
+ return f"({self.config.description})"
101
109
 
102
110
  def date_str(self) -> str | None:
103
111
  return date_str(self.timestamp)
104
112
 
105
- def description(self) -> Text:
106
- """Mostly for logging. Brackets are left open for subclasses to add stuff."""
107
- txt = Text('').append(self.url_slug, style='magenta')
108
- txt.append(f' {self.document_type()}', style=self.document_type_style())
109
-
110
- if self.timestamp:
111
- txt.append(' (', style=SYMBOL_STYLE)
112
- txt.append(f"{iso_timestamp(self.timestamp)}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
113
-
114
- txt.append(" [").append(key_value_txt('num_lines', Text(f"{self.num_lines}", style='cyan')))
115
- txt.append(', ').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
116
- return txt
117
-
118
113
  def description_panel(self, include_hints: bool = False) -> Panel:
119
114
  """Panelized description() with info_txt(), used in search results."""
120
115
  hints = [Text('', style='italic').append(h) for h in (self.hints() if include_hints else [])]
121
- return Panel(Group(*([self.description()] + hints)), border_style=self.document_type_style(), expand=False)
122
-
123
- def document_type(self) -> str:
124
- """Annoying workaround for circular import issues and isinstance()."""
125
- return str(type(self).__name__)
116
+ return Panel(Group(*([self.summary()] + hints)), border_style=self.document_type_style(), expand=False)
126
117
 
127
118
  def document_type_style(self) -> str:
128
- return DOC_TYPE_STYLES[self.document_type()]
119
+ return DOC_TYPE_STYLES[self.class_name()]
129
120
 
130
121
  def duplicate_file_txt(self) -> Text:
131
122
  """If the file is a dupe make a nice message to explain what file it's a duplicate of."""
132
123
  if not self.config or not self.config.dupe_of_id:
133
- raise RuntimeError(f"duplicate_file_txt() called on {self.description()} but not a dupe! config:\n\n{self.config}")
124
+ raise RuntimeError(f"duplicate_file_txt() called on {self.summary()} but not a dupe! config:\n\n{self.config}")
134
125
 
135
126
  txt = Text(f"Not showing ", style='white dim italic').append(epstein_media_doc_link_txt(self.file_id, style='cyan'))
136
127
  txt.append(f" because it's {self.config.duplicate_reason()} ")
@@ -154,6 +145,9 @@ class Document:
154
145
  hints = [Padding(hint, INFO_PADDING) for hint in self.hints()]
155
146
  return Group(*([panel] + hints))
156
147
 
148
+ def file_size(self) -> int:
149
+ return file_size(self.file_path)
150
+
157
151
  def file_size_str(self) -> str:
158
152
  return file_size_str(self.file_path)
159
153
 
@@ -162,16 +156,10 @@ class Document:
162
156
  hints = listify(self.info_txt())
163
157
  hint_msg = self.configured_description()
164
158
 
165
- if self.document_type() == OTHER_FILE_CLASS:
166
- if not hint_msg and VI_DAILY_NEWS_REGEX.search(self.text):
167
- hint_msg = VI_DAILY_NEWS_ARTICLE
168
- elif hint_msg:
169
- hint_msg = f"({hint_msg})"
170
-
171
159
  if hint_msg:
172
160
  hints.append(highlighter(Text(hint_msg, style='white dim italic')))
173
161
 
174
- return hints
162
+ return without_nones(hints)
175
163
 
176
164
  def info_txt(self) -> Text | None:
177
165
  """Secondary info about this file (recipients, level of certainty, etc). Overload in subclasses."""
@@ -181,32 +169,42 @@ class Document:
181
169
  """True if file created by extracting text from a court doc (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
182
170
  return is_local_extract_file(self.filename)
183
171
 
184
- def lines_matching_txt(self, _pattern: re.Pattern | str) -> list[Text]:
185
- """Return lines matching a regex as colored list[Text]."""
186
- pattern = patternize(_pattern)
187
- matched_lines = [line for line in self.lines if pattern.search(line)]
188
-
189
- if len(matched_lines) == 0:
190
- return []
191
-
192
- file_style = FILENAME_MATCH_STYLES[type(self).file_matching_idx % len(FILENAME_MATCH_STYLES)]
193
- type(self).file_matching_idx += 1
194
-
195
- return [
196
- Text('').append(self.file_path.name, style=file_style).append(':').append(line)
197
- for line in matched_lines
198
- ]
199
-
200
172
  def log(self, msg: str, level: int = logging.WARNING):
201
- """Log with [file_id] as a prefix."""
202
- logger.log(level, f"[{self.file_id}] {msg}")
173
+ """Log with filename as a prefix."""
174
+ logger.log(level, f"{self.url_slug} {msg}")
203
175
 
204
176
  def log_top_lines(self, n: int = 10, msg: str = '', level: int = logging.INFO) -> None:
205
177
  """Log first 'n' lines of self.text at 'level'. 'msg' can be optionally provided."""
206
178
  separator = '\n\n' if '\n' in msg else '. '
207
- msg = f"{msg + separator if msg else ''}Top lines of '{self.filename}' ({self.num_lines} lines):"
179
+ msg = (msg + separator) if msg else ''
180
+ msg = f"{self.filename}: {msg}First {n} lines:"
208
181
  logger.log(level, f"{msg}\n\n{self.top_lines(n)}\n")
209
182
 
183
+ def matching_lines(self, _pattern: re.Pattern | str) -> list[MatchedLine]:
184
+ """Return lines matching a regex as colored list[Text]."""
185
+ pattern = patternize(_pattern)
186
+ return [MatchedLine(line, i) for i, line in enumerate(self.lines) if pattern.search(line)]
187
+
188
+ def metadata(self) -> Metadata:
189
+ metadata = self.config.metadata() if self.config else {}
190
+ metadata.update({k: v for k, v in asdict(self).items() if k in METADATA_FIELDS and v is not None})
191
+ metadata['bytes'] = self.file_size()
192
+ metadata['filename'] = f"{self.url_slug}.txt"
193
+ metadata['type'] = self.class_name()
194
+
195
+ if self.is_local_extract_file():
196
+ metadata['extracted_file'] = {
197
+ 'explanation': 'This file was extracted from a court filing, not distributed directly. A copy can be found on github.',
198
+ 'extracted_from_file': self.url_slug + '.txt',
199
+ 'extracted_file_url': extracted_file_url(self.filename),
200
+ }
201
+
202
+ return metadata
203
+
204
+ def raw_text(self) -> str:
205
+ with open(self.file_path) as f:
206
+ return f.read()
207
+
210
208
  def raw_document_link_txt(self, style: str = '', include_alt_link: bool = False) -> Text:
211
209
  """Returns colored links to epstein.media and and epsteinweb in a Text object."""
212
210
  txt = Text('', style='white' if include_alt_link else ARCHIVE_LINK_COLOR)
@@ -215,11 +213,13 @@ class Document:
215
213
  txt.append(self.epstein_web_link(style=style))
216
214
 
217
215
  if include_alt_link:
216
+ txt.append(' (').append(self.epsteinify_link(style='white dim', link_txt=EPSTEINIFY)).append(')')
218
217
  txt.append(' (').append(self.epstein_media_link(style='white dim', link_txt=EPSTEIN_MEDIA)).append(')')
219
218
  else:
220
219
  txt.append(self.epstein_media_link(style=style))
221
220
 
222
221
  if include_alt_link:
222
+ txt.append(' (').append(self.epsteinify_link(style='white dim', link_txt=EPSTEINIFY)).append(')')
223
223
  txt.append(' (').append(self.epstein_web_link(style='white dim', link_txt=EPSTEIN_WEB)).append(')')
224
224
 
225
225
  return txt
@@ -234,8 +234,36 @@ class Document:
234
234
 
235
235
  return text
236
236
 
237
+ def sort_key(self) -> tuple[datetime, str, int]:
238
+ if self.config and self.config.dupe_of_id:
239
+ sort_id = self.config.dupe_of_id
240
+ dupe_idx = 1
241
+ else:
242
+ sort_id = self.file_id
243
+ dupe_idx = 0
244
+
245
+ return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
246
+
247
+ def summary(self) -> Text:
248
+ """Summary of this file for logging. Brackets are left open for subclasses to add stuff."""
249
+ txt = Text('').append(self.class_name(), style=self.document_type_style())
250
+ txt.append(f" {self.url_slug}", style=FILENAME_STYLE)
251
+
252
+ if self.timestamp:
253
+ timestamp_str = iso_timestamp(self.timestamp).removesuffix(' 00:00:00')
254
+ txt.append(' (', style=SYMBOL_STYLE)
255
+ txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
256
+
257
+ txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
258
+ txt.append(", ").append(key_value_txt('lines', self.num_lines))
259
+
260
+ if self.config and self.config.dupe_of_id:
261
+ txt.append(", ").append(key_value_txt('dupe_of', Text(self.config.dupe_of_id, style='magenta')))
262
+
263
+ return txt
264
+
237
265
  def top_lines(self, n: int = 10) -> str:
238
- return '\n'.join(self.lines[0:n])
266
+ return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
239
267
 
240
268
  def _border_style(self) -> str:
241
269
  """Should be overloaded in subclasses."""
@@ -250,21 +278,20 @@ class Document:
250
278
  """Should be implemented in subclasses."""
251
279
  pass
252
280
 
253
- def _load_file(self):
281
+ def _load_file(self) -> str:
254
282
  """Remove BOM and HOUSE OVERSIGHT lines, strip whitespace."""
255
- with open(self.file_path) as f:
256
- text = f.read()
257
- text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text # remove BOM
258
- text = self.repair_ocr_text(OCR_REPAIRS, text.strip())
259
- lines = [l.strip() for l in text.split('\n') if not l.startswith(HOUSE_OVERSIGHT)]
260
- lines = lines[1:] if (len(lines) > 1 and lines[0] == '>>') else lines
261
- return collapse_newlines('\n'.join(lines))
283
+ text = self.raw_text()
284
+ text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text # remove BOM
285
+ text = self.repair_ocr_text(OCR_REPAIRS, text.strip())
286
+ lines = [l.strip() for l in text.split('\n') if not l.startswith(HOUSE_OVERSIGHT)]
287
+ return collapse_newlines('\n'.join(lines))
262
288
 
263
289
  def _repair(self) -> None:
264
- """Can optionally be overloaded in subclasses."""
290
+ """Can optionally be overloaded in subclasses to further improve self.text."""
265
291
  pass
266
292
 
267
293
  def _set_computed_fields(self, lines: list[str] | None = None, text: str | None = None) -> None:
294
+ """Sets all fields derived from self.text based on either 'lines' or 'text' arg."""
268
295
  if (lines and text):
269
296
  raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (got both)")
270
297
  elif lines is not None:
@@ -275,7 +302,7 @@ class Document:
275
302
  raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (neither was)")
276
303
 
277
304
  self.length = len(self.text)
278
- self.lines = [line.strip() for line in self.text.split('\n')]
305
+ self.lines = [line.strip() if self.strip_whitespace else line for line in self.text.split('\n')]
279
306
  self.num_lines = len(self.lines)
280
307
 
281
308
  def _write_clean_text(self, output_path: Path) -> None:
@@ -291,16 +318,17 @@ class Document:
291
318
 
292
319
  logger.warning(f"Wrote {self.length} chars of cleaned {self.filename} to {output_path}.")
293
320
 
294
- def __rich_console__(self, _console: Console, _options: ConsoleOptions) -> RenderResult:
321
+ def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
295
322
  yield self.file_info_panel()
296
323
  text_panel = Panel(highlighter(self.text), border_style=self._border_style(), expand=False)
297
324
  yield Padding(text_panel, (0, 0, 1, INFO_INDENT))
298
325
 
299
326
  def __str__(self) -> str:
300
- return self.description().plain
327
+ return self.summary().plain
301
328
 
302
329
  @staticmethod
303
330
  def diff_files(files: list[str]) -> None:
331
+ """Diff the contents of two Documents after all cleanup, BOM removal, etc."""
304
332
  if len(files) != 2:
305
333
  raise RuntimeError('Need 2 files')
306
334
  elif files[0] == files[1]:
@@ -330,7 +358,7 @@ class Document:
330
358
 
331
359
  @staticmethod
332
360
  def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
333
- return sorted(docs, key=lambda doc: [doc.timestamp or FALLBACK_TIMESTAMP, doc.file_id])
361
+ return sorted(docs, key=lambda doc: doc.sort_key())
334
362
 
335
363
  @classmethod
336
364
  def uniquify(cls, documents: Sequence['DocumentType']) -> Sequence['DocumentType']: