epstein-files 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,194 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ Reformat Epstein text message files for readability and count email senders.
4
+ For use with iMessage log files from https://drive.google.com/drive/folders/1hTNH5woIRio578onLGElkTWofUSWRoH_
5
+
6
+ Install: 'poetry install'
7
+ Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT ./generate.py'
8
+ """
9
+ from sys import exit
10
+
11
+ from dotenv import load_dotenv
12
+ load_dotenv()
13
+ from rich.padding import Padding
14
+
15
+ from epstein_files.documents.email import Email
16
+ from epstein_files.epstein_files import EpsteinFiles, count_by_month
17
+ from epstein_files.util.constant.html import *
18
+ from epstein_files.util.constant.names import *
19
+ from epstein_files.util.constant.strings import EMAIL_CLASS, MESSENGER_LOG_CLASS
20
+ from epstein_files.util.data import Timer, dict_sets_to_lists, flatten
21
+ from epstein_files.util.env import specified_names, args
22
+ from epstein_files.util.file_helper import GH_PAGES_HTML_PATH
23
+ from epstein_files.util.rich import *
24
+
25
+ PRINT_COLOR_KEY_EVERY_N_EMAILS = 150
26
+
27
+ # Order matters (will be order of output)
28
+ PEOPLE_WHOSE_EMAILS_SHOULD_BE_PRINTED: list[str | None] = [
29
+ JEREMY_RUBIN,
30
+ AL_SECKEL,
31
+ JOI_ITO,
32
+ JABOR_Y,
33
+ STEVEN_SINOFSKY,
34
+ DANIEL_SIAD,
35
+ JEAN_LUC_BRUNEL,
36
+ STEVEN_HOFFENBERG,
37
+ EHUD_BARAK,
38
+ MARTIN_NOWAK,
39
+ MASHA_DROKOVA,
40
+ RENATA_BOLOTOVA,
41
+ STEVE_BANNON,
42
+ OLIVIER_COLOM,
43
+ BORIS_NIKOLIC,
44
+ PRINCE_ANDREW,
45
+ JIDE_ZEITLIN,
46
+ DAVID_STERN,
47
+ MOHAMED_WAHEED_HASSAN,
48
+ JENNIFER_JACQUET,
49
+ None,
50
+ ]
51
+
52
+ # Order matters (will be order of output)
53
+ PEOPLE_WHOSE_EMAILS_SHOULD_BE_TABLES: list[str | None] = [
54
+ GHISLAINE_MAXWELL,
55
+ LEON_BLACK,
56
+ LANDON_THOMAS,
57
+ KATHRYN_RUEMMLER,
58
+ DARREN_INDYKE,
59
+ RICHARD_KAHN,
60
+ TYLER_SHEARS,
61
+ SULTAN_BIN_SULAYEM,
62
+ DEEPAK_CHOPRA,
63
+ ARIANE_DE_ROTHSCHILD,
64
+ TOM_PRITZKER,
65
+ ]
66
+
67
+
68
+ def generate_html() -> None:
69
+ timer = Timer()
70
+ epstein_files = EpsteinFiles.get_files(timer)
71
+ print_header(epstein_files)
72
+
73
+ if args.colors_only:
74
+ exit()
75
+
76
+ # Text messages section
77
+ if args.output_texts:
78
+ print_text_messages(epstein_files)
79
+ timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs):,} text message logs')
80
+
81
+ # Emails section
82
+ if args.output_emails:
83
+ emails_printed = print_emails(epstein_files)
84
+ timer.print_at_checkpoint(f"Printed {emails_printed:,} emails")
85
+
86
+ if args.output_other_files:
87
+ epstein_files.print_other_files_table()
88
+ timer.print_at_checkpoint(f"Printed {len(epstein_files.other_files):,} other files")
89
+ else:
90
+ logger.warning(f"Skipping other files section...")
91
+
92
+ # Save output
93
+ write_html(GH_PAGES_HTML_PATH)
94
+ logger.warning(f"Total time: {timer.seconds_since_start()}")
95
+
96
+ # JSON stats (mostly used for building pytest checks)
97
+ if args.json_stats:
98
+ console.line(5)
99
+ print_json_stats(epstein_files)
100
+
101
+
102
+ def print_emails(epstein_files: EpsteinFiles) -> int:
103
+ """Returns number of emails printed."""
104
+ print_section_header(('Selections from ' if not args.all_emails else '') + 'His Emails')
105
+ print_other_site_link(is_header=False)
106
+
107
+ if len(specified_names) == 0:
108
+ epstein_files.print_emailer_counts_table()
109
+
110
+ emailers_to_print: list[str | None]
111
+ emailer_tables: list[str | None] = []
112
+ emails_that_were_printed: list[Email] = []
113
+ num_emails_printed_since_last_color_key = 0
114
+
115
+ if args.all_emails:
116
+ console.print('Email conversations are sorted chronologically based on time of the first email.')
117
+ emailers_to_print = sorted(epstein_files.all_emailers(), key=lambda e: epstein_files.earliest_email_at(e))
118
+ print_numbered_list_of_emailers(emailers_to_print, epstein_files)
119
+ else:
120
+ if len(specified_names) > 0:
121
+ emailers_to_print = specified_names
122
+ else:
123
+ emailers_to_print = PEOPLE_WHOSE_EMAILS_SHOULD_BE_PRINTED
124
+
125
+ console.print('Email conversations grouped by counterparty can be found in the order listed below.')
126
+ print_numbered_list_of_emailers(emailers_to_print)
127
+ console.print("\nAfter that there's tables linking to (but not displaying) all known emails for each of these people:")
128
+
129
+ if len(specified_names) > 0:
130
+ if args.all_email_tables:
131
+ emailer_tables = sorted(epstein_files.all_emailers(), key=lambda e: epstein_files.earliest_email_at(e))
132
+ else:
133
+ emailer_tables = PEOPLE_WHOSE_EMAILS_SHOULD_BE_TABLES
134
+
135
+ print_numbered_list_of_emailers(emailer_tables)
136
+
137
+ for author in emailers_to_print:
138
+ newly_printed_emails = epstein_files.print_emails_for(author)
139
+ emails_that_were_printed.extend(newly_printed_emails)
140
+ num_emails_printed_since_last_color_key += len(newly_printed_emails)
141
+
142
+ # Print color key every once in a while
143
+ if num_emails_printed_since_last_color_key > PRINT_COLOR_KEY_EVERY_N_EMAILS:
144
+ print_color_key()
145
+ num_emails_printed_since_last_color_key = 0
146
+
147
+ if len(emailer_tables) > 0 and len(specified_names) == 0:
148
+ print_author_header(f"Email Tables for {len(emailer_tables)} Other People", 'white')
149
+
150
+ for name in emailer_tables:
151
+ epstein_files.print_emails_table_for(name)
152
+
153
+ if len(specified_names) == 0:
154
+ epstein_files.print_email_device_info()
155
+
156
+ logger.warning(f"Rewrote {len(Email.rewritten_header_ids)} headers of {len(epstein_files.emails)} emails")
157
+
158
+ if args.all_emails:
159
+ email_ids_that_were_printed = set([email.file_id for email in emails_that_were_printed])
160
+ logger.warning(f"Printed {len(emails_that_were_printed)} emails of {len(email_ids_that_were_printed)} unique file IDs.")
161
+
162
+ for email in epstein_files.emails:
163
+ if email.file_id not in email_ids_that_were_printed and not email.is_duplicate:
164
+ logger.warning(f"Failed to print {email.description()}")
165
+
166
+ return len(emails_that_were_printed)
167
+
168
+
169
+ def print_text_messages(epstein_files: EpsteinFiles) -> None:
170
+ print_section_header('Text Messages')
171
+ print_centered("(conversations are sorted chronologically based on timestamp of first message)\n", style='gray30')
172
+
173
+ if len(specified_names) == 0:
174
+ log_files = epstein_files.imessage_logs
175
+ else:
176
+ log_files = flatten([epstein_files.imessage_logs_for(name) for name in specified_names])
177
+
178
+ for log_file in log_files:
179
+ console.print(Padding(log_file))
180
+ console.line(2)
181
+
182
+ epstein_files.print_imessage_summary()
183
+
184
+
185
+ def print_json_stats(epstein_files: EpsteinFiles) -> None:
186
+ console.print(Panel('JSON Stats Dump', expand=True, style='reverse bold'), '\n')
187
+ print_json(f"{MESSENGER_LOG_CLASS} Sender Counts", epstein_files.imessage_sender_counts(), skip_falsey=True)
188
+ print_json(f"{EMAIL_CLASS} Author Counts", epstein_files.email_author_counts, skip_falsey=True)
189
+ print_json(f"{EMAIL_CLASS} Recipient Counts", epstein_files.email_recipient_counts, skip_falsey=True)
190
+ print_json("Email signature_substitution_countss", epstein_files.email_signature_substitution_counts(), skip_falsey=True)
191
+ print_json("email_author_device_signatures", dict_sets_to_lists(epstein_files.email_authors_to_device_signatures))
192
+ print_json("email_sent_from_devices", dict_sets_to_lists(epstein_files.email_device_signatures_to_authors))
193
+ print_json("email_unknown_recipient_file_ids", epstein_files.email_unknown_recipient_file_ids())
194
+ print_json("count_by_month", count_by_month(epstein_files.all_documents()))
@@ -0,0 +1,53 @@
1
+ import re
2
+ from dataclasses import dataclass, field
3
+ from datetime import datetime
4
+ from typing import TypeVar
5
+
6
+ from rich.text import Text
7
+
8
+ from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, Document
9
+ from epstein_files.util.constant.names import UNKNOWN
10
+ from epstein_files.util.constants import FALLBACK_TIMESTAMP
11
+ from epstein_files.util.file_cfg import MessageCfg
12
+ from epstein_files.util.highlighted_group import get_style_for_name
13
+ from epstein_files.util.rich import key_value_txt
14
+
15
+ TIMESTAMP_SECONDS_REGEX = re.compile(r":\d{2}$")
16
+
17
+
18
+ @dataclass
19
+ class Communication(Document):
20
+ """Superclass for Email and MessengerLog."""
21
+ author_style: str = 'white'
22
+ author_txt: Text = field(init=False)
23
+ config: MessageCfg | None = None
24
+ timestamp: datetime = FALLBACK_TIMESTAMP # TODO this default sucks (though it never happens)
25
+
26
+ def __post_init__(self):
27
+ super().__post_init__()
28
+ self.author_style = get_style_for_name(self.author_or_unknown())
29
+ self.author_txt = Text(self.author_or_unknown(), style=self.author_style)
30
+
31
+ def author_or_unknown(self) -> str:
32
+ return self.author or UNKNOWN
33
+
34
+ def description(self) -> Text:
35
+ return self._description().append(CLOSE_PROPERTIES_CHAR)
36
+
37
+ def is_attribution_uncertain(self) -> bool | None:
38
+ return self.config and self.config.is_attribution_uncertain
39
+
40
+ def raw_document_link_txt(self, _style: str = '', include_alt_link: bool = True) -> Text:
41
+ """Overrides super() method to apply self.author_style."""
42
+ return super().raw_document_link_txt(self.author_style, include_alt_link=include_alt_link)
43
+
44
+ def timestamp_without_seconds(self) -> str:
45
+ return TIMESTAMP_SECONDS_REGEX.sub('', str(self.timestamp))
46
+
47
+ def _description(self) -> Text:
48
+ """One line summary mostly for logging."""
49
+ txt = super().description().append(', ')
50
+ return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown()}'", style=self.author_style)))
51
+
52
+
53
+ CommunicationType = TypeVar('CommunicationType', bound=Document)
@@ -0,0 +1,357 @@
1
+ import logging
2
+ import re
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from subprocess import run
7
+ from typing import ClassVar, Sequence, TypeVar
8
+
9
+ from rich.console import Console, ConsoleOptions, Group, RenderResult
10
+ from rich.padding import Padding
11
+ from rich.panel import Panel
12
+ from rich.text import Text
13
+
14
+ from epstein_files.util.constant.names import *
15
+ from epstein_files.util.constant.strings import *
16
+ from epstein_files.util.constant.urls import *
17
+ from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP, VI_DAILY_NEWS_ARTICLE
18
+ from epstein_files.util.file_cfg import FileCfg, MessageCfg
19
+ from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize
20
+ from epstein_files.util.env import args, logger
21
+ from epstein_files.util.file_helper import DOCS_DIR, file_stem_for_id, extract_file_id, file_size_str, is_local_extract_file
22
+ from epstein_files.util.rich import SYMBOL_STYLE, console, highlighter, key_value_txt, logger, link_text_obj
23
+
24
+ WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
25
+ HOUSE_OVERSIGHT = HOUSE_OVERSIGHT_PREFIX.replace('_', ' ').strip()
26
+ MIN_DOCUMENT_ID = 10477
27
+ INFO_INDENT = 2
28
+ INFO_PADDING = (0, 0, 0, INFO_INDENT)
29
+
30
+ CLOSE_PROPERTIES_CHAR = ']'
31
+ MAX_EXTRACTED_TIMESTAMPS = 6
32
+ MIN_TIMESTAMP = datetime(1991, 1, 1)
33
+ MID_TIMESTAMP = datetime(2007, 1, 1)
34
+ MAX_TIMESTAMP = datetime(2020, 1, 1)
35
+ VI_DAILY_NEWS_REGEX = re.compile(r'virgin\s*is[kl][ai]nds\s*daily\s*news', re.IGNORECASE)
36
+
37
+ DOC_TYPE_STYLES = {
38
+ DOCUMENT_CLASS: 'grey69',
39
+ EMAIL_CLASS: 'sea_green2',
40
+ JSON_FILE_CLASS: 'sandy_brown',
41
+ MESSENGER_LOG_CLASS: 'cyan',
42
+ OTHER_FILE_CLASS: 'grey69',
43
+ }
44
+
45
+ FILENAME_MATCH_STYLES = [
46
+ 'dark_green',
47
+ 'green',
48
+ 'spring_green4',
49
+ ]
50
+
51
+ OCR_REPAIRS = {
52
+ re.compile(r'\.corn\b'): '.com',
53
+ re.compile('ln(adequate|dyke)'): r'In\1',
54
+ 'Nil Priell': 'Nili Priell',
55
+ }
56
+
57
+
58
+ @dataclass
59
+ class Document:
60
+ """Base class for all Epstein Files documents."""
61
+ file_path: Path
62
+ # Optional fields
63
+ author: str | None = None
64
+ config: FileCfg | MessageCfg | None = None
65
+ file_id: str = field(init=False)
66
+ filename: str = field(init=False)
67
+ is_duplicate: bool = False
68
+ length: int = field(init=False)
69
+ lines: list[str] = field(init=False)
70
+ num_lines: int = field(init=False)
71
+ text: str = ''
72
+ timestamp: datetime | None = None
73
+ url_slug: str = field(init=False) # e.g. 'HOUSE_OVERSIGHT_123456
74
+
75
+ # Class variable; only used to cycle color of output when using lines_match()
76
+ file_matching_idx: ClassVar[int] = 0
77
+
78
+ def __post_init__(self):
79
+ self.filename = self.file_path.name
80
+ self.file_id = extract_file_id(self.filename)
81
+ self.config = ALL_FILE_CONFIGS.get(self.file_id)
82
+ self.is_duplicate = bool(self.config.dupe_of_id) if self.config else False
83
+
84
+ if self.is_local_extract_file():
85
+ self.url_slug = file_stem_for_id(self.file_id)
86
+ cfg_type = type(self.config).__name__ if self.config else None
87
+
88
+ # Coerce FileConfig for court docs etc. to MessageCfg for email files extracted from that document
89
+ if self.document_type() == EMAIL_CLASS and self.config and cfg_type != MessageCfg.__name__:
90
+ self.config = MessageCfg.from_file_cfg(self.config)
91
+ else:
92
+ self.url_slug = self.file_path.stem
93
+
94
+ self._set_computed_fields(text=self.text or self._load_file())
95
+ self._repair()
96
+ self._extract_author()
97
+ self.timestamp = self._extract_timestamp()
98
+
99
+ def configured_description(self) -> str | None:
100
+ return self.config.description if self.config else None
101
+
102
+ def date_str(self) -> str | None:
103
+ return date_str(self.timestamp)
104
+
105
+ def description(self) -> Text:
106
+ """Mostly for logging. Brackets are left open for subclasses to add stuff."""
107
+ txt = Text('').append(self.url_slug, style='magenta')
108
+ txt.append(f' {self.document_type()}', style=self.document_type_style())
109
+
110
+ if self.timestamp:
111
+ txt.append(' (', style=SYMBOL_STYLE)
112
+ txt.append(f"{iso_timestamp(self.timestamp)}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
113
+
114
+ txt.append(" [").append(key_value_txt('num_lines', Text(f"{self.num_lines}", style='cyan')))
115
+ txt.append(', ').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
116
+ return txt
117
+
118
+ def description_panel(self, include_hints: bool = False) -> Panel:
119
+ """Panelized description() with info_txt(), used in search results."""
120
+ hints = [Text('', style='italic').append(h) for h in (self.hints() if include_hints else [])]
121
+ return Panel(Group(*([self.description()] + hints)), border_style=self.document_type_style(), expand=False)
122
+
123
+ def document_type(self) -> str:
124
+ """Annoying workaround for circular import issues and isinstance()."""
125
+ return str(type(self).__name__)
126
+
127
+ def document_type_style(self) -> str:
128
+ return DOC_TYPE_STYLES[self.document_type()]
129
+
130
+ def duplicate_file_txt(self) -> Text:
131
+ """If the file is a dupe make a nice message to explain what file it's a duplicate of."""
132
+ if not self.config or not self.config.dupe_of_id:
133
+ raise RuntimeError(f"duplicate_file_txt() called on {self.description()} but not a dupe! config:\n\n{self.config}")
134
+
135
+ txt = Text(f"Not showing ", style='white dim italic').append(epstein_media_doc_link_txt(self.file_id, style='cyan'))
136
+ txt.append(f" because it's {self.config.duplicate_reason()} ")
137
+ return txt.append(epstein_media_doc_link_txt(self.config.dupe_of_id, style='royal_blue1'))
138
+
139
+ def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
140
+ """Create a Text obj link to this document on epsteinify.com."""
141
+ return link_text_obj(epsteinify_doc_url(self.url_slug), link_txt or self.url_slug, style)
142
+
143
+ def epstein_media_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
144
+ """Create a Text obj link to this document on epstein.media."""
145
+ return link_text_obj(epstein_media_doc_url(self.url_slug), link_txt or self.url_slug, style)
146
+
147
+ def epstein_web_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
148
+ """Create a Text obj link to this document on EpsteinWeb."""
149
+ return link_text_obj(epstein_web_doc_url(self.url_slug), link_txt or self.url_slug, style)
150
+
151
+ def file_info_panel(self) -> Group:
152
+ """Panel with filename linking to raw file plus any hints/info about the file."""
153
+ panel = Panel(self.raw_document_link_txt(include_alt_link=True), border_style=self._border_style(), expand=False)
154
+ hints = [Padding(hint, INFO_PADDING) for hint in self.hints()]
155
+ return Group(*([panel] + hints))
156
+
157
+ def file_size_str(self) -> str:
158
+ return file_size_str(self.file_path)
159
+
160
+ def hints(self) -> list[Text]:
161
+ """Additional info about the Document (author, description, and so on) to be desplayed in doc header."""
162
+ hints = listify(self.info_txt())
163
+ hint_msg = self.configured_description()
164
+
165
+ if self.document_type() == OTHER_FILE_CLASS:
166
+ if not hint_msg and VI_DAILY_NEWS_REGEX.search(self.text):
167
+ hint_msg = VI_DAILY_NEWS_ARTICLE
168
+ elif hint_msg:
169
+ hint_msg = f"({hint_msg})"
170
+
171
+ if hint_msg:
172
+ hints.append(highlighter(Text(hint_msg, style='white dim italic')))
173
+
174
+ return hints
175
+
176
+ def info_txt(self) -> Text | None:
177
+ """Secondary info about this file (recipients, level of certainty, etc). Overload in subclasses."""
178
+ return None
179
+
180
+ def is_local_extract_file(self) -> bool:
181
+ """True if file created by extracting text from a court doc (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
182
+ return is_local_extract_file(self.filename)
183
+
184
+ def lines_matching_txt(self, _pattern: re.Pattern | str) -> list[Text]:
185
+ """Return lines matching a regex as colored list[Text]."""
186
+ pattern = patternize(_pattern)
187
+ matched_lines = [line for line in self.lines if pattern.search(line)]
188
+
189
+ if len(matched_lines) == 0:
190
+ return []
191
+
192
+ file_style = FILENAME_MATCH_STYLES[type(self).file_matching_idx % len(FILENAME_MATCH_STYLES)]
193
+ type(self).file_matching_idx += 1
194
+
195
+ return [
196
+ Text('').append(self.file_path.name, style=file_style).append(':').append(line)
197
+ for line in matched_lines
198
+ ]
199
+
200
+ def log(self, msg: str, level: int = logging.WARNING):
201
+ """Log with [file_id] as a prefix."""
202
+ logger.log(level, f"[{self.file_id}] {msg}")
203
+
204
+ def log_top_lines(self, n: int = 10, msg: str = '', level: int = logging.INFO) -> None:
205
+ """Log first 'n' lines of self.text at 'level'. 'msg' can be optionally provided."""
206
+ separator = '\n\n' if '\n' in msg else '. '
207
+ msg = f"{msg + separator if msg else ''}Top lines of '{self.filename}' ({self.num_lines} lines):"
208
+ logger.log(level, f"{msg}\n\n{self.top_lines(n)}\n")
209
+
210
+ def raw_document_link_txt(self, style: str = '', include_alt_link: bool = False) -> Text:
211
+ """Returns colored links to epstein.media and and epsteinweb in a Text object."""
212
+ txt = Text('', style='white' if include_alt_link else ARCHIVE_LINK_COLOR)
213
+
214
+ if args.use_epstein_web_links:
215
+ txt.append(self.epstein_web_link(style=style))
216
+
217
+ if include_alt_link:
218
+ txt.append(' (').append(self.epstein_media_link(style='white dim', link_txt=EPSTEIN_MEDIA)).append(')')
219
+ else:
220
+ txt.append(self.epstein_media_link(style=style))
221
+
222
+ if include_alt_link:
223
+ txt.append(' (').append(self.epstein_web_link(style='white dim', link_txt=EPSTEIN_WEB)).append(')')
224
+
225
+ return txt
226
+
227
+ def repair_ocr_text(self, repairs: dict[str | re.Pattern, str], text: str) -> str:
228
+ """Apply a dict of repairs (key is pattern or string, value is replacement string) to text."""
229
+ for k, v in repairs.items():
230
+ if isinstance(k, re.Pattern):
231
+ text = k.sub(v, text)
232
+ else:
233
+ text = text.replace(k, v)
234
+
235
+ return text
236
+
237
+ def top_lines(self, n: int = 10) -> str:
238
+ return '\n'.join(self.lines[0:n])
239
+
240
+ def _border_style(self) -> str:
241
+ """Should be overloaded in subclasses."""
242
+ return 'white'
243
+
244
+ def _extract_author(self) -> None:
245
+ """Get author from config. Extended in Email subclass to also check headers."""
246
+ if self.config and self.config.author:
247
+ self.author = self.config.author
248
+
249
+ def _extract_timestamp(self) -> datetime | None:
250
+ """Should be implemented in subclasses."""
251
+ pass
252
+
253
+ def _load_file(self):
254
+ """Remove BOM and HOUSE OVERSIGHT lines, strip whitespace."""
255
+ with open(self.file_path) as f:
256
+ text = f.read()
257
+ text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text # remove BOM
258
+ text = self.repair_ocr_text(OCR_REPAIRS, text.strip())
259
+ lines = [l.strip() for l in text.split('\n') if not l.startswith(HOUSE_OVERSIGHT)]
260
+ lines = lines[1:] if (len(lines) > 1 and lines[0] == '>>') else lines
261
+ return collapse_newlines('\n'.join(lines))
262
+
263
+ def _repair(self) -> None:
264
+ """Can optionally be overloaded in subclasses."""
265
+ pass
266
+
267
+ def _set_computed_fields(self, lines: list[str] | None = None, text: str | None = None) -> None:
268
+ if (lines and text):
269
+ raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (got both)")
270
+ elif lines is not None:
271
+ self.text = '\n'.join(lines).strip()
272
+ elif text is not None:
273
+ self.text = text.strip()
274
+ else:
275
+ raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (neither was)")
276
+
277
+ self.length = len(self.text)
278
+ self.lines = [line.strip() for line in self.text.split('\n')]
279
+ self.num_lines = len(self.lines)
280
+
281
+ def _write_clean_text(self, output_path: Path) -> None:
282
+ """Write self.text to 'output_path'. Used only for diffing files."""
283
+ if output_path.exists():
284
+ if str(output_path.name).startswith(HOUSE_OVERSIGHT_PREFIX):
285
+ raise RuntimeError(f"'{output_path}' already exists! Not overwriting.")
286
+ else:
287
+ logger.warning(f"Overwriting '{output_path}'...")
288
+
289
+ with open(output_path, 'w') as f:
290
+ f.write(self.text)
291
+
292
+ logger.warning(f"Wrote {self.length} chars of cleaned {self.filename} to {output_path}.")
293
+
294
+ def __rich_console__(self, _console: Console, _options: ConsoleOptions) -> RenderResult:
295
+ yield self.file_info_panel()
296
+ text_panel = Panel(highlighter(self.text), border_style=self._border_style(), expand=False)
297
+ yield Padding(text_panel, (0, 0, 1, INFO_INDENT))
298
+
299
+ def __str__(self) -> str:
300
+ return self.description().plain
301
+
302
+ @staticmethod
303
+ def diff_files(files: list[str]) -> None:
304
+ if len(files) != 2:
305
+ raise RuntimeError('Need 2 files')
306
+ elif files[0] == files[1]:
307
+ raise RuntimeError(f"Filenames are the same!")
308
+
309
+ files = [f"{HOUSE_OVERSIGHT_PREFIX}{f}" if len(f) == 6 else f for f in files]
310
+ files = [f if f.endswith('.txt') else f"{f}.txt" for f in files]
311
+ tmpfiles = [Path(f"tmp_{f}") for f in files]
312
+ docs = [Document(DOCS_DIR.joinpath(f)) for f in files]
313
+
314
+ for i, doc in enumerate(docs):
315
+ doc._write_clean_text(tmpfiles[i])
316
+
317
+ cmd = f"diff {tmpfiles[0]} {tmpfiles[1]}"
318
+ console.print(f"Running '{cmd}'...")
319
+ results = run(cmd, shell=True, capture_output=True, text=True).stdout
320
+
321
+ for line in _color_diff_output(results):
322
+ console.print(line, highlight=True)
323
+
324
+ console.print(f"Possible suppression with: ")
325
+ console.print(Text(' suppress left: ').append(f" '{extract_file_id(files[0])}': 'the same as {extract_file_id(files[1])}',", style='cyan'))
326
+ console.print(Text(' suppress right: ').append(f" '{extract_file_id(files[1])}': 'the same as {extract_file_id(files[0])}',", style='cyan'))
327
+
328
+ for f in tmpfiles:
329
+ f.unlink()
330
+
331
+ @staticmethod
332
+ def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
333
+ return sorted(docs, key=lambda doc: [doc.timestamp or FALLBACK_TIMESTAMP, doc.file_id])
334
+
335
+ @classmethod
336
+ def uniquify(cls, documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
337
+ """Uniquify by file_id."""
338
+ id_map = {doc.file_id: doc for doc in documents}
339
+ return [doc for doc in id_map.values()]
340
+
341
+
342
+ DocumentType = TypeVar('DocumentType', bound=Document)
343
+
344
+
345
+ def _color_diff_output(diff_result: str) -> list[Text]:
346
+ txts = [Text('diff output:')]
347
+ style = 'dim'
348
+
349
+ for line in diff_result.split('\n'):
350
+ if line.startswith('>'):
351
+ style='spring_green4'
352
+ elif line.startswith('<'):
353
+ style='sea_green1'
354
+
355
+ txts.append(Text(line, style=style))
356
+
357
+ return txts