epstein-files 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +194 -0
- epstein_files/documents/communication.py +53 -0
- epstein_files/documents/document.py +357 -0
- epstein_files/documents/email.py +655 -0
- epstein_files/documents/emails/email_header.py +167 -0
- epstein_files/documents/imessage/text_message.py +93 -0
- epstein_files/documents/json_file.py +23 -0
- epstein_files/documents/messenger_log.py +73 -0
- epstein_files/documents/other_file.py +117 -0
- epstein_files/epstein_files.py +437 -0
- epstein_files/util/constant/common_words.py +94 -0
- epstein_files/util/constant/html.py +57 -0
- epstein_files/util/constant/names.py +261 -0
- epstein_files/util/constant/strings.py +47 -0
- epstein_files/util/constant/urls.py +103 -0
- epstein_files/util/constants.py +1552 -0
- epstein_files/util/data.py +131 -0
- epstein_files/util/env.py +80 -0
- epstein_files/util/file_cfg.py +172 -0
- epstein_files/util/file_helper.py +81 -0
- epstein_files/util/highlighted_group.py +620 -0
- epstein_files/util/rich.py +324 -0
- epstein_files/util/search_result.py +15 -0
- epstein_files/util/word_count.py +191 -0
- epstein_files-1.0.0.dist-info/LICENSE +674 -0
- epstein_files-1.0.0.dist-info/METADATA +60 -0
- epstein_files-1.0.0.dist-info/RECORD +28 -0
- epstein_files-1.0.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
Reformat Epstein text message files for readability and count email senders.
|
|
4
|
+
For use with iMessage log files from https://drive.google.com/drive/folders/1hTNH5woIRio578onLGElkTWofUSWRoH_
|
|
5
|
+
|
|
6
|
+
Install: 'poetry install'
|
|
7
|
+
Run: 'EPSTEIN_DOCS_DIR=/path/to/TXT ./generate.py'
|
|
8
|
+
"""
|
|
9
|
+
from sys import exit
|
|
10
|
+
|
|
11
|
+
from dotenv import load_dotenv
|
|
12
|
+
load_dotenv()
|
|
13
|
+
from rich.padding import Padding
|
|
14
|
+
|
|
15
|
+
from epstein_files.documents.email import Email
|
|
16
|
+
from epstein_files.epstein_files import EpsteinFiles, count_by_month
|
|
17
|
+
from epstein_files.util.constant.html import *
|
|
18
|
+
from epstein_files.util.constant.names import *
|
|
19
|
+
from epstein_files.util.constant.strings import EMAIL_CLASS, MESSENGER_LOG_CLASS
|
|
20
|
+
from epstein_files.util.data import Timer, dict_sets_to_lists, flatten
|
|
21
|
+
from epstein_files.util.env import specified_names, args
|
|
22
|
+
from epstein_files.util.file_helper import GH_PAGES_HTML_PATH
|
|
23
|
+
from epstein_files.util.rich import *
|
|
24
|
+
|
|
25
|
+
PRINT_COLOR_KEY_EVERY_N_EMAILS = 150
|
|
26
|
+
|
|
27
|
+
# Order matters (will be order of output)
|
|
28
|
+
PEOPLE_WHOSE_EMAILS_SHOULD_BE_PRINTED: list[str | None] = [
|
|
29
|
+
JEREMY_RUBIN,
|
|
30
|
+
AL_SECKEL,
|
|
31
|
+
JOI_ITO,
|
|
32
|
+
JABOR_Y,
|
|
33
|
+
STEVEN_SINOFSKY,
|
|
34
|
+
DANIEL_SIAD,
|
|
35
|
+
JEAN_LUC_BRUNEL,
|
|
36
|
+
STEVEN_HOFFENBERG,
|
|
37
|
+
EHUD_BARAK,
|
|
38
|
+
MARTIN_NOWAK,
|
|
39
|
+
MASHA_DROKOVA,
|
|
40
|
+
RENATA_BOLOTOVA,
|
|
41
|
+
STEVE_BANNON,
|
|
42
|
+
OLIVIER_COLOM,
|
|
43
|
+
BORIS_NIKOLIC,
|
|
44
|
+
PRINCE_ANDREW,
|
|
45
|
+
JIDE_ZEITLIN,
|
|
46
|
+
DAVID_STERN,
|
|
47
|
+
MOHAMED_WAHEED_HASSAN,
|
|
48
|
+
JENNIFER_JACQUET,
|
|
49
|
+
None,
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
# Order matters (will be order of output)
|
|
53
|
+
PEOPLE_WHOSE_EMAILS_SHOULD_BE_TABLES: list[str | None] = [
|
|
54
|
+
GHISLAINE_MAXWELL,
|
|
55
|
+
LEON_BLACK,
|
|
56
|
+
LANDON_THOMAS,
|
|
57
|
+
KATHRYN_RUEMMLER,
|
|
58
|
+
DARREN_INDYKE,
|
|
59
|
+
RICHARD_KAHN,
|
|
60
|
+
TYLER_SHEARS,
|
|
61
|
+
SULTAN_BIN_SULAYEM,
|
|
62
|
+
DEEPAK_CHOPRA,
|
|
63
|
+
ARIANE_DE_ROTHSCHILD,
|
|
64
|
+
TOM_PRITZKER,
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def generate_html() -> None:
|
|
69
|
+
timer = Timer()
|
|
70
|
+
epstein_files = EpsteinFiles.get_files(timer)
|
|
71
|
+
print_header(epstein_files)
|
|
72
|
+
|
|
73
|
+
if args.colors_only:
|
|
74
|
+
exit()
|
|
75
|
+
|
|
76
|
+
# Text messages section
|
|
77
|
+
if args.output_texts:
|
|
78
|
+
print_text_messages(epstein_files)
|
|
79
|
+
timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs):,} text message logs')
|
|
80
|
+
|
|
81
|
+
# Emails section
|
|
82
|
+
if args.output_emails:
|
|
83
|
+
emails_printed = print_emails(epstein_files)
|
|
84
|
+
timer.print_at_checkpoint(f"Printed {emails_printed:,} emails")
|
|
85
|
+
|
|
86
|
+
if args.output_other_files:
|
|
87
|
+
epstein_files.print_other_files_table()
|
|
88
|
+
timer.print_at_checkpoint(f"Printed {len(epstein_files.other_files):,} other files")
|
|
89
|
+
else:
|
|
90
|
+
logger.warning(f"Skipping other files section...")
|
|
91
|
+
|
|
92
|
+
# Save output
|
|
93
|
+
write_html(GH_PAGES_HTML_PATH)
|
|
94
|
+
logger.warning(f"Total time: {timer.seconds_since_start()}")
|
|
95
|
+
|
|
96
|
+
# JSON stats (mostly used for building pytest checks)
|
|
97
|
+
if args.json_stats:
|
|
98
|
+
console.line(5)
|
|
99
|
+
print_json_stats(epstein_files)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def print_emails(epstein_files: EpsteinFiles) -> int:
|
|
103
|
+
"""Returns number of emails printed."""
|
|
104
|
+
print_section_header(('Selections from ' if not args.all_emails else '') + 'His Emails')
|
|
105
|
+
print_other_site_link(is_header=False)
|
|
106
|
+
|
|
107
|
+
if len(specified_names) == 0:
|
|
108
|
+
epstein_files.print_emailer_counts_table()
|
|
109
|
+
|
|
110
|
+
emailers_to_print: list[str | None]
|
|
111
|
+
emailer_tables: list[str | None] = []
|
|
112
|
+
emails_that_were_printed: list[Email] = []
|
|
113
|
+
num_emails_printed_since_last_color_key = 0
|
|
114
|
+
|
|
115
|
+
if args.all_emails:
|
|
116
|
+
console.print('Email conversations are sorted chronologically based on time of the first email.')
|
|
117
|
+
emailers_to_print = sorted(epstein_files.all_emailers(), key=lambda e: epstein_files.earliest_email_at(e))
|
|
118
|
+
print_numbered_list_of_emailers(emailers_to_print, epstein_files)
|
|
119
|
+
else:
|
|
120
|
+
if len(specified_names) > 0:
|
|
121
|
+
emailers_to_print = specified_names
|
|
122
|
+
else:
|
|
123
|
+
emailers_to_print = PEOPLE_WHOSE_EMAILS_SHOULD_BE_PRINTED
|
|
124
|
+
|
|
125
|
+
console.print('Email conversations grouped by counterparty can be found in the order listed below.')
|
|
126
|
+
print_numbered_list_of_emailers(emailers_to_print)
|
|
127
|
+
console.print("\nAfter that there's tables linking to (but not displaying) all known emails for each of these people:")
|
|
128
|
+
|
|
129
|
+
if len(specified_names) > 0:
|
|
130
|
+
if args.all_email_tables:
|
|
131
|
+
emailer_tables = sorted(epstein_files.all_emailers(), key=lambda e: epstein_files.earliest_email_at(e))
|
|
132
|
+
else:
|
|
133
|
+
emailer_tables = PEOPLE_WHOSE_EMAILS_SHOULD_BE_TABLES
|
|
134
|
+
|
|
135
|
+
print_numbered_list_of_emailers(emailer_tables)
|
|
136
|
+
|
|
137
|
+
for author in emailers_to_print:
|
|
138
|
+
newly_printed_emails = epstein_files.print_emails_for(author)
|
|
139
|
+
emails_that_were_printed.extend(newly_printed_emails)
|
|
140
|
+
num_emails_printed_since_last_color_key += len(newly_printed_emails)
|
|
141
|
+
|
|
142
|
+
# Print color key every once in a while
|
|
143
|
+
if num_emails_printed_since_last_color_key > PRINT_COLOR_KEY_EVERY_N_EMAILS:
|
|
144
|
+
print_color_key()
|
|
145
|
+
num_emails_printed_since_last_color_key = 0
|
|
146
|
+
|
|
147
|
+
if len(emailer_tables) > 0 and len(specified_names) == 0:
|
|
148
|
+
print_author_header(f"Email Tables for {len(emailer_tables)} Other People", 'white')
|
|
149
|
+
|
|
150
|
+
for name in emailer_tables:
|
|
151
|
+
epstein_files.print_emails_table_for(name)
|
|
152
|
+
|
|
153
|
+
if len(specified_names) == 0:
|
|
154
|
+
epstein_files.print_email_device_info()
|
|
155
|
+
|
|
156
|
+
logger.warning(f"Rewrote {len(Email.rewritten_header_ids)} headers of {len(epstein_files.emails)} emails")
|
|
157
|
+
|
|
158
|
+
if args.all_emails:
|
|
159
|
+
email_ids_that_were_printed = set([email.file_id for email in emails_that_were_printed])
|
|
160
|
+
logger.warning(f"Printed {len(emails_that_were_printed)} emails of {len(email_ids_that_were_printed)} unique file IDs.")
|
|
161
|
+
|
|
162
|
+
for email in epstein_files.emails:
|
|
163
|
+
if email.file_id not in email_ids_that_were_printed and not email.is_duplicate:
|
|
164
|
+
logger.warning(f"Failed to print {email.description()}")
|
|
165
|
+
|
|
166
|
+
return len(emails_that_were_printed)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def print_text_messages(epstein_files: EpsteinFiles) -> None:
|
|
170
|
+
print_section_header('Text Messages')
|
|
171
|
+
print_centered("(conversations are sorted chronologically based on timestamp of first message)\n", style='gray30')
|
|
172
|
+
|
|
173
|
+
if len(specified_names) == 0:
|
|
174
|
+
log_files = epstein_files.imessage_logs
|
|
175
|
+
else:
|
|
176
|
+
log_files = flatten([epstein_files.imessage_logs_for(name) for name in specified_names])
|
|
177
|
+
|
|
178
|
+
for log_file in log_files:
|
|
179
|
+
console.print(Padding(log_file))
|
|
180
|
+
console.line(2)
|
|
181
|
+
|
|
182
|
+
epstein_files.print_imessage_summary()
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def print_json_stats(epstein_files: EpsteinFiles) -> None:
|
|
186
|
+
console.print(Panel('JSON Stats Dump', expand=True, style='reverse bold'), '\n')
|
|
187
|
+
print_json(f"{MESSENGER_LOG_CLASS} Sender Counts", epstein_files.imessage_sender_counts(), skip_falsey=True)
|
|
188
|
+
print_json(f"{EMAIL_CLASS} Author Counts", epstein_files.email_author_counts, skip_falsey=True)
|
|
189
|
+
print_json(f"{EMAIL_CLASS} Recipient Counts", epstein_files.email_recipient_counts, skip_falsey=True)
|
|
190
|
+
print_json("Email signature_substitution_countss", epstein_files.email_signature_substitution_counts(), skip_falsey=True)
|
|
191
|
+
print_json("email_author_device_signatures", dict_sets_to_lists(epstein_files.email_authors_to_device_signatures))
|
|
192
|
+
print_json("email_sent_from_devices", dict_sets_to_lists(epstein_files.email_device_signatures_to_authors))
|
|
193
|
+
print_json("email_unknown_recipient_file_ids", epstein_files.email_unknown_recipient_file_ids())
|
|
194
|
+
print_json("count_by_month", count_by_month(epstein_files.all_documents()))
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import TypeVar
|
|
5
|
+
|
|
6
|
+
from rich.text import Text
|
|
7
|
+
|
|
8
|
+
from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, Document
|
|
9
|
+
from epstein_files.util.constant.names import UNKNOWN
|
|
10
|
+
from epstein_files.util.constants import FALLBACK_TIMESTAMP
|
|
11
|
+
from epstein_files.util.file_cfg import MessageCfg
|
|
12
|
+
from epstein_files.util.highlighted_group import get_style_for_name
|
|
13
|
+
from epstein_files.util.rich import key_value_txt
|
|
14
|
+
|
|
15
|
+
TIMESTAMP_SECONDS_REGEX = re.compile(r":\d{2}$")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class Communication(Document):
|
|
20
|
+
"""Superclass for Email and MessengerLog."""
|
|
21
|
+
author_style: str = 'white'
|
|
22
|
+
author_txt: Text = field(init=False)
|
|
23
|
+
config: MessageCfg | None = None
|
|
24
|
+
timestamp: datetime = FALLBACK_TIMESTAMP # TODO this default sucks (though it never happens)
|
|
25
|
+
|
|
26
|
+
def __post_init__(self):
|
|
27
|
+
super().__post_init__()
|
|
28
|
+
self.author_style = get_style_for_name(self.author_or_unknown())
|
|
29
|
+
self.author_txt = Text(self.author_or_unknown(), style=self.author_style)
|
|
30
|
+
|
|
31
|
+
def author_or_unknown(self) -> str:
|
|
32
|
+
return self.author or UNKNOWN
|
|
33
|
+
|
|
34
|
+
def description(self) -> Text:
|
|
35
|
+
return self._description().append(CLOSE_PROPERTIES_CHAR)
|
|
36
|
+
|
|
37
|
+
def is_attribution_uncertain(self) -> bool | None:
|
|
38
|
+
return self.config and self.config.is_attribution_uncertain
|
|
39
|
+
|
|
40
|
+
def raw_document_link_txt(self, _style: str = '', include_alt_link: bool = True) -> Text:
|
|
41
|
+
"""Overrides super() method to apply self.author_style."""
|
|
42
|
+
return super().raw_document_link_txt(self.author_style, include_alt_link=include_alt_link)
|
|
43
|
+
|
|
44
|
+
def timestamp_without_seconds(self) -> str:
|
|
45
|
+
return TIMESTAMP_SECONDS_REGEX.sub('', str(self.timestamp))
|
|
46
|
+
|
|
47
|
+
def _description(self) -> Text:
|
|
48
|
+
"""One line summary mostly for logging."""
|
|
49
|
+
txt = super().description().append(', ')
|
|
50
|
+
return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown()}'", style=self.author_style)))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
CommunicationType = TypeVar('CommunicationType', bound=Document)
|
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from subprocess import run
|
|
7
|
+
from typing import ClassVar, Sequence, TypeVar
|
|
8
|
+
|
|
9
|
+
from rich.console import Console, ConsoleOptions, Group, RenderResult
|
|
10
|
+
from rich.padding import Padding
|
|
11
|
+
from rich.panel import Panel
|
|
12
|
+
from rich.text import Text
|
|
13
|
+
|
|
14
|
+
from epstein_files.util.constant.names import *
|
|
15
|
+
from epstein_files.util.constant.strings import *
|
|
16
|
+
from epstein_files.util.constant.urls import *
|
|
17
|
+
from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP, VI_DAILY_NEWS_ARTICLE
|
|
18
|
+
from epstein_files.util.file_cfg import FileCfg, MessageCfg
|
|
19
|
+
from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize
|
|
20
|
+
from epstein_files.util.env import args, logger
|
|
21
|
+
from epstein_files.util.file_helper import DOCS_DIR, file_stem_for_id, extract_file_id, file_size_str, is_local_extract_file
|
|
22
|
+
from epstein_files.util.rich import SYMBOL_STYLE, console, highlighter, key_value_txt, logger, link_text_obj
|
|
23
|
+
|
|
24
|
+
WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
|
|
25
|
+
HOUSE_OVERSIGHT = HOUSE_OVERSIGHT_PREFIX.replace('_', ' ').strip()
|
|
26
|
+
MIN_DOCUMENT_ID = 10477
|
|
27
|
+
INFO_INDENT = 2
|
|
28
|
+
INFO_PADDING = (0, 0, 0, INFO_INDENT)
|
|
29
|
+
|
|
30
|
+
CLOSE_PROPERTIES_CHAR = ']'
|
|
31
|
+
MAX_EXTRACTED_TIMESTAMPS = 6
|
|
32
|
+
MIN_TIMESTAMP = datetime(1991, 1, 1)
|
|
33
|
+
MID_TIMESTAMP = datetime(2007, 1, 1)
|
|
34
|
+
MAX_TIMESTAMP = datetime(2020, 1, 1)
|
|
35
|
+
VI_DAILY_NEWS_REGEX = re.compile(r'virgin\s*is[kl][ai]nds\s*daily\s*news', re.IGNORECASE)
|
|
36
|
+
|
|
37
|
+
DOC_TYPE_STYLES = {
|
|
38
|
+
DOCUMENT_CLASS: 'grey69',
|
|
39
|
+
EMAIL_CLASS: 'sea_green2',
|
|
40
|
+
JSON_FILE_CLASS: 'sandy_brown',
|
|
41
|
+
MESSENGER_LOG_CLASS: 'cyan',
|
|
42
|
+
OTHER_FILE_CLASS: 'grey69',
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
FILENAME_MATCH_STYLES = [
|
|
46
|
+
'dark_green',
|
|
47
|
+
'green',
|
|
48
|
+
'spring_green4',
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
OCR_REPAIRS = {
|
|
52
|
+
re.compile(r'\.corn\b'): '.com',
|
|
53
|
+
re.compile('ln(adequate|dyke)'): r'In\1',
|
|
54
|
+
'Nil Priell': 'Nili Priell',
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class Document:
|
|
60
|
+
"""Base class for all Epstein Files documents."""
|
|
61
|
+
file_path: Path
|
|
62
|
+
# Optional fields
|
|
63
|
+
author: str | None = None
|
|
64
|
+
config: FileCfg | MessageCfg | None = None
|
|
65
|
+
file_id: str = field(init=False)
|
|
66
|
+
filename: str = field(init=False)
|
|
67
|
+
is_duplicate: bool = False
|
|
68
|
+
length: int = field(init=False)
|
|
69
|
+
lines: list[str] = field(init=False)
|
|
70
|
+
num_lines: int = field(init=False)
|
|
71
|
+
text: str = ''
|
|
72
|
+
timestamp: datetime | None = None
|
|
73
|
+
url_slug: str = field(init=False) # e.g. 'HOUSE_OVERSIGHT_123456
|
|
74
|
+
|
|
75
|
+
# Class variable; only used to cycle color of output when using lines_match()
|
|
76
|
+
file_matching_idx: ClassVar[int] = 0
|
|
77
|
+
|
|
78
|
+
def __post_init__(self):
|
|
79
|
+
self.filename = self.file_path.name
|
|
80
|
+
self.file_id = extract_file_id(self.filename)
|
|
81
|
+
self.config = ALL_FILE_CONFIGS.get(self.file_id)
|
|
82
|
+
self.is_duplicate = bool(self.config.dupe_of_id) if self.config else False
|
|
83
|
+
|
|
84
|
+
if self.is_local_extract_file():
|
|
85
|
+
self.url_slug = file_stem_for_id(self.file_id)
|
|
86
|
+
cfg_type = type(self.config).__name__ if self.config else None
|
|
87
|
+
|
|
88
|
+
# Coerce FileConfig for court docs etc. to MessageCfg for email files extracted from that document
|
|
89
|
+
if self.document_type() == EMAIL_CLASS and self.config and cfg_type != MessageCfg.__name__:
|
|
90
|
+
self.config = MessageCfg.from_file_cfg(self.config)
|
|
91
|
+
else:
|
|
92
|
+
self.url_slug = self.file_path.stem
|
|
93
|
+
|
|
94
|
+
self._set_computed_fields(text=self.text or self._load_file())
|
|
95
|
+
self._repair()
|
|
96
|
+
self._extract_author()
|
|
97
|
+
self.timestamp = self._extract_timestamp()
|
|
98
|
+
|
|
99
|
+
def configured_description(self) -> str | None:
|
|
100
|
+
return self.config.description if self.config else None
|
|
101
|
+
|
|
102
|
+
def date_str(self) -> str | None:
|
|
103
|
+
return date_str(self.timestamp)
|
|
104
|
+
|
|
105
|
+
def description(self) -> Text:
|
|
106
|
+
"""Mostly for logging. Brackets are left open for subclasses to add stuff."""
|
|
107
|
+
txt = Text('').append(self.url_slug, style='magenta')
|
|
108
|
+
txt.append(f' {self.document_type()}', style=self.document_type_style())
|
|
109
|
+
|
|
110
|
+
if self.timestamp:
|
|
111
|
+
txt.append(' (', style=SYMBOL_STYLE)
|
|
112
|
+
txt.append(f"{iso_timestamp(self.timestamp)}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
|
|
113
|
+
|
|
114
|
+
txt.append(" [").append(key_value_txt('num_lines', Text(f"{self.num_lines}", style='cyan')))
|
|
115
|
+
txt.append(', ').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
|
|
116
|
+
return txt
|
|
117
|
+
|
|
118
|
+
def description_panel(self, include_hints: bool = False) -> Panel:
|
|
119
|
+
"""Panelized description() with info_txt(), used in search results."""
|
|
120
|
+
hints = [Text('', style='italic').append(h) for h in (self.hints() if include_hints else [])]
|
|
121
|
+
return Panel(Group(*([self.description()] + hints)), border_style=self.document_type_style(), expand=False)
|
|
122
|
+
|
|
123
|
+
def document_type(self) -> str:
|
|
124
|
+
"""Annoying workaround for circular import issues and isinstance()."""
|
|
125
|
+
return str(type(self).__name__)
|
|
126
|
+
|
|
127
|
+
def document_type_style(self) -> str:
|
|
128
|
+
return DOC_TYPE_STYLES[self.document_type()]
|
|
129
|
+
|
|
130
|
+
def duplicate_file_txt(self) -> Text:
|
|
131
|
+
"""If the file is a dupe make a nice message to explain what file it's a duplicate of."""
|
|
132
|
+
if not self.config or not self.config.dupe_of_id:
|
|
133
|
+
raise RuntimeError(f"duplicate_file_txt() called on {self.description()} but not a dupe! config:\n\n{self.config}")
|
|
134
|
+
|
|
135
|
+
txt = Text(f"Not showing ", style='white dim italic').append(epstein_media_doc_link_txt(self.file_id, style='cyan'))
|
|
136
|
+
txt.append(f" because it's {self.config.duplicate_reason()} ")
|
|
137
|
+
return txt.append(epstein_media_doc_link_txt(self.config.dupe_of_id, style='royal_blue1'))
|
|
138
|
+
|
|
139
|
+
def epsteinify_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
140
|
+
"""Create a Text obj link to this document on epsteinify.com."""
|
|
141
|
+
return link_text_obj(epsteinify_doc_url(self.url_slug), link_txt or self.url_slug, style)
|
|
142
|
+
|
|
143
|
+
def epstein_media_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
144
|
+
"""Create a Text obj link to this document on epstein.media."""
|
|
145
|
+
return link_text_obj(epstein_media_doc_url(self.url_slug), link_txt or self.url_slug, style)
|
|
146
|
+
|
|
147
|
+
def epstein_web_link(self, style: str = ARCHIVE_LINK_COLOR, link_txt: str | None = None) -> Text:
|
|
148
|
+
"""Create a Text obj link to this document on EpsteinWeb."""
|
|
149
|
+
return link_text_obj(epstein_web_doc_url(self.url_slug), link_txt or self.url_slug, style)
|
|
150
|
+
|
|
151
|
+
def file_info_panel(self) -> Group:
|
|
152
|
+
"""Panel with filename linking to raw file plus any hints/info about the file."""
|
|
153
|
+
panel = Panel(self.raw_document_link_txt(include_alt_link=True), border_style=self._border_style(), expand=False)
|
|
154
|
+
hints = [Padding(hint, INFO_PADDING) for hint in self.hints()]
|
|
155
|
+
return Group(*([panel] + hints))
|
|
156
|
+
|
|
157
|
+
def file_size_str(self) -> str:
|
|
158
|
+
return file_size_str(self.file_path)
|
|
159
|
+
|
|
160
|
+
def hints(self) -> list[Text]:
|
|
161
|
+
"""Additional info about the Document (author, description, and so on) to be desplayed in doc header."""
|
|
162
|
+
hints = listify(self.info_txt())
|
|
163
|
+
hint_msg = self.configured_description()
|
|
164
|
+
|
|
165
|
+
if self.document_type() == OTHER_FILE_CLASS:
|
|
166
|
+
if not hint_msg and VI_DAILY_NEWS_REGEX.search(self.text):
|
|
167
|
+
hint_msg = VI_DAILY_NEWS_ARTICLE
|
|
168
|
+
elif hint_msg:
|
|
169
|
+
hint_msg = f"({hint_msg})"
|
|
170
|
+
|
|
171
|
+
if hint_msg:
|
|
172
|
+
hints.append(highlighter(Text(hint_msg, style='white dim italic')))
|
|
173
|
+
|
|
174
|
+
return hints
|
|
175
|
+
|
|
176
|
+
def info_txt(self) -> Text | None:
|
|
177
|
+
"""Secondary info about this file (recipients, level of certainty, etc). Overload in subclasses."""
|
|
178
|
+
return None
|
|
179
|
+
|
|
180
|
+
def is_local_extract_file(self) -> bool:
|
|
181
|
+
"""True if file created by extracting text from a court doc (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
|
|
182
|
+
return is_local_extract_file(self.filename)
|
|
183
|
+
|
|
184
|
+
def lines_matching_txt(self, _pattern: re.Pattern | str) -> list[Text]:
|
|
185
|
+
"""Return lines matching a regex as colored list[Text]."""
|
|
186
|
+
pattern = patternize(_pattern)
|
|
187
|
+
matched_lines = [line for line in self.lines if pattern.search(line)]
|
|
188
|
+
|
|
189
|
+
if len(matched_lines) == 0:
|
|
190
|
+
return []
|
|
191
|
+
|
|
192
|
+
file_style = FILENAME_MATCH_STYLES[type(self).file_matching_idx % len(FILENAME_MATCH_STYLES)]
|
|
193
|
+
type(self).file_matching_idx += 1
|
|
194
|
+
|
|
195
|
+
return [
|
|
196
|
+
Text('').append(self.file_path.name, style=file_style).append(':').append(line)
|
|
197
|
+
for line in matched_lines
|
|
198
|
+
]
|
|
199
|
+
|
|
200
|
+
def log(self, msg: str, level: int = logging.WARNING):
|
|
201
|
+
"""Log with [file_id] as a prefix."""
|
|
202
|
+
logger.log(level, f"[{self.file_id}] {msg}")
|
|
203
|
+
|
|
204
|
+
def log_top_lines(self, n: int = 10, msg: str = '', level: int = logging.INFO) -> None:
|
|
205
|
+
"""Log first 'n' lines of self.text at 'level'. 'msg' can be optionally provided."""
|
|
206
|
+
separator = '\n\n' if '\n' in msg else '. '
|
|
207
|
+
msg = f"{msg + separator if msg else ''}Top lines of '{self.filename}' ({self.num_lines} lines):"
|
|
208
|
+
logger.log(level, f"{msg}\n\n{self.top_lines(n)}\n")
|
|
209
|
+
|
|
210
|
+
def raw_document_link_txt(self, style: str = '', include_alt_link: bool = False) -> Text:
|
|
211
|
+
"""Returns colored links to epstein.media and and epsteinweb in a Text object."""
|
|
212
|
+
txt = Text('', style='white' if include_alt_link else ARCHIVE_LINK_COLOR)
|
|
213
|
+
|
|
214
|
+
if args.use_epstein_web_links:
|
|
215
|
+
txt.append(self.epstein_web_link(style=style))
|
|
216
|
+
|
|
217
|
+
if include_alt_link:
|
|
218
|
+
txt.append(' (').append(self.epstein_media_link(style='white dim', link_txt=EPSTEIN_MEDIA)).append(')')
|
|
219
|
+
else:
|
|
220
|
+
txt.append(self.epstein_media_link(style=style))
|
|
221
|
+
|
|
222
|
+
if include_alt_link:
|
|
223
|
+
txt.append(' (').append(self.epstein_web_link(style='white dim', link_txt=EPSTEIN_WEB)).append(')')
|
|
224
|
+
|
|
225
|
+
return txt
|
|
226
|
+
|
|
227
|
+
def repair_ocr_text(self, repairs: dict[str | re.Pattern, str], text: str) -> str:
|
|
228
|
+
"""Apply a dict of repairs (key is pattern or string, value is replacement string) to text."""
|
|
229
|
+
for k, v in repairs.items():
|
|
230
|
+
if isinstance(k, re.Pattern):
|
|
231
|
+
text = k.sub(v, text)
|
|
232
|
+
else:
|
|
233
|
+
text = text.replace(k, v)
|
|
234
|
+
|
|
235
|
+
return text
|
|
236
|
+
|
|
237
|
+
def top_lines(self, n: int = 10) -> str:
|
|
238
|
+
return '\n'.join(self.lines[0:n])
|
|
239
|
+
|
|
240
|
+
def _border_style(self) -> str:
|
|
241
|
+
"""Should be overloaded in subclasses."""
|
|
242
|
+
return 'white'
|
|
243
|
+
|
|
244
|
+
def _extract_author(self) -> None:
|
|
245
|
+
"""Get author from config. Extended in Email subclass to also check headers."""
|
|
246
|
+
if self.config and self.config.author:
|
|
247
|
+
self.author = self.config.author
|
|
248
|
+
|
|
249
|
+
def _extract_timestamp(self) -> datetime | None:
|
|
250
|
+
"""Should be implemented in subclasses."""
|
|
251
|
+
pass
|
|
252
|
+
|
|
253
|
+
def _load_file(self):
|
|
254
|
+
"""Remove BOM and HOUSE OVERSIGHT lines, strip whitespace."""
|
|
255
|
+
with open(self.file_path) as f:
|
|
256
|
+
text = f.read()
|
|
257
|
+
text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text # remove BOM
|
|
258
|
+
text = self.repair_ocr_text(OCR_REPAIRS, text.strip())
|
|
259
|
+
lines = [l.strip() for l in text.split('\n') if not l.startswith(HOUSE_OVERSIGHT)]
|
|
260
|
+
lines = lines[1:] if (len(lines) > 1 and lines[0] == '>>') else lines
|
|
261
|
+
return collapse_newlines('\n'.join(lines))
|
|
262
|
+
|
|
263
|
+
def _repair(self) -> None:
|
|
264
|
+
"""Can optionally be overloaded in subclasses."""
|
|
265
|
+
pass
|
|
266
|
+
|
|
267
|
+
def _set_computed_fields(self, lines: list[str] | None = None, text: str | None = None) -> None:
|
|
268
|
+
if (lines and text):
|
|
269
|
+
raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (got both)")
|
|
270
|
+
elif lines is not None:
|
|
271
|
+
self.text = '\n'.join(lines).strip()
|
|
272
|
+
elif text is not None:
|
|
273
|
+
self.text = text.strip()
|
|
274
|
+
else:
|
|
275
|
+
raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (neither was)")
|
|
276
|
+
|
|
277
|
+
self.length = len(self.text)
|
|
278
|
+
self.lines = [line.strip() for line in self.text.split('\n')]
|
|
279
|
+
self.num_lines = len(self.lines)
|
|
280
|
+
|
|
281
|
+
def _write_clean_text(self, output_path: Path) -> None:
|
|
282
|
+
"""Write self.text to 'output_path'. Used only for diffing files."""
|
|
283
|
+
if output_path.exists():
|
|
284
|
+
if str(output_path.name).startswith(HOUSE_OVERSIGHT_PREFIX):
|
|
285
|
+
raise RuntimeError(f"'{output_path}' already exists! Not overwriting.")
|
|
286
|
+
else:
|
|
287
|
+
logger.warning(f"Overwriting '{output_path}'...")
|
|
288
|
+
|
|
289
|
+
with open(output_path, 'w') as f:
|
|
290
|
+
f.write(self.text)
|
|
291
|
+
|
|
292
|
+
logger.warning(f"Wrote {self.length} chars of cleaned {self.filename} to {output_path}.")
|
|
293
|
+
|
|
294
|
+
def __rich_console__(self, _console: Console, _options: ConsoleOptions) -> RenderResult:
|
|
295
|
+
yield self.file_info_panel()
|
|
296
|
+
text_panel = Panel(highlighter(self.text), border_style=self._border_style(), expand=False)
|
|
297
|
+
yield Padding(text_panel, (0, 0, 1, INFO_INDENT))
|
|
298
|
+
|
|
299
|
+
def __str__(self) -> str:
|
|
300
|
+
return self.description().plain
|
|
301
|
+
|
|
302
|
+
@staticmethod
|
|
303
|
+
def diff_files(files: list[str]) -> None:
|
|
304
|
+
if len(files) != 2:
|
|
305
|
+
raise RuntimeError('Need 2 files')
|
|
306
|
+
elif files[0] == files[1]:
|
|
307
|
+
raise RuntimeError(f"Filenames are the same!")
|
|
308
|
+
|
|
309
|
+
files = [f"{HOUSE_OVERSIGHT_PREFIX}{f}" if len(f) == 6 else f for f in files]
|
|
310
|
+
files = [f if f.endswith('.txt') else f"{f}.txt" for f in files]
|
|
311
|
+
tmpfiles = [Path(f"tmp_{f}") for f in files]
|
|
312
|
+
docs = [Document(DOCS_DIR.joinpath(f)) for f in files]
|
|
313
|
+
|
|
314
|
+
for i, doc in enumerate(docs):
|
|
315
|
+
doc._write_clean_text(tmpfiles[i])
|
|
316
|
+
|
|
317
|
+
cmd = f"diff {tmpfiles[0]} {tmpfiles[1]}"
|
|
318
|
+
console.print(f"Running '{cmd}'...")
|
|
319
|
+
results = run(cmd, shell=True, capture_output=True, text=True).stdout
|
|
320
|
+
|
|
321
|
+
for line in _color_diff_output(results):
|
|
322
|
+
console.print(line, highlight=True)
|
|
323
|
+
|
|
324
|
+
console.print(f"Possible suppression with: ")
|
|
325
|
+
console.print(Text(' suppress left: ').append(f" '{extract_file_id(files[0])}': 'the same as {extract_file_id(files[1])}',", style='cyan'))
|
|
326
|
+
console.print(Text(' suppress right: ').append(f" '{extract_file_id(files[1])}': 'the same as {extract_file_id(files[0])}',", style='cyan'))
|
|
327
|
+
|
|
328
|
+
for f in tmpfiles:
|
|
329
|
+
f.unlink()
|
|
330
|
+
|
|
331
|
+
@staticmethod
|
|
332
|
+
def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
|
|
333
|
+
return sorted(docs, key=lambda doc: [doc.timestamp or FALLBACK_TIMESTAMP, doc.file_id])
|
|
334
|
+
|
|
335
|
+
@classmethod
|
|
336
|
+
def uniquify(cls, documents: Sequence['DocumentType']) -> Sequence['DocumentType']:
|
|
337
|
+
"""Uniquify by file_id."""
|
|
338
|
+
id_map = {doc.file_id: doc for doc in documents}
|
|
339
|
+
return [doc for doc in id_map.values()]
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
DocumentType = TypeVar('DocumentType', bound=Document)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _color_diff_output(diff_result: str) -> list[Text]:
|
|
346
|
+
txts = [Text('diff output:')]
|
|
347
|
+
style = 'dim'
|
|
348
|
+
|
|
349
|
+
for line in diff_result.split('\n'):
|
|
350
|
+
if line.startswith('>'):
|
|
351
|
+
style='spring_green4'
|
|
352
|
+
elif line.startswith('<'):
|
|
353
|
+
style='sea_green1'
|
|
354
|
+
|
|
355
|
+
txts.append(Text(line, style=style))
|
|
356
|
+
|
|
357
|
+
return txts
|