epstein-files 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
epstein_files/__init__.py CHANGED
@@ -13,19 +13,22 @@ load_dotenv()
13
13
  from rich.padding import Padding
14
14
 
15
15
  from epstein_files.documents.email import Email
16
+ from epstein_files.documents.messenger_log import MessengerLog
16
17
  from epstein_files.epstein_files import EpsteinFiles, count_by_month
17
18
  from epstein_files.util.constant.html import *
18
19
  from epstein_files.util.constant.names import *
19
20
  from epstein_files.util.constant.strings import EMAIL_CLASS, MESSENGER_LOG_CLASS
20
- from epstein_files.util.data import Timer, dict_sets_to_lists, flatten
21
- from epstein_files.util.env import specified_names, args
22
- from epstein_files.util.file_helper import GH_PAGES_HTML_PATH
21
+ from epstein_files.util.data import dict_sets_to_lists
22
+ from epstein_files.util.env import args, specified_names
23
+ from epstein_files.util.file_helper import GH_PAGES_HTML_PATH, JSON_METADATA_PATH, make_clean
24
+ from epstein_files.util.logging import logger
23
25
  from epstein_files.util.rich import *
26
+ from epstein_files.util.timer import Timer
24
27
 
25
28
  PRINT_COLOR_KEY_EVERY_N_EMAILS = 150
26
29
 
27
- # Order matters (will be order of output)
28
- PEOPLE_WHOSE_EMAILS_SHOULD_BE_PRINTED: list[str | None] = [
30
+ # Order matters. Default names to print emails for.
31
+ DEFAULT_EMAILERS = [
29
32
  JEREMY_RUBIN,
30
33
  AL_SECKEL,
31
34
  JOI_ITO,
@@ -49,8 +52,9 @@ PEOPLE_WHOSE_EMAILS_SHOULD_BE_PRINTED: list[str | None] = [
49
52
  None,
50
53
  ]
51
54
 
52
- # Order matters (will be order of output)
53
- PEOPLE_WHOSE_EMAILS_SHOULD_BE_TABLES: list[str | None] = [
55
+ # Order matters. Default names to print tables w/email subject, timestamp, etc for.
56
+ # TODO: get rid of this
57
+ DEFAULT_EMAILER_TABLES: list[str | None] = [
54
58
  GHISLAINE_MAXWELL,
55
59
  LEON_BLACK,
56
60
  LANDON_THOMAS,
@@ -64,42 +68,58 @@ PEOPLE_WHOSE_EMAILS_SHOULD_BE_TABLES: list[str | None] = [
64
68
  TOM_PRITZKER,
65
69
  ]
66
70
 
71
+ if len(set(DEFAULT_EMAILERS).intersection(set(DEFAULT_EMAILER_TABLES))) > 0:
72
+ raise RuntimeError(f"Some names appear in both DEFAULT_EMAILERS and DEFAULT_EMAILER_TABLES")
73
+
67
74
 
68
75
  def generate_html() -> None:
76
+ if args.make_clean:
77
+ make_clean()
78
+ exit()
79
+
69
80
  timer = Timer()
70
81
  epstein_files = EpsteinFiles.get_files(timer)
82
+
83
+ if args.json_metadata:
84
+ json_str = epstein_files.json_metadata()
85
+
86
+ if args.build:
87
+ with open(JSON_METADATA_PATH, 'w') as f:
88
+ f.write(json_str)
89
+ timer.print_at_checkpoint(f"Wrote {file_size_str(JSON_METADATA_PATH)} to '{JSON_METADATA_PATH}'")
90
+ else:
91
+ console.print_json(json_str, indent=4, sort_keys=True)
92
+
93
+ exit()
94
+
71
95
  print_header(epstein_files)
72
96
 
73
97
  if args.colors_only:
74
98
  exit()
75
99
 
76
- # Text messages section
77
100
  if args.output_texts:
78
- print_text_messages(epstein_files)
79
- timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs):,} text message logs')
101
+ _print_text_messages(epstein_files)
102
+ timer.print_at_checkpoint(f'Printed {len(epstein_files.imessage_logs)} text message logs')
80
103
 
81
- # Emails section
82
104
  if args.output_emails:
83
- emails_printed = print_emails(epstein_files)
105
+ emails_printed = _print_emails(epstein_files)
84
106
  timer.print_at_checkpoint(f"Printed {emails_printed:,} emails")
85
107
 
86
108
  if args.output_other_files:
87
- epstein_files.print_other_files_table()
88
- timer.print_at_checkpoint(f"Printed {len(epstein_files.other_files):,} other files")
89
- else:
90
- logger.warning(f"Skipping other files section...")
109
+ files_printed = epstein_files.print_other_files_table()
110
+ timer.print_at_checkpoint(f"Printed {len(files_printed)} other files")
91
111
 
92
112
  # Save output
93
113
  write_html(GH_PAGES_HTML_PATH)
94
- logger.warning(f"Total time: {timer.seconds_since_start()}")
114
+ logger.warning(f"Total time: {timer.seconds_since_start_str()}")
95
115
 
96
116
  # JSON stats (mostly used for building pytest checks)
97
117
  if args.json_stats:
98
118
  console.line(5)
99
- print_json_stats(epstein_files)
119
+ _print_json_stats(epstein_files)
100
120
 
101
121
 
102
- def print_emails(epstein_files: EpsteinFiles) -> int:
122
+ def _print_emails(epstein_files: EpsteinFiles) -> int:
103
123
  """Returns number of emails printed."""
104
124
  print_section_header(('Selections from ' if not args.all_emails else '') + 'His Emails')
105
125
  print_other_site_link(is_header=False)
@@ -109,7 +129,7 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
109
129
 
110
130
  emailers_to_print: list[str | None]
111
131
  emailer_tables: list[str | None] = []
112
- emails_that_were_printed: list[Email] = []
132
+ already_printed_emails: list[Email] = []
113
133
  num_emails_printed_since_last_color_key = 0
114
134
 
115
135
  if args.all_emails:
@@ -117,26 +137,17 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
117
137
  emailers_to_print = sorted(epstein_files.all_emailers(), key=lambda e: epstein_files.earliest_email_at(e))
118
138
  print_numbered_list_of_emailers(emailers_to_print, epstein_files)
119
139
  else:
120
- if len(specified_names) > 0:
121
- emailers_to_print = specified_names
122
- else:
123
- emailers_to_print = PEOPLE_WHOSE_EMAILS_SHOULD_BE_PRINTED
124
-
140
+ emailers_to_print = specified_names if specified_names else DEFAULT_EMAILERS
125
141
  console.print('Email conversations grouped by counterparty can be found in the order listed below.')
126
142
  print_numbered_list_of_emailers(emailers_to_print)
127
143
  console.print("\nAfter that there's tables linking to (but not displaying) all known emails for each of these people:")
128
144
 
129
145
  if len(specified_names) > 0:
130
- if args.all_email_tables:
131
- emailer_tables = sorted(epstein_files.all_emailers(), key=lambda e: epstein_files.earliest_email_at(e))
132
- else:
133
- emailer_tables = PEOPLE_WHOSE_EMAILS_SHOULD_BE_TABLES
134
-
135
- print_numbered_list_of_emailers(emailer_tables)
146
+ print_numbered_list_of_emailers(DEFAULT_EMAILER_TABLES)
136
147
 
137
148
  for author in emailers_to_print:
138
149
  newly_printed_emails = epstein_files.print_emails_for(author)
139
- emails_that_were_printed.extend(newly_printed_emails)
150
+ already_printed_emails.extend(newly_printed_emails)
140
151
  num_emails_printed_since_last_color_key += len(newly_printed_emails)
141
152
 
142
153
  # Print color key every once in a while
@@ -144,36 +155,33 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
144
155
  print_color_key()
145
156
  num_emails_printed_since_last_color_key = 0
146
157
 
147
- if len(emailer_tables) > 0 and len(specified_names) == 0:
148
- print_author_header(f"Email Tables for {len(emailer_tables)} Other People", 'white')
158
+ if not specified_names:
159
+ if not args.all_emails:
160
+ print_author_header(f"Email Tables for {len(emailer_tables)} Other People", 'white')
149
161
 
150
- for name in emailer_tables:
151
- epstein_files.print_emails_table_for(name)
162
+ for name in DEFAULT_EMAILER_TABLES:
163
+ epstein_files.print_emails_table_for(name)
152
164
 
153
- if len(specified_names) == 0:
154
165
  epstein_files.print_email_device_info()
155
166
 
156
- logger.warning(f"Rewrote {len(Email.rewritten_header_ids)} headers of {len(epstein_files.emails)} emails")
157
-
167
+ # Check that all emails were actually printed
158
168
  if args.all_emails:
159
- email_ids_that_were_printed = set([email.file_id for email in emails_that_were_printed])
160
- logger.warning(f"Printed {len(emails_that_were_printed)} emails of {len(email_ids_that_were_printed)} unique file IDs.")
169
+ email_ids_that_were_printed = set([email.file_id for email in already_printed_emails])
170
+ logger.warning(f"Printed {len(already_printed_emails)} emails of {len(email_ids_that_were_printed)} unique file IDs.")
161
171
 
162
172
  for email in epstein_files.emails:
163
173
  if email.file_id not in email_ids_that_were_printed and not email.is_duplicate:
164
- logger.warning(f"Failed to print {email.description()}")
174
+ logger.warning(f"Failed to print {email.summary()}")
165
175
 
166
- return len(emails_that_were_printed)
176
+ logger.warning(f"Rewrote {len(Email.rewritten_header_ids)} headers of {len(epstein_files.emails)} emails")
177
+ return len(already_printed_emails)
167
178
 
168
179
 
169
- def print_text_messages(epstein_files: EpsteinFiles) -> None:
180
+ def _print_text_messages(epstein_files: EpsteinFiles) -> None:
170
181
  print_section_header('Text Messages')
171
182
  print_centered("(conversations are sorted chronologically based on timestamp of first message)\n", style='gray30')
172
-
173
- if len(specified_names) == 0:
174
- log_files = epstein_files.imessage_logs
175
- else:
176
- log_files = flatten([epstein_files.imessage_logs_for(name) for name in specified_names])
183
+ authors: list[str | None] = specified_names if specified_names else [JEFFREY_EPSTEIN]
184
+ log_files = epstein_files.imessage_logs_for(authors)
177
185
 
178
186
  for log_file in log_files:
179
187
  console.print(Padding(log_file))
@@ -182,9 +190,9 @@ def print_text_messages(epstein_files: EpsteinFiles) -> None:
182
190
  epstein_files.print_imessage_summary()
183
191
 
184
192
 
185
- def print_json_stats(epstein_files: EpsteinFiles) -> None:
193
+ def _print_json_stats(epstein_files: EpsteinFiles) -> None:
186
194
  console.print(Panel('JSON Stats Dump', expand=True, style='reverse bold'), '\n')
187
- print_json(f"{MESSENGER_LOG_CLASS} Sender Counts", epstein_files.imessage_sender_counts(), skip_falsey=True)
195
+ print_json(f"{MESSENGER_LOG_CLASS} Sender Counts", MessengerLog.count_authors(epstein_files.imessage_logs), skip_falsey=True)
188
196
  print_json(f"{EMAIL_CLASS} Author Counts", epstein_files.email_author_counts, skip_falsey=True)
189
197
  print_json(f"{EMAIL_CLASS} Recipient Counts", epstein_files.email_recipient_counts, skip_falsey=True)
190
198
  print_json("Email signature_substitution_countss", epstein_files.email_signature_substitution_counts(), skip_falsey=True)
@@ -8,7 +8,7 @@ from rich.text import Text
8
8
  from epstein_files.documents.document import CLOSE_PROPERTIES_CHAR, Document
9
9
  from epstein_files.util.constant.names import UNKNOWN
10
10
  from epstein_files.util.constants import FALLBACK_TIMESTAMP
11
- from epstein_files.util.file_cfg import MessageCfg
11
+ from epstein_files.util.doc_cfg import CommunicationCfg
12
12
  from epstein_files.util.highlighted_group import get_style_for_name
13
13
  from epstein_files.util.rich import key_value_txt
14
14
 
@@ -20,7 +20,7 @@ class Communication(Document):
20
20
  """Superclass for Email and MessengerLog."""
21
21
  author_style: str = 'white'
22
22
  author_txt: Text = field(init=False)
23
- config: MessageCfg | None = None
23
+ config: CommunicationCfg | None = None
24
24
  timestamp: datetime = FALLBACK_TIMESTAMP # TODO this default sucks (though it never happens)
25
25
 
26
26
  def __post_init__(self):
@@ -31,22 +31,22 @@ class Communication(Document):
31
31
  def author_or_unknown(self) -> str:
32
32
  return self.author or UNKNOWN
33
33
 
34
- def description(self) -> Text:
35
- return self._description().append(CLOSE_PROPERTIES_CHAR)
36
-
37
- def is_attribution_uncertain(self) -> bool | None:
38
- return self.config and self.config.is_attribution_uncertain
34
+ def is_attribution_uncertain(self) -> bool:
35
+ return bool(self.config and self.config.is_attribution_uncertain)
39
36
 
40
37
  def raw_document_link_txt(self, _style: str = '', include_alt_link: bool = True) -> Text:
41
38
  """Overrides super() method to apply self.author_style."""
42
39
  return super().raw_document_link_txt(self.author_style, include_alt_link=include_alt_link)
43
40
 
41
+ def summary(self) -> Text:
42
+ return self._summary().append(CLOSE_PROPERTIES_CHAR)
43
+
44
44
  def timestamp_without_seconds(self) -> str:
45
45
  return TIMESTAMP_SECONDS_REGEX.sub('', str(self.timestamp))
46
46
 
47
- def _description(self) -> Text:
47
+ def _summary(self) -> Text:
48
48
  """One line summary mostly for logging."""
49
- txt = super().description().append(', ')
49
+ txt = super().summary().append(', ')
50
50
  return txt.append(key_value_txt('author', Text(f"'{self.author_or_unknown()}'", style=self.author_style)))
51
51
 
52
52
 
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  import re
3
- from dataclasses import dataclass, field
3
+ from dataclasses import asdict, dataclass, field
4
4
  from datetime import datetime
5
5
  from pathlib import Path
6
6
  from subprocess import run
@@ -14,33 +14,28 @@ from rich.text import Text
14
14
  from epstein_files.util.constant.names import *
15
15
  from epstein_files.util.constant.strings import *
16
16
  from epstein_files.util.constant.urls import *
17
- from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP, VI_DAILY_NEWS_ARTICLE
18
- from epstein_files.util.file_cfg import FileCfg, MessageCfg
19
- from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize
20
- from epstein_files.util.env import args, logger
21
- from epstein_files.util.file_helper import DOCS_DIR, file_stem_for_id, extract_file_id, file_size_str, is_local_extract_file
22
- from epstein_files.util.rich import SYMBOL_STYLE, console, highlighter, key_value_txt, logger, link_text_obj
17
+ from epstein_files.util.constants import ALL_FILE_CONFIGS, FALLBACK_TIMESTAMP
18
+ from epstein_files.util.data import collapse_newlines, date_str, iso_timestamp, listify, patternize, without_nones
19
+ from epstein_files.util.doc_cfg import EmailCfg, DocCfg, Metadata, TextCfg
20
+ from epstein_files.util.env import args
21
+ from epstein_files.util.file_helper import (DOCS_DIR, file_stem_for_id, extract_file_id, file_size,
22
+ file_size_str, is_local_extract_file)
23
+ from epstein_files.util.logging import DOC_TYPE_STYLES, FILENAME_STYLE, logger
24
+ from epstein_files.util.rich import SYMBOL_STYLE, console, highlighter, key_value_txt, link_text_obj
25
+ from epstein_files.util.search_result import MatchedLine
23
26
 
24
- WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
27
+ CLOSE_PROPERTIES_CHAR = ']'
25
28
  HOUSE_OVERSIGHT = HOUSE_OVERSIGHT_PREFIX.replace('_', ' ').strip()
26
- MIN_DOCUMENT_ID = 10477
27
29
  INFO_INDENT = 2
28
30
  INFO_PADDING = (0, 0, 0, INFO_INDENT)
31
+ MAX_TOP_LINES_LEN = 4000 # Only for logging
32
+ MIN_DOCUMENT_ID = 10477
33
+ LOCAL_EXTRACT_REGEX = re.compile(r"_\d$")
34
+ WHITESPACE_REGEX = re.compile(r"\s{2,}|\t|\n", re.MULTILINE)
29
35
 
30
- CLOSE_PROPERTIES_CHAR = ']'
31
- MAX_EXTRACTED_TIMESTAMPS = 6
32
36
  MIN_TIMESTAMP = datetime(1991, 1, 1)
33
37
  MID_TIMESTAMP = datetime(2007, 1, 1)
34
38
  MAX_TIMESTAMP = datetime(2020, 1, 1)
35
- VI_DAILY_NEWS_REGEX = re.compile(r'virgin\s*is[kl][ai]nds\s*daily\s*news', re.IGNORECASE)
36
-
37
- DOC_TYPE_STYLES = {
38
- DOCUMENT_CLASS: 'grey69',
39
- EMAIL_CLASS: 'sea_green2',
40
- JSON_FILE_CLASS: 'sandy_brown',
41
- MESSENGER_LOG_CLASS: 'cyan',
42
- OTHER_FILE_CLASS: 'grey69',
43
- }
44
39
 
45
40
  FILENAME_MATCH_STYLES = [
46
41
  'dark_green',
@@ -48,6 +43,13 @@ FILENAME_MATCH_STYLES = [
48
43
  'spring_green4',
49
44
  ]
50
45
 
46
+ METADATA_FIELDS = [
47
+ 'author',
48
+ 'file_id',
49
+ 'num_lines',
50
+ 'timestamp'
51
+ ]
52
+
51
53
  OCR_REPAIRS = {
52
54
  re.compile(r'\.corn\b'): '.com',
53
55
  re.compile('ln(adequate|dyke)'): r'In\1',
@@ -61,7 +63,7 @@ class Document:
61
63
  file_path: Path
62
64
  # Optional fields
63
65
  author: str | None = None
64
- config: FileCfg | MessageCfg | None = None
66
+ config: EmailCfg | DocCfg | TextCfg | None = None
65
67
  file_id: str = field(init=False)
66
68
  filename: str = field(init=False)
67
69
  is_duplicate: bool = False
@@ -72,8 +74,8 @@ class Document:
72
74
  timestamp: datetime | None = None
73
75
  url_slug: str = field(init=False) # e.g. 'HOUSE_OVERSIGHT_123456
74
76
 
75
- # Class variable; only used to cycle color of output when using lines_match()
76
- file_matching_idx: ClassVar[int] = 0
77
+ # Class variable overridden in JsonFile
78
+ strip_whitespace: ClassVar[bool] = True
77
79
 
78
80
  def __post_init__(self):
79
81
  self.filename = self.file_path.name
@@ -82,12 +84,12 @@ class Document:
82
84
  self.is_duplicate = bool(self.config.dupe_of_id) if self.config else False
83
85
 
84
86
  if self.is_local_extract_file():
85
- self.url_slug = file_stem_for_id(self.file_id)
87
+ self.url_slug = LOCAL_EXTRACT_REGEX.sub('', file_stem_for_id(self.file_id))
86
88
  cfg_type = type(self.config).__name__ if self.config else None
87
89
 
88
90
  # Coerce FileConfig for court docs etc. to MessageCfg for email files extracted from that document
89
- if self.document_type() == EMAIL_CLASS and self.config and cfg_type != MessageCfg.__name__:
90
- self.config = MessageCfg.from_file_cfg(self.config)
91
+ if self.class_name() == EMAIL_CLASS and self.config and cfg_type != EmailCfg.__name__:
92
+ self.config = EmailCfg.from_doc_cfg(self.config)
91
93
  else:
92
94
  self.url_slug = self.file_path.stem
93
95
 
@@ -96,41 +98,30 @@ class Document:
96
98
  self._extract_author()
97
99
  self.timestamp = self._extract_timestamp()
98
100
 
101
+ def class_name(self) -> str:
102
+ """Annoying workaround for circular import issues and isinstance()."""
103
+ return str(type(self).__name__)
104
+
99
105
  def configured_description(self) -> str | None:
100
- return self.config.description if self.config else None
106
+ """Overloaded in OtherFile."""
107
+ if self.config and self.config.description:
108
+ return f"({self.config.description})"
101
109
 
102
110
  def date_str(self) -> str | None:
103
111
  return date_str(self.timestamp)
104
112
 
105
- def description(self) -> Text:
106
- """Mostly for logging. Brackets are left open for subclasses to add stuff."""
107
- txt = Text('').append(self.url_slug, style='magenta')
108
- txt.append(f' {self.document_type()}', style=self.document_type_style())
109
-
110
- if self.timestamp:
111
- txt.append(' (', style=SYMBOL_STYLE)
112
- txt.append(f"{iso_timestamp(self.timestamp)}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
113
-
114
- txt.append(" [").append(key_value_txt('num_lines', Text(f"{self.num_lines}", style='cyan')))
115
- txt.append(', ').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
116
- return txt
117
-
118
113
  def description_panel(self, include_hints: bool = False) -> Panel:
119
114
  """Panelized description() with info_txt(), used in search results."""
120
115
  hints = [Text('', style='italic').append(h) for h in (self.hints() if include_hints else [])]
121
- return Panel(Group(*([self.description()] + hints)), border_style=self.document_type_style(), expand=False)
122
-
123
- def document_type(self) -> str:
124
- """Annoying workaround for circular import issues and isinstance()."""
125
- return str(type(self).__name__)
116
+ return Panel(Group(*([self.summary()] + hints)), border_style=self.document_type_style(), expand=False)
126
117
 
127
118
  def document_type_style(self) -> str:
128
- return DOC_TYPE_STYLES[self.document_type()]
119
+ return DOC_TYPE_STYLES[self.class_name()]
129
120
 
130
121
  def duplicate_file_txt(self) -> Text:
131
122
  """If the file is a dupe make a nice message to explain what file it's a duplicate of."""
132
123
  if not self.config or not self.config.dupe_of_id:
133
- raise RuntimeError(f"duplicate_file_txt() called on {self.description()} but not a dupe! config:\n\n{self.config}")
124
+ raise RuntimeError(f"duplicate_file_txt() called on {self.summary()} but not a dupe! config:\n\n{self.config}")
134
125
 
135
126
  txt = Text(f"Not showing ", style='white dim italic').append(epstein_media_doc_link_txt(self.file_id, style='cyan'))
136
127
  txt.append(f" because it's {self.config.duplicate_reason()} ")
@@ -154,6 +145,9 @@ class Document:
154
145
  hints = [Padding(hint, INFO_PADDING) for hint in self.hints()]
155
146
  return Group(*([panel] + hints))
156
147
 
148
+ def file_size(self) -> int:
149
+ return file_size(self.file_path)
150
+
157
151
  def file_size_str(self) -> str:
158
152
  return file_size_str(self.file_path)
159
153
 
@@ -162,16 +156,10 @@ class Document:
162
156
  hints = listify(self.info_txt())
163
157
  hint_msg = self.configured_description()
164
158
 
165
- if self.document_type() == OTHER_FILE_CLASS:
166
- if not hint_msg and VI_DAILY_NEWS_REGEX.search(self.text):
167
- hint_msg = VI_DAILY_NEWS_ARTICLE
168
- elif hint_msg:
169
- hint_msg = f"({hint_msg})"
170
-
171
159
  if hint_msg:
172
160
  hints.append(highlighter(Text(hint_msg, style='white dim italic')))
173
161
 
174
- return hints
162
+ return without_nones(hints)
175
163
 
176
164
  def info_txt(self) -> Text | None:
177
165
  """Secondary info about this file (recipients, level of certainty, etc). Overload in subclasses."""
@@ -181,32 +169,42 @@ class Document:
181
169
  """True if file created by extracting text from a court doc (identifiable from filename e.g. HOUSE_OVERSIGHT_012345_1.txt)."""
182
170
  return is_local_extract_file(self.filename)
183
171
 
184
- def lines_matching_txt(self, _pattern: re.Pattern | str) -> list[Text]:
185
- """Return lines matching a regex as colored list[Text]."""
186
- pattern = patternize(_pattern)
187
- matched_lines = [line for line in self.lines if pattern.search(line)]
188
-
189
- if len(matched_lines) == 0:
190
- return []
191
-
192
- file_style = FILENAME_MATCH_STYLES[type(self).file_matching_idx % len(FILENAME_MATCH_STYLES)]
193
- type(self).file_matching_idx += 1
194
-
195
- return [
196
- Text('').append(self.file_path.name, style=file_style).append(':').append(line)
197
- for line in matched_lines
198
- ]
199
-
200
172
  def log(self, msg: str, level: int = logging.WARNING):
201
- """Log with [file_id] as a prefix."""
202
- logger.log(level, f"[{self.file_id}] {msg}")
173
+ """Log with filename as a prefix."""
174
+ logger.log(level, f"{self.url_slug} {msg}")
203
175
 
204
176
  def log_top_lines(self, n: int = 10, msg: str = '', level: int = logging.INFO) -> None:
205
177
  """Log first 'n' lines of self.text at 'level'. 'msg' can be optionally provided."""
206
178
  separator = '\n\n' if '\n' in msg else '. '
207
- msg = f"{msg + separator if msg else ''}Top lines of '{self.filename}' ({self.num_lines} lines):"
179
+ msg = (msg + separator) if msg else ''
180
+ msg = f"{self.filename}: {msg}First {n} lines:"
208
181
  logger.log(level, f"{msg}\n\n{self.top_lines(n)}\n")
209
182
 
183
+ def matching_lines(self, _pattern: re.Pattern | str) -> list[MatchedLine]:
184
+ """Return lines matching a regex as colored list[Text]."""
185
+ pattern = patternize(_pattern)
186
+ return [MatchedLine(line, i) for i, line in enumerate(self.lines) if pattern.search(line)]
187
+
188
+ def metadata(self) -> Metadata:
189
+ metadata = self.config.metadata() if self.config else {}
190
+ metadata.update({k: v for k, v in asdict(self).items() if k in METADATA_FIELDS and v is not None})
191
+ metadata['bytes'] = self.file_size()
192
+ metadata['filename'] = f"{self.url_slug}.txt"
193
+ metadata['type'] = self.class_name()
194
+
195
+ if self.is_local_extract_file():
196
+ metadata['extracted_file'] = {
197
+ 'explanation': 'This file was extracted from a court filing, not distributed directly. A copy can be found on github.',
198
+ 'extracted_from_file': self.url_slug + '.txt',
199
+ 'extracted_file_url': extracted_file_url(self.filename),
200
+ }
201
+
202
+ return metadata
203
+
204
+ def raw_text(self) -> str:
205
+ with open(self.file_path) as f:
206
+ return f.read()
207
+
210
208
  def raw_document_link_txt(self, style: str = '', include_alt_link: bool = False) -> Text:
211
209
  """Returns colored links to epstein.media and and epsteinweb in a Text object."""
212
210
  txt = Text('', style='white' if include_alt_link else ARCHIVE_LINK_COLOR)
@@ -215,11 +213,13 @@ class Document:
215
213
  txt.append(self.epstein_web_link(style=style))
216
214
 
217
215
  if include_alt_link:
216
+ txt.append(' (').append(self.epsteinify_link(style='white dim', link_txt=EPSTEINIFY)).append(')')
218
217
  txt.append(' (').append(self.epstein_media_link(style='white dim', link_txt=EPSTEIN_MEDIA)).append(')')
219
218
  else:
220
219
  txt.append(self.epstein_media_link(style=style))
221
220
 
222
221
  if include_alt_link:
222
+ txt.append(' (').append(self.epsteinify_link(style='white dim', link_txt=EPSTEINIFY)).append(')')
223
223
  txt.append(' (').append(self.epstein_web_link(style='white dim', link_txt=EPSTEIN_WEB)).append(')')
224
224
 
225
225
  return txt
@@ -234,8 +234,32 @@ class Document:
234
234
 
235
235
  return text
236
236
 
237
+ def sort_key(self) -> tuple[datetime, str, int]:
238
+ if self.config and self.config.dupe_of_id:
239
+ sort_id = self.config.dupe_of_id
240
+ dupe_idx = 1
241
+ else:
242
+ sort_id = self.file_id
243
+ dupe_idx = 0
244
+
245
+ return (self.timestamp or FALLBACK_TIMESTAMP, sort_id, dupe_idx)
246
+
247
+ def summary(self) -> Text:
248
+ """Summary of this file for logging. Brackets are left open for subclasses to add stuff."""
249
+ txt = Text('').append(self.class_name(), style=self.document_type_style())
250
+ txt.append(f" {self.url_slug}", style=FILENAME_STYLE)
251
+
252
+ if self.timestamp:
253
+ timestamp_str = iso_timestamp(self.timestamp).removesuffix(' 00:00:00')
254
+ txt.append(' (', style=SYMBOL_STYLE)
255
+ txt.append(f"{timestamp_str}", style=TIMESTAMP_DIM).append(')', style=SYMBOL_STYLE)
256
+
257
+ txt.append(' [').append(key_value_txt('size', Text(self.file_size_str(), style='aquamarine1')))
258
+ txt.append(", ").append(key_value_txt('lines', Text(f"{self.num_lines}", style='cyan')))
259
+ return txt
260
+
237
261
  def top_lines(self, n: int = 10) -> str:
238
- return '\n'.join(self.lines[0:n])
262
+ return '\n'.join(self.lines[0:n])[:MAX_TOP_LINES_LEN]
239
263
 
240
264
  def _border_style(self) -> str:
241
265
  """Should be overloaded in subclasses."""
@@ -250,21 +274,20 @@ class Document:
250
274
  """Should be implemented in subclasses."""
251
275
  pass
252
276
 
253
- def _load_file(self):
277
+ def _load_file(self) -> str:
254
278
  """Remove BOM and HOUSE OVERSIGHT lines, strip whitespace."""
255
- with open(self.file_path) as f:
256
- text = f.read()
257
- text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text # remove BOM
258
- text = self.repair_ocr_text(OCR_REPAIRS, text.strip())
259
- lines = [l.strip() for l in text.split('\n') if not l.startswith(HOUSE_OVERSIGHT)]
260
- lines = lines[1:] if (len(lines) > 1 and lines[0] == '>>') else lines
261
- return collapse_newlines('\n'.join(lines))
279
+ text = self.raw_text()
280
+ text = text[1:] if (len(text) > 0 and text[0] == '\ufeff') else text # remove BOM
281
+ text = self.repair_ocr_text(OCR_REPAIRS, text.strip())
282
+ lines = [l.strip() for l in text.split('\n') if not l.startswith(HOUSE_OVERSIGHT)]
283
+ return collapse_newlines('\n'.join(lines))
262
284
 
263
285
  def _repair(self) -> None:
264
- """Can optionally be overloaded in subclasses."""
286
+ """Can optionally be overloaded in subclasses to further improve self.text."""
265
287
  pass
266
288
 
267
289
  def _set_computed_fields(self, lines: list[str] | None = None, text: str | None = None) -> None:
290
+ """Sets all fields derived from self.text based on either 'lines' or 'text' arg."""
268
291
  if (lines and text):
269
292
  raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (got both)")
270
293
  elif lines is not None:
@@ -275,7 +298,7 @@ class Document:
275
298
  raise RuntimeError(f"[{self.filename}] Either 'lines' or 'text' arg must be provided (neither was)")
276
299
 
277
300
  self.length = len(self.text)
278
- self.lines = [line.strip() for line in self.text.split('\n')]
301
+ self.lines = [line.strip() if self.strip_whitespace else line for line in self.text.split('\n')]
279
302
  self.num_lines = len(self.lines)
280
303
 
281
304
  def _write_clean_text(self, output_path: Path) -> None:
@@ -291,16 +314,17 @@ class Document:
291
314
 
292
315
  logger.warning(f"Wrote {self.length} chars of cleaned {self.filename} to {output_path}.")
293
316
 
294
- def __rich_console__(self, _console: Console, _options: ConsoleOptions) -> RenderResult:
317
+ def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
295
318
  yield self.file_info_panel()
296
319
  text_panel = Panel(highlighter(self.text), border_style=self._border_style(), expand=False)
297
320
  yield Padding(text_panel, (0, 0, 1, INFO_INDENT))
298
321
 
299
322
  def __str__(self) -> str:
300
- return self.description().plain
323
+ return self.summary().plain
301
324
 
302
325
  @staticmethod
303
326
  def diff_files(files: list[str]) -> None:
327
+ """Diff the contents of two Documents after all cleanup, BOM removal, etc."""
304
328
  if len(files) != 2:
305
329
  raise RuntimeError('Need 2 files')
306
330
  elif files[0] == files[1]:
@@ -330,7 +354,7 @@ class Document:
330
354
 
331
355
  @staticmethod
332
356
  def sort_by_timestamp(docs: Sequence['DocumentType']) -> list['DocumentType']:
333
- return sorted(docs, key=lambda doc: [doc.timestamp or FALLBACK_TIMESTAMP, doc.file_id])
357
+ return sorted(docs, key=lambda doc: doc.sort_key())
334
358
 
335
359
  @classmethod
336
360
  def uniquify(cls, documents: Sequence['DocumentType']) -> Sequence['DocumentType']: