epstein-files 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,7 +13,8 @@ from dateutil.parser import parse
13
13
  from rich.text import Text
14
14
 
15
15
  from epstein_files.util.constant import names
16
- from epstein_files.util.env import args, logger
16
+ from epstein_files.util.env import args
17
+ from epstein_files.util.logging import logger
17
18
 
18
19
  T = TypeVar('T')
19
20
 
@@ -26,14 +27,6 @@ PACIFIC_TZ = tz.gettz("America/Los_Angeles")
26
27
  TIMEZONE_INFO = {"PST": PACIFIC_TZ, "PDT": PACIFIC_TZ} # Suppresses annoying warnings from parse() calls
27
28
 
28
29
 
29
- def collapse_newlines(text: str) -> str:
30
- return MULTINEWLINE_REGEX.sub('\n\n', text)
31
-
32
-
33
- def date_str(timestamp: datetime | None) -> str | None:
34
- return timestamp.isoformat()[0:10] if timestamp else None
35
-
36
-
37
30
  def dict_sets_to_lists(d: dict[str, set]) -> dict[str, list]:
38
31
  return {k: sorted(list(v)) for k, v in d.items()}
39
32
 
@@ -70,14 +63,19 @@ def flatten(_list: list[list[T]]) -> list[T]:
70
63
  return list(itertools.chain.from_iterable(_list))
71
64
 
72
65
 
73
- def iso_timestamp(dt: datetime) -> str:
74
- return dt.isoformat().replace('T', ' ')
66
+ def json_safe(d: dict) -> dict:
67
+ return {
68
+ 'None' if k is None else k: v.isoformat() if isinstance(v, datetime) else v
69
+ for k,v in d.items()
70
+ }
75
71
 
76
72
 
77
- def listify(listlike: list | str | Text | None) -> list:
73
+ def listify(listlike) -> list:
78
74
  """Create a list of 'listlike'. Returns empty list if 'listlike' is None or empty string."""
79
75
  if isinstance(listlike, list):
80
76
  return listlike
77
+ elif listlike is None:
78
+ return [None]
81
79
  elif listlike:
82
80
  return [listlike]
83
81
  else:
@@ -110,22 +108,10 @@ def sort_dict(d: dict[str | None, int] | dict[str, int]) -> list[tuple[str | Non
110
108
  return sorted(d.items(), key=sort_key)
111
109
 
112
110
 
113
- @dataclass
114
- class Timer:
115
- started_at: float = field(default_factory=lambda: time.perf_counter())
116
- checkpoint_at: float = field(default_factory=lambda: time.perf_counter())
117
-
118
- def print_at_checkpoint(self, msg: str) -> None:
119
- logger.warning(f"{msg} in {self.seconds_since_checkpoint()}")
120
- self.checkpoint_at = time.perf_counter()
121
-
122
- def seconds_since_checkpoint(self) -> str:
123
- return f"{(time.perf_counter() - self.checkpoint_at):.2f} seconds"
124
-
125
- def seconds_since_start(self) -> str:
126
- return f"{(time.perf_counter() - self.started_at):.2f} seconds"
127
-
128
-
111
+ collapse_newlines = lambda text: MULTINEWLINE_REGEX.sub('\n\n', text)
112
+ date_str = lambda dt: dt.isoformat()[0:10] if dt else None
129
113
  escape_double_quotes = lambda text: text.replace('"', r'\"')
130
114
  escape_single_quotes = lambda text: text.replace("'", r"\'")
115
+ iso_timestamp = lambda dt: dt.isoformat().replace('T', ' ')
131
116
  uniquify = lambda _list: list(set(_list))
117
+ without_nones = lambda _list: [e for e in _list if e]
@@ -6,22 +6,27 @@ from typing import Generator, Literal
6
6
 
7
7
  from dateutil.parser import parse
8
8
 
9
- from epstein_files.util.constant.names import constantize_name
10
- from epstein_files.util.constant.strings import AUTHOR
9
+ from epstein_files.util.constant.names import *
10
+ from epstein_files.util.constant.strings import *
11
+ from epstein_files.util.data import without_nones
11
12
 
12
- DuplicateType = Literal['same', 'earlier', 'quoted', 'redacted']
13
+ DuplicateType = Literal['earlier', 'quoted', 'redacted', 'same']
14
+ Metadata = dict[str, bool | datetime | int | str | list[str | None] |dict[str, bool | str]]
13
15
 
16
+ # Misc
17
+ CONSTANTIZE_NAMES = False # A flag set to True that causes repr() of these classes to return strings of usable code
14
18
  INDENT = ' '
15
19
  INDENT_NEWLINE = f'\n{INDENT}'
16
20
  INDENTED_JOIN = f',{INDENT_NEWLINE}'
17
- CONSTANTIZE_NAMES = False # A flag set to True that causes repr() of these classes to return strings of usable code
18
- MAX_LINE_LENGTH = 250
21
+ MAX_LINE_LENGTH = 150
22
+ REPUTATION_MGMT = f'{REPUTATION} management'
23
+ SAME = 'same'
19
24
 
20
- REASON_MAPPING: dict[DuplicateType, str] = {
21
- 'earlier': 'earlier draft of',
25
+ DUPE_TYPE_STRS: dict[DuplicateType, str] = {
26
+ 'earlier': 'an earlier draft of',
22
27
  'quoted': 'quoted in full in',
23
- 'redacted': 'redacted version of',
24
- 'same': 'the same as',
28
+ 'redacted': 'a redacted version of',
29
+ SAME: 'the same as',
25
30
  }
26
31
 
27
32
  FIELD_SORT_KEY = {
@@ -30,57 +35,114 @@ FIELD_SORT_KEY = {
30
35
  'attribution_reason': 'zz',
31
36
  }
32
37
 
38
+ FINANCIAL_REPORTS_AUTHORS = [
39
+ BOFA,
40
+ DEUTSCHE_BANK,
41
+ ELECTRON_CAPITAL_PARTNERS,
42
+ GOLDMAN_INVESTMENT_MGMT,
43
+ 'Invesco',
44
+ JP_MORGAN,
45
+ 'Morgan Stanley',
46
+ 'S&P',
47
+ ]
48
+
49
+ # Fields like timestamp and author are better added from the Document object
50
+ INVALID_FOR_METADATA = [
51
+ 'actual_text',
52
+ 'date',
53
+ 'id',
54
+ 'timestamp',
55
+ 'was_generated',
56
+ ]
57
+
33
58
 
34
59
  @dataclass(kw_only=True)
35
- class FileCfg:
36
- """Convenience class that encapsulates configuring info about files that need to be manually configured.
60
+ class DocCfg:
61
+ """
62
+ Encapsulates info about files that needs to be manually configured because it cannot be programmatically inferred.
37
63
 
38
64
  Attributes:
39
65
  id (str): ID of file
40
66
  author (str | None): Author of the document (if any)
67
+ category (str | None): Type of file
41
68
  date (str | None): If passed will be immediated parsed into the 'timestamp' field
42
69
  dupe_of_id (str | None): If this is a dupe the ID of the duplicated file. This file will be suppressed
43
- dupe_type (DuplicateType | None): The type of duplicate this file is (redacted, quoted, etc.)
44
- duplicate_ids (list[str]): Inverse of 'dupe_of_id' - this file will NOT be suppressed but 'duplicate_ids' will be.
70
+ dupe_type (DuplicateType | None): The type of duplicate this file is or its 'duplicate_ids' are
71
+ duplicate_ids (list[str]): Inverse of 'dupe_of_id' - this file will NOT be suppressed but 'duplicate_ids' will be
72
+ is_interesting (bool): Override other considerations and always consider this file interesting
45
73
  timestamp (datetime | None): Time this email was sent, file was created, article published, etc.
46
74
  was_generated (bool): True if this object was generated by the duplicate_cfgs() method
47
75
  """
48
76
  id: str
49
77
  author: str | None = None
78
+ category: str | None = None
50
79
  date: str | None = None
51
80
  description: str | None = None
52
81
  dupe_of_id: str | None = None
53
82
  dupe_type: DuplicateType | None = None
54
83
  duplicate_ids: list[str] = field(default_factory=list)
84
+ is_interesting: bool = False
55
85
  timestamp: datetime | None = None
56
- was_generated: bool = False # True if this object was generated by duplicate_cfgs()
86
+ was_generated: bool = False
57
87
 
58
88
  def __post_init__(self):
59
- if self.dupe_of_id:
60
- self.dupe_type = self.dupe_type or 'same'
61
-
62
89
  if self.date:
63
90
  self.timestamp = parse(self.date)
64
91
 
92
+ if self.dupe_of_id or self.duplicate_ids:
93
+ self.dupe_type = self.dupe_type or SAME
94
+
65
95
  def duplicate_reason(self) -> str | None:
66
96
  if self.dupe_type is not None:
67
- return REASON_MAPPING[self.dupe_type]
97
+ return DUPE_TYPE_STRS[self.dupe_type]
68
98
 
69
- def duplicate_cfgs(self) -> Generator['FileCfg', None, None]:
99
+ def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
100
+ """Create synthetic DocCfg objects that set the 'dupe_of_id' field to point back to this object."""
70
101
  for id in self.duplicate_ids:
71
102
  dupe_cfg = deepcopy(self)
72
103
  dupe_cfg.id = id
73
104
  dupe_cfg.dupe_of_id = self.id
74
- dupe_cfg.dupe_type = self.dupe_type or 'same'
105
+ dupe_cfg.duplicate_ids = []
106
+ dupe_cfg.dupe_type = self.dupe_type
75
107
  dupe_cfg.was_generated = True
76
108
  yield dupe_cfg
77
109
 
110
+ def info_str(self) -> str | None:
111
+ """String that summarizes what is known about this document."""
112
+ if self.category == REPUTATION:
113
+ return f"{REPUTATION_MGMT}: {self.description}"
114
+ elif self.author and self.description:
115
+ if self.category in [ACADEMIA, BOOK]:
116
+ return self.title_by_author()
117
+ elif self.category == FINANCE and self.author in FINANCIAL_REPORTS_AUTHORS:
118
+ return f"{self.author} report: '{self.description}'"
119
+ elif self.category and self.author is None and self.description is None:
120
+ return self.category
121
+
122
+ pieces = without_nones([self.author, self.description])
123
+ return ' '.join(pieces) if pieces else None
124
+
125
+ def metadata(self) -> Metadata:
126
+ non_null_fields = {k: v for k, v in asdict(self).items() if v and k not in INVALID_FOR_METADATA}
127
+
128
+ if self.category in [EMAIL, TEXT_MESSAGE]:
129
+ del non_null_fields['category']
130
+
131
+ return non_null_fields
132
+
78
133
  def non_null_field_names(self) -> list[str]:
79
134
  return [f.name for f in self.sorted_fields() if getattr(self, f.name)]
80
135
 
81
136
  def sorted_fields(self) -> list[Field]:
82
137
  return sorted(fields(self), key=lambda f: FIELD_SORT_KEY.get(f.name, f.name))
83
138
 
139
+ def title_by_author(self) -> str:
140
+ if not (self.author and self.description):
141
+ raise RuntimeError(f"Can't call title_by_author() without author and description!")
142
+
143
+ title = self.description if '"' in self.description else f"'{self.description}'"
144
+ return f"{title} by {self.author}"
145
+
84
146
  def _props_strs(self) -> list[str]:
85
147
  props = []
86
148
  add_prop = lambda f, value: props.append(f"{f.name}={value}")
@@ -92,14 +154,16 @@ class FileCfg:
92
154
  continue
93
155
  elif _field.name == AUTHOR:
94
156
  add_prop(_field, constantize_name(str(value)) if CONSTANTIZE_NAMES else f"'{value}'")
157
+ elif _field.name == 'category' and value in [EMAIL, TEXT_MESSAGE]:
158
+ continue
95
159
  elif _field.name == 'recipients' and isinstance(value, list):
96
160
  recipients_str = str([constantize_name(r) if (CONSTANTIZE_NAMES and r) else r for r in value])
97
161
  add_prop(_field, recipients_str.replace("'", '') if CONSTANTIZE_NAMES else recipients_str)
162
+ elif _field.name == 'timestamp' and self.date is not None:
163
+ continue # Don't print both timestamp and date
98
164
  elif isinstance(value, datetime):
99
165
  value_str = re.sub(' 00:00:00', '', str(value))
100
166
  add_prop(_field, f"parse('{value_str}')" if CONSTANTIZE_NAMES else f"'{value}'")
101
- elif _field.name == 'description':
102
- add_prop(_field, value.strip())
103
167
  elif isinstance(value, str):
104
168
  if "'" in value:
105
169
  value = '"' + value.replace('"', r'\"') + '"'
@@ -112,7 +176,7 @@ class FileCfg:
112
176
 
113
177
  return props
114
178
 
115
- def __eq__(self, other: 'FileCfg') -> bool:
179
+ def __eq__(self, other: 'DocCfg') -> bool:
116
180
  """Return True if everything matches other than the two 'dupe_' fields ('duplicate_ids' is compared)."""
117
181
  for _field in self.sorted_fields():
118
182
  if _field.name == 'id' or _field.name.startswith('dupe'):
@@ -127,7 +191,7 @@ class FileCfg:
127
191
  type_str = f"{type(self).__name__}("
128
192
  single_line_repr = type_str + ', '.join(props) + f')'
129
193
 
130
- if (len(single_line_repr) < MAX_LINE_LENGTH or self.non_null_field_names() == ['id', 'description']) and '#' not in (self.description or ''):
194
+ if len(single_line_repr) < MAX_LINE_LENGTH:
131
195
  repr_str = single_line_repr
132
196
  else:
133
197
  repr_str = f"{type_str}{INDENT_NEWLINE}" + INDENTED_JOIN.join(props)
@@ -142,31 +206,53 @@ class FileCfg:
142
206
 
143
207
 
144
208
  @dataclass(kw_only=True)
145
- class MessageCfg(FileCfg):
209
+ class CommunicationCfg(DocCfg):
146
210
  """
147
- Convenience class to unite various configured properties for a given Communication file.
148
211
  Manual config is always required for MessengerLog author attribution. It's also often needed for Email
149
212
  files to handle the terrible OCR text that Congress provided which messes up a lot of the email headers.
150
213
 
151
214
  Attributes:
152
- actual_text (str | None): In dire cases of broken OCR we just configure the body of the email as a string.
153
215
  attribution_reason (str | None): Optional explanation of why this email was attributed to this author.
154
216
  is_attribution_uncertain (bool): True if we have a good idea of who the author is but are not 100% certain
217
+ """
218
+ attribution_reason: str | None = None
219
+ is_attribution_uncertain: bool = False
220
+
221
+ def __repr__(self) -> str:
222
+ return super().__repr__()
223
+
224
+
225
+ @dataclass(kw_only=True)
226
+ class EmailCfg(CommunicationCfg):
227
+ """
228
+ Attributes:
229
+ actual_text (str | None): In dire cases of broken OCR we just configure the body of the email as a string.
155
230
  is_fwded_article (bool): True if this is a newspaper article someone fwded. Used to exclude articles from word counting.
156
231
  recipients (list[str | None]): Who received the email
157
232
  """
158
233
  actual_text: str | None = None # Override for the Email._actual_text() method for particularly broken emails
159
- attribution_reason: str | None = None
160
- is_attribution_uncertain: bool = False
161
234
  is_fwded_article: bool = False
162
235
  recipients: list[str | None] = field(default_factory=list)
163
236
 
164
- def __eq__(self, other: 'FileCfg') -> bool:
165
- return super().__eq__(other)
237
+ def __post_init__(self):
238
+ super().__post_init__()
239
+ self.category = EMAIL
166
240
 
241
+ @classmethod
242
+ def from_doc_cfg(cls, cfg: DocCfg) -> 'EmailCfg':
243
+ return cls(**asdict(cfg))
244
+
245
+ # This is necessary for some dumb reason. @dataclass(repr=False) doesn't cut it
167
246
  def __repr__(self) -> str:
168
247
  return super().__repr__()
169
248
 
170
- @classmethod
171
- def from_file_cfg(cls, cfg: FileCfg) -> 'MessageCfg':
172
- return cls(**asdict(cfg))
249
+
250
+ @dataclass(kw_only=True)
251
+ class TextCfg(CommunicationCfg):
252
+ def __post_init__(self):
253
+ super().__post_init__()
254
+ self.category = TEXT_MESSAGE
255
+
256
+ # This is necessary for some dumb reason. @dataclass(repr=False) doesn't cut it
257
+ def __repr__(self) -> str:
258
+ return super().__repr__()
epstein_files/util/env.py CHANGED
@@ -4,7 +4,7 @@ from os import environ
4
4
  from pathlib import Path
5
5
  from sys import argv
6
6
 
7
- from rich.logging import RichHandler
7
+ from epstein_files.util.logging import datefinder_logger, env_log_level, logger
8
8
 
9
9
  DEFAULT_WIDTH = 154
10
10
  HTML_SCRIPTS = ['generate_html.py', 'count_words.py']
@@ -12,9 +12,8 @@ HTML_SCRIPTS = ['generate_html.py', 'count_words.py']
12
12
 
13
13
  parser = ArgumentParser(description="Parse epstein OCR docs and generate HTML page.")
14
14
  parser.add_argument('--build', '-b', action='store_true', help='write HTML to docs/index.html')
15
- parser.add_argument('--all-emails', '-ae', action='store_true', help='all the emails')
16
- parser.add_argument('--all-email-tables', '-aet', action='store_true', help='all email tables (except Epstein)')
17
- parser.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of a limited selection')
15
+ parser.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
16
+ parser.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just interesting ones')
18
17
  parser.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
19
18
  parser.add_argument('--name', '-n', action='append', dest='names', help='specify the name(s) whose communications should be output')
20
19
  parser.add_argument('--output-emails', '-oe', action='store_true', help='generate other files section')
@@ -22,24 +21,25 @@ parser.add_argument('--output-other-files', '-oo', action='store_true', help='ge
22
21
  parser.add_argument('--output-texts', '-ot', action='store_true', help='generate other files section')
23
22
  parser.add_argument('--pickled', '-p', action='store_true', help='use pickled EpsteinFiles object')
24
23
  parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='generate new pickled EpsteinFiles object')
24
+ parser.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (only used by scripts)')
25
25
  parser.add_argument('--sort-alphabetical', '-alpha', action='store_true', help='sort emailers alphabetically in counts table')
26
26
  parser.add_argument('--suppress-output', '-s', action='store_true', help='no output to terminal (use with --build)')
27
- parser.add_argument('--use-epstein-web-links', '-use', action='store_true', help='use epsteinweb.org links instead of epsteinify.com')
28
- parser.add_argument('--search-other', '-so', action='store_true', help='search for string in non email/text files (only used by search script)')
27
+ parser.add_argument('--use-epstein-web-links', '-use', action='store_true', help='use epsteinweb.org links instead of epstein.media')
29
28
  parser.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use')
30
29
  parser.add_argument('--whole-file', '-wf', action='store_true', help='print whole file (only used by search script)')
31
30
  parser.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
32
31
  parser.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
32
+ parser.add_argument('--make-clean', '-mc', action='store_true', help='delete all build artifact HTML and JSON files')
33
33
  parser.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
34
+ parser.add_argument('--json-metadata', '-jm', action='store_true', help='dump JSON metadata for all files')
34
35
  parser.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats at the end')
35
36
  parser.add_argument('positional_args', nargs='*', help='Optional args (only used by helper scripts)')
36
37
  args = parser.parse_args()
37
38
 
38
- is_env_var_set = lambda s: len(environ.get(s) or '') > 0
39
39
  current_script = Path(argv[0]).name
40
+ is_env_var_set = lambda s: len(environ.get(s) or '') > 0
40
41
  is_html_script = current_script in HTML_SCRIPTS
41
42
 
42
- args.deep_debug = args.deep_debug or is_env_var_set('DEEP_DEBUG')
43
43
  args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
44
44
  args.output_emails = args.output_emails or args.all_emails
45
45
  args.output_other_files = args.output_other_files or args.all_other_files
@@ -48,27 +48,25 @@ args.width = args.width if is_html_script else None
48
48
  specified_names: list[str | None] = [None if n == 'None' else n for n in (args.names or [])]
49
49
 
50
50
 
51
- # Setup logging
52
- logging.basicConfig(level="NOTSET", format="%(message)s", datefmt="[%X]", handlers=[RichHandler()])
53
- # logging.basicConfig(level="DEBUG", handlers=[RichHandler()])
54
- logger = logging.getLogger("rich")
55
-
51
+ # Log level args
56
52
  if args.deep_debug:
57
53
  logger.setLevel(logging.DEBUG)
58
54
  elif args.debug:
59
55
  logger.setLevel(logging.INFO)
60
56
  elif args.suppress_logs:
61
57
  logger.setLevel(logging.FATAL)
62
- else:
58
+ elif not env_log_level:
63
59
  logger.setLevel(logging.WARNING)
64
60
 
65
- datefinder_logger = logging.getLogger('datefinder') # Suppress annoying output
61
+ logger.info(f'Log level set to {logger.level}...')
66
62
  datefinder_logger.setLevel(logger.level)
67
63
 
68
64
 
69
65
  # Massage args that depend on other args to the appropriate state
70
- if not (args.output_texts or args.output_emails or args.output_other_files):
71
- logger.warning(f"No output section chosen; outputting default of texts, selected emails, and other files...")
66
+ if not (args.json_metadata or args.output_texts or args.output_emails or args.output_other_files):
67
+ if is_html_script:
68
+ logger.warning(f"No output section chosen; outputting default of texts, selected emails, and other files...")
69
+
72
70
  args.output_texts = True
73
71
  args.output_emails = True
74
72
  args.output_other_files = True
@@ -77,4 +75,4 @@ if args.use_epstein_web_links:
77
75
  logger.warning(f"Using links to epsteinweb.org links instead of epsteinify.com...")
78
76
 
79
77
  if args.debug:
80
- logger.warning(f"is_html_script={is_html_script}, specified_names={specified_names}, args={args}")
78
+ logger.warning(f"Invocation args:\nis_html_script={is_html_script},\nspecified_names={specified_names},\nargs={args}")
@@ -3,11 +3,12 @@ from os import environ
3
3
  from pathlib import Path
4
4
  from sys import exit
5
5
 
6
- from epstein_files.util.constant.strings import HOUSE_OVERSIGHT_PREFIX
6
+ from epstein_files.util.constant.strings import FILE_NAME_REGEX, FILE_STEM_REGEX, HOUSE_OVERSIGHT_PREFIX
7
7
 
8
8
  EPSTEIN_DOCS_DIR_ENV_VAR_NAME = 'EPSTEIN_DOCS_DIR'
9
9
  DOCS_DIR_ENV = environ[EPSTEIN_DOCS_DIR_ENV_VAR_NAME]
10
10
  DOCS_DIR = Path(DOCS_DIR_ENV or '').resolve()
11
+ PICKLED_PATH = Path("the_epstein_files.pkl.gz")
11
12
 
12
13
  if not DOCS_DIR_ENV:
13
14
  print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME} env var not set!")
@@ -16,30 +17,36 @@ elif not DOCS_DIR.exists():
16
17
  print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!")
17
18
  exit(1)
18
19
 
19
- JSON_DIR = DOCS_DIR.joinpath('json_files')
20
20
  HTML_DIR = Path('docs')
21
21
  EXTRACTED_EMAILS_DIR = Path('emails_extracted_from_legal_filings')
22
+ EPSTEIN_WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_texts_and_emails_word_count.html')
22
23
  GH_PAGES_HTML_PATH = HTML_DIR.joinpath('index.html')
24
+ JSON_METADATA_PATH = HTML_DIR.joinpath('epstein_files_nov_2025_cryptadamus_metadata.json')
23
25
  WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_emails_word_count.html')
24
- EPSTEIN_WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_texts_and_emails_word_count.html')
25
- PICKLED_PATH = Path("the_epstein_files.pkl.gz")
26
26
 
27
- FILE_STEM_REGEX = re.compile(fr"{HOUSE_OVERSIGHT_PREFIX}(\d{{6}})")
28
- FILE_ID_REGEX = re.compile(fr".*{FILE_STEM_REGEX.pattern}(_\d{{1,2}})?(\.txt(\.json)?)?")
27
+ BUILD_ARTIFACTS = [
28
+ EPSTEIN_WORD_COUNT_HTML_PATH,
29
+ GH_PAGES_HTML_PATH,
30
+ JSON_METADATA_PATH,
31
+ WORD_COUNT_HTML_PATH,
32
+ ]
33
+
34
+ FILE_ID_REGEX = re.compile(fr".*{FILE_NAME_REGEX.pattern}")
29
35
  FILENAME_LENGTH = len(HOUSE_OVERSIGHT_PREFIX) + 6
30
36
  KB = 1024
31
37
  MB = KB * KB
32
38
 
33
39
 
34
40
  # Handles both string and int 'id' args.
35
- file_stem_for_id = lambda id: f"{HOUSE_OVERSIGHT_PREFIX}{int(id):06d}"
41
+ id_str = lambda id: f"{int(id):06d}"
36
42
  filename_for_id = lambda id: file_stem_for_id(id) + '.txt'
37
43
 
38
44
 
39
45
  def coerce_file_stem(filename_or_id: int | str) -> str:
40
46
  """Generate a valid file_stem no matter what form the argument comes in."""
41
47
  if isinstance(filename_or_id, str) and filename_or_id.startswith(HOUSE_OVERSIGHT_PREFIX):
42
- file_stem = file_stem_for_id(extract_file_id(filename_or_id))
48
+ file_id = extract_file_id(filename_or_id)
49
+ file_stem = file_stem_for_id(file_id)
43
50
  else:
44
51
  file_stem = file_stem_for_id(filename_or_id)
45
52
 
@@ -49,33 +56,65 @@ def coerce_file_stem(filename_or_id: int | str) -> str:
49
56
  return file_stem
50
57
 
51
58
 
52
- def extract_file_id(filename: str | Path) -> str:
53
- file_match = FILE_ID_REGEX.match(str(filename))
59
+ def coerce_file_name(filename_or_id: int | str) -> str:
60
+ return coerce_file_stem(filename_or_id) + '.txt'
61
+
62
+
63
+ def coerce_file_path(filename_or_id: int | str) -> Path:
64
+ return DOCS_DIR.joinpath(coerce_file_name(filename_or_id))
65
+
66
+
67
+ def extract_file_id(filename_or_id: int | str | Path) -> str:
68
+ if isinstance(filename_or_id, int) or (isinstance(filename_or_id, str) and len(filename_or_id) <= 6):
69
+ return id_str(filename_or_id)
70
+
71
+ file_match = FILE_ID_REGEX.match(str(filename_or_id))
54
72
 
55
73
  if not file_match:
56
- raise RuntimeError(f"Failed to extract file ID from {filename}")
74
+ raise RuntimeError(f"Failed to extract file ID from {filename_or_id}")
57
75
 
58
76
  return file_match.group(1)
59
77
 
60
78
 
79
+ def file_size(file_path: str | Path) -> int:
80
+ return Path(file_path).stat().st_size
81
+
82
+
61
83
  def file_size_str(file_path: str | Path) -> str:
62
- file_size = float(Path(file_path).stat().st_size)
84
+ size = file_size(file_path)
63
85
  digits = 2
64
86
 
65
- if file_size > MB:
66
- size_num = file_size / MB
87
+ if size > MB:
88
+ size_num = float(size) / MB
67
89
  size_str = 'MB'
68
- elif file_size > KB:
69
- size_num = file_size / KB
90
+ elif size > KB:
91
+ size_num = float(size) / KB
70
92
  size_str = 'kb'
71
93
  digits = 1
72
94
  else:
73
- return f"{int(file_size)} b"
95
+ return f"{size} b"
74
96
 
75
97
  return f"{size_num:,.{digits}f} {size_str}"
76
98
 
77
99
 
100
+ def file_stem_for_id(id: int | str) -> str:
101
+ if isinstance(id, int) or (isinstance(id, str) and len(id) <= 6):
102
+ return f"{HOUSE_OVERSIGHT_PREFIX}{id_str(id)}"
103
+ elif len(id) == 8:
104
+ return f"{HOUSE_OVERSIGHT_PREFIX}{id}"
105
+ else:
106
+ raise RuntimeError(f"Unknown kind of file id {id}")
107
+
108
+
78
109
  def is_local_extract_file(filename) -> bool:
79
110
  """Return true if filename is of form 'HOUSE_OVERSIGHT_029835_1.txt'."""
80
111
  file_match = FILE_ID_REGEX.match(str(filename))
81
112
  return True if file_match and file_match.group(2) else False
113
+
114
+
115
+ def make_clean() -> None:
116
+ """Delete all build artifacts."""
117
+ for build_file in BUILD_ARTIFACTS:
118
+ if build_file.exists():
119
+ print(f"Removing build file '{build_file}'...")
120
+ build_file.unlink()