epstein-files 1.2.5__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. epstein_files/__init__.py +55 -23
  2. epstein_files/documents/communication.py +9 -5
  3. epstein_files/documents/document.py +231 -135
  4. epstein_files/documents/doj_file.py +242 -0
  5. epstein_files/documents/doj_files/full_text.py +166 -0
  6. epstein_files/documents/email.py +289 -232
  7. epstein_files/documents/emails/email_header.py +35 -16
  8. epstein_files/documents/emails/emailers.py +223 -0
  9. epstein_files/documents/imessage/text_message.py +2 -3
  10. epstein_files/documents/json_file.py +18 -14
  11. epstein_files/documents/messenger_log.py +23 -39
  12. epstein_files/documents/other_file.py +54 -48
  13. epstein_files/epstein_files.py +65 -29
  14. epstein_files/person.py +151 -94
  15. epstein_files/util/constant/names.py +37 -10
  16. epstein_files/util/constant/output_files.py +2 -0
  17. epstein_files/util/constant/strings.py +14 -7
  18. epstein_files/util/constant/urls.py +17 -0
  19. epstein_files/util/constants.py +556 -391
  20. epstein_files/util/data.py +2 -0
  21. epstein_files/util/doc_cfg.py +44 -33
  22. epstein_files/util/env.py +34 -19
  23. epstein_files/util/file_helper.py +30 -6
  24. epstein_files/util/helpers/debugging_helper.py +13 -0
  25. epstein_files/util/helpers/env_helpers.py +21 -0
  26. epstein_files/util/highlighted_group.py +121 -37
  27. epstein_files/util/layout/left_bar_panel.py +26 -0
  28. epstein_files/util/logging.py +28 -13
  29. epstein_files/util/output.py +49 -40
  30. epstein_files/util/rich.py +30 -3
  31. epstein_files/util/word_count.py +7 -7
  32. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/METADATA +16 -3
  33. epstein_files-1.5.0.dist-info/RECORD +40 -0
  34. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/entry_points.txt +1 -1
  35. epstein_files-1.2.5.dist-info/RECORD +0 -34
  36. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/LICENSE +0 -0
  37. {epstein_files-1.2.5.dist-info → epstein_files-1.5.0.dist-info}/WHEEL +0 -0
@@ -19,6 +19,8 @@ MULTINEWLINE_REGEX = re.compile(r"\n{2,}")
19
19
  CONSTANT_VAR_REGEX = re.compile(r"^[A-Z_]+$")
20
20
  ALL_NAMES = [v for k, v in vars(names).items() if isinstance(v, str) and CONSTANT_VAR_REGEX.match(k)]
21
21
 
22
+ AMERICAN_DATE_FORMAT = r"%m/%d/%y %I:%M:%S %p"
23
+ AMERICAN_TIME_REGEX = re.compile(r"(\d{1,2}/\d{1,2}/\d{2,4}\s+\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)?)")
22
24
  PACIFIC_TZ = tz.gettz("America/Los_Angeles")
23
25
  TIMEZONE_INFO = {"PDT": PACIFIC_TZ, "PST": PACIFIC_TZ} # Suppresses annoying warnings from parse() calls
24
26
 
@@ -1,3 +1,4 @@
1
+ import json
1
2
  import re
2
3
  from copy import deepcopy
3
4
  from dataclasses import Field, asdict, dataclass, field, fields
@@ -9,20 +10,21 @@ from dateutil.parser import parse
9
10
  from epstein_files.util.constant.names import *
10
11
  from epstein_files.util.constant.strings import *
11
12
  from epstein_files.util.data import remove_zero_time, without_falsey
13
+ from epstein_files.util.env import args
12
14
 
13
- DuplicateType = Literal['earlier', 'quoted', 'redacted', 'same']
15
+ DuplicateType = Literal['bounced', 'earlier', 'quoted', 'redacted', 'same']
14
16
  Metadata = dict[str, bool | datetime | int | str | list[str | None] |dict[str, bool | str]]
15
17
 
16
18
  # Misc
17
- CONSTANTIZE_NAMES = False # A flag set to True that causes repr() of these classes to return strings of usable code
18
19
  INDENT = ' '
19
20
  INDENT_NEWLINE = f'\n{INDENT}'
20
21
  INDENTED_JOIN = f',{INDENT_NEWLINE}'
21
- MAX_LINE_LENGTH = 150
22
+ MAX_LINE_LENGTH = 135
22
23
  REPUTATION_MGMT = f'{REPUTATION} management'
23
24
  SAME = 'same'
24
25
 
25
26
  DUPE_TYPE_STRS: dict[DuplicateType, str] = {
27
+ 'bounced': 'a bounced copy of',
26
28
  'earlier': 'an earlier draft of',
27
29
  'quoted': 'quoted in full in',
28
30
  'redacted': 'a redacted version of',
@@ -32,7 +34,10 @@ DUPE_TYPE_STRS: dict[DuplicateType, str] = {
32
34
  FIELD_SORT_KEY = {
33
35
  'id': 'a',
34
36
  'author': 'aa',
35
- 'attribution_reason': 'zz',
37
+ 'comment': 'zz',
38
+ 'duplicate_ids': 'dup',
39
+ 'duplicate_of_id': 'dupe',
40
+ 'recipients': 'aaa',
36
41
  }
37
42
 
38
43
  FINANCIAL_REPORTS_AUTHORS = [
@@ -49,7 +54,6 @@ FINANCIAL_REPORTS_AUTHORS = [
49
54
  # Fields like timestamp and author are better added from the Document object
50
55
  NON_METADATA_FIELDS = [
51
56
  'actual_text',
52
- 'date',
53
57
  'id',
54
58
  'is_synthetic',
55
59
  ]
@@ -64,18 +68,19 @@ class DocCfg:
64
68
  id (str): ID of file
65
69
  author (Name): Author of the document (if any)
66
70
  category (str | None): Type of file
67
- date (str | None): If passed will be immediated parsed into the 'timestamp' field
71
+ date (str | None): Parsed to a datetime by timestamp() if it exists
68
72
  dupe_type (DuplicateType | None): The type of duplicate this file is or its 'duplicate_ids' are
69
73
  duplicate_ids (list[str]): IDs of *other* documents that are dupes of this document
70
74
  duplicate_of_id (str | None): If this is a dupe the ID of the duplicated file. This file will be suppressed
71
75
  is_interesting (bool | None): Override other considerations and always consider this file interesting (or not)
72
- timestamp (datetime | None): Time this email was sent, file was created, article published, etc.
73
76
  is_synthetic (bool): True if this config was generated by the duplicate_cfgs() method
77
+ replace_text_with (bool): True if `description` should replace body of the document when printing.
74
78
  """
75
79
  id: str
76
80
  attached_to_email_id: str | None = None
77
81
  author: Name = None
78
82
  category: str | None = None
83
+ comment: str = ''
79
84
  date: str | None = None
80
85
  description: str | None = None
81
86
  dupe_type: DuplicateType | None = None
@@ -84,15 +89,9 @@ class DocCfg:
84
89
  is_attribution_uncertain: bool = False
85
90
  is_interesting: bool | None = None
86
91
  is_synthetic: bool = False
87
- timestamp: datetime | None = None
88
-
89
- def __post_init__(self):
90
- if self.date:
91
- self.timestamp = parse(self.date)
92
-
93
- if self.duplicate_of_id or self.duplicate_ids:
94
- self.dupe_type = self.dupe_type or SAME
92
+ replace_text_with: str = ''
95
93
 
94
+ @property
96
95
  def complete_description(self) -> str | None:
97
96
  """String that summarizes what is known about this document."""
98
97
  description = ''
@@ -130,6 +129,24 @@ class DocCfg:
130
129
 
131
130
  return description
132
131
 
132
+ @property
133
+ def metadata(self) -> Metadata:
134
+ metadata = {k: v for k, v in asdict(self).items() if k not in NON_METADATA_FIELDS and v}
135
+
136
+ if self.is_interesting is False:
137
+ metadata['is_interesting'] = False
138
+
139
+ return metadata
140
+
141
+ @property
142
+ def timestamp(self) -> datetime | None:
143
+ if self.date:
144
+ return parse(self.date)
145
+
146
+ def __post_init__(self):
147
+ if self.duplicate_of_id or self.duplicate_ids:
148
+ self.dupe_type = self.dupe_type or SAME
149
+
133
150
  def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
134
151
  """Create synthetic DocCfg objects that set the 'duplicate_of_id' field to point back to this object."""
135
152
  for id in self.duplicate_ids:
@@ -141,9 +158,6 @@ class DocCfg:
141
158
  dupe_cfg.is_synthetic = True
142
159
  yield dupe_cfg
143
160
 
144
- def metadata(self) -> Metadata:
145
- return {k: v for k, v in asdict(self).items() if k not in NON_METADATA_FIELDS and v}
146
-
147
161
  def _props_strs(self) -> list[str]:
148
162
  props = []
149
163
  add_prop = lambda f, value: props.append(f"{f.name}={value}")
@@ -151,20 +165,16 @@ class DocCfg:
151
165
  for _field in sorted(fields(self), key=lambda f: FIELD_SORT_KEY.get(f.name, f.name)):
152
166
  value = getattr(self, _field.name)
153
167
 
154
- if value is None or value is False or (isinstance(value, list) and len(value) == 0):
168
+ if _field.name in ['actual_text', 'is_fwded_article', 'is_interesting']: # fields can be False or None or ''
169
+ if value is not None:
170
+ add_prop(_field, str(value))
171
+ elif not value or _field.name == 'dupe_type' and value == 'same':
155
172
  continue
156
173
  elif _field.name == AUTHOR:
157
- add_prop(_field, constantize_name(str(value)) if CONSTANTIZE_NAMES else f"'{value}'")
158
- elif _field.name == 'category' and value in [EMAIL, TEXT_MESSAGE]:
159
- continue
160
- elif _field.name == 'recipients' and value:
161
- recipients_str = str([constantize_name(r) if (CONSTANTIZE_NAMES and r) else r for r in value])
162
- add_prop(_field, recipients_str.replace("'", '') if CONSTANTIZE_NAMES else recipients_str)
163
- elif _field.name == 'timestamp' and self.date is not None:
164
- continue # Don't print both timestamp and date
165
- elif isinstance(value, datetime):
166
- value_str = remove_zero_time(value)
167
- add_prop(_field, f"parse('{value_str}')" if CONSTANTIZE_NAMES else f"'{value}'")
174
+ add_prop(_field, constantize_name(str(value)) if args.constantize else f"'{value}'")
175
+ elif _field.name == 'recipients':
176
+ recipients_str = str([constantize_name(r) if (args.constantize and r) else r for r in value])
177
+ add_prop(_field, recipients_str.replace("'", '') if args.constantize else recipients_str)
168
178
  elif isinstance(value, str):
169
179
  if "'" in value:
170
180
  value = '"' + value.replace('"', r'\"') + '"'
@@ -182,14 +192,14 @@ class DocCfg:
182
192
  type_str = f"{type(self).__name__}("
183
193
  single_line_repr = type_str + ', '.join(props) + f')'
184
194
 
185
- if len(single_line_repr) < MAX_LINE_LENGTH:
195
+ if len(single_line_repr) < MAX_LINE_LENGTH or (self.comment and getattr(self, 'is_fwded_article')):
186
196
  repr_str = single_line_repr
187
197
  else:
188
198
  repr_str = f"{type_str}{INDENT_NEWLINE}" + INDENTED_JOIN.join(props)
189
199
  repr_str += ',' if props else ''
190
200
  repr_str += '\n)'
191
201
 
192
- if CONSTANTIZE_NAMES:
202
+ if args.constantize:
193
203
  repr_str = INDENT + INDENT_NEWLINE.join(repr_str.split('\n'))
194
204
  return repr_str.replace(',,', ',').replace(',),', '),').replace(',),', '),')
195
205
  else:
@@ -224,9 +234,10 @@ class EmailCfg(CommunicationCfg):
224
234
  """
225
235
  actual_text: str | None = None
226
236
  fwded_text_after: str | None = None
227
- is_fwded_article: bool = False
237
+ is_fwded_article: bool | None = None
228
238
  recipients: list[Name] = field(default_factory=list)
229
239
  subject: str | None = None
240
+ truncate_to: int | None = None
230
241
 
231
242
  # This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
232
243
  def __repr__(self) -> str:
epstein_files/util/env.py CHANGED
@@ -5,23 +5,24 @@ from pathlib import Path
5
5
 
6
6
  from rich_argparse_plus import RichHelpFormatterPlus
7
7
 
8
- from epstein_files.util.constant.output_files import ALL_EMAILS_PATH, CHRONOLOGICAL_EMAILS_PATH, TEXT_MSGS_HTML_PATH
9
- from epstein_files.util.logging import env_log_level, exit_with_error, logger
8
+ from epstein_files.util.constant.output_files import (ALL_EMAILS_PATH, CHRONOLOGICAL_EMAILS_PATH,
9
+ DOJ_2026_HTML_PATH, TEXT_MSGS_HTML_PATH)
10
+ from epstein_files.util.helpers.env_helpers import get_env_dir
11
+ from epstein_files.util.logging import env_log_level, exit_with_error, logger, set_log_level
10
12
 
11
13
  DEFAULT_WIDTH = 155
12
14
  DEFAULT_FILE = 'default_file'
13
15
  EPSTEIN_GENERATE = 'epstein_generate'
14
16
  HTML_SCRIPTS = [EPSTEIN_GENERATE, 'epstein_word_count']
15
17
 
16
- # Verify Epstein docs dir exists
17
- EPSTEIN_DOCS_DIR_ENV_VAR_NAME = 'EPSTEIN_DOCS_DIR'
18
- DOCS_DIR_ENV = environ.get(EPSTEIN_DOCS_DIR_ENV_VAR_NAME)
19
- DOCS_DIR = Path(DOCS_DIR_ENV or '').resolve()
18
+ # Get dirs from Env vars
19
+ DOCS_DIR_ENV_VAR = 'EPSTEIN_DOCS_DIR'
20
+ DOJ_PDFS_20260130_DIR_ENV_VAR = 'EPSTEIN_DOJ_PDFS_20260130_DIR'
21
+ DOJ_TXTS_20260130_DIR_ENV_VAR = 'EPSTEIN_DOJ_TXTS_20260130_DIR'
20
22
 
21
- if not DOCS_DIR_ENV:
22
- exit_with_error(f"{EPSTEIN_DOCS_DIR_ENV_VAR_NAME} env var not set!\n")
23
- elif not DOCS_DIR.exists():
24
- exit_with_error(f"{EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!\n")
23
+ DOCS_DIR: Path = get_env_dir(DOCS_DIR_ENV_VAR, must_exist=True)
24
+ DOJ_PDFS_20260130_DIR: Path = get_env_dir(DOJ_PDFS_20260130_DIR_ENV_VAR, must_exist=False)
25
+ DOJ_TXTS_20260130_DIR: Path = get_env_dir(DOJ_TXTS_20260130_DIR_ENV_VAR, must_exist=False)
25
26
 
26
27
  is_env_var_set = lambda s: len(environ.get(s) or '') > 0
27
28
  is_output_arg = lambda arg: any([arg.startswith(pfx) for pfx in ['colors_only', 'json', 'make_clean', 'output']])
@@ -41,6 +42,7 @@ output.add_argument('--email-timeline', action='store_true', help='print a table
41
42
  output.add_argument('--emailers-info', '-ei', action='store_true', help='write a .png of the eeailers info table')
42
43
  output.add_argument('--json-files', action='store_true', help='pretty print all the raw JSON data files in the collection and exit')
43
44
  output.add_argument('--json-metadata', action='store_true', help='dump JSON metadata for all files and exit')
45
+ output.add_argument('--output-doj-files', '-od', action='store_true', help='generate the DOJ files from 2026-01-30')
44
46
  output.add_argument('--output-emails', '-oe', action='store_true', help='generate emails section')
45
47
  output.add_argument('--output-other', '-oo', action='store_true', help='generate other files section')
46
48
  output.add_argument('--output-texts', '-ot', action='store_true', help='generate text messages section')
@@ -49,16 +51,19 @@ output.add_argument('--suppress-output', action='store_true', help='no output to
49
51
  output.add_argument('--uninteresting', action='store_true', help='only output uninteresting other files')
50
52
  output.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use (in characters)')
51
53
 
52
- scripts = parser.add_argument_group('SCRIPTS', 'Options used by epstein_search, epstein_show, and epstein_diff.')
54
+ scripts = parser.add_argument_group('SCRIPTS', 'Options used by epstein_grep, epstein_show, and epstein_diff.')
53
55
  scripts.add_argument('positional_args', nargs='*', help='strings to searchs for, file IDs to show or diff, etc.')
56
+ scripts.add_argument('--email-body', action='store_true', help='epstein_grep but only for the body of the email')
57
+ scripts.add_argument('--min-line-length', type=int, help='epstein_grep minimum length of a matched line')
54
58
  scripts.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (used by epstein_show)')
55
59
  scripts.add_argument('--whole-file', '-wf', action='store_true', help='print whole files')
56
60
 
57
61
  debug = parser.add_argument_group('DEBUG')
58
62
  debug.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
63
+ debug.add_argument('--constantize', action='store_true', help='constantize names when printing repr() of objects')
59
64
  debug.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
60
65
  debug.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
61
- debug.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats about the files')
66
+ debug.add_argument('--stats', '-j', action='store_true', help='print JSON formatted stats about the files')
62
67
  debug.add_argument('--skip-other-files', '-sof', action='store_true', help='skip parsing non email/text files')
63
68
  debug.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
64
69
  debug.add_argument('--truncate', '-t', type=int, help='truncate emails to this many characters')
@@ -66,7 +71,11 @@ debug.add_argument('--write-txt', '-wt', action='store_true', help='write a plai
66
71
 
67
72
 
68
73
  # Parse args
69
- args = parser.parse_args()
74
+ if environ.get('INVOKED_BY_PYTEST'):
75
+ args = parser.parse_args([EPSTEIN_GENERATE])
76
+ else:
77
+ args = parser.parse_args()
78
+
70
79
  is_html_script = parser.prog in HTML_SCRIPTS
71
80
 
72
81
  args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
@@ -78,7 +87,9 @@ args.width = args.width if is_html_script else None
78
87
  args.any_output_selected = any([is_output_arg(arg) and val for arg, val in vars(args).items()])
79
88
 
80
89
  if not (args.any_output_selected or args.email_timeline or args.emailers_info):
81
- logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
90
+ if is_html_script:
91
+ logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
92
+
82
93
  args.output_emails = args.output_other = args.output_texts = True
83
94
 
84
95
  if is_html_script:
@@ -95,25 +106,29 @@ if is_html_script:
95
106
  args.build = ALL_EMAILS_PATH
96
107
  elif args.email_timeline:
97
108
  args.build = CHRONOLOGICAL_EMAILS_PATH
109
+ elif args.output_doj_files:
110
+ args.build = DOJ_2026_HTML_PATH
98
111
  else:
99
112
  args.build = TEXT_MSGS_HTML_PATH
100
- elif parser.prog.startswith('epstein_') and not args.positional_args:
113
+ elif parser.prog.startswith('epstein_') and not args.positional_args and not args.names:
101
114
  exit_with_error(f"{parser.prog} requires positional arguments but got none!")
102
115
 
103
116
  if args.names:
104
117
  logger.warning(f"Output restricted to {args.names}")
105
118
  args.output_other = False
106
119
 
120
+ if args.truncate and args.whole_file:
121
+ exit_with_error(f"--whole-file and --truncate are incompatible")
107
122
 
108
123
  # Log level args
109
124
  if args.deep_debug:
110
- logger.setLevel(logging.DEBUG)
125
+ set_log_level(logging.DEBUG)
111
126
  elif args.debug:
112
- logger.setLevel(logging.INFO)
127
+ set_log_level(logging.INFO)
113
128
  elif args.suppress_logs:
114
- logger.setLevel(logging.FATAL)
129
+ set_log_level(logging.FATAL)
115
130
  elif not env_log_level:
116
- logger.setLevel(logging.WARNING)
131
+ set_log_level(logging.WARNING)
117
132
 
118
133
  logger.debug(f'Log level set to {logger.level}...')
119
134
  args_str = ',\n'.join([f"{k}={v}" for k, v in vars(args).items() if v])
@@ -1,43 +1,67 @@
1
1
  import re
2
2
  from pathlib import Path
3
3
 
4
- from epstein_files.util.constant.strings import FILE_NAME_REGEX, FILE_STEM_REGEX, HOUSE_OVERSIGHT_PREFIX
5
- from epstein_files.util.env import DOCS_DIR
4
+ from epstein_files.util.constant.strings import (DOJ_FILE_NAME_REGEX, EFTA_PREFIX,
5
+ HOUSE_OVERSIGHT_NOV_2025_FILE_NAME_REGEX, HOUSE_OVERSIGHT_NOV_2025_FILE_STEM_REGEX,
6
+ HOUSE_OVERSIGHT_PREFIX)
7
+ from epstein_files.util.env import DOCS_DIR, DOJ_TXTS_20260130_DIR
6
8
  from epstein_files.util.logging import logger
7
9
 
8
10
  EXTRACTED_EMAILS_DIR = Path('emails_extracted_from_legal_filings')
9
- FILE_ID_REGEX = re.compile(fr".*{FILE_NAME_REGEX.pattern}")
11
+ FILE_ID_REGEX = re.compile(fr".*{HOUSE_OVERSIGHT_NOV_2025_FILE_NAME_REGEX.pattern}")
10
12
  FILENAME_LENGTH = len(HOUSE_OVERSIGHT_PREFIX) + 6
11
13
  KB = 1024
12
14
  MB = KB * KB
13
15
 
14
16
  # Coerce methods handle both string and int arguments.
15
17
  coerce_file_name = lambda filename_or_id: coerce_file_stem(filename_or_id) + '.txt'
16
- coerce_file_path = lambda filename_or_id: DOCS_DIR.joinpath(coerce_file_name(filename_or_id))
17
18
  file_size = lambda file_path: Path(file_path).stat().st_size
18
19
  id_str = lambda id: f"{int(id):06d}"
19
20
 
20
21
 
22
+ def coerce_file_path(filename_or_id: int | str) -> Path:
23
+ """Returns the `Path` for the file with `filename_or_id` ID."""
24
+ filename = coerce_file_name(filename_or_id)
25
+
26
+ if isinstance(filename_or_id, str) and DOJ_FILE_NAME_REGEX.match(filename_or_id):
27
+ for txt_file in DOJ_TXTS_20260130_DIR.glob('**/*.txt'):
28
+ if txt_file.name == filename:
29
+ return txt_file
30
+
31
+ raise RuntimeError(f"'{filename_or_id}' looks like DOJ file but no file named {filename} in '{DOJ_TXTS_20260130_DIR}'")
32
+ else:
33
+ return DOCS_DIR.joinpath(filename)
34
+
35
+
21
36
  def coerce_file_stem(filename_or_id: int | str) -> str:
22
- """Generate a valid file_stem no matter what form the argument comes in."""
37
+ """Generate a valid file stem no matter what form the argument comes in."""
38
+ if isinstance(filename_or_id, str) and DOJ_FILE_NAME_REGEX.search(filename_or_id):
39
+ return Path(filename_or_id).stem
40
+
23
41
  if isinstance(filename_or_id, str) and filename_or_id.startswith(HOUSE_OVERSIGHT_PREFIX):
24
42
  file_id = extract_file_id(filename_or_id)
25
43
  file_stem = file_stem_for_id(file_id)
26
44
  else:
27
45
  file_stem = file_stem_for_id(filename_or_id)
28
46
 
29
- if not FILE_STEM_REGEX.match(file_stem):
47
+ if not HOUSE_OVERSIGHT_NOV_2025_FILE_STEM_REGEX.match(file_stem):
30
48
  raise RuntimeError(f"Invalid stem '{file_stem}' from '{filename_or_id}'")
31
49
 
32
50
  return file_stem
33
51
 
34
52
 
35
53
  def extract_file_id(filename_or_id: int | str | Path) -> str:
54
+ # DOJ 2026-01 files have different pattern
55
+ if isinstance(filename_or_id, str) and filename_or_id.startswith(EFTA_PREFIX):
56
+ return Path(filename_or_id).stem
57
+
36
58
  if isinstance(filename_or_id, str):
37
59
  filename_or_id = filename_or_id.removesuffix(',')
38
60
 
39
61
  if isinstance(filename_or_id, int) or (isinstance(filename_or_id, str) and len(filename_or_id) <= 6):
40
62
  return id_str(filename_or_id)
63
+ elif isinstance(filename_or_id, str) and len(filename_or_id) == 8:
64
+ return f"{HOUSE_OVERSIGHT_PREFIX}{filename_or_id}"
41
65
 
42
66
  file_match = FILE_ID_REGEX.match(str(filename_or_id).upper())
43
67
 
@@ -0,0 +1,13 @@
1
+
2
+
3
+ def _show_timestamps(epstein_files):
4
+ for doc in epstein_files.doj_files:
5
+ doc.warn(f"timestamp: {doc.timestamp}")
6
+
7
+
8
+ def _verify_filenames(epstein_files):
9
+ doc_filenames = set([doc.file_path.name for doc in epstein_files.all_documents])
10
+
11
+ for file_path in epstein_files.all_files:
12
+ if file_path.name not in doc_filenames:
13
+ print(f"'{file_path}' is not in list of {len(doc_filenames)} Document obj filenames!")
@@ -0,0 +1,21 @@
1
+ """Helpers for dealing with environment variables."""
2
+ from os import environ
3
+ from pathlib import Path
4
+
5
+ from epstein_files.util.logging import exit_with_error, logger
6
+
7
+
8
+ def get_env_dir(env_var_name: str, must_exist: bool = True) -> Path | None:
9
+ if (dir := environ.get(env_var_name)):
10
+ dir = Path(dir)
11
+ error_msg = f"env var {env_var_name} set to '{dir}' but that's not a directory"
12
+
13
+ if dir.is_dir():
14
+ return dir.resolve()
15
+ elif must_exist:
16
+ exit_with_error(f"Required {error_msg}.\n")
17
+ else:
18
+ logger.warning(f"Optional {error_msg}. Some features will be unavailable.")
19
+ return None
20
+ else:
21
+ logger.warning(f"Optional env var {env_var_name} not set. Some features will be unavailable.")