epstein-files 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. epstein_files/__init__.py +75 -135
  2. epstein_files/documents/communication.py +9 -9
  3. epstein_files/documents/document.py +115 -87
  4. epstein_files/documents/email.py +154 -85
  5. epstein_files/documents/emails/email_header.py +7 -6
  6. epstein_files/documents/imessage/text_message.py +3 -2
  7. epstein_files/documents/json_file.py +17 -0
  8. epstein_files/documents/messenger_log.py +62 -3
  9. epstein_files/documents/other_file.py +165 -17
  10. epstein_files/epstein_files.py +128 -169
  11. epstein_files/util/constant/names.py +8 -1
  12. epstein_files/util/constant/output_files.py +29 -0
  13. epstein_files/util/constant/strings.py +27 -0
  14. epstein_files/util/constant/urls.py +25 -9
  15. epstein_files/util/constants.py +1018 -1045
  16. epstein_files/util/data.py +20 -55
  17. epstein_files/util/{file_cfg.py → doc_cfg.py} +121 -43
  18. epstein_files/util/env.py +19 -20
  19. epstein_files/util/file_helper.py +38 -21
  20. epstein_files/util/highlighted_group.py +229 -177
  21. epstein_files/util/logging.py +63 -0
  22. epstein_files/util/output.py +180 -0
  23. epstein_files/util/rich.py +29 -17
  24. epstein_files/util/search_result.py +14 -6
  25. epstein_files/util/timer.py +24 -0
  26. epstein_files/util/word_count.py +2 -1
  27. {epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/METADATA +20 -4
  28. epstein_files-1.0.2.dist-info/RECORD +33 -0
  29. epstein_files-1.0.2.dist-info/entry_points.txt +7 -0
  30. epstein_files-1.0.0.dist-info/RECORD +0 -28
  31. {epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/LICENSE +0 -0
  32. {epstein_files-1.0.0.dist-info → epstein_files-1.0.2.dist-info}/WHEEL +0 -0
@@ -3,17 +3,13 @@ Helpers for dealing with various kinds of data.
3
3
  """
4
4
  import itertools
5
5
  import re
6
- import time
7
- from dataclasses import dataclass, field
8
6
  from datetime import datetime, timezone
9
7
  from dateutil import tz
10
8
  from typing import TypeVar
11
9
 
12
- from dateutil.parser import parse
13
- from rich.text import Text
14
-
15
10
  from epstein_files.util.constant import names
16
- from epstein_files.util.env import args, logger
11
+ from epstein_files.util.env import args
12
+ from epstein_files.util.logging import logger
17
13
 
18
14
  T = TypeVar('T')
19
15
 
@@ -23,37 +19,22 @@ CONSTANT_VAR_REGEX = re.compile(r"^[A-Z_]+$")
23
19
  ALL_NAMES = [v for k, v in vars(names).items() if isinstance(v, str) and CONSTANT_VAR_REGEX.match(k)]
24
20
 
25
21
  PACIFIC_TZ = tz.gettz("America/Los_Angeles")
26
- TIMEZONE_INFO = {"PST": PACIFIC_TZ, "PDT": PACIFIC_TZ} # Suppresses annoying warnings from parse() calls
27
-
28
-
29
- def collapse_newlines(text: str) -> str:
30
- return MULTINEWLINE_REGEX.sub('\n\n', text)
22
+ TIMEZONE_INFO = {"PDT": PACIFIC_TZ, "PST": PACIFIC_TZ} # Suppresses annoying warnings from parse() calls
31
23
 
32
24
 
33
- def date_str(timestamp: datetime | None) -> str | None:
34
- return timestamp.isoformat()[0:10] if timestamp else None
25
+ collapse_newlines = lambda text: MULTINEWLINE_REGEX.sub('\n\n', text)
26
+ date_str = lambda dt: dt.isoformat()[0:10] if dt else None
27
+ escape_double_quotes = lambda text: text.replace('"', r'\"')
28
+ escape_single_quotes = lambda text: text.replace("'", r"\'")
29
+ iso_timestamp = lambda dt: dt.isoformat().replace('T', ' ')
30
+ uniquify = lambda _list: list(set(_list))
31
+ without_nones = lambda _list: [e for e in _list if e]
35
32
 
36
33
 
37
34
  def dict_sets_to_lists(d: dict[str, set]) -> dict[str, list]:
38
35
  return {k: sorted(list(v)) for k, v in d.items()}
39
36
 
40
37
 
41
- def extract_datetime(s: str) -> datetime | None:
42
- match = ISO_DATE_REGEX.search(s)
43
-
44
- if not match:
45
- return None
46
-
47
- date_str = match.group(0)
48
-
49
- if len(date_str) == 4:
50
- date_str += '-01-01'
51
- elif len(date_str) == 7:
52
- date_str += '-01'
53
-
54
- return parse(date_str, tzinfos=TIMEZONE_INFO)
55
-
56
-
57
38
  def extract_last_name(name: str) -> str:
58
39
  if ' ' not in name:
59
40
  return name
@@ -70,14 +51,19 @@ def flatten(_list: list[list[T]]) -> list[T]:
70
51
  return list(itertools.chain.from_iterable(_list))
71
52
 
72
53
 
73
- def iso_timestamp(dt: datetime) -> str:
74
- return dt.isoformat().replace('T', ' ')
54
+ def json_safe(d: dict) -> dict:
55
+ return {
56
+ 'None' if k is None else k: v.isoformat() if isinstance(v, datetime) else v
57
+ for k,v in d.items()
58
+ }
75
59
 
76
60
 
77
- def listify(listlike: list | str | Text | None) -> list:
61
+ def listify(listlike) -> list:
78
62
  """Create a list of 'listlike'. Returns empty list if 'listlike' is None or empty string."""
79
63
  if isinstance(listlike, list):
80
64
  return listlike
65
+ elif listlike is None:
66
+ return [None]
81
67
  elif listlike:
82
68
  return [listlike]
83
69
  else:
@@ -93,8 +79,8 @@ def ordinal_str(n: int) -> str:
93
79
  return str(n) + suffix
94
80
 
95
81
 
96
- def patternize(_pattern: str | re.Pattern):
97
- return _pattern if isinstance(_pattern, re.Pattern) else re.compile(rf"({_pattern})", re.IGNORECASE)
82
+ def patternize(_pattern: str | re.Pattern) -> re.Pattern:
83
+ return _pattern if isinstance(_pattern, re.Pattern) else re.compile(fr"({_pattern})", re.IGNORECASE)
98
84
 
99
85
 
100
86
  def remove_timezone(timestamp: datetime) -> datetime:
@@ -108,24 +94,3 @@ def remove_timezone(timestamp: datetime) -> datetime:
108
94
  def sort_dict(d: dict[str | None, int] | dict[str, int]) -> list[tuple[str | None, int]]:
109
95
  sort_key = lambda e: (e[0] or '').lower() if args.sort_alphabetical else [-e[1], (e[0] or '').lower()]
110
96
  return sorted(d.items(), key=sort_key)
111
-
112
-
113
- @dataclass
114
- class Timer:
115
- started_at: float = field(default_factory=lambda: time.perf_counter())
116
- checkpoint_at: float = field(default_factory=lambda: time.perf_counter())
117
-
118
- def print_at_checkpoint(self, msg: str) -> None:
119
- logger.warning(f"{msg} in {self.seconds_since_checkpoint()}")
120
- self.checkpoint_at = time.perf_counter()
121
-
122
- def seconds_since_checkpoint(self) -> str:
123
- return f"{(time.perf_counter() - self.checkpoint_at):.2f} seconds"
124
-
125
- def seconds_since_start(self) -> str:
126
- return f"{(time.perf_counter() - self.started_at):.2f} seconds"
127
-
128
-
129
- escape_double_quotes = lambda text: text.replace('"', r'\"')
130
- escape_single_quotes = lambda text: text.replace("'", r"\'")
131
- uniquify = lambda _list: list(set(_list))
@@ -6,22 +6,27 @@ from typing import Generator, Literal
6
6
 
7
7
  from dateutil.parser import parse
8
8
 
9
- from epstein_files.util.constant.names import constantize_name
10
- from epstein_files.util.constant.strings import AUTHOR
9
+ from epstein_files.util.constant.names import *
10
+ from epstein_files.util.constant.strings import *
11
+ from epstein_files.util.data import without_nones
11
12
 
12
- DuplicateType = Literal['same', 'earlier', 'quoted', 'redacted']
13
+ DuplicateType = Literal['earlier', 'quoted', 'redacted', 'same']
14
+ Metadata = dict[str, bool | datetime | int | str | list[str | None] |dict[str, bool | str]]
13
15
 
16
+ # Misc
17
+ CONSTANTIZE_NAMES = False # A flag set to True that causes repr() of these classes to return strings of usable code
14
18
  INDENT = ' '
15
19
  INDENT_NEWLINE = f'\n{INDENT}'
16
20
  INDENTED_JOIN = f',{INDENT_NEWLINE}'
17
- CONSTANTIZE_NAMES = False # A flag set to True that causes repr() of these classes to return strings of usable code
18
- MAX_LINE_LENGTH = 250
21
+ MAX_LINE_LENGTH = 150
22
+ REPUTATION_MGMT = f'{REPUTATION} management'
23
+ SAME = 'same'
19
24
 
20
- REASON_MAPPING: dict[DuplicateType, str] = {
21
- 'earlier': 'earlier draft of',
25
+ DUPE_TYPE_STRS: dict[DuplicateType, str] = {
26
+ 'earlier': 'an earlier draft of',
22
27
  'quoted': 'quoted in full in',
23
- 'redacted': 'redacted version of',
24
- 'same': 'the same as',
28
+ 'redacted': 'a redacted version of',
29
+ SAME: 'the same as',
25
30
  }
26
31
 
27
32
  FIELD_SORT_KEY = {
@@ -30,57 +35,116 @@ FIELD_SORT_KEY = {
30
35
  'attribution_reason': 'zz',
31
36
  }
32
37
 
38
+ FINANCIAL_REPORTS_AUTHORS = [
39
+ BOFA,
40
+ DEUTSCHE_BANK,
41
+ ELECTRON_CAPITAL_PARTNERS,
42
+ GOLDMAN_INVESTMENT_MGMT,
43
+ 'Invesco',
44
+ JP_MORGAN,
45
+ 'Morgan Stanley',
46
+ 'S&P',
47
+ ]
48
+
49
+ # Fields like timestamp and author are better added from the Document object
50
+ INVALID_FOR_METADATA = [
51
+ 'actual_text',
52
+ 'date',
53
+ 'id',
54
+ 'timestamp',
55
+ 'was_generated',
56
+ ]
57
+
33
58
 
34
59
  @dataclass(kw_only=True)
35
- class FileCfg:
36
- """Convenience class that encapsulates configuring info about files that need to be manually configured.
60
+ class DocCfg:
61
+ """
62
+ Encapsulates info about files that needs to be manually configured because it cannot be programmatically inferred.
37
63
 
38
64
  Attributes:
39
65
  id (str): ID of file
40
66
  author (str | None): Author of the document (if any)
67
+ category (str | None): Type of file
41
68
  date (str | None): If passed will be immediated parsed into the 'timestamp' field
42
69
  dupe_of_id (str | None): If this is a dupe the ID of the duplicated file. This file will be suppressed
43
- dupe_type (DuplicateType | None): The type of duplicate this file is (redacted, quoted, etc.)
44
- duplicate_ids (list[str]): Inverse of 'dupe_of_id' - this file will NOT be suppressed but 'duplicate_ids' will be.
70
+ dupe_type (DuplicateType | None): The type of duplicate this file is or its 'duplicate_ids' are
71
+ duplicate_ids (list[str]): Inverse of 'dupe_of_id' - this file will NOT be suppressed but 'duplicate_ids' will be
72
+ is_interesting (bool): Override other considerations and always consider this file interesting
45
73
  timestamp (datetime | None): Time this email was sent, file was created, article published, etc.
46
74
  was_generated (bool): True if this object was generated by the duplicate_cfgs() method
47
75
  """
48
76
  id: str
49
77
  author: str | None = None
78
+ category: str | None = None
50
79
  date: str | None = None
51
80
  description: str | None = None
52
81
  dupe_of_id: str | None = None
53
82
  dupe_type: DuplicateType | None = None
54
83
  duplicate_ids: list[str] = field(default_factory=list)
84
+ is_interesting: bool = False
55
85
  timestamp: datetime | None = None
56
- was_generated: bool = False # True if this object was generated by duplicate_cfgs()
86
+ was_generated: bool = False
57
87
 
58
88
  def __post_init__(self):
59
- if self.dupe_of_id:
60
- self.dupe_type = self.dupe_type or 'same'
61
-
62
89
  if self.date:
63
90
  self.timestamp = parse(self.date)
64
91
 
92
+ if self.dupe_of_id or self.duplicate_ids:
93
+ self.dupe_type = self.dupe_type or SAME
94
+
65
95
  def duplicate_reason(self) -> str | None:
66
96
  if self.dupe_type is not None:
67
- return REASON_MAPPING[self.dupe_type]
97
+ return DUPE_TYPE_STRS[self.dupe_type]
68
98
 
69
- def duplicate_cfgs(self) -> Generator['FileCfg', None, None]:
99
+ def duplicate_cfgs(self) -> Generator['DocCfg', None, None]:
100
+ """Create synthetic DocCfg objects that set the 'dupe_of_id' field to point back to this object."""
70
101
  for id in self.duplicate_ids:
71
102
  dupe_cfg = deepcopy(self)
72
103
  dupe_cfg.id = id
73
104
  dupe_cfg.dupe_of_id = self.id
74
- dupe_cfg.dupe_type = self.dupe_type or 'same'
105
+ dupe_cfg.duplicate_ids = []
106
+ dupe_cfg.dupe_type = self.dupe_type
75
107
  dupe_cfg.was_generated = True
76
108
  yield dupe_cfg
77
109
 
110
+ def info_str(self) -> str | None:
111
+ """String that summarizes what is known about this document."""
112
+ if self.category == REPUTATION:
113
+ return f"{REPUTATION_MGMT}: {self.description}"
114
+ elif self.author and self.description:
115
+ if self.category in [ACADEMIA, BOOK]:
116
+ return self.title_by_author()
117
+ elif self.category == FINANCE and self.author in FINANCIAL_REPORTS_AUTHORS:
118
+ return f"{self.author} report: '{self.description}'"
119
+ elif self.category == LEGAL and 'v.' in self.author:
120
+ return f"{self.author}: '{self.description}'"
121
+ elif self.category and self.author is None and self.description is None:
122
+ return self.category
123
+
124
+ pieces = without_nones([self.author, self.description])
125
+ return ' '.join(pieces) if pieces else None
126
+
127
+ def metadata(self) -> Metadata:
128
+ non_null_fields = {k: v for k, v in asdict(self).items() if v and k not in INVALID_FOR_METADATA}
129
+
130
+ if self.category in [EMAIL, TEXT_MESSAGE]:
131
+ del non_null_fields['category']
132
+
133
+ return non_null_fields
134
+
78
135
  def non_null_field_names(self) -> list[str]:
79
136
  return [f.name for f in self.sorted_fields() if getattr(self, f.name)]
80
137
 
81
138
  def sorted_fields(self) -> list[Field]:
82
139
  return sorted(fields(self), key=lambda f: FIELD_SORT_KEY.get(f.name, f.name))
83
140
 
141
+ def title_by_author(self) -> str:
142
+ if not (self.author and self.description):
143
+ raise RuntimeError(f"Can't call title_by_author() without author and description!")
144
+
145
+ title = self.description if '"' in self.description else f"'{self.description}'"
146
+ return f"{title} by {self.author}"
147
+
84
148
  def _props_strs(self) -> list[str]:
85
149
  props = []
86
150
  add_prop = lambda f, value: props.append(f"{f.name}={value}")
@@ -92,14 +156,16 @@ class FileCfg:
92
156
  continue
93
157
  elif _field.name == AUTHOR:
94
158
  add_prop(_field, constantize_name(str(value)) if CONSTANTIZE_NAMES else f"'{value}'")
159
+ elif _field.name == 'category' and value in [EMAIL, TEXT_MESSAGE]:
160
+ continue
95
161
  elif _field.name == 'recipients' and isinstance(value, list):
96
162
  recipients_str = str([constantize_name(r) if (CONSTANTIZE_NAMES and r) else r for r in value])
97
163
  add_prop(_field, recipients_str.replace("'", '') if CONSTANTIZE_NAMES else recipients_str)
164
+ elif _field.name == 'timestamp' and self.date is not None:
165
+ continue # Don't print both timestamp and date
98
166
  elif isinstance(value, datetime):
99
167
  value_str = re.sub(' 00:00:00', '', str(value))
100
168
  add_prop(_field, f"parse('{value_str}')" if CONSTANTIZE_NAMES else f"'{value}'")
101
- elif _field.name == 'description':
102
- add_prop(_field, value.strip())
103
169
  elif isinstance(value, str):
104
170
  if "'" in value:
105
171
  value = '"' + value.replace('"', r'\"') + '"'
@@ -112,22 +178,12 @@ class FileCfg:
112
178
 
113
179
  return props
114
180
 
115
- def __eq__(self, other: 'FileCfg') -> bool:
116
- """Return True if everything matches other than the two 'dupe_' fields ('duplicate_ids' is compared)."""
117
- for _field in self.sorted_fields():
118
- if _field.name == 'id' or _field.name.startswith('dupe'):
119
- continue
120
- elif getattr(self, _field.name) != getattr(other, _field.name):
121
- return False
122
-
123
- return True
124
-
125
181
  def __repr__(self) -> str:
126
182
  props = self._props_strs()
127
183
  type_str = f"{type(self).__name__}("
128
184
  single_line_repr = type_str + ', '.join(props) + f')'
129
185
 
130
- if (len(single_line_repr) < MAX_LINE_LENGTH or self.non_null_field_names() == ['id', 'description']) and '#' not in (self.description or ''):
186
+ if len(single_line_repr) < MAX_LINE_LENGTH:
131
187
  repr_str = single_line_repr
132
188
  else:
133
189
  repr_str = f"{type_str}{INDENT_NEWLINE}" + INDENTED_JOIN.join(props)
@@ -142,31 +198,53 @@ class FileCfg:
142
198
 
143
199
 
144
200
  @dataclass(kw_only=True)
145
- class MessageCfg(FileCfg):
201
+ class CommunicationCfg(DocCfg):
146
202
  """
147
- Convenience class to unite various configured properties for a given Communication file.
148
203
  Manual config is always required for MessengerLog author attribution. It's also often needed for Email
149
204
  files to handle the terrible OCR text that Congress provided which messes up a lot of the email headers.
150
205
 
151
206
  Attributes:
152
- actual_text (str | None): In dire cases of broken OCR we just configure the body of the email as a string.
153
207
  attribution_reason (str | None): Optional explanation of why this email was attributed to this author.
154
208
  is_attribution_uncertain (bool): True if we have a good idea of who the author is but are not 100% certain
209
+ """
210
+ attribution_reason: str | None = None
211
+ is_attribution_uncertain: bool = False
212
+
213
+ def __repr__(self) -> str:
214
+ return super().__repr__()
215
+
216
+
217
+ @dataclass(kw_only=True)
218
+ class EmailCfg(CommunicationCfg):
219
+ """
220
+ Attributes:
221
+ actual_text (str | None): In dire cases of broken OCR we just configure the body of the email as a string.
155
222
  is_fwded_article (bool): True if this is a newspaper article someone fwded. Used to exclude articles from word counting.
156
223
  recipients (list[str | None]): Who received the email
157
224
  """
158
225
  actual_text: str | None = None # Override for the Email._actual_text() method for particularly broken emails
159
- attribution_reason: str | None = None
160
- is_attribution_uncertain: bool = False
161
226
  is_fwded_article: bool = False
162
227
  recipients: list[str | None] = field(default_factory=list)
163
228
 
164
- def __eq__(self, other: 'FileCfg') -> bool:
165
- return super().__eq__(other)
229
+ def __post_init__(self):
230
+ super().__post_init__()
231
+ self.category = EMAIL
232
+
233
+ @classmethod
234
+ def from_doc_cfg(cls, cfg: DocCfg) -> 'EmailCfg':
235
+ return cls(**asdict(cfg))
166
236
 
237
+ # This is necessary for some dumb reason. @dataclass(repr=False) doesn't cut it
167
238
  def __repr__(self) -> str:
168
239
  return super().__repr__()
169
240
 
170
- @classmethod
171
- def from_file_cfg(cls, cfg: FileCfg) -> 'MessageCfg':
172
- return cls(**asdict(cfg))
241
+
242
+ @dataclass(kw_only=True)
243
+ class TextCfg(CommunicationCfg):
244
+ def __post_init__(self):
245
+ super().__post_init__()
246
+ self.category = TEXT_MESSAGE
247
+
248
+ # This is necessary for some dumb reason. @dataclass(repr=False) doesn't cut it
249
+ def __repr__(self) -> str:
250
+ return super().__repr__()
epstein_files/util/env.py CHANGED
@@ -4,42 +4,43 @@ from os import environ
4
4
  from pathlib import Path
5
5
  from sys import argv
6
6
 
7
- from rich.logging import RichHandler
7
+ from epstein_files.util.logging import datefinder_logger, env_log_level, logger
8
8
 
9
9
  DEFAULT_WIDTH = 154
10
- HTML_SCRIPTS = ['generate_html.py', 'count_words.py']
10
+ HTML_SCRIPTS = ['epstein_generate', 'generate_html.py', 'count_words.py']
11
11
 
12
12
 
13
13
  parser = ArgumentParser(description="Parse epstein OCR docs and generate HTML page.")
14
- parser.add_argument('--build', '-b', action='store_true', help='write HTML to docs/index.html')
15
- parser.add_argument('--all-emails', '-ae', action='store_true', help='all the emails')
16
- parser.add_argument('--all-email-tables', '-aet', action='store_true', help='all email tables (except Epstein)')
17
- parser.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of a limited selection')
14
+ parser.add_argument('--build', '-b', action='store_true', help='write output to file')
15
+ parser.add_argument('--all-emails', '-ae', action='store_true', help='all the emails instead of just the interesting ones')
16
+ parser.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just interesting ones')
18
17
  parser.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
19
18
  parser.add_argument('--name', '-n', action='append', dest='names', help='specify the name(s) whose communications should be output')
19
+ parser.add_argument('--output-file', '-out', metavar='FILE', default='index.html', help='write output to FILE in docs/ (default=index.html)')
20
20
  parser.add_argument('--output-emails', '-oe', action='store_true', help='generate other files section')
21
21
  parser.add_argument('--output-other-files', '-oo', action='store_true', help='generate other files section')
22
22
  parser.add_argument('--output-texts', '-ot', action='store_true', help='generate other files section')
23
23
  parser.add_argument('--pickled', '-p', action='store_true', help='use pickled EpsteinFiles object')
24
24
  parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='generate new pickled EpsteinFiles object')
25
+ parser.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (only used by scripts)')
25
26
  parser.add_argument('--sort-alphabetical', '-alpha', action='store_true', help='sort emailers alphabetically in counts table')
26
27
  parser.add_argument('--suppress-output', '-s', action='store_true', help='no output to terminal (use with --build)')
27
- parser.add_argument('--use-epstein-web-links', '-use', action='store_true', help='use epsteinweb.org links instead of epsteinify.com')
28
- parser.add_argument('--search-other', '-so', action='store_true', help='search for string in non email/text files (only used by search script)')
28
+ parser.add_argument('--use-epstein-web-links', '-use', action='store_true', help='use epsteinweb.org links instead of epstein.media')
29
29
  parser.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use')
30
30
  parser.add_argument('--whole-file', '-wf', action='store_true', help='print whole file (only used by search script)')
31
31
  parser.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
32
32
  parser.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
33
+ parser.add_argument('--make-clean', '-mc', action='store_true', help='delete all build artifact HTML and JSON files')
33
34
  parser.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
35
+ parser.add_argument('--json-metadata', '-jm', action='store_true', help='dump JSON metadata for all files')
34
36
  parser.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats at the end')
35
37
  parser.add_argument('positional_args', nargs='*', help='Optional args (only used by helper scripts)')
36
38
  args = parser.parse_args()
37
39
 
38
- is_env_var_set = lambda s: len(environ.get(s) or '') > 0
39
40
  current_script = Path(argv[0]).name
41
+ is_env_var_set = lambda s: len(environ.get(s) or '') > 0
40
42
  is_html_script = current_script in HTML_SCRIPTS
41
43
 
42
- args.deep_debug = args.deep_debug or is_env_var_set('DEEP_DEBUG')
43
44
  args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
44
45
  args.output_emails = args.output_emails or args.all_emails
45
46
  args.output_other_files = args.output_other_files or args.all_other_files
@@ -48,27 +49,25 @@ args.width = args.width if is_html_script else None
48
49
  specified_names: list[str | None] = [None if n == 'None' else n for n in (args.names or [])]
49
50
 
50
51
 
51
- # Setup logging
52
- logging.basicConfig(level="NOTSET", format="%(message)s", datefmt="[%X]", handlers=[RichHandler()])
53
- # logging.basicConfig(level="DEBUG", handlers=[RichHandler()])
54
- logger = logging.getLogger("rich")
55
-
52
+ # Log level args
56
53
  if args.deep_debug:
57
54
  logger.setLevel(logging.DEBUG)
58
55
  elif args.debug:
59
56
  logger.setLevel(logging.INFO)
60
57
  elif args.suppress_logs:
61
58
  logger.setLevel(logging.FATAL)
62
- else:
59
+ elif not env_log_level:
63
60
  logger.setLevel(logging.WARNING)
64
61
 
65
- datefinder_logger = logging.getLogger('datefinder') # Suppress annoying output
62
+ logger.info(f'Log level set to {logger.level}...')
66
63
  datefinder_logger.setLevel(logger.level)
67
64
 
68
65
 
69
66
  # Massage args that depend on other args to the appropriate state
70
- if not (args.output_texts or args.output_emails or args.output_other_files):
71
- logger.warning(f"No output section chosen; outputting default of texts, selected emails, and other files...")
67
+ if not (args.json_metadata or args.output_texts or args.output_emails or args.output_other_files):
68
+ if is_html_script:
69
+ logger.warning(f"No output section chosen; outputting default of texts, selected emails, and other files...")
70
+
72
71
  args.output_texts = True
73
72
  args.output_emails = True
74
73
  args.output_other_files = True
@@ -77,4 +76,4 @@ if args.use_epstein_web_links:
77
76
  logger.warning(f"Using links to epsteinweb.org links instead of epsteinify.com...")
78
77
 
79
78
  if args.debug:
80
- logger.warning(f"is_html_script={is_html_script}, specified_names={specified_names}, args={args}")
79
+ logger.warning(f"Invocation args:\nis_html_script={is_html_script},\nspecified_names={specified_names},\nargs={args}")
@@ -3,7 +3,7 @@ from os import environ
3
3
  from pathlib import Path
4
4
  from sys import exit
5
5
 
6
- from epstein_files.util.constant.strings import HOUSE_OVERSIGHT_PREFIX
6
+ from epstein_files.util.constant.strings import FILE_NAME_REGEX, FILE_STEM_REGEX, HOUSE_OVERSIGHT_PREFIX
7
7
 
8
8
  EPSTEIN_DOCS_DIR_ENV_VAR_NAME = 'EPSTEIN_DOCS_DIR'
9
9
  DOCS_DIR_ENV = environ[EPSTEIN_DOCS_DIR_ENV_VAR_NAME]
@@ -16,30 +16,23 @@ elif not DOCS_DIR.exists():
16
16
  print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!")
17
17
  exit(1)
18
18
 
19
- JSON_DIR = DOCS_DIR.joinpath('json_files')
20
- HTML_DIR = Path('docs')
21
19
  EXTRACTED_EMAILS_DIR = Path('emails_extracted_from_legal_filings')
22
- GH_PAGES_HTML_PATH = HTML_DIR.joinpath('index.html')
23
- WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_emails_word_count.html')
24
- EPSTEIN_WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_texts_and_emails_word_count.html')
25
- PICKLED_PATH = Path("the_epstein_files.pkl.gz")
26
-
27
- FILE_STEM_REGEX = re.compile(fr"{HOUSE_OVERSIGHT_PREFIX}(\d{{6}})")
28
- FILE_ID_REGEX = re.compile(fr".*{FILE_STEM_REGEX.pattern}(_\d{{1,2}})?(\.txt(\.json)?)?")
20
+ FILE_ID_REGEX = re.compile(fr".*{FILE_NAME_REGEX.pattern}")
29
21
  FILENAME_LENGTH = len(HOUSE_OVERSIGHT_PREFIX) + 6
30
22
  KB = 1024
31
23
  MB = KB * KB
32
24
 
33
25
 
34
26
  # Handles both string and int 'id' args.
35
- file_stem_for_id = lambda id: f"{HOUSE_OVERSIGHT_PREFIX}{int(id):06d}"
27
+ id_str = lambda id: f"{int(id):06d}"
36
28
  filename_for_id = lambda id: file_stem_for_id(id) + '.txt'
37
29
 
38
30
 
39
31
  def coerce_file_stem(filename_or_id: int | str) -> str:
40
32
  """Generate a valid file_stem no matter what form the argument comes in."""
41
33
  if isinstance(filename_or_id, str) and filename_or_id.startswith(HOUSE_OVERSIGHT_PREFIX):
42
- file_stem = file_stem_for_id(extract_file_id(filename_or_id))
34
+ file_id = extract_file_id(filename_or_id)
35
+ file_stem = file_stem_for_id(file_id)
43
36
  else:
44
37
  file_stem = file_stem_for_id(filename_or_id)
45
38
 
@@ -49,32 +42,56 @@ def coerce_file_stem(filename_or_id: int | str) -> str:
49
42
  return file_stem
50
43
 
51
44
 
52
- def extract_file_id(filename: str | Path) -> str:
53
- file_match = FILE_ID_REGEX.match(str(filename))
45
+ def coerce_file_name(filename_or_id: int | str) -> str:
46
+ return coerce_file_stem(filename_or_id) + '.txt'
47
+
48
+
49
+ def coerce_file_path(filename_or_id: int | str) -> Path:
50
+ return DOCS_DIR.joinpath(coerce_file_name(filename_or_id))
51
+
52
+
53
+ def extract_file_id(filename_or_id: int | str | Path) -> str:
54
+ if isinstance(filename_or_id, int) or (isinstance(filename_or_id, str) and len(filename_or_id) <= 6):
55
+ return id_str(filename_or_id)
56
+
57
+ file_match = FILE_ID_REGEX.match(str(filename_or_id))
54
58
 
55
59
  if not file_match:
56
- raise RuntimeError(f"Failed to extract file ID from {filename}")
60
+ raise RuntimeError(f"Failed to extract file ID from {filename_or_id}")
57
61
 
58
62
  return file_match.group(1)
59
63
 
60
64
 
65
+ def file_size(file_path: str | Path) -> int:
66
+ return Path(file_path).stat().st_size
67
+
68
+
61
69
  def file_size_str(file_path: str | Path) -> str:
62
- file_size = float(Path(file_path).stat().st_size)
70
+ size = file_size(file_path)
63
71
  digits = 2
64
72
 
65
- if file_size > MB:
66
- size_num = file_size / MB
73
+ if size > MB:
74
+ size_num = float(size) / MB
67
75
  size_str = 'MB'
68
- elif file_size > KB:
69
- size_num = file_size / KB
76
+ elif size > KB:
77
+ size_num = float(size) / KB
70
78
  size_str = 'kb'
71
79
  digits = 1
72
80
  else:
73
- return f"{int(file_size)} b"
81
+ return f"{size} b"
74
82
 
75
83
  return f"{size_num:,.{digits}f} {size_str}"
76
84
 
77
85
 
86
+ def file_stem_for_id(id: int | str) -> str:
87
+ if isinstance(id, int) or (isinstance(id, str) and len(id) <= 6):
88
+ return f"{HOUSE_OVERSIGHT_PREFIX}{id_str(id)}"
89
+ elif len(id) == 8:
90
+ return f"{HOUSE_OVERSIGHT_PREFIX}{id}"
91
+ else:
92
+ raise RuntimeError(f"Unknown kind of file id {id}")
93
+
94
+
78
95
  def is_local_extract_file(filename) -> bool:
79
96
  """Return true if filename is of form 'HOUSE_OVERSIGHT_029835_1.txt'."""
80
97
  file_match = FILE_ID_REGEX.match(str(filename))