epstein-files 1.2.1__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,6 +22,7 @@ ALL_NAMES = [v for k, v in vars(names).items() if isinstance(v, str) and CONSTAN
22
22
  PACIFIC_TZ = tz.gettz("America/Los_Angeles")
23
23
  TIMEZONE_INFO = {"PDT": PACIFIC_TZ, "PST": PACIFIC_TZ} # Suppresses annoying warnings from parse() calls
24
24
 
25
+ all_elements_same = lambda _list: len(_list) == 0 or all(x == _list[0] for x in _list)
25
26
  collapse_newlines = lambda text: MULTINEWLINE_REGEX.sub('\n\n', text)
26
27
  date_str = lambda dt: dt.isoformat()[0:10] if dt else None
27
28
  escape_double_quotes = lambda text: text.replace('"', r'\"')
@@ -1,3 +1,4 @@
1
+ import json
1
2
  import re
2
3
  from copy import deepcopy
3
4
  from dataclasses import Field, asdict, dataclass, field, fields
@@ -9,20 +10,21 @@ from dateutil.parser import parse
9
10
  from epstein_files.util.constant.names import *
10
11
  from epstein_files.util.constant.strings import *
11
12
  from epstein_files.util.data import remove_zero_time, without_falsey
13
+ from epstein_files.util.env import args
12
14
 
13
- DuplicateType = Literal['earlier', 'quoted', 'redacted', 'same']
15
+ DuplicateType = Literal['bounced', 'earlier', 'quoted', 'redacted', 'same']
14
16
  Metadata = dict[str, bool | datetime | int | str | list[str | None] |dict[str, bool | str]]
15
17
 
16
18
  # Misc
17
- CONSTANTIZE_NAMES = False # A flag set to True that causes repr() of these classes to return strings of usable code
18
19
  INDENT = ' '
19
20
  INDENT_NEWLINE = f'\n{INDENT}'
20
21
  INDENTED_JOIN = f',{INDENT_NEWLINE}'
21
- MAX_LINE_LENGTH = 150
22
+ MAX_LINE_LENGTH = 135
22
23
  REPUTATION_MGMT = f'{REPUTATION} management'
23
24
  SAME = 'same'
24
25
 
25
26
  DUPE_TYPE_STRS: dict[DuplicateType, str] = {
27
+ 'bounced': 'a bounced copy of',
26
28
  'earlier': 'an earlier draft of',
27
29
  'quoted': 'quoted in full in',
28
30
  'redacted': 'a redacted version of',
@@ -32,7 +34,10 @@ DUPE_TYPE_STRS: dict[DuplicateType, str] = {
32
34
  FIELD_SORT_KEY = {
33
35
  'id': 'a',
34
36
  'author': 'aa',
35
- 'attribution_reason': 'zz',
37
+ 'comment': 'zz',
38
+ 'duplicate_ids': 'dup',
39
+ 'duplicate_of_id': 'dupe',
40
+ 'recipients': 'aaa',
36
41
  }
37
42
 
38
43
  FINANCIAL_REPORTS_AUTHORS = [
@@ -49,7 +54,6 @@ FINANCIAL_REPORTS_AUTHORS = [
49
54
  # Fields like timestamp and author are better added from the Document object
50
55
  NON_METADATA_FIELDS = [
51
56
  'actual_text',
52
- 'date',
53
57
  'id',
54
58
  'is_synthetic',
55
59
  ]
@@ -64,18 +68,18 @@ class DocCfg:
64
68
  id (str): ID of file
65
69
  author (Name): Author of the document (if any)
66
70
  category (str | None): Type of file
67
- date (str | None): If passed will be immediated parsed into the 'timestamp' field
71
+ date (str | None): Parsed to a datetime by timestamp() if it exists
68
72
  dupe_type (DuplicateType | None): The type of duplicate this file is or its 'duplicate_ids' are
69
73
  duplicate_ids (list[str]): IDs of *other* documents that are dupes of this document
70
74
  duplicate_of_id (str | None): If this is a dupe the ID of the duplicated file. This file will be suppressed
71
75
  is_interesting (bool | None): Override other considerations and always consider this file interesting (or not)
72
- timestamp (datetime | None): Time this email was sent, file was created, article published, etc.
73
76
  is_synthetic (bool): True if this config was generated by the duplicate_cfgs() method
74
77
  """
75
78
  id: str
76
79
  attached_to_email_id: str | None = None
77
80
  author: Name = None
78
81
  category: str | None = None
82
+ comment: str = ''
79
83
  date: str | None = None
80
84
  description: str | None = None
81
85
  dupe_type: DuplicateType | None = None
@@ -84,12 +88,8 @@ class DocCfg:
84
88
  is_attribution_uncertain: bool = False
85
89
  is_interesting: bool | None = None
86
90
  is_synthetic: bool = False
87
- timestamp: datetime | None = None
88
91
 
89
92
  def __post_init__(self):
90
- if self.date:
91
- self.timestamp = parse(self.date)
92
-
93
93
  if self.duplicate_of_id or self.duplicate_ids:
94
94
  self.dupe_type = self.dupe_type or SAME
95
95
 
@@ -142,7 +142,16 @@ class DocCfg:
142
142
  yield dupe_cfg
143
143
 
144
144
  def metadata(self) -> Metadata:
145
- return {k: v for k, v in asdict(self).items() if k not in NON_METADATA_FIELDS and v}
145
+ metadata = {k: v for k, v in asdict(self).items() if k not in NON_METADATA_FIELDS and v}
146
+
147
+ if self.is_interesting is False:
148
+ metadata['is_interesting'] = False
149
+
150
+ return metadata
151
+
152
+ def timestamp(self) -> datetime | None:
153
+ if self.date:
154
+ return parse(self.date)
146
155
 
147
156
  def _props_strs(self) -> list[str]:
148
157
  props = []
@@ -151,20 +160,16 @@ class DocCfg:
151
160
  for _field in sorted(fields(self), key=lambda f: FIELD_SORT_KEY.get(f.name, f.name)):
152
161
  value = getattr(self, _field.name)
153
162
 
154
- if value is None or value is False or (isinstance(value, list) and len(value) == 0):
163
+ if _field.name in ['actual_text', 'is_fwded_article', 'is_interesting']: # fields can be False or None or ''
164
+ if value is not None:
165
+ add_prop(_field, str(value))
166
+ elif not value or _field.name == 'dupe_type' and value == 'same':
155
167
  continue
156
168
  elif _field.name == AUTHOR:
157
- add_prop(_field, constantize_name(str(value)) if CONSTANTIZE_NAMES else f"'{value}'")
158
- elif _field.name == 'category' and value in [EMAIL, TEXT_MESSAGE]:
159
- continue
160
- elif _field.name == 'recipients' and value:
161
- recipients_str = str([constantize_name(r) if (CONSTANTIZE_NAMES and r) else r for r in value])
162
- add_prop(_field, recipients_str.replace("'", '') if CONSTANTIZE_NAMES else recipients_str)
163
- elif _field.name == 'timestamp' and self.date is not None:
164
- continue # Don't print both timestamp and date
165
- elif isinstance(value, datetime):
166
- value_str = remove_zero_time(value)
167
- add_prop(_field, f"parse('{value_str}')" if CONSTANTIZE_NAMES else f"'{value}'")
169
+ add_prop(_field, constantize_name(str(value)) if args.constantize else f"'{value}'")
170
+ elif _field.name == 'recipients':
171
+ recipients_str = str([constantize_name(r) if (args.constantize and r) else r for r in value])
172
+ add_prop(_field, recipients_str.replace("'", '') if args.constantize else recipients_str)
168
173
  elif isinstance(value, str):
169
174
  if "'" in value:
170
175
  value = '"' + value.replace('"', r'\"') + '"'
@@ -182,14 +187,14 @@ class DocCfg:
182
187
  type_str = f"{type(self).__name__}("
183
188
  single_line_repr = type_str + ', '.join(props) + f')'
184
189
 
185
- if len(single_line_repr) < MAX_LINE_LENGTH:
190
+ if len(single_line_repr) < MAX_LINE_LENGTH or (self.comment and getattr(self, 'is_fwded_article')):
186
191
  repr_str = single_line_repr
187
192
  else:
188
193
  repr_str = f"{type_str}{INDENT_NEWLINE}" + INDENTED_JOIN.join(props)
189
194
  repr_str += ',' if props else ''
190
195
  repr_str += '\n)'
191
196
 
192
- if CONSTANTIZE_NAMES:
197
+ if args.constantize:
193
198
  repr_str = INDENT + INDENT_NEWLINE.join(repr_str.split('\n'))
194
199
  return repr_str.replace(',,', ',').replace(',),', '),').replace(',),', '),')
195
200
  else:
@@ -224,9 +229,10 @@ class EmailCfg(CommunicationCfg):
224
229
  """
225
230
  actual_text: str | None = None
226
231
  fwded_text_after: str | None = None
227
- is_fwded_article: bool = False
232
+ is_fwded_article: bool | None = None
228
233
  recipients: list[Name] = field(default_factory=list)
229
234
  subject: str | None = None
235
+ truncate_to: int | None = None
230
236
 
231
237
  # This is necessary because for some dumb reason @dataclass(repr=False) doesn't cut it
232
238
  def __repr__(self) -> str:
epstein_files/util/env.py CHANGED
@@ -38,7 +38,7 @@ output.add_argument('--all-emails', '-ae', action='store_true', help='all the em
38
38
  output.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of just the interesting ones')
39
39
  parser.add_argument('--build', '-b', nargs="?", default=None, const=DEFAULT_FILE, help='write output to HTML file')
40
40
  output.add_argument('--email-timeline', action='store_true', help='print a table of all emails in chronological order')
41
- output.add_argument('--emailers-info', action='store_true', help='write a .png of the eeailers info table')
41
+ output.add_argument('--emailers-info', '-ei', action='store_true', help='write a .png of the eeailers info table')
42
42
  output.add_argument('--json-files', action='store_true', help='pretty print all the raw JSON data files in the collection and exit')
43
43
  output.add_argument('--json-metadata', action='store_true', help='dump JSON metadata for all files and exit')
44
44
  output.add_argument('--output-emails', '-oe', action='store_true', help='generate emails section')
@@ -49,43 +49,51 @@ output.add_argument('--suppress-output', action='store_true', help='no output to
49
49
  output.add_argument('--uninteresting', action='store_true', help='only output uninteresting other files')
50
50
  output.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use (in characters)')
51
51
 
52
- scripts = parser.add_argument_group('SCRIPTS', 'Options used by epstein_search, epstein_show, and epstein_diff.')
52
+ scripts = parser.add_argument_group('SCRIPTS', 'Options used by epstein_grep, epstein_show, and epstein_diff.')
53
53
  scripts.add_argument('positional_args', nargs='*', help='strings to searchs for, file IDs to show or diff, etc.')
54
+ scripts.add_argument('--email-body', action='store_true', help='epstein_grep but only for the body of the email')
55
+ scripts.add_argument('--min-line-length', type=int, help='epstein_grep minimum length of a matched line')
54
56
  scripts.add_argument('--raw', '-r', action='store_true', help='show raw contents of file (used by epstein_show)')
55
57
  scripts.add_argument('--whole-file', '-wf', action='store_true', help='print whole files')
56
58
 
57
59
  debug = parser.add_argument_group('DEBUG')
58
60
  debug.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
61
+ debug.add_argument('--constantize', action='store_true', help='constantize names when printing repr() of objects')
59
62
  debug.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
60
63
  debug.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
61
64
  debug.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats about the files')
62
65
  debug.add_argument('--skip-other-files', '-sof', action='store_true', help='skip parsing non email/text files')
63
66
  debug.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
67
+ debug.add_argument('--truncate', '-t', type=int, help='truncate emails to this many characters')
68
+ debug.add_argument('--write-txt', '-wt', action='store_true', help='write a plain text version of output')
64
69
 
65
70
 
66
71
  # Parse args
67
72
  args = parser.parse_args()
68
73
  is_html_script = parser.prog in HTML_SCRIPTS
69
74
 
70
- args.build = args.build
71
75
  args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
72
76
  args.names = [None if n == 'None' else n.strip() for n in (args.names or [])]
73
77
  args.output_emails = args.output_emails or args.all_emails
74
78
  args.output_other = args.output_other or args.all_other_files or args.uninteresting
75
79
  args.overwrite_pickle = args.overwrite_pickle or (is_env_var_set('OVERWRITE_PICKLE') and not is_env_var_set('PICKLED'))
76
80
  args.width = args.width if is_html_script else None
81
+ args.any_output_selected = any([is_output_arg(arg) and val for arg, val in vars(args).items()])
82
+
83
+ if not (args.any_output_selected or args.email_timeline or args.emailers_info):
84
+ if is_html_script:
85
+ logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
86
+
87
+ args.output_emails = args.output_other = args.output_texts = True
77
88
 
78
89
  if is_html_script:
79
90
  if args.positional_args:
80
91
  exit_with_error(f"{parser.prog} does not accept positional arguments (receeived {args.positional_args})")
81
92
 
82
93
  if parser.prog == EPSTEIN_GENERATE:
83
- if any([is_output_arg(arg) and val for arg, val in vars(args).items()]):
94
+ if args.any_output_selected:
84
95
  if args.email_timeline:
85
96
  exit_with_error(f"--email-timeline option is mutually exlusive with other output options")
86
- elif not args.email_timeline and not args.emailers_info:
87
- logger.warning(f"No output section chosen; outputting default selection of texts, selected emails, and other files...")
88
- args.output_texts = args.output_emails = args.output_other = True
89
97
 
90
98
  if args.build == DEFAULT_FILE:
91
99
  if args.all_emails:
@@ -94,13 +102,15 @@ if is_html_script:
94
102
  args.build = CHRONOLOGICAL_EMAILS_PATH
95
103
  else:
96
104
  args.build = TEXT_MSGS_HTML_PATH
97
- elif parser.prog.startswith('epstein_') and not args.positional_args:
105
+ elif parser.prog.startswith('epstein_') and not args.positional_args and not args.names:
98
106
  exit_with_error(f"{parser.prog} requires positional arguments but got none!")
99
107
 
100
108
  if args.names:
101
109
  logger.warning(f"Output restricted to {args.names}")
102
110
  args.output_other = False
103
111
 
112
+ if args.truncate and args.whole_file:
113
+ exit_with_error(f"--whole-file and --truncate are incompatible")
104
114
 
105
115
  # Log level args
106
116
  if args.deep_debug:
@@ -38,6 +38,8 @@ def extract_file_id(filename_or_id: int | str | Path) -> str:
38
38
 
39
39
  if isinstance(filename_or_id, int) or (isinstance(filename_or_id, str) and len(filename_or_id) <= 6):
40
40
  return id_str(filename_or_id)
41
+ elif isinstance(filename_or_id, str) and len(filename_or_id) == 8:
42
+ return f"{HOUSE_OVERSIGHT_PREFIX}{filename_or_id}"
41
43
 
42
44
  file_match = FILE_ID_REGEX.match(str(filename_or_id).upper())
43
45