epstein-files 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,131 @@
1
+ """
2
+ Helpers for dealing with various kinds of data.
3
+ """
4
+ import itertools
5
+ import re
6
+ import time
7
+ from dataclasses import dataclass, field
8
+ from datetime import datetime, timezone
9
+ from dateutil import tz
10
+ from typing import TypeVar
11
+
12
+ from dateutil.parser import parse
13
+ from rich.text import Text
14
+
15
+ from epstein_files.util.constant import names
16
+ from epstein_files.util.env import args, logger
17
+
18
+ T = TypeVar('T')
19
+
20
+ ISO_DATE_REGEX = re.compile(r'\d{4}-\d{2}(-\d{2})?')
21
+ MULTINEWLINE_REGEX = re.compile(r"\n{2,}")
22
+ CONSTANT_VAR_REGEX = re.compile(r"^[A-Z_]+$")
23
+ ALL_NAMES = [v for k, v in vars(names).items() if isinstance(v, str) and CONSTANT_VAR_REGEX.match(k)]
24
+
25
+ PACIFIC_TZ = tz.gettz("America/Los_Angeles")
26
+ TIMEZONE_INFO = {"PST": PACIFIC_TZ, "PDT": PACIFIC_TZ} # Suppresses annoying warnings from parse() calls
27
+
28
+
29
+ def collapse_newlines(text: str) -> str:
30
+ return MULTINEWLINE_REGEX.sub('\n\n', text)
31
+
32
+
33
+ def date_str(timestamp: datetime | None) -> str | None:
34
+ return timestamp.isoformat()[0:10] if timestamp else None
35
+
36
+
37
+ def dict_sets_to_lists(d: dict[str, set]) -> dict[str, list]:
38
+ return {k: sorted(list(v)) for k, v in d.items()}
39
+
40
+
41
+ def extract_datetime(s: str) -> datetime | None:
42
+ match = ISO_DATE_REGEX.search(s)
43
+
44
+ if not match:
45
+ return None
46
+
47
+ date_str = match.group(0)
48
+
49
+ if len(date_str) == 4:
50
+ date_str += '-01-01'
51
+ elif len(date_str) == 7:
52
+ date_str += '-01'
53
+
54
+ return parse(date_str, tzinfos=TIMEZONE_INFO)
55
+
56
+
57
+ def extract_last_name(name: str) -> str:
58
+ if ' ' not in name:
59
+ return name
60
+
61
+ names = name.split()
62
+
63
+ if names[-1].startswith('Jr') and len(names[-1]) <= 3:
64
+ return ' '.join(names[-2:])
65
+ else:
66
+ return names[-1]
67
+
68
+
69
+ def flatten(_list: list[list[T]]) -> list[T]:
70
+ return list(itertools.chain.from_iterable(_list))
71
+
72
+
73
+ def iso_timestamp(dt: datetime) -> str:
74
+ return dt.isoformat().replace('T', ' ')
75
+
76
+
77
+ def listify(listlike: list | str | Text | None) -> list:
78
+ """Create a list of 'listlike'. Returns empty list if 'listlike' is None or empty string."""
79
+ if isinstance(listlike, list):
80
+ return listlike
81
+ elif listlike:
82
+ return [listlike]
83
+ else:
84
+ return []
85
+
86
+
87
+ def ordinal_str(n: int) -> str:
88
+ if 11 <= (n % 100) <= 13:
89
+ suffix = 'th'
90
+ else:
91
+ suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
92
+
93
+ return str(n) + suffix
94
+
95
+
96
+ def patternize(_pattern: str | re.Pattern):
97
+ return _pattern if isinstance(_pattern, re.Pattern) else re.compile(rf"({_pattern})", re.IGNORECASE)
98
+
99
+
100
+ def remove_timezone(timestamp: datetime) -> datetime:
101
+ if timestamp.tzinfo:
102
+ timestamp = timestamp.astimezone(timezone.utc).replace(tzinfo=None)
103
+ logger.debug(f" -> Converted to UTC: {timestamp}")
104
+
105
+ return timestamp
106
+
107
+
108
+ def sort_dict(d: dict[str | None, int] | dict[str, int]) -> list[tuple[str | None, int]]:
109
+ sort_key = lambda e: (e[0] or '').lower() if args.sort_alphabetical else [-e[1], (e[0] or '').lower()]
110
+ return sorted(d.items(), key=sort_key)
111
+
112
+
113
+ @dataclass
114
+ class Timer:
115
+ started_at: float = field(default_factory=lambda: time.perf_counter())
116
+ checkpoint_at: float = field(default_factory=lambda: time.perf_counter())
117
+
118
+ def print_at_checkpoint(self, msg: str) -> None:
119
+ logger.warning(f"{msg} in {self.seconds_since_checkpoint()}")
120
+ self.checkpoint_at = time.perf_counter()
121
+
122
+ def seconds_since_checkpoint(self) -> str:
123
+ return f"{(time.perf_counter() - self.checkpoint_at):.2f} seconds"
124
+
125
+ def seconds_since_start(self) -> str:
126
+ return f"{(time.perf_counter() - self.started_at):.2f} seconds"
127
+
128
+
129
+ escape_double_quotes = lambda text: text.replace('"', r'\"')
130
+ escape_single_quotes = lambda text: text.replace("'", r"\'")
131
+ uniquify = lambda _list: list(set(_list))
@@ -0,0 +1,80 @@
1
+ import logging
2
+ from argparse import ArgumentParser
3
+ from os import environ
4
+ from pathlib import Path
5
+ from sys import argv
6
+
7
+ from rich.logging import RichHandler
8
+
9
+ DEFAULT_WIDTH = 154
10
+ HTML_SCRIPTS = ['generate_html.py', 'count_words.py']
11
+
12
+
13
+ parser = ArgumentParser(description="Parse epstein OCR docs and generate HTML page.")
14
+ parser.add_argument('--build', '-b', action='store_true', help='write HTML to docs/index.html')
15
+ parser.add_argument('--all-emails', '-ae', action='store_true', help='all the emails')
16
+ parser.add_argument('--all-email-tables', '-aet', action='store_true', help='all email tables (except Epstein)')
17
+ parser.add_argument('--all-other-files', '-ao', action='store_true', help='all the non-email, non-text msg files instead of a limited selection')
18
+ parser.add_argument('--colors-only', '-c', action='store_true', help='print header with color key table and links and exit')
19
+ parser.add_argument('--name', '-n', action='append', dest='names', help='specify the name(s) whose communications should be output')
20
+ parser.add_argument('--output-emails', '-oe', action='store_true', help='generate other files section')
21
+ parser.add_argument('--output-other-files', '-oo', action='store_true', help='generate other files section')
22
+ parser.add_argument('--output-texts', '-ot', action='store_true', help='generate other files section')
23
+ parser.add_argument('--pickled', '-p', action='store_true', help='use pickled EpsteinFiles object')
24
+ parser.add_argument('--overwrite-pickle', '-op', action='store_true', help='generate new pickled EpsteinFiles object')
25
+ parser.add_argument('--sort-alphabetical', '-alpha', action='store_true', help='sort emailers alphabetically in counts table')
26
+ parser.add_argument('--suppress-output', '-s', action='store_true', help='no output to terminal (use with --build)')
27
+ parser.add_argument('--use-epstein-web-links', '-use', action='store_true', help='use epsteinweb.org links instead of epsteinify.com')
28
+ parser.add_argument('--search-other', '-so', action='store_true', help='search for string in non email/text files (only used by search script)')
29
+ parser.add_argument('--width', '-w', type=int, default=DEFAULT_WIDTH, help='screen width to use')
30
+ parser.add_argument('--whole-file', '-wf', action='store_true', help='print whole file (only used by search script)')
31
+ parser.add_argument('--debug', '-d', action='store_true', help='set debug level to INFO')
32
+ parser.add_argument('--deep-debug', '-dd', action='store_true', help='set debug level to DEBUG')
33
+ parser.add_argument('--suppress-logs', '-sl', action='store_true', help='set debug level to FATAL')
34
+ parser.add_argument('--json-stats', '-j', action='store_true', help='print JSON formatted stats at the end')
35
+ parser.add_argument('positional_args', nargs='*', help='Optional args (only used by helper scripts)')
36
+ args = parser.parse_args()
37
+
38
+ is_env_var_set = lambda s: len(environ.get(s) or '') > 0
39
+ current_script = Path(argv[0]).name
40
+ is_html_script = current_script in HTML_SCRIPTS
41
+
42
+ args.deep_debug = args.deep_debug or is_env_var_set('DEEP_DEBUG')
43
+ args.debug = args.deep_debug or args.debug or is_env_var_set('DEBUG')
44
+ args.output_emails = args.output_emails or args.all_emails
45
+ args.output_other_files = args.output_other_files or args.all_other_files
46
+ args.pickled = args.pickled or is_env_var_set('PICKLED') or args.colors_only or len(args.names or []) > 0
47
+ args.width = args.width if is_html_script else None
48
+ specified_names: list[str | None] = [None if n == 'None' else n for n in (args.names or [])]
49
+
50
+
51
+ # Setup logging
52
+ logging.basicConfig(level="NOTSET", format="%(message)s", datefmt="[%X]", handlers=[RichHandler()])
53
+ # logging.basicConfig(level="DEBUG", handlers=[RichHandler()])
54
+ logger = logging.getLogger("rich")
55
+
56
+ if args.deep_debug:
57
+ logger.setLevel(logging.DEBUG)
58
+ elif args.debug:
59
+ logger.setLevel(logging.INFO)
60
+ elif args.suppress_logs:
61
+ logger.setLevel(logging.FATAL)
62
+ else:
63
+ logger.setLevel(logging.WARNING)
64
+
65
+ datefinder_logger = logging.getLogger('datefinder') # Suppress annoying output
66
+ datefinder_logger.setLevel(logger.level)
67
+
68
+
69
+ # Massage args that depend on other args to the appropriate state
70
+ if not (args.output_texts or args.output_emails or args.output_other_files):
71
+ logger.warning(f"No output section chosen; outputting default of texts, selected emails, and other files...")
72
+ args.output_texts = True
73
+ args.output_emails = True
74
+ args.output_other_files = True
75
+
76
+ if args.use_epstein_web_links:
77
+ logger.warning(f"Using links to epsteinweb.org links instead of epsteinify.com...")
78
+
79
+ if args.debug:
80
+ logger.warning(f"is_html_script={is_html_script}, specified_names={specified_names}, args={args}")
@@ -0,0 +1,172 @@
1
+ import re
2
+ from copy import deepcopy
3
+ from dataclasses import Field, asdict, dataclass, field, fields
4
+ from datetime import datetime
5
+ from typing import Generator, Literal
6
+
7
+ from dateutil.parser import parse
8
+
9
+ from epstein_files.util.constant.names import constantize_name
10
+ from epstein_files.util.constant.strings import AUTHOR
11
+
12
+ DuplicateType = Literal['same', 'earlier', 'quoted', 'redacted']
13
+
14
+ INDENT = ' '
15
+ INDENT_NEWLINE = f'\n{INDENT}'
16
+ INDENTED_JOIN = f',{INDENT_NEWLINE}'
17
+ CONSTANTIZE_NAMES = False # A flag set to True that causes repr() of these classes to return strings of usable code
18
+ MAX_LINE_LENGTH = 250
19
+
20
+ REASON_MAPPING: dict[DuplicateType, str] = {
21
+ 'earlier': 'earlier draft of',
22
+ 'quoted': 'quoted in full in',
23
+ 'redacted': 'redacted version of',
24
+ 'same': 'the same as',
25
+ }
26
+
27
+ FIELD_SORT_KEY = {
28
+ 'id': 'a',
29
+ 'author': 'aa',
30
+ 'attribution_reason': 'zz',
31
+ }
32
+
33
+
34
+ @dataclass(kw_only=True)
35
+ class FileCfg:
36
+ """Convenience class that encapsulates configuring info about files that need to be manually configured.
37
+
38
+ Attributes:
39
+ id (str): ID of file
40
+ author (str | None): Author of the document (if any)
41
+ date (str | None): If passed will be immediated parsed into the 'timestamp' field
42
+ dupe_of_id (str | None): If this is a dupe the ID of the duplicated file. This file will be suppressed
43
+ dupe_type (DuplicateType | None): The type of duplicate this file is (redacted, quoted, etc.)
44
+ duplicate_ids (list[str]): Inverse of 'dupe_of_id' - this file will NOT be suppressed but 'duplicate_ids' will be.
45
+ timestamp (datetime | None): Time this email was sent, file was created, article published, etc.
46
+ was_generated (bool): True if this object was generated by the duplicate_cfgs() method
47
+ """
48
+ id: str
49
+ author: str | None = None
50
+ date: str | None = None
51
+ description: str | None = None
52
+ dupe_of_id: str | None = None
53
+ dupe_type: DuplicateType | None = None
54
+ duplicate_ids: list[str] = field(default_factory=list)
55
+ timestamp: datetime | None = None
56
+ was_generated: bool = False # True if this object was generated by duplicate_cfgs()
57
+
58
+ def __post_init__(self):
59
+ if self.dupe_of_id:
60
+ self.dupe_type = self.dupe_type or 'same'
61
+
62
+ if self.date:
63
+ self.timestamp = parse(self.date)
64
+
65
+ def duplicate_reason(self) -> str | None:
66
+ if self.dupe_type is not None:
67
+ return REASON_MAPPING[self.dupe_type]
68
+
69
+ def duplicate_cfgs(self) -> Generator['FileCfg', None, None]:
70
+ for id in self.duplicate_ids:
71
+ dupe_cfg = deepcopy(self)
72
+ dupe_cfg.id = id
73
+ dupe_cfg.dupe_of_id = self.id
74
+ dupe_cfg.dupe_type = self.dupe_type or 'same'
75
+ dupe_cfg.was_generated = True
76
+ yield dupe_cfg
77
+
78
+ def non_null_field_names(self) -> list[str]:
79
+ return [f.name for f in self.sorted_fields() if getattr(self, f.name)]
80
+
81
+ def sorted_fields(self) -> list[Field]:
82
+ return sorted(fields(self), key=lambda f: FIELD_SORT_KEY.get(f.name, f.name))
83
+
84
+ def _props_strs(self) -> list[str]:
85
+ props = []
86
+ add_prop = lambda f, value: props.append(f"{f.name}={value}")
87
+
88
+ for _field in self.sorted_fields():
89
+ value = getattr(self, _field.name)
90
+
91
+ if value is None or value is False or (isinstance(value, list) and len(value) == 0):
92
+ continue
93
+ elif _field.name == AUTHOR:
94
+ add_prop(_field, constantize_name(str(value)) if CONSTANTIZE_NAMES else f"'{value}'")
95
+ elif _field.name == 'recipients' and isinstance(value, list):
96
+ recipients_str = str([constantize_name(r) if (CONSTANTIZE_NAMES and r) else r for r in value])
97
+ add_prop(_field, recipients_str.replace("'", '') if CONSTANTIZE_NAMES else recipients_str)
98
+ elif isinstance(value, datetime):
99
+ value_str = re.sub(' 00:00:00', '', str(value))
100
+ add_prop(_field, f"parse('{value_str}')" if CONSTANTIZE_NAMES else f"'{value}'")
101
+ elif _field.name == 'description':
102
+ add_prop(_field, value.strip())
103
+ elif isinstance(value, str):
104
+ if "'" in value:
105
+ value = '"' + value.replace('"', r'\"') + '"'
106
+ else:
107
+ value = "'" + value.replace("'", r'\'') + "'"
108
+
109
+ add_prop(_field, value)
110
+ else:
111
+ add_prop(_field, str(value))
112
+
113
+ return props
114
+
115
+ def __eq__(self, other: 'FileCfg') -> bool:
116
+ """Return True if everything matches other than the two 'dupe_' fields ('duplicate_ids' is compared)."""
117
+ for _field in self.sorted_fields():
118
+ if _field.name == 'id' or _field.name.startswith('dupe'):
119
+ continue
120
+ elif getattr(self, _field.name) != getattr(other, _field.name):
121
+ return False
122
+
123
+ return True
124
+
125
+ def __repr__(self) -> str:
126
+ props = self._props_strs()
127
+ type_str = f"{type(self).__name__}("
128
+ single_line_repr = type_str + ', '.join(props) + f')'
129
+
130
+ if (len(single_line_repr) < MAX_LINE_LENGTH or self.non_null_field_names() == ['id', 'description']) and '#' not in (self.description or ''):
131
+ repr_str = single_line_repr
132
+ else:
133
+ repr_str = f"{type_str}{INDENT_NEWLINE}" + INDENTED_JOIN.join(props)
134
+ repr_str += ',' if props else ''
135
+ repr_str += '\n)'
136
+
137
+ if CONSTANTIZE_NAMES:
138
+ repr_str = INDENT + INDENT_NEWLINE.join(repr_str.split('\n'))
139
+ return repr_str.replace(',,', ',').replace(',),', '),').replace(',),', '),')
140
+ else:
141
+ return repr_str
142
+
143
+
144
+ @dataclass(kw_only=True)
145
+ class MessageCfg(FileCfg):
146
+ """
147
+ Convenience class to unite various configured properties for a given Communication file.
148
+ Manual config is always required for MessengerLog author attribution. It's also often needed for Email
149
+ files to handle the terrible OCR text that Congress provided which messes up a lot of the email headers.
150
+
151
+ Attributes:
152
+ actual_text (str | None): In dire cases of broken OCR we just configure the body of the email as a string.
153
+ attribution_reason (str | None): Optional explanation of why this email was attributed to this author.
154
+ is_attribution_uncertain (bool): True if we have a good idea of who the author is but are not 100% certain
155
+ is_fwded_article (bool): True if this is a newspaper article someone fwded. Used to exclude articles from word counting.
156
+ recipients (list[str | None]): Who received the email
157
+ """
158
+ actual_text: str | None = None # Override for the Email._actual_text() method for particularly broken emails
159
+ attribution_reason: str | None = None
160
+ is_attribution_uncertain: bool = False
161
+ is_fwded_article: bool = False
162
+ recipients: list[str | None] = field(default_factory=list)
163
+
164
+ def __eq__(self, other: 'FileCfg') -> bool:
165
+ return super().__eq__(other)
166
+
167
+ def __repr__(self) -> str:
168
+ return super().__repr__()
169
+
170
+ @classmethod
171
+ def from_file_cfg(cls, cfg: FileCfg) -> 'MessageCfg':
172
+ return cls(**asdict(cfg))
@@ -0,0 +1,81 @@
1
+ import re
2
+ from os import environ
3
+ from pathlib import Path
4
+ from sys import exit
5
+
6
+ from epstein_files.util.constant.strings import HOUSE_OVERSIGHT_PREFIX
7
+
8
+ EPSTEIN_DOCS_DIR_ENV_VAR_NAME = 'EPSTEIN_DOCS_DIR'
9
+ DOCS_DIR_ENV = environ[EPSTEIN_DOCS_DIR_ENV_VAR_NAME]
10
+ DOCS_DIR = Path(DOCS_DIR_ENV or '').resolve()
11
+
12
+ if not DOCS_DIR_ENV:
13
+ print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME} env var not set!")
14
+ exit(1)
15
+ elif not DOCS_DIR.exists():
16
+ print(f"ERROR: {EPSTEIN_DOCS_DIR_ENV_VAR_NAME}='{DOCS_DIR}' does not exist!")
17
+ exit(1)
18
+
19
+ JSON_DIR = DOCS_DIR.joinpath('json_files')
20
+ HTML_DIR = Path('docs')
21
+ EXTRACTED_EMAILS_DIR = Path('emails_extracted_from_legal_filings')
22
+ GH_PAGES_HTML_PATH = HTML_DIR.joinpath('index.html')
23
+ WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_emails_word_count.html')
24
+ EPSTEIN_WORD_COUNT_HTML_PATH = HTML_DIR.joinpath('epstein_texts_and_emails_word_count.html')
25
+ PICKLED_PATH = Path("the_epstein_files.pkl.gz")
26
+
27
+ FILE_STEM_REGEX = re.compile(fr"{HOUSE_OVERSIGHT_PREFIX}(\d{{6}})")
28
+ FILE_ID_REGEX = re.compile(fr".*{FILE_STEM_REGEX.pattern}(_\d{{1,2}})?(\.txt(\.json)?)?")
29
+ FILENAME_LENGTH = len(HOUSE_OVERSIGHT_PREFIX) + 6
30
+ KB = 1024
31
+ MB = KB * KB
32
+
33
+
34
+ # Handles both string and int 'id' args.
35
+ file_stem_for_id = lambda id: f"{HOUSE_OVERSIGHT_PREFIX}{int(id):06d}"
36
+ filename_for_id = lambda id: file_stem_for_id(id) + '.txt'
37
+
38
+
39
+ def coerce_file_stem(filename_or_id: int | str) -> str:
40
+ """Generate a valid file_stem no matter what form the argument comes in."""
41
+ if isinstance(filename_or_id, str) and filename_or_id.startswith(HOUSE_OVERSIGHT_PREFIX):
42
+ file_stem = file_stem_for_id(extract_file_id(filename_or_id))
43
+ else:
44
+ file_stem = file_stem_for_id(filename_or_id)
45
+
46
+ if not FILE_STEM_REGEX.match(file_stem):
47
+ raise RuntimeError(f"Invalid stem '{file_stem}' from '{filename_or_id}'")
48
+
49
+ return file_stem
50
+
51
+
52
+ def extract_file_id(filename: str | Path) -> str:
53
+ file_match = FILE_ID_REGEX.match(str(filename))
54
+
55
+ if not file_match:
56
+ raise RuntimeError(f"Failed to extract file ID from {filename}")
57
+
58
+ return file_match.group(1)
59
+
60
+
61
+ def file_size_str(file_path: str | Path) -> str:
62
+ file_size = float(Path(file_path).stat().st_size)
63
+ digits = 2
64
+
65
+ if file_size > MB:
66
+ size_num = file_size / MB
67
+ size_str = 'MB'
68
+ elif file_size > KB:
69
+ size_num = file_size / KB
70
+ size_str = 'kb'
71
+ digits = 1
72
+ else:
73
+ return f"{int(file_size)} b"
74
+
75
+ return f"{size_num:,.{digits}f} {size_str}"
76
+
77
+
78
+ def is_local_extract_file(filename) -> bool:
79
+ """Return true if filename is of form 'HOUSE_OVERSIGHT_029835_1.txt'."""
80
+ file_match = FILE_ID_REGEX.match(str(filename))
81
+ return True if file_match and file_match.group(2) else False