epstein-files 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +194 -0
- epstein_files/documents/communication.py +53 -0
- epstein_files/documents/document.py +357 -0
- epstein_files/documents/email.py +655 -0
- epstein_files/documents/emails/email_header.py +167 -0
- epstein_files/documents/imessage/text_message.py +93 -0
- epstein_files/documents/json_file.py +23 -0
- epstein_files/documents/messenger_log.py +73 -0
- epstein_files/documents/other_file.py +117 -0
- epstein_files/epstein_files.py +437 -0
- epstein_files/util/constant/common_words.py +94 -0
- epstein_files/util/constant/html.py +57 -0
- epstein_files/util/constant/names.py +261 -0
- epstein_files/util/constant/strings.py +47 -0
- epstein_files/util/constant/urls.py +103 -0
- epstein_files/util/constants.py +1552 -0
- epstein_files/util/data.py +131 -0
- epstein_files/util/env.py +80 -0
- epstein_files/util/file_cfg.py +172 -0
- epstein_files/util/file_helper.py +81 -0
- epstein_files/util/highlighted_group.py +620 -0
- epstein_files/util/rich.py +324 -0
- epstein_files/util/search_result.py +15 -0
- epstein_files/util/word_count.py +191 -0
- epstein_files-1.0.0.dist-info/LICENSE +674 -0
- epstein_files-1.0.0.dist-info/METADATA +60 -0
- epstein_files-1.0.0.dist-info/RECORD +28 -0
- epstein_files-1.0.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
# Rich reference: https://rich.readthedocs.io/en/latest/reference.html
|
|
2
|
+
import json
|
|
3
|
+
from os import devnull
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
from rich.align import Align
|
|
8
|
+
from rich.console import Console, RenderableType
|
|
9
|
+
from rich.markup import escape
|
|
10
|
+
from rich.panel import Panel
|
|
11
|
+
from rich.padding import Padding
|
|
12
|
+
from rich.table import Table
|
|
13
|
+
from rich.text import Text
|
|
14
|
+
from rich.theme import Theme
|
|
15
|
+
|
|
16
|
+
from epstein_files.util.constant.html import CONSOLE_HTML_FORMAT, HTML_TERMINAL_THEME, PAGE_TITLE
|
|
17
|
+
from epstein_files.util.constant.names import UNKNOWN
|
|
18
|
+
from epstein_files.util.constant.strings import DEFAULT, EMAIL, NA, OTHER_SITE_LINK_STYLE, QUESTION_MARKS, SiteType
|
|
19
|
+
from epstein_files.util.constant.urls import *
|
|
20
|
+
from epstein_files.util.constants import FALLBACK_TIMESTAMP, HEADER_ABBREVIATIONS
|
|
21
|
+
from epstein_files.util.env import args, logger
|
|
22
|
+
from epstein_files.util.file_helper import file_size_str
|
|
23
|
+
from epstein_files.util.highlighted_group import HIGHLIGHTED_GROUPS, InterestingNamesHighlighter
|
|
24
|
+
|
|
25
|
+
TITLE_WIDTH = 50
|
|
26
|
+
NUM_COLOR_KEY_COLS = 4
|
|
27
|
+
NA_TXT = Text(NA, style='dim')
|
|
28
|
+
QUESTION_MARK_TXT = Text(QUESTION_MARKS, style='dim')
|
|
29
|
+
GREY_NUMBERS = [58, 39, 39, 35, 30, 27, 23, 23, 19, 19, 15, 15, 15]
|
|
30
|
+
|
|
31
|
+
DEFAULT_NAME_STYLE = 'gray46'
|
|
32
|
+
KEY_STYLE='honeydew2 bold'
|
|
33
|
+
SECTION_HEADER_STYLE = 'bold white on blue3'
|
|
34
|
+
SOCIAL_MEDIA_LINK_STYLE = 'cyan3 bold'
|
|
35
|
+
SUBSTACK_POST_LINK_STYLE = 'bright_cyan'
|
|
36
|
+
SYMBOL_STYLE = 'grey70'
|
|
37
|
+
TITLE_STYLE = 'black on bright_white bold'
|
|
38
|
+
|
|
39
|
+
HIGHLIGHTED_GROUP_COLOR_KEYS = [
|
|
40
|
+
Text(highlight_group.label.replace('_', ' '), style=highlight_group.style)
|
|
41
|
+
for highlight_group in sorted(HIGHLIGHTED_GROUPS, key=lambda hg: hg.label)
|
|
42
|
+
if not highlight_group.is_multiline
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
THEME_STYLES = {
|
|
46
|
+
DEFAULT: 'wheat4',
|
|
47
|
+
TEXT_LINK: 'deep_sky_blue4 underline',
|
|
48
|
+
**{hg.theme_style_name: hg.style for hg in HIGHLIGHTED_GROUPS}, # Inject style names for HighlightedGroups
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# Instantiate console object
|
|
52
|
+
CONSOLE_ARGS = {
|
|
53
|
+
'color_system': '256',
|
|
54
|
+
'highlighter': InterestingNamesHighlighter(),
|
|
55
|
+
'record': args.build,
|
|
56
|
+
'safe_box': False,
|
|
57
|
+
'theme': Theme(THEME_STYLES),
|
|
58
|
+
'width': args.width,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
if args.suppress_output:
|
|
62
|
+
logger.warning(f"Suppressing terminal output because args.suppress_output={args.suppress_output}...")
|
|
63
|
+
CONSOLE_ARGS.update({'file': open(devnull, "wt")})
|
|
64
|
+
|
|
65
|
+
console = Console(**CONSOLE_ARGS)
|
|
66
|
+
highlighter = CONSOLE_ARGS['highlighter']
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def add_cols_to_table(table: Table, col_names: list[str]) -> None:
|
|
70
|
+
"""Left most col will be left justified, rest are center justified."""
|
|
71
|
+
for i, col in enumerate(col_names):
|
|
72
|
+
table.add_column(col, justify='left' if i == 0 else 'center')
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def join_texts(txts: list[Text], join: str = ' ', encloser: str = '') -> Text:
|
|
76
|
+
"""Join rich.Text objs into one."""
|
|
77
|
+
if encloser:
|
|
78
|
+
if len(encloser) != 2:
|
|
79
|
+
raise ValueError(f"'encloser' arg is '{encloser}' which is not 2 characters long")
|
|
80
|
+
|
|
81
|
+
enclose_start, enclose_end = (encloser[0], encloser[1])
|
|
82
|
+
else:
|
|
83
|
+
enclose_start = enclose_end = ''
|
|
84
|
+
|
|
85
|
+
txt = Text('')
|
|
86
|
+
|
|
87
|
+
for i, link in enumerate(txts):
|
|
88
|
+
txt.append(join if i >= 1 else '').append(enclose_start).append(link).append(enclose_end)
|
|
89
|
+
|
|
90
|
+
return txt
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def key_value_txt(key: str, value: Text | str) -> Text:
|
|
94
|
+
"""Generate a Text obj for 'key=value'."""
|
|
95
|
+
return Text('').append(key, style=KEY_STYLE).append('=', style=SYMBOL_STYLE).append(value)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def parenthesize(msg: str | Text, style: str = '') -> Text:
|
|
99
|
+
txt = Text(msg) if isinstance(msg, str) else msg
|
|
100
|
+
return Text('(', style=style).append(txt).append(')')
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def print_author_header(msg: str, color: str | None, footer: str | None = None) -> None:
|
|
104
|
+
txt = Text(msg, justify='center')
|
|
105
|
+
color = color or 'white'
|
|
106
|
+
color = 'white' if color == DEFAULT else color
|
|
107
|
+
panel = Panel(txt, width=80, style=f"black on {color} bold")
|
|
108
|
+
console.print('\n', Align.center(panel))
|
|
109
|
+
|
|
110
|
+
if footer:
|
|
111
|
+
console.print(Align.center(f"({footer})"), highlight=False, style=f'{color} italic')
|
|
112
|
+
|
|
113
|
+
console.line()
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def print_centered(obj: RenderableType, style: str = '') -> None:
|
|
117
|
+
console.print(Align.center(obj), style=style)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def print_centered_link(url: str, link_text: str, style: str | None = None) -> None:
|
|
121
|
+
print_centered(link_markup(url, link_text, style or ARCHIVE_LINK_COLOR))
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def print_color_key(_key_type: Literal["Groups", "People"] = "Groups") -> None:
|
|
125
|
+
color_table = Table(title=f'Rough Guide to Highlighted Colors', show_header=False)
|
|
126
|
+
num_colors = len(HIGHLIGHTED_GROUP_COLOR_KEYS)
|
|
127
|
+
row_number = 0
|
|
128
|
+
|
|
129
|
+
for i in range(0, NUM_COLOR_KEY_COLS):
|
|
130
|
+
color_table.add_column(f"color_col_{i}", justify='center')
|
|
131
|
+
|
|
132
|
+
while (row_number * NUM_COLOR_KEY_COLS) < num_colors:
|
|
133
|
+
idx = row_number * NUM_COLOR_KEY_COLS
|
|
134
|
+
color_table.add_row(*HIGHLIGHTED_GROUP_COLOR_KEYS[idx:(idx + NUM_COLOR_KEY_COLS)])
|
|
135
|
+
row_number += 1
|
|
136
|
+
|
|
137
|
+
print_centered(vertically_pad(color_table))
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def print_header(epstein_files: 'EpsteinFiles') -> None:
|
|
141
|
+
console.print(f"This site isn't optimized for mobile but if you get past the header it should be readable.", style='dim')
|
|
142
|
+
print_page_title(width=TITLE_WIDTH)
|
|
143
|
+
print_other_site_link()
|
|
144
|
+
_print_external_links()
|
|
145
|
+
console.line()
|
|
146
|
+
_print_abbreviations_table()
|
|
147
|
+
epstein_files.print_files_summary()
|
|
148
|
+
print_color_key()
|
|
149
|
+
print_centered(f"if you think there's an attribution error or can deanonymize an {UNKNOWN} contact {CRYPTADAMUS_TWITTER}", 'grey46')
|
|
150
|
+
print_centered('note this site is based on the OCR text provided by Congress which is not always the greatest', 'grey23')
|
|
151
|
+
print_centered(f"(thanks to {link_markup('https://x.com/ImDrinknWyn', '@ImDrinknWyn', 'dodger_blue3')} + others for help attributing redacted emails)")
|
|
152
|
+
print_centered_link(ATTRIBUTIONS_URL, "(some explanations of author attributions)", style='magenta')
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def print_json(label: str, obj: object, skip_falsey: bool = False) -> None:
|
|
156
|
+
if isinstance(obj, dict):
|
|
157
|
+
if skip_falsey:
|
|
158
|
+
obj = {k: v for k, v in obj.items() if v}
|
|
159
|
+
|
|
160
|
+
if None in obj:
|
|
161
|
+
obj = {k or UNKNOWN: v for k, v in obj.items()}
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
console.line()
|
|
165
|
+
console.print(Panel(label, expand=False))
|
|
166
|
+
console.print_json(json.dumps(obj, sort_keys=True), indent=4)
|
|
167
|
+
console.line()
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def print_numbered_list_of_emailers(_list: list[str | None], epstein_files = None) -> None:
|
|
171
|
+
"""Add the first emailed_at timestamp for each emailer if 'epstein_files' provided."""
|
|
172
|
+
current_year = 1990
|
|
173
|
+
current_year_month = current_year * 12
|
|
174
|
+
grey_idx = 0
|
|
175
|
+
console.line()
|
|
176
|
+
|
|
177
|
+
for i, name in enumerate(_list):
|
|
178
|
+
indent = ' ' if i < 9 else (' ' if i < 99 else ' ')
|
|
179
|
+
txt = Text((indent) + F" {i + 1}. ", style=DEFAULT_NAME_STYLE)
|
|
180
|
+
|
|
181
|
+
if epstein_files:
|
|
182
|
+
earliest_email_date = (epstein_files.earliest_email_at(name) or FALLBACK_TIMESTAMP).date()
|
|
183
|
+
year_months = (earliest_email_date.year * 12) + earliest_email_date.month
|
|
184
|
+
|
|
185
|
+
# Color year rollovers more brightly
|
|
186
|
+
if current_year != earliest_email_date.year:
|
|
187
|
+
grey_idx = 0
|
|
188
|
+
elif current_year_month != year_months:
|
|
189
|
+
grey_idx = ((current_year_month - 1) % 12) + 1
|
|
190
|
+
|
|
191
|
+
current_year_month = year_months
|
|
192
|
+
current_year = earliest_email_date.year
|
|
193
|
+
txt.append(escape(f"[{earliest_email_date}] "), style=f"grey{GREY_NUMBERS[grey_idx]}")
|
|
194
|
+
|
|
195
|
+
txt.append(highlighter(name or UNKNOWN))
|
|
196
|
+
|
|
197
|
+
if epstein_files:
|
|
198
|
+
num_days_in_converation = epstein_files.email_conversation_length_in_days(name)
|
|
199
|
+
msg = f" ({len(epstein_files.emails_for(name))} emails over {num_days_in_converation:,} days)"
|
|
200
|
+
txt.append(msg, style=f'dim italic')
|
|
201
|
+
|
|
202
|
+
console.print(txt)
|
|
203
|
+
|
|
204
|
+
console.line()
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def print_other_site_link(is_header: bool = True) -> None:
|
|
208
|
+
"""Print a link to the emails site if we're building text messages site and vice versa."""
|
|
209
|
+
site_type: SiteType = EMAIL if args.all_emails else TEXT_MESSAGE
|
|
210
|
+
|
|
211
|
+
if is_header:
|
|
212
|
+
print_starred_header(f"This is the Epstein {site_type.title()}s site", num_spaces=4, num_stars=14)
|
|
213
|
+
|
|
214
|
+
other_site_type: SiteType = TEXT_MESSAGE if site_type == EMAIL else EMAIL
|
|
215
|
+
other_site_msg = "another site for" + (' all of' if other_site_type == EMAIL else '')
|
|
216
|
+
other_site_msg += f" Epstein's {other_site_type}s also generated by this code"
|
|
217
|
+
markup_msg = link_markup(SITE_URLS[other_site_type], other_site_msg, OTHER_SITE_LINK_STYLE)
|
|
218
|
+
print_centered(parenthesize(Text.from_markup(markup_msg)), style='bold')
|
|
219
|
+
word_count_link = link_text_obj(WORD_COUNT_URL, 'site showing the most frequently used words in these communiques', OTHER_SITE_LINK_STYLE)
|
|
220
|
+
print_centered(parenthesize(word_count_link))
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def print_page_title(expand: bool = True, width: int | None = None) -> None:
|
|
224
|
+
title_panel = Panel(Text(PAGE_TITLE, justify='center'), expand=expand, style=TITLE_STYLE, width=width)
|
|
225
|
+
console.print(Align.center(vertically_pad(title_panel)))
|
|
226
|
+
print_social_media_links()
|
|
227
|
+
console.line(2)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def print_panel(msg: str, style: str = 'black on white', padding: tuple | None = None, centered: bool = False) -> None:
|
|
231
|
+
_padding: list[int] = list(padding or [0, 0, 0, 0])
|
|
232
|
+
_padding[2] += 1 # Bottom pad
|
|
233
|
+
panel = Panel(Text.from_markup(msg, justify='center'), width=70, style=style)
|
|
234
|
+
actual_padding: tuple[int, int, int, int] = tuple(_padding)
|
|
235
|
+
|
|
236
|
+
if centered:
|
|
237
|
+
console.print(Align.center(Padding(panel, actual_padding)))
|
|
238
|
+
else:
|
|
239
|
+
console.print(Padding(panel, actual_padding))
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def print_section_header(msg: str, style: str = SECTION_HEADER_STYLE, is_centered: bool = False) -> None:
|
|
243
|
+
panel = Panel(Text(msg, justify='center'), expand=True, padding=(1, 1), style=style)
|
|
244
|
+
panel = Align.center(panel) if is_centered else panel
|
|
245
|
+
console.print(Padding(panel, (3, 0, 1, 0)))
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def print_social_media_links() -> None:
|
|
249
|
+
print_centered_link(SUBSTACK_URL, "I Made Epstein's Text Messages Great Again (And You Should Read Them)", style=f'{SUBSTACK_POST_LINK_STYLE} bold')
|
|
250
|
+
print_centered_link(SUBSTACK_URL, SUBSTACK_URL.removeprefix('https://'), style=f'{SUBSTACK_POST_LINK_STYLE} dim')
|
|
251
|
+
|
|
252
|
+
social_links = [
|
|
253
|
+
link_text_obj('https://x.com/Cryptadamist/status/1990866804630036988', '@cryptadamist', style=SOCIAL_MEDIA_LINK_STYLE),
|
|
254
|
+
link_text_obj('https://cryptadamus.substack.com/', 'substack', style=SOCIAL_MEDIA_LINK_STYLE),
|
|
255
|
+
link_text_obj('https://universeodon.com/@cryptadamist/115572634993386057', 'mastodon', style=SOCIAL_MEDIA_LINK_STYLE),
|
|
256
|
+
]
|
|
257
|
+
|
|
258
|
+
print_centered(join_texts(social_links, join=' ', encloser='[]'))
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def print_starred_header(msg: str, num_stars: int = 7, num_spaces: int = 2, style: str = TITLE_STYLE) -> None:
|
|
262
|
+
stars = '*' * num_stars
|
|
263
|
+
spaces = ' ' * num_spaces
|
|
264
|
+
msg = f"{spaces}{stars} {msg} {stars}{spaces}"
|
|
265
|
+
print_centered(wrap_in_markup_style(msg, style))
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def vertically_pad(obj: RenderableType, amount: int = 1) -> Padding:
|
|
269
|
+
return Padding(obj, (amount, 0, amount, 0))
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def wrap_in_markup_style(msg: str, style: str | None = None) -> str:
|
|
273
|
+
if style is None or len(style.strip()) == 0:
|
|
274
|
+
return msg
|
|
275
|
+
|
|
276
|
+
modifier = ''
|
|
277
|
+
|
|
278
|
+
for style_word in style.split():
|
|
279
|
+
if style_word == 'on':
|
|
280
|
+
modifier = style_word
|
|
281
|
+
continue
|
|
282
|
+
|
|
283
|
+
style = f"{modifier} {style_word}".strip()
|
|
284
|
+
msg = f"[{style}]{msg}[/{style}]"
|
|
285
|
+
modifier = ''
|
|
286
|
+
|
|
287
|
+
return msg
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def write_html(output_path: Path) -> None:
|
|
291
|
+
if not args.build:
|
|
292
|
+
logger.warning(f"Not writing HTML because args.build={args.build}.")
|
|
293
|
+
return
|
|
294
|
+
|
|
295
|
+
console.save_html(output_path, code_format=CONSOLE_HTML_FORMAT, theme=HTML_TERMINAL_THEME)
|
|
296
|
+
logger.warning(f"Wrote {file_size_str(output_path)} to '{output_path}'")
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def _print_abbreviations_table() -> None:
|
|
300
|
+
table = Table(title="Abbreviations Used Frequently In These Conversations", header_style="bold", show_header=False)
|
|
301
|
+
table.add_column("Abbreviation", justify="center", style='bold')
|
|
302
|
+
table.add_column("Translation", style="white", justify="center")
|
|
303
|
+
|
|
304
|
+
for k, v in HEADER_ABBREVIATIONS.items():
|
|
305
|
+
table.add_row(highlighter(k), v)
|
|
306
|
+
|
|
307
|
+
console.print(Align.center(vertically_pad(table)))
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _print_external_links() -> None:
|
|
311
|
+
console.line()
|
|
312
|
+
print_starred_header('External Links', num_stars=0, num_spaces=20, style=f"italic")
|
|
313
|
+
presser_link = link_text_obj(OVERSIGHT_REPUBLICANS_PRESSER_URL, 'Official Oversight Committee Press Release')
|
|
314
|
+
raw_docs_link = join_texts([link_text_obj(RAW_OVERSIGHT_DOCS_GOOGLE_DRIVE_URL, 'raw files', style=f"{ARCHIVE_LINK_COLOR} dim")], encloser='()')
|
|
315
|
+
print_centered(join_texts([presser_link, raw_docs_link]))
|
|
316
|
+
print_centered(link_markup(JMAIL_URL, JMAIL) + " (read His Emails via Gmail interface)")
|
|
317
|
+
print_centered(link_markup(COFFEEZILLA_ARCHIVE_URL, 'Archive Of Epstein Materials') + " (Coffeezilla)")
|
|
318
|
+
print_centered(link_markup(COURIER_NEWSROOM_ARCHIVE_URL, 'Searchable Archive') + " (Courier Newsroom)")
|
|
319
|
+
print_centered(link_markup(EPSTEINIFY_URL) + " (raw document images)")
|
|
320
|
+
print_centered(link_markup(EPSTEIN_WEB_URL) + " (character summaries)")
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
if args.deep_debug:
|
|
324
|
+
print_json('THEME_STYLES', THEME_STYLES)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from rich.text import Text
|
|
4
|
+
|
|
5
|
+
from epstein_files.documents.document import Document
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class SearchResult:
|
|
10
|
+
"""Simple class used for collecting documents that match a given search term."""
|
|
11
|
+
document: Document
|
|
12
|
+
lines: list[Text] # The lines that match the search
|
|
13
|
+
|
|
14
|
+
def unprefixed_lines(self) -> list[str]:
|
|
15
|
+
return [line.plain.split(':', 1)[1] for line in self.lines]
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
|
|
5
|
+
from inflection import singularize
|
|
6
|
+
from rich.columns import Columns
|
|
7
|
+
from rich.console import Console, ConsoleOptions, RenderResult
|
|
8
|
+
from rich.padding import Padding
|
|
9
|
+
from rich.text import Text
|
|
10
|
+
|
|
11
|
+
from epstein_files.documents.emails.email_header import EmailHeader
|
|
12
|
+
from epstein_files.util.constant.common_words import COMMON_WORDS, UNSINGULARIZABLE_WORDS
|
|
13
|
+
from epstein_files.util.constant.names import OTHER_NAMES
|
|
14
|
+
from epstein_files.util.data import ALL_NAMES, flatten, sort_dict
|
|
15
|
+
from epstein_files.util.env import args, logger
|
|
16
|
+
from epstein_files.util.rich import highlighter
|
|
17
|
+
from epstein_files.util.search_result import SearchResult
|
|
18
|
+
|
|
19
|
+
FIRST_AND_LAST_NAMES = flatten([n.split() for n in ALL_NAMES])
|
|
20
|
+
FIRST_AND_LAST_NAMES = [n.lower() for n in FIRST_AND_LAST_NAMES] + OTHER_NAMES
|
|
21
|
+
|
|
22
|
+
NON_SINGULARIZABLE = UNSINGULARIZABLE_WORDS + [n for n in FIRST_AND_LAST_NAMES if n.endswith('s')]
|
|
23
|
+
SKIP_WORDS_REGEX = re.compile(r"^(asmallworld@|enwiki|http|imagepng|nymagcomnymetro|addresswww|mailto|www|/font|colordu|classdms|targetdblank|nymagcom|palmbeachdailynews)|jee[vy]acation|fontfamily|(gif|html?|jpe?g|utm)$")
|
|
24
|
+
BAD_CHARS_REGEX = re.compile(r"[-–=+()$€£©°«—^&%!#_`,.;:'‘’\"„“”?\d\\]")
|
|
25
|
+
NO_SINGULARIZE_REGEX = re.compile(r".*[io]us$")
|
|
26
|
+
PADDING = (0, 0, 2, 2)
|
|
27
|
+
MIN_COUNT_CUTOFF = 3
|
|
28
|
+
MAX_WORD_LEN = 45
|
|
29
|
+
|
|
30
|
+
BAD_WORDS = [
|
|
31
|
+
'charsetutf',
|
|
32
|
+
'classdhoenzbfont',
|
|
33
|
+
'classdmsonormaluucauup',
|
|
34
|
+
'contenttransferencoding',
|
|
35
|
+
'dbfcefacdbfla',
|
|
36
|
+
'ehomep',
|
|
37
|
+
'facedarial',
|
|
38
|
+
'fortunehtmlsmidnytnowsharesmprodnytnow',
|
|
39
|
+
'inthe',
|
|
40
|
+
'quotedprintable',
|
|
41
|
+
'researchdisclosureinquiries@jpmorgancom',
|
|
42
|
+
'summarypricesquotesstatistic',
|
|
43
|
+
'emichotpmiamiheraldcom',
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
BAD_CHARS_OK = [
|
|
47
|
+
"he'll",
|
|
48
|
+
'MLPF&S'.lower(),
|
|
49
|
+
'reis-dennis',
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
# inflection.singularize() messes these up
|
|
53
|
+
SINGULARIZATIONS = {
|
|
54
|
+
'abuses': 'abuse',
|
|
55
|
+
'approves': 'approve',
|
|
56
|
+
'arrives': 'arrive',
|
|
57
|
+
'awards/awards': 'award',
|
|
58
|
+
'bann': 'bannon',
|
|
59
|
+
'believes': 'believe',
|
|
60
|
+
'busses': 'bus',
|
|
61
|
+
'colletcions': 'collection',
|
|
62
|
+
'deserves': 'deserve',
|
|
63
|
+
'dies': 'die',
|
|
64
|
+
'dives': 'dive',
|
|
65
|
+
'drives': 'drive',
|
|
66
|
+
'enterpris': 'enterprise',
|
|
67
|
+
'focuses': 'focus',
|
|
68
|
+
'foes': 'foe',
|
|
69
|
+
'girsl': 'girl',
|
|
70
|
+
'gives': 'give',
|
|
71
|
+
'involves': 'involve',
|
|
72
|
+
'jackies': 'jackie',
|
|
73
|
+
'leaves': 'leave',
|
|
74
|
+
'lies': 'lie',
|
|
75
|
+
'lives': 'live',
|
|
76
|
+
'loves': 'love',
|
|
77
|
+
'missives': 'missive',
|
|
78
|
+
'police': 'police',
|
|
79
|
+
'proves': 'prove',
|
|
80
|
+
'receives': 'receive',
|
|
81
|
+
'reserves': 'reserve',
|
|
82
|
+
'selfies': 'selfie',
|
|
83
|
+
'serves': 'serve',
|
|
84
|
+
'shes': 'she',
|
|
85
|
+
'sholes': 'scholes',
|
|
86
|
+
'slaves': 'slave',
|
|
87
|
+
'thnks': 'thank',
|
|
88
|
+
'ties': 'tie',
|
|
89
|
+
'thieves': 'thief',
|
|
90
|
+
'toes': 'toe',
|
|
91
|
+
#'trying': 'try',
|
|
92
|
+
'viruses': 'virus',
|
|
93
|
+
'waves': 'wave',
|
|
94
|
+
'woes': 'woe',
|
|
95
|
+
# spelling
|
|
96
|
+
'prostituion': 'prostitution',
|
|
97
|
+
'visoki': 'visoski',
|
|
98
|
+
# eh...
|
|
99
|
+
'twittercom': 'twitter',
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
HTML_REGEX = re.compile(r"com/|cae-v2w=|content-(transfe|type)|font(/|-(family|size))|http|\.html?\??|margin-bottom|padding-left|quoted-printable|region=|text-decoration|ttps|www|\.(gif|jpe?g|png);?$")
|
|
103
|
+
HYPHENATED_WORD_REGEX = re.compile(r"[a-z]+-[a-z]+", re.IGNORECASE)
|
|
104
|
+
OK_SYMBOL_WORDS = ['mar-a-lago', 'p/e', 's&p', ':)', ':).', ';)', ':-)', ';-)']
|
|
105
|
+
SYMBOL_WORD_REGEX = re.compile(r"^[-—–@%/?.,&=]+$")
|
|
106
|
+
ONLY_SYMBOLS_REGEX = re.compile(r"^[^a-zA-Z0-9]+$")
|
|
107
|
+
SPLIT_WORDS_BY = ['@', '/']
|
|
108
|
+
FLAGGED_WORDS = [] # For debugging, log extra info when one of these is encountered
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@dataclass
|
|
112
|
+
class WordCount:
|
|
113
|
+
count: dict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
114
|
+
singularized: dict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
115
|
+
|
|
116
|
+
def count_word(self, word: str, document_line: SearchResult) -> None:
|
|
117
|
+
word = EmailHeader.cleanup_str(word).lower().strip()
|
|
118
|
+
raw_word = word
|
|
119
|
+
|
|
120
|
+
if HTML_REGEX.search(word):
|
|
121
|
+
logger.info(f" Skipping HTML word '{word}'")
|
|
122
|
+
return
|
|
123
|
+
elif SYMBOL_WORD_REGEX.match(word):
|
|
124
|
+
logger.debug(f" Skipping symbol word '{word}'")
|
|
125
|
+
return
|
|
126
|
+
elif word in OK_SYMBOL_WORDS:
|
|
127
|
+
self.count[':)' if word == ':).' else word] += 1
|
|
128
|
+
return
|
|
129
|
+
elif HYPHENATED_WORD_REGEX.search(word):
|
|
130
|
+
logger.info(f" Word with hyphen: '{word}'")
|
|
131
|
+
|
|
132
|
+
if ONLY_SYMBOLS_REGEX.match(word):
|
|
133
|
+
logger.info(f" ONLY_SYMBOLS_REGEX match: '{word}'")
|
|
134
|
+
return
|
|
135
|
+
|
|
136
|
+
if word not in BAD_CHARS_OK:
|
|
137
|
+
word = BAD_CHARS_REGEX.sub('', word).strip()
|
|
138
|
+
|
|
139
|
+
if self._is_invalid_word(word):
|
|
140
|
+
return
|
|
141
|
+
elif SYMBOL_WORD_REGEX.match(word):
|
|
142
|
+
logger.debug(f" Skipping symbol word '{word}'")
|
|
143
|
+
return
|
|
144
|
+
|
|
145
|
+
for symbol in SPLIT_WORDS_BY:
|
|
146
|
+
if symbol not in word:
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
for w in word.split(symbol):
|
|
150
|
+
self.count_word(w, document_line)
|
|
151
|
+
|
|
152
|
+
logger.info(f" Split word with '{symbol}' in it '{word}'...")
|
|
153
|
+
return
|
|
154
|
+
|
|
155
|
+
if word in SINGULARIZATIONS:
|
|
156
|
+
word = SINGULARIZATIONS[word]
|
|
157
|
+
elif not (word in NON_SINGULARIZABLE or NO_SINGULARIZE_REGEX.match(word) or len(word) <= 2):
|
|
158
|
+
word = singularize(word)
|
|
159
|
+
self.singularized[raw_word] += 1
|
|
160
|
+
|
|
161
|
+
# Log the raw_word if we've seen it more than once (but only once)
|
|
162
|
+
if raw_word.endswith('s') and self.singularized[raw_word] == 2:
|
|
163
|
+
logger.info(f" Singularized '{raw_word}' to '{word}'...")
|
|
164
|
+
|
|
165
|
+
if not self._is_invalid_word(word):
|
|
166
|
+
self.count[word] += 1
|
|
167
|
+
|
|
168
|
+
if word in FLAGGED_WORDS:
|
|
169
|
+
logger.warning(f"{document_line.document.filename}: Found '{word}' in '{document_line.lines[0]}'")
|
|
170
|
+
|
|
171
|
+
def _is_invalid_word(self, w: str) -> bool:
|
|
172
|
+
return bool(SKIP_WORDS_REGEX.search(w)) \
|
|
173
|
+
or len(w) <= 1 \
|
|
174
|
+
or len(w) >= MAX_WORD_LEN \
|
|
175
|
+
or w in COMMON_WORDS \
|
|
176
|
+
or w in BAD_WORDS
|
|
177
|
+
|
|
178
|
+
def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
|
|
179
|
+
word_txts = [
|
|
180
|
+
highlighter(Text('').append(f"{word}", style=_word_style(word)).append(': ').append(f"{count:,}"))
|
|
181
|
+
for word, count in [kv for kv in sort_dict(self.count) if kv[1] >= MIN_COUNT_CUTOFF]
|
|
182
|
+
]
|
|
183
|
+
|
|
184
|
+
cols = Columns(word_txts, column_first=False, equal=False, expand=True)
|
|
185
|
+
yield Padding(cols, PADDING)
|
|
186
|
+
yield f"Showing {len(word_txts):,} words appearing at least {MIN_COUNT_CUTOFF} times (out of {len(self.count):,} words)."
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _word_style(word: str | None) -> str:
|
|
190
|
+
word = word or ''
|
|
191
|
+
return 'bright_white' if word in FIRST_AND_LAST_NAMES else 'grey53'
|