epstein-files 1.0.10__py3-none-any.whl → 1.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epstein_files/__init__.py +4 -6
- epstein_files/documents/document.py +92 -49
- epstein_files/documents/email.py +7 -4
- epstein_files/documents/imessage/text_message.py +3 -12
- epstein_files/documents/json_file.py +13 -1
- epstein_files/documents/messenger_log.py +32 -19
- epstein_files/documents/other_file.py +66 -43
- epstein_files/epstein_files.py +22 -15
- epstein_files/util/constant/names.py +2 -2
- epstein_files/util/constants.py +84 -78
- epstein_files/util/doc_cfg.py +17 -25
- epstein_files/util/env.py +29 -17
- epstein_files/util/file_helper.py +13 -24
- epstein_files/util/highlighted_group.py +22 -14
- epstein_files/util/logging.py +0 -6
- epstein_files/util/output.py +12 -7
- epstein_files/util/rich.py +15 -10
- epstein_files/util/word_count.py +65 -5
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.11.dist-info}/METADATA +1 -1
- epstein_files-1.0.11.dist-info/RECORD +33 -0
- epstein_files/count_words.py +0 -72
- epstein_files-1.0.10.dist-info/RECORD +0 -34
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.11.dist-info}/LICENSE +0 -0
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.11.dist-info}/WHEEL +0 -0
- {epstein_files-1.0.10.dist-info → epstein_files-1.0.11.dist-info}/entry_points.txt +0 -0
|
@@ -2,6 +2,7 @@ import re
|
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
3
|
|
|
4
4
|
from rich.highlighter import RegexHighlighter
|
|
5
|
+
from rich.text import Text
|
|
5
6
|
|
|
6
7
|
from epstein_files.util.constant.names import *
|
|
7
8
|
from epstein_files.util.constant.strings import *
|
|
@@ -21,7 +22,7 @@ EPSTEIN_ESTATE_EXECUTOR = f"Epstein {ESTATE_EXECUTOR}"
|
|
|
21
22
|
REGEX_STYLE_PREFIX = 'regex'
|
|
22
23
|
SIMPLE_NAME_REGEX = re.compile(r"^[-\w ]+$", re.IGNORECASE)
|
|
23
24
|
|
|
24
|
-
|
|
25
|
+
CATEGORY_STYLE_MAPPING = {
|
|
25
26
|
ARTICLE: JOURNALIST,
|
|
26
27
|
ARTS: ENTERTAINER,
|
|
27
28
|
BOOK: JOURNALIST,
|
|
@@ -31,6 +32,12 @@ CATEGORY_LABEL_MAPPING = {
|
|
|
31
32
|
REPUTATION: PUBLICIST,
|
|
32
33
|
}
|
|
33
34
|
|
|
35
|
+
CATEGORY_STYLES = {
|
|
36
|
+
JSON: 'dark_red',
|
|
37
|
+
JUNK: 'grey19',
|
|
38
|
+
'letter': 'medium_orchid1'
|
|
39
|
+
}
|
|
40
|
+
|
|
34
41
|
|
|
35
42
|
@dataclass(kw_only=True)
|
|
36
43
|
class HighlightedText:
|
|
@@ -156,7 +163,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
156
163
|
HighlightedNames(
|
|
157
164
|
label=BUSINESS,
|
|
158
165
|
style='spring_green4',
|
|
159
|
-
pattern=r'Gruterite|(John\s*)?Kluge|Marc Rich|(Mi(chael|ke)\s*)?Ovitz|(Steve\s+)?Wynn|(Les(lie)?\s+)?Wexner|SALSS|Swedish[-\s]*American\s*Life\s*Science\s*Summit|Valhi|(Yves\s*)?Bouvier',
|
|
166
|
+
pattern=r'Gruterite|(John\s*)?Kluge|Marc Rich|(Mi(chael|ke)\s*)?Ovitz|(Steve\s+)?Wynn|(Les(lie)?\s+)?Wexner|New Leaf Ventures|Park Partners|SALSS|Swedish[-\s]*American\s*Life\s*Science\s*Summit|Valhi|(Yves\s*)?Bouvier',
|
|
160
167
|
emailers = {
|
|
161
168
|
ALIREZA_ITTIHADIEH: 'CEO Freestream Aircraft Limited',
|
|
162
169
|
BARBRO_C_EHNBOM: 'Swedish pharmaceuticals, SALSS',
|
|
@@ -305,7 +312,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
305
312
|
HighlightedNames(
|
|
306
313
|
label='finance',
|
|
307
314
|
style='green',
|
|
308
|
-
pattern=r'Apollo|Ari\s*Glass|Bank|(Bernie\s*)?Madoff|Black(rock|stone)|B\s*of\s*A|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|(money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
|
|
315
|
+
pattern=r'Apollo|Ari\s*Glass|Bank|(Bernie\s*)?Madoff|Black(rock|stone)|B\s*of\s*A|Boothbay(\sFund\sManagement)?|Chase\s*Bank|Credit\s*Suisse|DB|Deutsche\s*(Asset|Bank)|Electron\s*Capital\s*(Partners)?|Fenner|FRBNY|Goldman(\s*Sachs)|HSBC|Invesco|(Janet\s*)?Yellen|(Jerome\s*)?Powell(?!M\. Cabot)|(Jimmy\s*)?Cayne|JPMC?|j\.?p\.?\s*morgan(\.?com|\s*Chase)?|Madoff|Merrill(\s*Lynch)?|(Michael\s*)?(Cembalest|Milken)|Mizrahi\s*Bank|MLPF&S|((anti.?)?money\s+)?launder(s?|ers?|ing)?(\s+money)?|Morgan Stanley|(Peter L. )?Scher|(Ray\s*)?Dalio|Schwartz?man|Serageldin|UBS|us.gio@jpmorgan.com',
|
|
309
316
|
emailers={
|
|
310
317
|
AMANDA_ENS: 'Citigroup',
|
|
311
318
|
DANIEL_SABBA: 'UBS Investment Bank',
|
|
@@ -325,6 +332,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
325
332
|
style='deep_pink2',
|
|
326
333
|
pattern=r'Cambridge|(Derek\s*)?Bok|Elisa(\s*New)?|Harvard(\s*(Business|Law|University)(\s*School)?)?|(Jonathan\s*)?Zittrain|(Stephen\s*)?Kosslyn',
|
|
327
334
|
emailers = {
|
|
335
|
+
"Donald Rubin": f"Professor of Statistics",
|
|
328
336
|
"Kelly Friendly": f"longtime aide and spokesperson of {LARRY_SUMMERS}",
|
|
329
337
|
LARRY_SUMMERS: 'board of Digital Currency Group (DCG), Harvard president, Obama economic advisor',
|
|
330
338
|
'Leah Reis-Dennis': 'producer for Lisa New\'s Poetry in America',
|
|
@@ -390,7 +398,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
390
398
|
HighlightedNames(
|
|
391
399
|
label='law enforcement',
|
|
392
400
|
style='color(24) bold',
|
|
393
|
-
pattern=r'ag|(Alicia\s*)?Valle|attorney|((Bob|Robert)\s*)?Mueller|(Byung\s)?Pak|CFTC
|
|
401
|
+
pattern=r'ag|(Alicia\s*)?Valle|AML|attorney|((Bob|Robert)\s*)?Mueller|(Byung\s)?Pak|CFTC?|CIA|CIS|CVRA|Dep(artmen)?t\.?\s*of\s*(the\s*)?(Justice|Treasury)|DHS|DOJ|FBI|FCPA|FDIC|Federal\s*Bureau\s*of\s*Investigation|FinCEN|FINRA|FOIA|FTC|IRS|(James\s*)?Comey|(Jennifer\s*Shasky\s*)?Calvery|((Judge|Mark)\s*)?(Carney|Filip)|(Kirk )?Blouin|KYC|NIH|NS(A|C)|OCC|OFAC|(Lann?a\s*)?Belohlavek|lawyer|(Michael\s*)?Reiter|OGE|Office\s*of\s*Government\s*Ethics|Police Code Enforcement|(Preet\s*)?Bharara|SCOTUS|SD(FL|NY)|Southern\s*District\s*of\s*(Florida|New\s*York)|SEC|Secret\s*Service|Securities\s*and\s*Exchange\s*Commission|State\s*Dep(artmen)?t|Strzok|Supreme\s*Court|Treasury\s*(Dep(artmen)?t|Secretary)|TSA|USAID|(William\s*J\.?\s*)?Zloch',
|
|
394
402
|
emailers = {
|
|
395
403
|
ANN_MARIE_VILLAFANA: 'southern district of Florida U.S. Attorney',
|
|
396
404
|
DANNY_FROST: 'Director of Communications at Manhattan DA',
|
|
@@ -588,7 +596,7 @@ HIGHLIGHTED_NAMES = [
|
|
|
588
596
|
HighlightedText(
|
|
589
597
|
label='phone_number',
|
|
590
598
|
style='bright_green',
|
|
591
|
-
pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})
|
|
599
|
+
pattern=r"\+?(1?\(?\d{3}\)?[- ]\d{3}[- ]\d{4}|\d{2}[- ]\(?0?\)?\d{2}[- ]\d{4}[- ]\d{4})|(\b|\+)[\d+]{10,12}\b",
|
|
592
600
|
),
|
|
593
601
|
]
|
|
594
602
|
|
|
@@ -648,18 +656,14 @@ def get_info_for_name(name: str) -> str | None:
|
|
|
648
656
|
|
|
649
657
|
|
|
650
658
|
def get_style_for_category(category: str) -> str | None:
|
|
651
|
-
if category in
|
|
659
|
+
if category in CATEGORY_STYLES:
|
|
660
|
+
return CATEGORY_STYLES[category]
|
|
661
|
+
elif category in [CONFERENCE, SPEECH]:
|
|
652
662
|
return f"{get_style_for_category(ACADEMIA)} dim"
|
|
653
|
-
elif category == JSON:
|
|
654
|
-
return 'dark_red'
|
|
655
|
-
elif category == JUNK:
|
|
656
|
-
return 'grey19'
|
|
657
|
-
elif category == 'letter':
|
|
658
|
-
return 'medium_orchid1'
|
|
659
663
|
elif category == SOCIAL:
|
|
660
|
-
return f"{get_style_for_category(PUBLICIST)}
|
|
664
|
+
return f"{get_style_for_category(PUBLICIST)}"
|
|
661
665
|
|
|
662
|
-
category =
|
|
666
|
+
category = CATEGORY_STYLE_MAPPING.get(category, category)
|
|
663
667
|
|
|
664
668
|
for highlight_group in HIGHLIGHTED_NAMES:
|
|
665
669
|
if highlight_group.label == category:
|
|
@@ -672,6 +676,10 @@ def get_style_for_name(name: str | None, default_style: str = DEFAULT, allow_bol
|
|
|
672
676
|
return style if allow_bold else style.replace('bold', '').strip()
|
|
673
677
|
|
|
674
678
|
|
|
679
|
+
def styled_category(category: str) -> Text:
|
|
680
|
+
return Text(category, get_style_for_category(category) or 'wheat4')
|
|
681
|
+
|
|
682
|
+
|
|
675
683
|
def _get_highlight_group_for_name(name: str) -> HighlightedNames | None:
|
|
676
684
|
for highlight_group in HIGHLIGHTED_NAMES:
|
|
677
685
|
if highlight_group.regex.search(name):
|
epstein_files/util/logging.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from os import environ
|
|
3
|
-
from pathlib import Path
|
|
4
3
|
|
|
5
4
|
import datefinder
|
|
6
5
|
import rich_argparse_plus
|
|
@@ -10,7 +9,6 @@ from rich.logging import RichHandler
|
|
|
10
9
|
from rich.theme import Theme
|
|
11
10
|
|
|
12
11
|
from epstein_files.util.constant.strings import *
|
|
13
|
-
from epstein_files.util.file_helper import file_size_str
|
|
14
12
|
|
|
15
13
|
FILENAME_STYLE = 'gray27'
|
|
16
14
|
|
|
@@ -60,7 +58,3 @@ if env_log_level_str:
|
|
|
60
58
|
|
|
61
59
|
logger.warning(f"Setting log level to {env_log_level} based on {LOG_LEVEL_ENV_VAR} env var...")
|
|
62
60
|
logger.setLevel(env_log_level)
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def log_file_write(file_path: str | Path) -> None:
|
|
66
|
-
logger.warning(f"Wrote {file_size_str(file_path)} to '{file_path}'")
|
epstein_files/util/output.py
CHANGED
|
@@ -11,7 +11,8 @@ from epstein_files.util.constant.names import *
|
|
|
11
11
|
from epstein_files.util.constant.output_files import JSON_FILES_JSON_PATH, JSON_METADATA_PATH
|
|
12
12
|
from epstein_files.util.data import dict_sets_to_lists
|
|
13
13
|
from epstein_files.util.env import args, specified_names
|
|
14
|
-
from epstein_files.util.
|
|
14
|
+
from epstein_files.util.file_helper import log_file_write
|
|
15
|
+
from epstein_files.util.logging import logger
|
|
15
16
|
from epstein_files.util.rich import *
|
|
16
17
|
|
|
17
18
|
PRINT_COLOR_KEY_EVERY_N_EMAILS = 150
|
|
@@ -60,7 +61,6 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
|
|
|
60
61
|
"""Returns number of emails printed."""
|
|
61
62
|
print_section_header(('Selections from ' if not args.all_emails else '') + 'His Emails')
|
|
62
63
|
print_other_site_link(is_header=False)
|
|
63
|
-
|
|
64
64
|
emailers_to_print: list[str | None]
|
|
65
65
|
emailer_tables: list[str | None] = []
|
|
66
66
|
already_printed_emails: list[Email] = []
|
|
@@ -106,8 +106,8 @@ def print_emails(epstein_files: EpsteinFiles) -> int:
|
|
|
106
106
|
_verify_all_emails_were_printed(epstein_files, already_printed_emails)
|
|
107
107
|
|
|
108
108
|
fwded_articles = [e for e in already_printed_emails if e.config and e.config.is_fwded_article]
|
|
109
|
-
|
|
110
|
-
logger.warning(f"
|
|
109
|
+
log_msg = f"Rewrote {len(Email.rewritten_header_ids)} email headers (out of {len(already_printed_emails)})"
|
|
110
|
+
logger.warning(f"{log_msg}, {len(fwded_articles)} of the emails were forwarded articles.")
|
|
111
111
|
return len(already_printed_emails)
|
|
112
112
|
|
|
113
113
|
|
|
@@ -121,7 +121,7 @@ def print_json_files(epstein_files: EpsteinFiles):
|
|
|
121
121
|
else:
|
|
122
122
|
for json_file in epstein_files.json_files:
|
|
123
123
|
console.line(2)
|
|
124
|
-
console.print(json_file.
|
|
124
|
+
console.print(json_file.summary_panel())
|
|
125
125
|
console.print_json(json_file.json_str(), indent=4, sort_keys=False)
|
|
126
126
|
|
|
127
127
|
|
|
@@ -187,8 +187,13 @@ def write_urls() -> None:
|
|
|
187
187
|
def _verify_all_emails_were_printed(epstein_files: EpsteinFiles, already_printed_emails: list[Email]) -> None:
|
|
188
188
|
"""Log warnings if some emails were never printed."""
|
|
189
189
|
email_ids_that_were_printed = set([email.file_id for email in already_printed_emails])
|
|
190
|
-
logger.warning(f"Printed {len(already_printed_emails)} emails of {len(email_ids_that_were_printed)} unique file IDs.")
|
|
190
|
+
logger.warning(f"Printed {len(already_printed_emails):,} emails of {len(email_ids_that_were_printed):,} unique file IDs.")
|
|
191
|
+
missed_an_email = False
|
|
191
192
|
|
|
192
193
|
for email in epstein_files.emails:
|
|
193
|
-
if email.file_id not in email_ids_that_were_printed and not email.is_duplicate:
|
|
194
|
+
if email.file_id not in email_ids_that_were_printed and not email.is_duplicate():
|
|
194
195
|
logger.warning(f"Failed to print {email.summary()}")
|
|
196
|
+
missed_an_email = True
|
|
197
|
+
|
|
198
|
+
if not missed_an_email:
|
|
199
|
+
logger.warning(f"All {len(epstein_files.emails):,} emails printed at least once.")
|
epstein_files/util/rich.py
CHANGED
|
@@ -20,8 +20,9 @@ from epstein_files.util.constant.urls import *
|
|
|
20
20
|
from epstein_files.util.constants import FALLBACK_TIMESTAMP, HEADER_ABBREVIATIONS
|
|
21
21
|
from epstein_files.util.data import json_safe
|
|
22
22
|
from epstein_files.util.env import args
|
|
23
|
+
from epstein_files.util.file_helper import log_file_write
|
|
23
24
|
from epstein_files.util.highlighted_group import ALL_HIGHLIGHTS, HIGHLIGHTED_NAMES, EpsteinHighlighter
|
|
24
|
-
from epstein_files.util.logging import
|
|
25
|
+
from epstein_files.util.logging import logger
|
|
25
26
|
|
|
26
27
|
TITLE_WIDTH = 50
|
|
27
28
|
NUM_COLOR_KEY_COLS = 4
|
|
@@ -30,6 +31,7 @@ QUESTION_MARK_TXT = Text(QUESTION_MARKS, style='dim')
|
|
|
30
31
|
GREY_NUMBERS = [58, 39, 39, 35, 30, 27, 23, 23, 19, 19, 15, 15, 15]
|
|
31
32
|
|
|
32
33
|
DEFAULT_NAME_STYLE = 'gray46'
|
|
34
|
+
INFO_STYLE = 'white dim italic'
|
|
33
35
|
KEY_STYLE='honeydew2 bold'
|
|
34
36
|
SECTION_HEADER_STYLE = 'bold white on blue3'
|
|
35
37
|
SOCIAL_MEDIA_LINK_STYLE = 'pale_turquoise4'
|
|
@@ -239,23 +241,26 @@ def print_numbered_list_of_emailers(_list: list[str | None], epstein_files = Non
|
|
|
239
241
|
def print_other_site_link(is_header: bool = True) -> None:
|
|
240
242
|
"""Print a link to the emails site if we're building text messages site and vice versa."""
|
|
241
243
|
site_type: SiteType = EMAIL if args.all_emails else TEXT_MESSAGE
|
|
244
|
+
link_style = OTHER_SITE_LINK_STYLE if is_header else 'light_slate_grey bold'
|
|
242
245
|
|
|
243
246
|
if is_header:
|
|
244
247
|
print_starred_header(f"This is the Epstein {site_type.title()}s site", num_spaces=4, num_stars=14)
|
|
245
248
|
|
|
246
249
|
other_site_type: SiteType = TEXT_MESSAGE if site_type == EMAIL else EMAIL
|
|
247
|
-
other_site_msg = "another site
|
|
250
|
+
other_site_msg = "another site with" + (' all of' if other_site_type == EMAIL else '')
|
|
248
251
|
other_site_msg += f" Epstein's {other_site_type}s also generated by this code"
|
|
249
|
-
markup_msg = link_markup(SITE_URLS[other_site_type], other_site_msg,
|
|
252
|
+
markup_msg = link_markup(SITE_URLS[other_site_type], other_site_msg, link_style)
|
|
250
253
|
print_centered(parenthesize(Text.from_markup(markup_msg)), style='bold')
|
|
251
254
|
|
|
252
|
-
if is_header:
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
255
|
+
if not is_header:
|
|
256
|
+
return
|
|
257
|
+
|
|
258
|
+
word_count_link = link_text_obj(WORD_COUNT_URL, 'most frequently used words in the emails and texts', AUX_SITE_LINK_STYLE)
|
|
259
|
+
print_centered(parenthesize(word_count_link))
|
|
260
|
+
metadata_link = link_text_obj(JSON_METADATA_URL, 'author attribution explanations', AUX_SITE_LINK_STYLE)
|
|
261
|
+
print_centered(parenthesize(metadata_link))
|
|
262
|
+
json_link = link_text_obj(WORD_COUNT_URL, "epstein's json files", AUX_SITE_LINK_STYLE)
|
|
263
|
+
print_centered(parenthesize(json_link))
|
|
259
264
|
|
|
260
265
|
|
|
261
266
|
def print_page_title(expand: bool = True, width: int | None = None) -> None:
|
epstein_files/util/word_count.py
CHANGED
|
@@ -9,18 +9,22 @@ from rich.padding import Padding
|
|
|
9
9
|
from rich.text import Text
|
|
10
10
|
|
|
11
11
|
from epstein_files.documents.emails.email_header import EmailHeader
|
|
12
|
-
from epstein_files.
|
|
12
|
+
from epstein_files.epstein_files import EpsteinFiles
|
|
13
|
+
from epstein_files.util.constant.common_words import COMMON_WORDS_LIST, COMMON_WORDS, UNSINGULARIZABLE_WORDS
|
|
13
14
|
from epstein_files.util.constant.names import OTHER_NAMES
|
|
15
|
+
from epstein_files.util.constant.output_files import WORD_COUNT_HTML_PATH
|
|
14
16
|
from epstein_files.util.data import ALL_NAMES, flatten, sort_dict
|
|
15
|
-
from epstein_files.util.env import args
|
|
17
|
+
from epstein_files.util.env import args, specified_names
|
|
16
18
|
from epstein_files.util.logging import logger
|
|
17
|
-
from epstein_files.util.rich import highlighter
|
|
18
|
-
|
|
19
|
+
from epstein_files.util.rich import (console, highlighter, print_centered, print_color_key, print_page_title,
|
|
20
|
+
print_panel, print_starred_header, write_html)
|
|
21
|
+
from epstein_files.util.search_result import MatchedLine, SearchResult
|
|
22
|
+
from epstein_files.util.timer import Timer
|
|
19
23
|
|
|
20
24
|
FIRST_AND_LAST_NAMES = flatten([n.split() for n in ALL_NAMES])
|
|
21
25
|
FIRST_AND_LAST_NAMES = [n.lower() for n in FIRST_AND_LAST_NAMES] + OTHER_NAMES
|
|
22
26
|
|
|
23
|
-
HTML_REGEX = re.compile(r"com/|cae-v2w=|content-(transfe|type)|font(/|-(family|size))|http|\.html?\??|margin-bottom|padding-left|quoted-printable|region=|text-decoration|ttps|www|\.(gif|jpe?g|png);?$")
|
|
27
|
+
HTML_REGEX = re.compile(r"^http|#yiv|com/|cae-v2w=|content-(transfe|type)|font(/|-(family|size))|http|\.html?\??|margin-bottom|padding-left|quoted-printable|region=|text-decoration|ttps|www|\.(gif|jpe?g|png);?$")
|
|
24
28
|
HYPHENATED_WORD_REGEX = re.compile(r"[a-z]+-[a-z]+", re.IGNORECASE)
|
|
25
29
|
OK_SYMBOL_WORDS = ['mar-a-lago', 'p/e', 's&p', ':)', ':).', ';)', ':-)', ';-)']
|
|
26
30
|
ONLY_SYMBOLS_REGEX = re.compile(r"^[^a-zA-Z0-9]+$")
|
|
@@ -187,6 +191,62 @@ class WordCount:
|
|
|
187
191
|
yield f"Showing {len(word_txts):,} words appearing at least {MIN_COUNT_CUTOFF} times (out of {len(self.count):,} words)."
|
|
188
192
|
|
|
189
193
|
|
|
194
|
+
def write_word_counts_html() -> None:
|
|
195
|
+
timer = Timer()
|
|
196
|
+
epstein_files = EpsteinFiles.get_files(timer)
|
|
197
|
+
email_subjects: set[str] = set()
|
|
198
|
+
word_count = WordCount()
|
|
199
|
+
|
|
200
|
+
# Remove dupes, junk mail, and fwded articles from emails
|
|
201
|
+
emails = [e for e in epstein_files.emails if not (e.is_duplicate() or e.is_junk_mail() or e.is_fwded_article())]
|
|
202
|
+
|
|
203
|
+
for email in emails:
|
|
204
|
+
if specified_names and email.author not in specified_names:
|
|
205
|
+
continue
|
|
206
|
+
|
|
207
|
+
logger.info(f"Counting words in {email}\n [SUBJECT] {email.subject()}")
|
|
208
|
+
lines = email.actual_text.split('\n')
|
|
209
|
+
|
|
210
|
+
if email.subject() not in email_subjects and f'Re: {email.subject()}' not in email_subjects:
|
|
211
|
+
email_subjects.add(email.subject())
|
|
212
|
+
lines.append(email.subject())
|
|
213
|
+
|
|
214
|
+
for i, line in enumerate(lines):
|
|
215
|
+
if HTML_REGEX.search(line):
|
|
216
|
+
continue
|
|
217
|
+
|
|
218
|
+
for word in line.split():
|
|
219
|
+
word_count.tally_word(word, SearchResult(email, [MatchedLine(line, i)]))
|
|
220
|
+
|
|
221
|
+
# Add in iMessage conversation words
|
|
222
|
+
imessage_logs = epstein_files.imessage_logs_for(specified_names) if specified_names else epstein_files.imessage_logs
|
|
223
|
+
|
|
224
|
+
for imessage_log in imessage_logs:
|
|
225
|
+
logger.info(f"Counting words in {imessage_log}")
|
|
226
|
+
|
|
227
|
+
for i, msg in enumerate(imessage_log.messages):
|
|
228
|
+
if specified_names and msg.author not in specified_names:
|
|
229
|
+
continue
|
|
230
|
+
elif HTML_REGEX.search(line):
|
|
231
|
+
continue
|
|
232
|
+
|
|
233
|
+
for word in msg.text.split():
|
|
234
|
+
word_count.tally_word(word, SearchResult(imessage_log, [MatchedLine(msg.text, i)]))
|
|
235
|
+
|
|
236
|
+
print_page_title(expand=False)
|
|
237
|
+
print_starred_header(f"Most Common Words in {len(emails):,} Emails and {len(imessage_logs)} iMessage Logs")
|
|
238
|
+
print_centered(f"(excluding {len(COMMON_WORDS_LIST)} particularly common words at bottom)", style='dim')
|
|
239
|
+
console.line()
|
|
240
|
+
print_color_key()
|
|
241
|
+
console.line()
|
|
242
|
+
console.print(word_count)
|
|
243
|
+
console.line(2)
|
|
244
|
+
print_panel(f"{len(COMMON_WORDS_LIST):,} Excluded Words", centered=True)
|
|
245
|
+
console.print(', '.join(COMMON_WORDS_LIST), highlight=False)
|
|
246
|
+
write_html(WORD_COUNT_HTML_PATH)
|
|
247
|
+
timer.print_at_checkpoint(f"Finished counting words")
|
|
248
|
+
|
|
249
|
+
|
|
190
250
|
def _word_style(word: str | None) -> str:
|
|
191
251
|
word = word or ''
|
|
192
252
|
return 'bright_white' if word in FIRST_AND_LAST_NAMES else 'grey53'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: epstein-files
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.11
|
|
4
4
|
Summary: Tools for working with the Jeffrey Epstein documents released in November 2025.
|
|
5
5
|
Home-page: https://michelcrypt4d4mus.github.io/epstein_text_messages/
|
|
6
6
|
License: GPL-3.0-or-later
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
epstein_files/__init__.py,sha256=4zxX1tw-0xMwpM-Sbq7PezV0YNS9zN-P6gc9BQ1BqKU,4710
|
|
2
|
+
epstein_files/documents/communication.py,sha256=SunZdjMhR9v6y8LlQ6jhIu8vYjSndaBK0Su1mKnhfj0,2060
|
|
3
|
+
epstein_files/documents/document.py,sha256=dECV0bSnOJzPfOIHyHeG5rNxKd6uwuiso35-sQZg9No,18353
|
|
4
|
+
epstein_files/documents/email.py,sha256=yXiW7mB4myU8G9DY7PnnqazaCqeAR3dHr35NfBplfRU,38519
|
|
5
|
+
epstein_files/documents/emails/email_header.py,sha256=wkPfSLbmzkAeQwvhf0bAeFDLPbQT-EeG0v8vNNLYktM,7502
|
|
6
|
+
epstein_files/documents/imessage/text_message.py,sha256=3HlNp75JIoMlWj7PaUWIFry3qlGEmpGu5OmdmsBYS34,2807
|
|
7
|
+
epstein_files/documents/json_file.py,sha256=HsnVWPZXVxTF_DadL2YtJtsiXKXOd18PUs05O33tjNc,1317
|
|
8
|
+
epstein_files/documents/messenger_log.py,sha256=uSPlg85jGTwod1cV9f7MtxSNqmMZ61JBFzoiRNqg52M,6263
|
|
9
|
+
epstein_files/documents/other_file.py,sha256=S_Y-SxYYYXtx42JHmhFWl5BbTduNI7cwQjeYHBJA7sc,9950
|
|
10
|
+
epstein_files/epstein_files.py,sha256=SaD4DJJ5tRxY97Ei4BdOgLzHQ9wrBVGrP64CSqdmk-w,18691
|
|
11
|
+
epstein_files/util/constant/common_words.py,sha256=aR0UjoWmxyR49XS-DtHECQ1CiA_bK8hNP6CQ1TS9yZA,3696
|
|
12
|
+
epstein_files/util/constant/html.py,sha256=9U098TGzlghGg4WfxLYHyub5JGR17Dv7VP5i2MSu8Kk,1415
|
|
13
|
+
epstein_files/util/constant/names.py,sha256=KKJEYFpdOp4xDwXe5dhrqYgF12oJODvVSFpAB28Q76A,10153
|
|
14
|
+
epstein_files/util/constant/output_files.py,sha256=BkV4_gmdj46RfGy5SFYp6dgTty3FtlBth5YGmaGutls,1700
|
|
15
|
+
epstein_files/util/constant/strings.py,sha256=FDtksfH50PSxtSBw9XhmqxtrgRgGxdIvGiAR2bbPpu4,1899
|
|
16
|
+
epstein_files/util/constant/urls.py,sha256=0IdCVVvXib0i-4TZFkVHoS4zCbjOBZWcr6NkGxsmQWM,4981
|
|
17
|
+
epstein_files/util/constants.py,sha256=LPSI6Z0n3ChFDnMGYVO80cGuSKZf0OoyUzLih_jlRKI,111434
|
|
18
|
+
epstein_files/util/data.py,sha256=xwTqrbAi7ZDJM0iyFVOevnokP_oIQ2npkRjHzF1KGGY,2908
|
|
19
|
+
epstein_files/util/doc_cfg.py,sha256=OZlocAWldfR8Nomiad4FxQeyhNMbd0PQ-rumKn2nWBg,9641
|
|
20
|
+
epstein_files/util/env.py,sha256=HnYcfHSNkwVJ_T75Woy43_OpDyxD0KHPj3GxcVx86N4,5751
|
|
21
|
+
epstein_files/util/file_helper.py,sha256=-higKqc9J5IfNpzMzg-9j1ps3beV4N2cw8kdAxfm7NA,2835
|
|
22
|
+
epstein_files/util/highlighted_group.py,sha256=fU-8ns50uUolzPEAxadF5AnPLjn383KpEeyRXfFbv_U,35971
|
|
23
|
+
epstein_files/util/logging.py,sha256=8e22WaBfDAKEmkcr3Gb4TdqtFSkU4FQDpk3Z6hfSzbw,1977
|
|
24
|
+
epstein_files/util/output.py,sha256=UzTU0mNHEmeJr3w2TXAp19X497GB6_-HyW0mfztI1jk,8120
|
|
25
|
+
epstein_files/util/rich.py,sha256=8-4IA5bwPBdDPqkPdymq3zVKB9hfy3nrT7fUrN_XevY,14744
|
|
26
|
+
epstein_files/util/search_result.py,sha256=1fxe0KPBQXBk4dLfu6m0QXIzYfZCzvaSkWqvghJGzxY,567
|
|
27
|
+
epstein_files/util/timer.py,sha256=8hxW4Y1JcTUfnBrHh7sL2pM9xu1sL4HFQM4CmmzTarU,837
|
|
28
|
+
epstein_files/util/word_count.py,sha256=8qBTuq3d0Q-3fwiuECKWi2RfL-KUiZD8TciwvfL0D_o,9353
|
|
29
|
+
epstein_files-1.0.11.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
30
|
+
epstein_files-1.0.11.dist-info/METADATA,sha256=HBW3t1F9lkoN6GIR7ySV2kBYnJhNEF9otDZWnf03jUo,5480
|
|
31
|
+
epstein_files-1.0.11.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
32
|
+
epstein_files-1.0.11.dist-info/entry_points.txt,sha256=5qYgwAXpxegeAicD_rzda_trDRnUC51F5UVDpcZ7j6Q,240
|
|
33
|
+
epstein_files-1.0.11.dist-info/RECORD,,
|
epstein_files/count_words.py
DELETED
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
# Count word usage in emails and texts
|
|
2
|
-
import re
|
|
3
|
-
|
|
4
|
-
from epstein_files.epstein_files import EpsteinFiles
|
|
5
|
-
from epstein_files.util.constant.common_words import COMMON_WORDS_LIST
|
|
6
|
-
from epstein_files.util.constant.output_files import WORD_COUNT_HTML_PATH
|
|
7
|
-
from epstein_files.util.env import args, specified_names
|
|
8
|
-
from epstein_files.util.logging import logger
|
|
9
|
-
from epstein_files.util.rich import (console, print_centered, print_color_key, print_page_title, print_panel,
|
|
10
|
-
print_starred_header, write_html)
|
|
11
|
-
from epstein_files.util.search_result import MatchedLine, SearchResult
|
|
12
|
-
from epstein_files.util.timer import Timer
|
|
13
|
-
from epstein_files.util.word_count import WordCount
|
|
14
|
-
|
|
15
|
-
HTML_REGEX = re.compile(r"^http|#yiv")
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def write_word_counts_html() -> None:
|
|
19
|
-
timer = Timer()
|
|
20
|
-
epstein_files = EpsteinFiles.get_files(timer)
|
|
21
|
-
email_subjects: set[str] = set()
|
|
22
|
-
word_count = WordCount()
|
|
23
|
-
|
|
24
|
-
# Remove dupes, junk mail, and fwded articles from emails
|
|
25
|
-
emails = [
|
|
26
|
-
e for e in epstein_files.emails
|
|
27
|
-
if not (e.is_duplicate or e.is_junk_mail() or (e.config and e.config.is_fwded_article)) \
|
|
28
|
-
and (len(specified_names) == 0 or e.author in specified_names)
|
|
29
|
-
]
|
|
30
|
-
|
|
31
|
-
for email in emails:
|
|
32
|
-
logger.info(f"Counting words in {email}\n [SUBJECT] {email.subject()}")
|
|
33
|
-
lines = email.actual_text.split('\n')
|
|
34
|
-
|
|
35
|
-
if email.subject() not in email_subjects and f'Re: {email.subject()}' not in email_subjects:
|
|
36
|
-
email_subjects.add(email.subject())
|
|
37
|
-
lines.append(email.subject())
|
|
38
|
-
|
|
39
|
-
for i, line in enumerate(lines):
|
|
40
|
-
if HTML_REGEX.search(line):
|
|
41
|
-
continue
|
|
42
|
-
|
|
43
|
-
for word in line.split():
|
|
44
|
-
word_count.tally_word(word, SearchResult(email, [MatchedLine(line, i)]))
|
|
45
|
-
|
|
46
|
-
# Add in iMessage conversation words
|
|
47
|
-
imessage_logs = epstein_files.imessage_logs_for(specified_names) if specified_names else epstein_files.imessage_logs
|
|
48
|
-
|
|
49
|
-
for imessage_log in imessage_logs:
|
|
50
|
-
logger.info(f"Counting words in {imessage_log}")
|
|
51
|
-
|
|
52
|
-
for msg in imessage_log.messages():
|
|
53
|
-
if len(specified_names) > 0 and msg.author not in specified_names:
|
|
54
|
-
continue
|
|
55
|
-
elif HTML_REGEX.search(line):
|
|
56
|
-
continue
|
|
57
|
-
|
|
58
|
-
for word in msg.text.split():
|
|
59
|
-
word_count.tally_word(word, SearchResult(imessage_log, [msg.text]))
|
|
60
|
-
|
|
61
|
-
print_page_title(expand=False)
|
|
62
|
-
print_starred_header(f"Most Common Words in {len(emails):,} Emails and {len(imessage_logs)} iMessage Logs")
|
|
63
|
-
print_centered(f"(excluding {len(COMMON_WORDS_LIST)} particularly common words at bottom)", style='dim')
|
|
64
|
-
console.line()
|
|
65
|
-
print_color_key()
|
|
66
|
-
console.line()
|
|
67
|
-
console.print(word_count)
|
|
68
|
-
console.line(2)
|
|
69
|
-
print_panel(f"{len(COMMON_WORDS_LIST):,} Excluded Words", centered=True)
|
|
70
|
-
console.print(', '.join(COMMON_WORDS_LIST), highlight=False)
|
|
71
|
-
write_html(WORD_COUNT_HTML_PATH)
|
|
72
|
-
timer.print_at_checkpoint(f"Finished counting words")
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
epstein_files/__init__.py,sha256=SfLLu9X7rfHdmZcl8JGmiIxZ_E1RVsmCrh8sLO4jNPU,4859
|
|
2
|
-
epstein_files/count_words.py,sha256=i1pYaQzX7b9S3pyV3RM_8asbQJ1PEk8wJgLOG6Mf0D8,2966
|
|
3
|
-
epstein_files/documents/communication.py,sha256=SunZdjMhR9v6y8LlQ6jhIu8vYjSndaBK0Su1mKnhfj0,2060
|
|
4
|
-
epstein_files/documents/document.py,sha256=BUaioSvOmfsR-ULa6hJy3WYg-hBDC-kqafUheMJ-jFY,16665
|
|
5
|
-
epstein_files/documents/email.py,sha256=H34b2zt_TrPUgXHwZXybjmLE9-QNAtezs9NVSCPOSGM,38462
|
|
6
|
-
epstein_files/documents/emails/email_header.py,sha256=wkPfSLbmzkAeQwvhf0bAeFDLPbQT-EeG0v8vNNLYktM,7502
|
|
7
|
-
epstein_files/documents/imessage/text_message.py,sha256=wfWPQhwGG5Yzyhbr1NAQAY0bzRjjqVZmh8SPl48XmAM,3025
|
|
8
|
-
epstein_files/documents/json_file.py,sha256=1Cx_3uM38Dwgrbs8fU55TUZKSrCsmd4QpHKWtfWdudw,1089
|
|
9
|
-
epstein_files/documents/messenger_log.py,sha256=DHlQpbLbMITMpMtCYk2vcRc7-CTvYvOXql-9nDUc3tQ,5887
|
|
10
|
-
epstein_files/documents/other_file.py,sha256=NdVlCYcyzHvOInReqF-zvHJI1hwtzMWW9ekDojHIb4U,9091
|
|
11
|
-
epstein_files/epstein_files.py,sha256=EEx8Auwv8z0FkRrCi7wE8iuuRQd6K1rQDMc2vdbrsh4,18298
|
|
12
|
-
epstein_files/util/constant/common_words.py,sha256=aR0UjoWmxyR49XS-DtHECQ1CiA_bK8hNP6CQ1TS9yZA,3696
|
|
13
|
-
epstein_files/util/constant/html.py,sha256=9U098TGzlghGg4WfxLYHyub5JGR17Dv7VP5i2MSu8Kk,1415
|
|
14
|
-
epstein_files/util/constant/names.py,sha256=uYhv9xa4NO5jCk9zrGpPKFkcVVaMY2qtBC7ZaKGK3J8,10135
|
|
15
|
-
epstein_files/util/constant/output_files.py,sha256=BkV4_gmdj46RfGy5SFYp6dgTty3FtlBth5YGmaGutls,1700
|
|
16
|
-
epstein_files/util/constant/strings.py,sha256=FDtksfH50PSxtSBw9XhmqxtrgRgGxdIvGiAR2bbPpu4,1899
|
|
17
|
-
epstein_files/util/constant/urls.py,sha256=0IdCVVvXib0i-4TZFkVHoS4zCbjOBZWcr6NkGxsmQWM,4981
|
|
18
|
-
epstein_files/util/constants.py,sha256=gp5HWHt5FHd916r4UpjcJKslO5L-Wno6kjA4F3ZA4YU,110884
|
|
19
|
-
epstein_files/util/data.py,sha256=xwTqrbAi7ZDJM0iyFVOevnokP_oIQ2npkRjHzF1KGGY,2908
|
|
20
|
-
epstein_files/util/doc_cfg.py,sha256=5Pb__bP00mKi9ACv33omZQA-TBzumc7D2Td_Mk4M5DY,9822
|
|
21
|
-
epstein_files/util/env.py,sha256=PaPBi27-npU9egt9LHxr5qR65B2DPHwt7Xc9sx5VN-M,5225
|
|
22
|
-
epstein_files/util/file_helper.py,sha256=v_bE10MHEcXti9DVJo4WqyOsG83Xrv05S3Vc70cYJkk,3082
|
|
23
|
-
epstein_files/util/highlighted_group.py,sha256=dajLYuSbT69zMWf6XKUOZI6ZcgFy-Beq7Nsg9qlteck,35715
|
|
24
|
-
epstein_files/util/logging.py,sha256=4hVl1Qw1qRMSVEYKXZxrvdQuSIMBgTPskzvNMNu8268,2185
|
|
25
|
-
epstein_files/util/output.py,sha256=wLjFBGR5ffn4cLep12G3OmUR0H3WtEMXeVMOXtd-6ig,7909
|
|
26
|
-
epstein_files/util/rich.py,sha256=rdHzn4XRB2erQSf2yYyPakRmd9ixqBUdS8-BVOUAXnE,14603
|
|
27
|
-
epstein_files/util/search_result.py,sha256=1fxe0KPBQXBk4dLfu6m0QXIzYfZCzvaSkWqvghJGzxY,567
|
|
28
|
-
epstein_files/util/timer.py,sha256=8hxW4Y1JcTUfnBrHh7sL2pM9xu1sL4HFQM4CmmzTarU,837
|
|
29
|
-
epstein_files/util/word_count.py,sha256=eGzcsoAvMcutRUFOJnVuEp9_28H74to7T9jTdGUZnuI,6757
|
|
30
|
-
epstein_files-1.0.10.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
31
|
-
epstein_files-1.0.10.dist-info/METADATA,sha256=zi10sSw5g5BZDRovIeWlpMYEgLbqFxSl7QII9jUuKdw,5480
|
|
32
|
-
epstein_files-1.0.10.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
33
|
-
epstein_files-1.0.10.dist-info/entry_points.txt,sha256=5qYgwAXpxegeAicD_rzda_trDRnUC51F5UVDpcZ7j6Q,240
|
|
34
|
-
epstein_files-1.0.10.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|