epstein-files 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,324 @@
1
+ # Rich reference: https://rich.readthedocs.io/en/latest/reference.html
2
+ import json
3
+ from os import devnull
4
+ from pathlib import Path
5
+ from typing import Literal
6
+
7
+ from rich.align import Align
8
+ from rich.console import Console, RenderableType
9
+ from rich.markup import escape
10
+ from rich.panel import Panel
11
+ from rich.padding import Padding
12
+ from rich.table import Table
13
+ from rich.text import Text
14
+ from rich.theme import Theme
15
+
16
+ from epstein_files.util.constant.html import CONSOLE_HTML_FORMAT, HTML_TERMINAL_THEME, PAGE_TITLE
17
+ from epstein_files.util.constant.names import UNKNOWN
18
+ from epstein_files.util.constant.strings import DEFAULT, EMAIL, NA, OTHER_SITE_LINK_STYLE, QUESTION_MARKS, SiteType
19
+ from epstein_files.util.constant.urls import *
20
+ from epstein_files.util.constants import FALLBACK_TIMESTAMP, HEADER_ABBREVIATIONS
21
+ from epstein_files.util.env import args, logger
22
+ from epstein_files.util.file_helper import file_size_str
23
+ from epstein_files.util.highlighted_group import HIGHLIGHTED_GROUPS, InterestingNamesHighlighter
24
+
25
+ TITLE_WIDTH = 50
26
+ NUM_COLOR_KEY_COLS = 4
27
+ NA_TXT = Text(NA, style='dim')
28
+ QUESTION_MARK_TXT = Text(QUESTION_MARKS, style='dim')
29
+ GREY_NUMBERS = [58, 39, 39, 35, 30, 27, 23, 23, 19, 19, 15, 15, 15]
30
+
31
+ DEFAULT_NAME_STYLE = 'gray46'
32
+ KEY_STYLE='honeydew2 bold'
33
+ SECTION_HEADER_STYLE = 'bold white on blue3'
34
+ SOCIAL_MEDIA_LINK_STYLE = 'cyan3 bold'
35
+ SUBSTACK_POST_LINK_STYLE = 'bright_cyan'
36
+ SYMBOL_STYLE = 'grey70'
37
+ TITLE_STYLE = 'black on bright_white bold'
38
+
39
+ HIGHLIGHTED_GROUP_COLOR_KEYS = [
40
+ Text(highlight_group.label.replace('_', ' '), style=highlight_group.style)
41
+ for highlight_group in sorted(HIGHLIGHTED_GROUPS, key=lambda hg: hg.label)
42
+ if not highlight_group.is_multiline
43
+ ]
44
+
45
+ THEME_STYLES = {
46
+ DEFAULT: 'wheat4',
47
+ TEXT_LINK: 'deep_sky_blue4 underline',
48
+ **{hg.theme_style_name: hg.style for hg in HIGHLIGHTED_GROUPS}, # Inject style names for HighlightedGroups
49
+ }
50
+
51
+ # Instantiate console object
52
+ CONSOLE_ARGS = {
53
+ 'color_system': '256',
54
+ 'highlighter': InterestingNamesHighlighter(),
55
+ 'record': args.build,
56
+ 'safe_box': False,
57
+ 'theme': Theme(THEME_STYLES),
58
+ 'width': args.width,
59
+ }
60
+
61
+ if args.suppress_output:
62
+ logger.warning(f"Suppressing terminal output because args.suppress_output={args.suppress_output}...")
63
+ CONSOLE_ARGS.update({'file': open(devnull, "wt")})
64
+
65
+ console = Console(**CONSOLE_ARGS)
66
+ highlighter = CONSOLE_ARGS['highlighter']
67
+
68
+
69
+ def add_cols_to_table(table: Table, col_names: list[str]) -> None:
70
+ """Left most col will be left justified, rest are center justified."""
71
+ for i, col in enumerate(col_names):
72
+ table.add_column(col, justify='left' if i == 0 else 'center')
73
+
74
+
75
+ def join_texts(txts: list[Text], join: str = ' ', encloser: str = '') -> Text:
76
+ """Join rich.Text objs into one."""
77
+ if encloser:
78
+ if len(encloser) != 2:
79
+ raise ValueError(f"'encloser' arg is '{encloser}' which is not 2 characters long")
80
+
81
+ enclose_start, enclose_end = (encloser[0], encloser[1])
82
+ else:
83
+ enclose_start = enclose_end = ''
84
+
85
+ txt = Text('')
86
+
87
+ for i, link in enumerate(txts):
88
+ txt.append(join if i >= 1 else '').append(enclose_start).append(link).append(enclose_end)
89
+
90
+ return txt
91
+
92
+
93
+ def key_value_txt(key: str, value: Text | str) -> Text:
94
+ """Generate a Text obj for 'key=value'."""
95
+ return Text('').append(key, style=KEY_STYLE).append('=', style=SYMBOL_STYLE).append(value)
96
+
97
+
98
+ def parenthesize(msg: str | Text, style: str = '') -> Text:
99
+ txt = Text(msg) if isinstance(msg, str) else msg
100
+ return Text('(', style=style).append(txt).append(')')
101
+
102
+
103
+ def print_author_header(msg: str, color: str | None, footer: str | None = None) -> None:
104
+ txt = Text(msg, justify='center')
105
+ color = color or 'white'
106
+ color = 'white' if color == DEFAULT else color
107
+ panel = Panel(txt, width=80, style=f"black on {color} bold")
108
+ console.print('\n', Align.center(panel))
109
+
110
+ if footer:
111
+ console.print(Align.center(f"({footer})"), highlight=False, style=f'{color} italic')
112
+
113
+ console.line()
114
+
115
+
116
+ def print_centered(obj: RenderableType, style: str = '') -> None:
117
+ console.print(Align.center(obj), style=style)
118
+
119
+
120
+ def print_centered_link(url: str, link_text: str, style: str | None = None) -> None:
121
+ print_centered(link_markup(url, link_text, style or ARCHIVE_LINK_COLOR))
122
+
123
+
124
+ def print_color_key(_key_type: Literal["Groups", "People"] = "Groups") -> None:
125
+ color_table = Table(title=f'Rough Guide to Highlighted Colors', show_header=False)
126
+ num_colors = len(HIGHLIGHTED_GROUP_COLOR_KEYS)
127
+ row_number = 0
128
+
129
+ for i in range(0, NUM_COLOR_KEY_COLS):
130
+ color_table.add_column(f"color_col_{i}", justify='center')
131
+
132
+ while (row_number * NUM_COLOR_KEY_COLS) < num_colors:
133
+ idx = row_number * NUM_COLOR_KEY_COLS
134
+ color_table.add_row(*HIGHLIGHTED_GROUP_COLOR_KEYS[idx:(idx + NUM_COLOR_KEY_COLS)])
135
+ row_number += 1
136
+
137
+ print_centered(vertically_pad(color_table))
138
+
139
+
140
+ def print_header(epstein_files: 'EpsteinFiles') -> None:
141
+ console.print(f"This site isn't optimized for mobile but if you get past the header it should be readable.", style='dim')
142
+ print_page_title(width=TITLE_WIDTH)
143
+ print_other_site_link()
144
+ _print_external_links()
145
+ console.line()
146
+ _print_abbreviations_table()
147
+ epstein_files.print_files_summary()
148
+ print_color_key()
149
+ print_centered(f"if you think there's an attribution error or can deanonymize an {UNKNOWN} contact {CRYPTADAMUS_TWITTER}", 'grey46')
150
+ print_centered('note this site is based on the OCR text provided by Congress which is not always the greatest', 'grey23')
151
+ print_centered(f"(thanks to {link_markup('https://x.com/ImDrinknWyn', '@ImDrinknWyn', 'dodger_blue3')} + others for help attributing redacted emails)")
152
+ print_centered_link(ATTRIBUTIONS_URL, "(some explanations of author attributions)", style='magenta')
153
+
154
+
155
+ def print_json(label: str, obj: object, skip_falsey: bool = False) -> None:
156
+ if isinstance(obj, dict):
157
+ if skip_falsey:
158
+ obj = {k: v for k, v in obj.items() if v}
159
+
160
+ if None in obj:
161
+ obj = {k or UNKNOWN: v for k, v in obj.items()}
162
+
163
+
164
+ console.line()
165
+ console.print(Panel(label, expand=False))
166
+ console.print_json(json.dumps(obj, sort_keys=True), indent=4)
167
+ console.line()
168
+
169
+
170
+ def print_numbered_list_of_emailers(_list: list[str | None], epstein_files = None) -> None:
171
+ """Add the first emailed_at timestamp for each emailer if 'epstein_files' provided."""
172
+ current_year = 1990
173
+ current_year_month = current_year * 12
174
+ grey_idx = 0
175
+ console.line()
176
+
177
+ for i, name in enumerate(_list):
178
+ indent = ' ' if i < 9 else (' ' if i < 99 else ' ')
179
+ txt = Text((indent) + F" {i + 1}. ", style=DEFAULT_NAME_STYLE)
180
+
181
+ if epstein_files:
182
+ earliest_email_date = (epstein_files.earliest_email_at(name) or FALLBACK_TIMESTAMP).date()
183
+ year_months = (earliest_email_date.year * 12) + earliest_email_date.month
184
+
185
+ # Color year rollovers more brightly
186
+ if current_year != earliest_email_date.year:
187
+ grey_idx = 0
188
+ elif current_year_month != year_months:
189
+ grey_idx = ((current_year_month - 1) % 12) + 1
190
+
191
+ current_year_month = year_months
192
+ current_year = earliest_email_date.year
193
+ txt.append(escape(f"[{earliest_email_date}] "), style=f"grey{GREY_NUMBERS[grey_idx]}")
194
+
195
+ txt.append(highlighter(name or UNKNOWN))
196
+
197
+ if epstein_files:
198
+ num_days_in_converation = epstein_files.email_conversation_length_in_days(name)
199
+ msg = f" ({len(epstein_files.emails_for(name))} emails over {num_days_in_converation:,} days)"
200
+ txt.append(msg, style=f'dim italic')
201
+
202
+ console.print(txt)
203
+
204
+ console.line()
205
+
206
+
207
+ def print_other_site_link(is_header: bool = True) -> None:
208
+ """Print a link to the emails site if we're building text messages site and vice versa."""
209
+ site_type: SiteType = EMAIL if args.all_emails else TEXT_MESSAGE
210
+
211
+ if is_header:
212
+ print_starred_header(f"This is the Epstein {site_type.title()}s site", num_spaces=4, num_stars=14)
213
+
214
+ other_site_type: SiteType = TEXT_MESSAGE if site_type == EMAIL else EMAIL
215
+ other_site_msg = "another site for" + (' all of' if other_site_type == EMAIL else '')
216
+ other_site_msg += f" Epstein's {other_site_type}s also generated by this code"
217
+ markup_msg = link_markup(SITE_URLS[other_site_type], other_site_msg, OTHER_SITE_LINK_STYLE)
218
+ print_centered(parenthesize(Text.from_markup(markup_msg)), style='bold')
219
+ word_count_link = link_text_obj(WORD_COUNT_URL, 'site showing the most frequently used words in these communiques', OTHER_SITE_LINK_STYLE)
220
+ print_centered(parenthesize(word_count_link))
221
+
222
+
223
+ def print_page_title(expand: bool = True, width: int | None = None) -> None:
224
+ title_panel = Panel(Text(PAGE_TITLE, justify='center'), expand=expand, style=TITLE_STYLE, width=width)
225
+ console.print(Align.center(vertically_pad(title_panel)))
226
+ print_social_media_links()
227
+ console.line(2)
228
+
229
+
230
+ def print_panel(msg: str, style: str = 'black on white', padding: tuple | None = None, centered: bool = False) -> None:
231
+ _padding: list[int] = list(padding or [0, 0, 0, 0])
232
+ _padding[2] += 1 # Bottom pad
233
+ panel = Panel(Text.from_markup(msg, justify='center'), width=70, style=style)
234
+ actual_padding: tuple[int, int, int, int] = tuple(_padding)
235
+
236
+ if centered:
237
+ console.print(Align.center(Padding(panel, actual_padding)))
238
+ else:
239
+ console.print(Padding(panel, actual_padding))
240
+
241
+
242
+ def print_section_header(msg: str, style: str = SECTION_HEADER_STYLE, is_centered: bool = False) -> None:
243
+ panel = Panel(Text(msg, justify='center'), expand=True, padding=(1, 1), style=style)
244
+ panel = Align.center(panel) if is_centered else panel
245
+ console.print(Padding(panel, (3, 0, 1, 0)))
246
+
247
+
248
+ def print_social_media_links() -> None:
249
+ print_centered_link(SUBSTACK_URL, "I Made Epstein's Text Messages Great Again (And You Should Read Them)", style=f'{SUBSTACK_POST_LINK_STYLE} bold')
250
+ print_centered_link(SUBSTACK_URL, SUBSTACK_URL.removeprefix('https://'), style=f'{SUBSTACK_POST_LINK_STYLE} dim')
251
+
252
+ social_links = [
253
+ link_text_obj('https://x.com/Cryptadamist/status/1990866804630036988', '@cryptadamist', style=SOCIAL_MEDIA_LINK_STYLE),
254
+ link_text_obj('https://cryptadamus.substack.com/', 'substack', style=SOCIAL_MEDIA_LINK_STYLE),
255
+ link_text_obj('https://universeodon.com/@cryptadamist/115572634993386057', 'mastodon', style=SOCIAL_MEDIA_LINK_STYLE),
256
+ ]
257
+
258
+ print_centered(join_texts(social_links, join=' ', encloser='[]'))
259
+
260
+
261
+ def print_starred_header(msg: str, num_stars: int = 7, num_spaces: int = 2, style: str = TITLE_STYLE) -> None:
262
+ stars = '*' * num_stars
263
+ spaces = ' ' * num_spaces
264
+ msg = f"{spaces}{stars} {msg} {stars}{spaces}"
265
+ print_centered(wrap_in_markup_style(msg, style))
266
+
267
+
268
+ def vertically_pad(obj: RenderableType, amount: int = 1) -> Padding:
269
+ return Padding(obj, (amount, 0, amount, 0))
270
+
271
+
272
+ def wrap_in_markup_style(msg: str, style: str | None = None) -> str:
273
+ if style is None or len(style.strip()) == 0:
274
+ return msg
275
+
276
+ modifier = ''
277
+
278
+ for style_word in style.split():
279
+ if style_word == 'on':
280
+ modifier = style_word
281
+ continue
282
+
283
+ style = f"{modifier} {style_word}".strip()
284
+ msg = f"[{style}]{msg}[/{style}]"
285
+ modifier = ''
286
+
287
+ return msg
288
+
289
+
290
+ def write_html(output_path: Path) -> None:
291
+ if not args.build:
292
+ logger.warning(f"Not writing HTML because args.build={args.build}.")
293
+ return
294
+
295
+ console.save_html(output_path, code_format=CONSOLE_HTML_FORMAT, theme=HTML_TERMINAL_THEME)
296
+ logger.warning(f"Wrote {file_size_str(output_path)} to '{output_path}'")
297
+
298
+
299
+ def _print_abbreviations_table() -> None:
300
+ table = Table(title="Abbreviations Used Frequently In These Conversations", header_style="bold", show_header=False)
301
+ table.add_column("Abbreviation", justify="center", style='bold')
302
+ table.add_column("Translation", style="white", justify="center")
303
+
304
+ for k, v in HEADER_ABBREVIATIONS.items():
305
+ table.add_row(highlighter(k), v)
306
+
307
+ console.print(Align.center(vertically_pad(table)))
308
+
309
+
310
+ def _print_external_links() -> None:
311
+ console.line()
312
+ print_starred_header('External Links', num_stars=0, num_spaces=20, style=f"italic")
313
+ presser_link = link_text_obj(OVERSIGHT_REPUBLICANS_PRESSER_URL, 'Official Oversight Committee Press Release')
314
+ raw_docs_link = join_texts([link_text_obj(RAW_OVERSIGHT_DOCS_GOOGLE_DRIVE_URL, 'raw files', style=f"{ARCHIVE_LINK_COLOR} dim")], encloser='()')
315
+ print_centered(join_texts([presser_link, raw_docs_link]))
316
+ print_centered(link_markup(JMAIL_URL, JMAIL) + " (read His Emails via Gmail interface)")
317
+ print_centered(link_markup(COFFEEZILLA_ARCHIVE_URL, 'Archive Of Epstein Materials') + " (Coffeezilla)")
318
+ print_centered(link_markup(COURIER_NEWSROOM_ARCHIVE_URL, 'Searchable Archive') + " (Courier Newsroom)")
319
+ print_centered(link_markup(EPSTEINIFY_URL) + " (raw document images)")
320
+ print_centered(link_markup(EPSTEIN_WEB_URL) + " (character summaries)")
321
+
322
+
323
+ if args.deep_debug:
324
+ print_json('THEME_STYLES', THEME_STYLES)
@@ -0,0 +1,15 @@
1
+ from dataclasses import dataclass
2
+
3
+ from rich.text import Text
4
+
5
+ from epstein_files.documents.document import Document
6
+
7
+
8
+ @dataclass
9
+ class SearchResult:
10
+ """Simple class used for collecting documents that match a given search term."""
11
+ document: Document
12
+ lines: list[Text] # The lines that match the search
13
+
14
+ def unprefixed_lines(self) -> list[str]:
15
+ return [line.plain.split(':', 1)[1] for line in self.lines]
@@ -0,0 +1,191 @@
1
+ import re
2
+ from collections import defaultdict
3
+ from dataclasses import dataclass, field
4
+
5
+ from inflection import singularize
6
+ from rich.columns import Columns
7
+ from rich.console import Console, ConsoleOptions, RenderResult
8
+ from rich.padding import Padding
9
+ from rich.text import Text
10
+
11
+ from epstein_files.documents.emails.email_header import EmailHeader
12
+ from epstein_files.util.constant.common_words import COMMON_WORDS, UNSINGULARIZABLE_WORDS
13
+ from epstein_files.util.constant.names import OTHER_NAMES
14
+ from epstein_files.util.data import ALL_NAMES, flatten, sort_dict
15
+ from epstein_files.util.env import args, logger
16
+ from epstein_files.util.rich import highlighter
17
+ from epstein_files.util.search_result import SearchResult
18
+
19
+ FIRST_AND_LAST_NAMES = flatten([n.split() for n in ALL_NAMES])
20
+ FIRST_AND_LAST_NAMES = [n.lower() for n in FIRST_AND_LAST_NAMES] + OTHER_NAMES
21
+
22
+ NON_SINGULARIZABLE = UNSINGULARIZABLE_WORDS + [n for n in FIRST_AND_LAST_NAMES if n.endswith('s')]
23
+ SKIP_WORDS_REGEX = re.compile(r"^(asmallworld@|enwiki|http|imagepng|nymagcomnymetro|addresswww|mailto|www|/font|colordu|classdms|targetdblank|nymagcom|palmbeachdailynews)|jee[vy]acation|fontfamily|(gif|html?|jpe?g|utm)$")
24
+ BAD_CHARS_REGEX = re.compile(r"[-–=+()$€£©°«—^&%!#_`,.;:'‘’\"„“”?\d\\]")
25
+ NO_SINGULARIZE_REGEX = re.compile(r".*[io]us$")
26
+ PADDING = (0, 0, 2, 2)
27
+ MIN_COUNT_CUTOFF = 3
28
+ MAX_WORD_LEN = 45
29
+
30
+ BAD_WORDS = [
31
+ 'charsetutf',
32
+ 'classdhoenzbfont',
33
+ 'classdmsonormaluucauup',
34
+ 'contenttransferencoding',
35
+ 'dbfcefacdbfla',
36
+ 'ehomep',
37
+ 'facedarial',
38
+ 'fortunehtmlsmidnytnowsharesmprodnytnow',
39
+ 'inthe',
40
+ 'quotedprintable',
41
+ 'researchdisclosureinquiries@jpmorgancom',
42
+ 'summarypricesquotesstatistic',
43
+ 'emichotpmiamiheraldcom',
44
+ ]
45
+
46
+ BAD_CHARS_OK = [
47
+ "he'll",
48
+ 'MLPF&S'.lower(),
49
+ 'reis-dennis',
50
+ ]
51
+
52
+ # inflection.singularize() messes these up
53
+ SINGULARIZATIONS = {
54
+ 'abuses': 'abuse',
55
+ 'approves': 'approve',
56
+ 'arrives': 'arrive',
57
+ 'awards/awards': 'award',
58
+ 'bann': 'bannon',
59
+ 'believes': 'believe',
60
+ 'busses': 'bus',
61
+ 'colletcions': 'collection',
62
+ 'deserves': 'deserve',
63
+ 'dies': 'die',
64
+ 'dives': 'dive',
65
+ 'drives': 'drive',
66
+ 'enterpris': 'enterprise',
67
+ 'focuses': 'focus',
68
+ 'foes': 'foe',
69
+ 'girsl': 'girl',
70
+ 'gives': 'give',
71
+ 'involves': 'involve',
72
+ 'jackies': 'jackie',
73
+ 'leaves': 'leave',
74
+ 'lies': 'lie',
75
+ 'lives': 'live',
76
+ 'loves': 'love',
77
+ 'missives': 'missive',
78
+ 'police': 'police',
79
+ 'proves': 'prove',
80
+ 'receives': 'receive',
81
+ 'reserves': 'reserve',
82
+ 'selfies': 'selfie',
83
+ 'serves': 'serve',
84
+ 'shes': 'she',
85
+ 'sholes': 'scholes',
86
+ 'slaves': 'slave',
87
+ 'thnks': 'thank',
88
+ 'ties': 'tie',
89
+ 'thieves': 'thief',
90
+ 'toes': 'toe',
91
+ #'trying': 'try',
92
+ 'viruses': 'virus',
93
+ 'waves': 'wave',
94
+ 'woes': 'woe',
95
+ # spelling
96
+ 'prostituion': 'prostitution',
97
+ 'visoki': 'visoski',
98
+ # eh...
99
+ 'twittercom': 'twitter',
100
+ }
101
+
102
+ HTML_REGEX = re.compile(r"com/|cae-v2w=|content-(transfe|type)|font(/|-(family|size))|http|\.html?\??|margin-bottom|padding-left|quoted-printable|region=|text-decoration|ttps|www|\.(gif|jpe?g|png);?$")
103
+ HYPHENATED_WORD_REGEX = re.compile(r"[a-z]+-[a-z]+", re.IGNORECASE)
104
+ OK_SYMBOL_WORDS = ['mar-a-lago', 'p/e', 's&p', ':)', ':).', ';)', ':-)', ';-)']
105
+ SYMBOL_WORD_REGEX = re.compile(r"^[-—–@%/?.,&=]+$")
106
+ ONLY_SYMBOLS_REGEX = re.compile(r"^[^a-zA-Z0-9]+$")
107
+ SPLIT_WORDS_BY = ['@', '/']
108
+ FLAGGED_WORDS = [] # For debugging, log extra info when one of these is encountered
109
+
110
+
111
+ @dataclass
112
+ class WordCount:
113
+ count: dict[str, int] = field(default_factory=lambda: defaultdict(int))
114
+ singularized: dict[str, int] = field(default_factory=lambda: defaultdict(int))
115
+
116
+ def count_word(self, word: str, document_line: SearchResult) -> None:
117
+ word = EmailHeader.cleanup_str(word).lower().strip()
118
+ raw_word = word
119
+
120
+ if HTML_REGEX.search(word):
121
+ logger.info(f" Skipping HTML word '{word}'")
122
+ return
123
+ elif SYMBOL_WORD_REGEX.match(word):
124
+ logger.debug(f" Skipping symbol word '{word}'")
125
+ return
126
+ elif word in OK_SYMBOL_WORDS:
127
+ self.count[':)' if word == ':).' else word] += 1
128
+ return
129
+ elif HYPHENATED_WORD_REGEX.search(word):
130
+ logger.info(f" Word with hyphen: '{word}'")
131
+
132
+ if ONLY_SYMBOLS_REGEX.match(word):
133
+ logger.info(f" ONLY_SYMBOLS_REGEX match: '{word}'")
134
+ return
135
+
136
+ if word not in BAD_CHARS_OK:
137
+ word = BAD_CHARS_REGEX.sub('', word).strip()
138
+
139
+ if self._is_invalid_word(word):
140
+ return
141
+ elif SYMBOL_WORD_REGEX.match(word):
142
+ logger.debug(f" Skipping symbol word '{word}'")
143
+ return
144
+
145
+ for symbol in SPLIT_WORDS_BY:
146
+ if symbol not in word:
147
+ continue
148
+
149
+ for w in word.split(symbol):
150
+ self.count_word(w, document_line)
151
+
152
+ logger.info(f" Split word with '{symbol}' in it '{word}'...")
153
+ return
154
+
155
+ if word in SINGULARIZATIONS:
156
+ word = SINGULARIZATIONS[word]
157
+ elif not (word in NON_SINGULARIZABLE or NO_SINGULARIZE_REGEX.match(word) or len(word) <= 2):
158
+ word = singularize(word)
159
+ self.singularized[raw_word] += 1
160
+
161
+ # Log the raw_word if we've seen it more than once (but only once)
162
+ if raw_word.endswith('s') and self.singularized[raw_word] == 2:
163
+ logger.info(f" Singularized '{raw_word}' to '{word}'...")
164
+
165
+ if not self._is_invalid_word(word):
166
+ self.count[word] += 1
167
+
168
+ if word in FLAGGED_WORDS:
169
+ logger.warning(f"{document_line.document.filename}: Found '{word}' in '{document_line.lines[0]}'")
170
+
171
+ def _is_invalid_word(self, w: str) -> bool:
172
+ return bool(SKIP_WORDS_REGEX.search(w)) \
173
+ or len(w) <= 1 \
174
+ or len(w) >= MAX_WORD_LEN \
175
+ or w in COMMON_WORDS \
176
+ or w in BAD_WORDS
177
+
178
+ def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
179
+ word_txts = [
180
+ highlighter(Text('').append(f"{word}", style=_word_style(word)).append(': ').append(f"{count:,}"))
181
+ for word, count in [kv for kv in sort_dict(self.count) if kv[1] >= MIN_COUNT_CUTOFF]
182
+ ]
183
+
184
+ cols = Columns(word_txts, column_first=False, equal=False, expand=True)
185
+ yield Padding(cols, PADDING)
186
+ yield f"Showing {len(word_txts):,} words appearing at least {MIN_COUNT_CUTOFF} times (out of {len(self.count):,} words)."
187
+
188
+
189
+ def _word_style(word: str | None) -> str:
190
+ word = word or ''
191
+ return 'bright_white' if word in FIRST_AND_LAST_NAMES else 'grey53'