hashsmith-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,629 @@
1
+ import base64
2
+ import binascii
3
+ import re
4
+ import string
5
+ from typing import List, Tuple
6
+
7
+ from ..algorithms.decoding import (
8
+ decode_base58,
9
+ decode_hex,
10
+ decode_binary,
11
+ decode_decimal,
12
+ decode_octal,
13
+ decode_base64,
14
+ decode_base32,
15
+ decode_base85,
16
+ decode_base64url,
17
+ decode_morse_code,
18
+ decode_baconian,
19
+ decode_polybius,
20
+ decode_unicode_escaped,
21
+ decode_url,
22
+ decode_rot13,
23
+ decode_atbash,
24
+ decode_caesar,
25
+ decode_leet_speak,
26
+ decode_reverse,
27
+ decode_brainfuck,
28
+ decode_rail_fence,
29
+ decode_vigenere,
30
+ decode_xor,
31
+ )
32
+ from ..algorithms.encoding import (
33
+ encode_base58,
34
+ encode_hex,
35
+ encode_binary,
36
+ encode_decimal,
37
+ encode_octal,
38
+ encode_base64,
39
+ encode_base32,
40
+ encode_base85,
41
+ encode_base64url,
42
+ encode_morse_code,
43
+ encode_baconian,
44
+ encode_polybius,
45
+ encode_unicode_escaped,
46
+ encode_url,
47
+ encode_caesar,
48
+ )
49
+ from ..algorithms.morse import REVERSE_MORSE
50
+
51
+
52
+ COMMON_BIGRAMS = (
53
+ "th",
54
+ "he",
55
+ "in",
56
+ "er",
57
+ "an",
58
+ "re",
59
+ "nd",
60
+ "on",
61
+ "en",
62
+ "at",
63
+ "ou",
64
+ "ed",
65
+ "ha",
66
+ "to",
67
+ "or",
68
+ "it",
69
+ "is",
70
+ "hi",
71
+ "es",
72
+ "ng",
73
+ "st",
74
+ "ar",
75
+ "te",
76
+ "se",
77
+ "me",
78
+ "ve",
79
+ "of",
80
+ )
81
+
82
+ COMMON_WORDS = (
83
+ "the",
84
+ "and",
85
+ "you",
86
+ "that",
87
+ "have",
88
+ "for",
89
+ "not",
90
+ "with",
91
+ "this",
92
+ "but",
93
+ "from",
94
+ "hello",
95
+ "secret",
96
+ "message",
97
+ "attack",
98
+ "dawn",
99
+ )
100
+
101
+
102
+ def _normalize_spaces(value: str) -> str:
103
+ return " ".join(value.strip().split())
104
+
105
+
106
+ def _is_hex(value: str) -> bool:
107
+ return bool(value) and all(ch in "0123456789abcdefABCDEF" for ch in value)
108
+
109
+
110
+ def _vowel_ratio(value: str) -> float:
111
+ letters = [ch.lower() for ch in value if ch.isalpha()]
112
+ if not letters:
113
+ return 0.0
114
+ vowels = sum(1 for ch in letters if ch in "aeiou")
115
+ return vowels / len(letters)
116
+
117
+
118
+ def _alpha_count(value: str) -> int:
119
+ return sum(1 for ch in value if ch.isalpha())
120
+
121
+
122
+ def _alpha_ratio(value: str) -> float:
123
+ if not value:
124
+ return 0.0
125
+ return _alpha_count(value) / len(value)
126
+
127
+
128
+ def _printable_ratio(value: bytes) -> float:
129
+ if not value:
130
+ return 0.0
131
+ printable = sum(1 for ch in value if chr(ch) in string.printable)
132
+ return printable / len(value)
133
+
134
+
135
+ def _alnum_space_ratio(value: bytes) -> float:
136
+ if not value:
137
+ return 0.0
138
+ allowed = set(string.ascii_letters + string.digits + " ")
139
+ count = sum(1 for ch in value if chr(ch) in allowed)
140
+ return count / len(value)
141
+
142
+
143
+ def _bigram_score(value: str) -> float:
144
+ text = re.sub(r"[^a-z]", "", value.lower())
145
+ if len(text) < 2:
146
+ return 0.0
147
+ count = 0
148
+ for i in range(len(text) - 1):
149
+ if text[i : i + 2] in COMMON_BIGRAMS:
150
+ count += 1
151
+ return count / max(len(text) - 1, 1)
152
+
153
+
154
+ def _word_hit(value: str) -> bool:
155
+ text = re.sub(r"[^a-z ]", " ", value.lower())
156
+ tokens = [token for token in text.split() if token]
157
+ hits = [word for word in COMMON_WORDS if word in tokens]
158
+ if any(len(word) >= 4 for word in hits):
159
+ return True
160
+ return len(hits) >= 2
161
+
162
+
163
+ def _text_score(value: str) -> float:
164
+ return _bigram_score(value) + (0.6 if _word_hit(value) else 0.0) + (_vowel_ratio(value) * 0.4)
165
+
166
+
167
+ def _index_of_coincidence(value: str) -> float:
168
+ letters = [ch.lower() for ch in value if ch.isalpha()]
169
+ n = len(letters)
170
+ if n < 2:
171
+ return 0.0
172
+ counts = {}
173
+ for ch in letters:
174
+ counts[ch] = counts.get(ch, 0) + 1
175
+ numerator = sum(count * (count - 1) for count in counts.values())
176
+ return numerator / (n * (n - 1))
177
+
178
+
179
+ def _best_shift_score(value: str) -> Tuple[int, float, float]:
180
+ original_score = _bigram_score(value)
181
+ best_shift = 0
182
+ best_score = 0.0
183
+ for shift in range(1, 26):
184
+ decoded = decode_caesar(value, shift)
185
+ score = _bigram_score(decoded)
186
+ if score > best_score:
187
+ best_score = score
188
+ best_shift = shift
189
+ return best_shift, original_score, best_score
190
+
191
+
192
+ def _best_shift_vowel_ratio(value: str) -> Tuple[int, float, float]:
193
+ original_ratio = _vowel_ratio(value)
194
+ best_shift = 0
195
+ best_ratio = 0.0
196
+ for shift in range(1, 26):
197
+ decoded = decode_caesar(value, shift)
198
+ ratio = _vowel_ratio(decoded)
199
+ if ratio > best_ratio:
200
+ best_ratio = ratio
201
+ best_shift = shift
202
+ return best_shift, original_ratio, best_ratio
203
+
204
+
205
+ def _try_single_byte_xor(hex_text: str) -> Tuple[float, float]:
206
+ try:
207
+ raw = binascii.unhexlify(hex_text)
208
+ except (binascii.Error, ValueError):
209
+ return 0.0, 0.0
210
+ raw_printable = _printable_ratio(raw)
211
+ best_printable = 0.0
212
+ best_score = 0.0
213
+ for key in range(256):
214
+ decoded = bytes(b ^ key for b in raw)
215
+ printable_ratio = _printable_ratio(decoded)
216
+ if printable_ratio > best_printable:
217
+ best_printable = printable_ratio
218
+ if printable_ratio >= 0.9:
219
+ text = decoded.decode("utf-8", errors="ignore")
220
+ best_score = max(best_score, _bigram_score(text))
221
+ return raw_printable, best_score
222
+
223
+
224
+ def detect_encoding_types(text: str) -> List[str]:
225
+ value = text.strip()
226
+ if not value:
227
+ return []
228
+
229
+ if value.startswith(("$2a$", "$2b$", "$2y$", "$argon2", "scrypt$")):
230
+ return []
231
+ if value.startswith("*") and len(value) == 41:
232
+ return []
233
+ if value.startswith("md5") and len(value) == 35:
234
+ return []
235
+ if value.lower().startswith("0x0100"):
236
+ return []
237
+
238
+ strong_results: List[str] = []
239
+ heuristic_results: List[str] = []
240
+
241
+ # Binary (8-bit groups)
242
+ normalized = _normalize_spaces(value)
243
+ if re.fullmatch(r"[01]{8}( [01]{8})*", normalized):
244
+ try:
245
+ decoded = decode_binary(normalized)
246
+ if _normalize_spaces(encode_binary(decoded)) == normalized:
247
+ strong_results.append("binary")
248
+ except Exception:
249
+ pass
250
+
251
+ # Decimal (space-separated 0-255)
252
+ if re.fullmatch(r"\d{1,3}( \d{1,3})*", normalized):
253
+ try:
254
+ decoded = decode_decimal(normalized)
255
+ if _normalize_spaces(encode_decimal(decoded)) == normalized:
256
+ strong_results.append("decimal")
257
+ except Exception:
258
+ pass
259
+
260
+ # Octal (space-separated 0-7)
261
+ if re.fullmatch(r"[0-7]{1,3}( [0-7]{1,3})*", normalized):
262
+ try:
263
+ decoded = decode_octal(normalized)
264
+ if _normalize_spaces(encode_octal(decoded)) == normalized:
265
+ strong_results.append("octal")
266
+ except Exception:
267
+ pass
268
+
269
+ # Polybius (1-5 pairs and / for spaces)
270
+ if re.fullmatch(r"[1-5/ ]+", normalized):
271
+ try:
272
+ decoded = decode_polybius(normalized)
273
+ if _normalize_spaces(encode_polybius(decoded)) == normalized:
274
+ strong_results.append("polybius")
275
+ except Exception:
276
+ pass
277
+
278
+ # Baconian (A/B tokens)
279
+ if re.fullmatch(r"[ABab/ ]+", normalized):
280
+ try:
281
+ decoded = decode_baconian(normalized)
282
+ if _normalize_spaces(encode_baconian(decoded).upper()) == normalized.upper():
283
+ strong_results.append("baconian")
284
+ except Exception:
285
+ pass
286
+
287
+ # Morse (tokens must be known)
288
+ if re.fullmatch(r"[.\-/ ]+", normalized):
289
+ tokens = normalized.split()
290
+ if tokens and all(token in REVERSE_MORSE for token in tokens):
291
+ try:
292
+ decoded = decode_morse_code(normalized)
293
+ if _normalize_spaces(encode_morse_code(decoded)) == normalized:
294
+ strong_results.append("morse")
295
+ except Exception:
296
+ pass
297
+
298
+ # Unicode escaped (\uXXXX)
299
+ if re.fullmatch(r"(?:\\u[0-9a-fA-F]{4})+", value):
300
+ try:
301
+ decoded = decode_unicode_escaped(value)
302
+ if encode_unicode_escaped(decoded) == value.lower():
303
+ strong_results.append("unicode")
304
+ except Exception:
305
+ pass
306
+
307
+ # Hex (strict, UTF-8 round-trip)
308
+ hex_bytes = None
309
+ hex_printable = 0.0
310
+ hex_text_score = 0.0
311
+ has_hex_alpha = False
312
+ if len(value) % 2 == 0 and _is_hex(value):
313
+ try:
314
+ decoded = decode_hex(value)
315
+ if encode_hex(decoded).lower() == value.lower():
316
+ strong_results.append("hex")
317
+ hex_bytes = binascii.unhexlify(value)
318
+ hex_printable = _printable_ratio(hex_bytes)
319
+ hex_text_score = _bigram_score(hex_bytes.decode("utf-8", errors="ignore"))
320
+ has_hex_alpha = any(ch in "abcdef" for ch in value.lower())
321
+ hex_alnum_space = _alnum_space_ratio(hex_bytes)
322
+ except Exception:
323
+ pass
324
+
325
+ if _is_hex(value) and not strong_results and len(value) >= 16:
326
+ return []
327
+
328
+ # Base64 (strict, padded)
329
+ try:
330
+ decoded = decode_base64(value)
331
+ if encode_base64(decoded) == value:
332
+ strong_results.append("base64")
333
+ except Exception:
334
+ pass
335
+
336
+ # Base64URL (unpadded, URL-safe)
337
+ if "=" not in value and re.fullmatch(r"[A-Za-z0-9_-]+", value):
338
+ try:
339
+ decoded = decode_base64url(value)
340
+ if encode_base64url(decoded) == value:
341
+ strong_results.append("base64url")
342
+ except Exception:
343
+ pass
344
+
345
+ # Base32 (strict, padded)
346
+ try:
347
+ decoded = decode_base32(value)
348
+ if encode_base32(decoded) == value.upper():
349
+ strong_results.append("base32")
350
+ except Exception:
351
+ pass
352
+
353
+ # Base85 (strict round-trip)
354
+ try:
355
+ decoded = decode_base85(value)
356
+ if encode_base85(decoded) == value:
357
+ strong_results.append("base85")
358
+ except Exception:
359
+ pass
360
+
361
+ # Base58 (strict round-trip)
362
+ try:
363
+ decoded = decode_base58(value)
364
+ if encode_base58(decoded) == value:
365
+ strong_results.append("base58")
366
+ except Exception:
367
+ pass
368
+
369
+ # URL encoding (must include at least one valid %XX)
370
+ if re.search(r"%[0-9A-Fa-f]{2}", value):
371
+ try:
372
+ decoded = decode_url(value)
373
+ if encode_url(decoded) == value:
374
+ strong_results.append("url")
375
+ except Exception:
376
+ pass
377
+ # Brainfuck
378
+ if re.fullmatch(r"[+\-<>\[\].,]+", value):
379
+ try:
380
+ decoded = decode_brainfuck(value)
381
+ if decoded:
382
+ strong_results.append("brainf*ck")
383
+ except Exception:
384
+ pass
385
+
386
+ # Prefer polybius when present to avoid overlap with decimal/octal
387
+ if "polybius" in strong_results:
388
+ return ["polybius"]
389
+
390
+ # If any strong format matched, return only those
391
+ if strong_results:
392
+ if "hex" in strong_results and hex_bytes is not None:
393
+ raw_printable, xor_score = _try_single_byte_xor(value)
394
+ raw_text_score = _bigram_score(hex_bytes.decode("utf-8", errors="ignore"))
395
+ decoded_text = hex_bytes.decode("utf-8", errors="ignore")
396
+ if hex_printable >= 0.9 and (
397
+ _word_hit(decoded_text)
398
+ or (_alpha_ratio(decoded_text) >= 0.6 and _vowel_ratio(decoded_text) >= 0.25)
399
+ or raw_text_score >= 0.1
400
+ ):
401
+ return ["hex"]
402
+ if (
403
+ (has_hex_alpha or hex_alnum_space < 0.85)
404
+ and (
405
+ (raw_printable < 0.6 and xor_score - raw_text_score >= 0.05)
406
+ or (raw_text_score < 0.01 and xor_score - raw_text_score >= 0.1)
407
+ )
408
+ ):
409
+ return ["xor"]
410
+ return list(dict.fromkeys(strong_results))
411
+
412
+ # ROT13 short-word check (avoid false positives by requiring word hit)
413
+ if re.fullmatch(r"[A-Za-z ]+", value) and 4 <= len(value.strip()) < 6:
414
+ rot13_decoded = decode_rot13(value)
415
+ if _word_hit(rot13_decoded) and not _word_hit(value):
416
+ return ["rot13"]
417
+
418
+ # ROT13 / Caesar / Atbash / Reverse / Rail fence heuristics
419
+ if re.fullmatch(r"[A-Za-z ]+", value) and len(value.strip()) >= 6:
420
+ base_score = _text_score(value)
421
+ base_word_hit = _word_hit(value)
422
+ base_vowel = _vowel_ratio(value)
423
+ base_alpha = _alpha_ratio(value)
424
+ candidate_scores: dict[str, float] = {}
425
+ candidate_texts: dict[str, str] = {}
426
+
427
+ rot13_decoded = decode_rot13(value)
428
+ candidate_scores["rot13"] = _text_score(rot13_decoded)
429
+ candidate_texts["rot13"] = rot13_decoded
430
+
431
+ best_shift = 0
432
+ best_caesar_score = 0.0
433
+ second_caesar_score = 0.0
434
+ best_caesar_text = value
435
+ for shift in range(1, 26):
436
+ if shift == 13:
437
+ continue
438
+ decoded = decode_caesar(value, shift)
439
+ score = _text_score(decoded)
440
+ if score > best_caesar_score:
441
+ second_caesar_score = best_caesar_score
442
+ best_caesar_score = score
443
+ best_shift = shift
444
+ best_caesar_text = decoded
445
+ elif score > second_caesar_score:
446
+ second_caesar_score = score
447
+ if best_shift:
448
+ candidate_scores["caesar"] = best_caesar_score
449
+ candidate_texts["caesar"] = best_caesar_text
450
+
451
+ atbash_decoded = decode_atbash(value)
452
+ candidate_scores["atbash"] = _text_score(atbash_decoded)
453
+ candidate_texts["atbash"] = atbash_decoded
454
+
455
+ reverse_decoded = decode_reverse(value)
456
+ candidate_scores["reverse"] = _text_score(reverse_decoded)
457
+ candidate_texts["reverse"] = reverse_decoded
458
+
459
+ best_rf_score = 0.0
460
+ best_rf_text = ""
461
+ for rails in range(2, 6):
462
+ try:
463
+ decoded = decode_rail_fence(value, rails)
464
+ except Exception:
465
+ continue
466
+ score = _text_score(decoded)
467
+ if score > best_rf_score:
468
+ best_rf_score = score
469
+ best_rf_text = decoded
470
+ if best_rf_text and (_word_hit(best_rf_text) or _bigram_score(best_rf_text) >= 0.2):
471
+ candidate_scores["railfence"] = best_rf_score
472
+ candidate_texts["railfence"] = best_rf_text
473
+
474
+ best_match, best_score = max(candidate_scores.items(), key=lambda item: item[1])
475
+ ic_value = _index_of_coincidence(value)
476
+ best_text = candidate_texts.get(best_match, "")
477
+ any_word_hit = any(_word_hit(text) for text in candidate_texts.values())
478
+ score_delta = best_score - base_score
479
+ allow_heuristics = (
480
+ (not base_word_hit and base_score < 0.18)
481
+ or (score_delta >= 0.15 and best_score >= 0.25)
482
+ )
483
+
484
+ if allow_heuristics:
485
+ if best_match == "caesar" and score_delta >= 0.05 and len(value.strip()) > 12:
486
+ heuristic_results.append("caesar")
487
+ elif best_match in {"caesar", "rot13", "atbash"} and not any_word_hit and len(value.strip()) <= 12:
488
+ if not base_word_hit and base_score < 0.18:
489
+ heuristic_results.append("vigenere")
490
+ elif best_match == "caesar" and (best_score - base_score) >= 0.12:
491
+ heuristic_results.append("caesar")
492
+ elif (
493
+ best_match in {"caesar", "rot13", "atbash"}
494
+ and not any_word_hit
495
+ and score_delta < 0.05
496
+ and len(value.strip()) >= 8
497
+ and not base_word_hit
498
+ and base_score < 0.18
499
+ ):
500
+ heuristic_results.append("vigenere")
501
+ elif (
502
+ best_match == "caesar"
503
+ and not any_word_hit
504
+ and (best_caesar_score - second_caesar_score) < 0.15
505
+ and len(value.strip()) >= 8
506
+ and not base_word_hit
507
+ and base_score < 0.18
508
+ ):
509
+ heuristic_results.append("vigenere")
510
+ elif (
511
+ ic_value < 0.06
512
+ and not any_word_hit
513
+ and score_delta < 0.05
514
+ and len(value.strip()) >= 8
515
+ and not base_word_hit
516
+ and base_score < 0.18
517
+ ):
518
+ heuristic_results.append("vigenere")
519
+ elif (
520
+ best_match in {"caesar", "rot13", "atbash"}
521
+ and not _word_hit(best_text)
522
+ and ic_value < 0.06
523
+ and best_score < 0.35
524
+ and len(value.strip()) >= 8
525
+ and not base_word_hit
526
+ and base_score < 0.18
527
+ ):
528
+ heuristic_results.append("vigenere")
529
+ elif (
530
+ ic_value < 0.055
531
+ and (best_score - base_score) < 0.08
532
+ and len(value.strip()) >= 8
533
+ and not base_word_hit
534
+ and base_score < 0.18
535
+ ):
536
+ heuristic_results.append("vigenere")
537
+ elif best_match == "reverse":
538
+ reverse_bigram = _bigram_score(best_text)
539
+ if (
540
+ _word_hit(best_text)
541
+ or (
542
+ reverse_bigram >= 0.25
543
+ and _vowel_ratio(best_text) >= 0.3
544
+ and base_score <= 0.1
545
+ and (best_score - base_score) >= 0.12
546
+ )
547
+ ):
548
+ heuristic_results.append("reverse")
549
+ elif best_score >= max(0.2, base_score + 0.08):
550
+ if not (base_alpha >= 0.85 and base_vowel >= 0.28 and base_score >= 0.18):
551
+ heuristic_results.append(best_match)
552
+
553
+ # Leet (heuristic, require mixed letters+digits)
554
+ if not _is_hex(value) and any(ch.isalpha() for ch in value) and any(ch in "013457" for ch in value):
555
+ decoded = decode_leet_speak(value)
556
+ original_score = _bigram_score(value)
557
+ decoded_score = _bigram_score(decoded)
558
+ if _word_hit(decoded) or (decoded_score >= 0.12 and original_score <= 0.05):
559
+ heuristic_results.append("leet")
560
+
561
+ # XOR (single-byte key heuristic on hex)
562
+ if len(value) % 2 == 0 and _is_hex(value):
563
+ if hex_bytes is not None:
564
+ raw_printable, xor_score = _try_single_byte_xor(value)
565
+ raw_text_score = _bigram_score(hex_bytes.decode("utf-8", errors="ignore"))
566
+ if (
567
+ (has_hex_alpha or hex_alnum_space < 0.85)
568
+ and (
569
+ (raw_printable < 0.6 and xor_score - raw_text_score >= 0.05)
570
+ or (raw_text_score < 0.01 and xor_score - raw_text_score >= 0.1)
571
+ )
572
+ ):
573
+ heuristic_results.append("xor")
574
+
575
+ return list(dict.fromkeys(heuristic_results))
576
+
577
+
578
+ def _weights_for_hex_length(length: int) -> List[Tuple[str, float]]:
579
+ return {
580
+ 16: [("mysql323", 1.0)],
581
+ 32: [("md5", 0.7), ("ntlm", 0.2), ("md4", 0.1)],
582
+ 40: [("sha1", 0.85), ("mssql2000", 0.15)],
583
+ 56: [("sha224", 0.8), ("sha3_224", 0.2)],
584
+ 64: [("sha256", 0.7), ("sha3_256", 0.2), ("blake2s", 0.1)],
585
+ 96: [("sha384", 1.0)],
586
+ 128: [("sha512", 0.7), ("sha3_512", 0.2), ("blake2b", 0.1)],
587
+ }.get(length, [])
588
+
589
+
590
+ def _normalize_percentages(items: List[Tuple[str, float]]) -> List[Tuple[str, int]]:
591
+ if not items:
592
+ return []
593
+ total = sum(weight for _, weight in items) or 1.0
594
+ raw = [(name, weight / total * 100.0) for name, weight in items]
595
+ rounded = [(name, int(round(pct))) for name, pct in raw]
596
+ diff = 100 - sum(pct for _, pct in rounded)
597
+ if diff != 0:
598
+ name, pct = rounded[0]
599
+ rounded[0] = (name, pct + diff)
600
+ return rounded
601
+
602
+
603
+ def detect_hash_probabilities(value: str, top: int = 3) -> List[Tuple[str, int]]:
604
+ text = value.strip()
605
+ if not text:
606
+ return []
607
+
608
+ if text.startswith(("$2a$", "$2b$", "$2y$")):
609
+ return [("bcrypt", 100)]
610
+ if text.startswith("$argon2"):
611
+ return [("argon2", 100)]
612
+ if text.startswith("scrypt$"):
613
+ return [("scrypt", 100)]
614
+ if text.lower().startswith("0x0100"):
615
+ return _normalize_percentages([("mssql2005", 0.5), ("mssql2012", 0.5)])
616
+ if text.startswith("md5") and len(text) == 35:
617
+ return [("postgres", 100)]
618
+ if text.startswith("*") and len(text) == 41:
619
+ return [("mysql41", 100)]
620
+
621
+ if not _is_hex(text):
622
+ return []
623
+
624
+ weights = _weights_for_hex_length(len(text))
625
+ if not weights:
626
+ return []
627
+
628
+ weights = sorted(weights, key=lambda item: item[1], reverse=True)[:top]
629
+ return _normalize_percentages(weights)
@@ -0,0 +1,30 @@
1
+ from pathlib import Path
2
+ from typing import Optional
3
+
4
+
5
+ def read_text_from_file(path: str) -> str:
6
+ file_path = Path(path).expanduser().resolve()
7
+ if not file_path.exists():
8
+ raise ValueError(f"File not found: {path}")
9
+ if file_path.is_dir():
10
+ raise ValueError(f"Expected a file but got a directory: {path}")
11
+ try:
12
+ return file_path.read_text(encoding="utf-8")
13
+ except PermissionError:
14
+ raise ValueError(f"Permission denied for file: {path}")
15
+ except IsADirectoryError:
16
+ raise ValueError(f"Expected a file but got a directory: {path}")
17
+
18
+
19
+ def write_text_to_file(path: str, content: str) -> None:
20
+ file_path = Path(path).expanduser().resolve()
21
+ file_path.parent.mkdir(parents=True, exist_ok=True)
22
+ file_path.write_text(content, encoding="utf-8")
23
+
24
+
25
+ def resolve_input(text: Optional[str], file_path: Optional[str]) -> str:
26
+ if text:
27
+ return text
28
+ if file_path:
29
+ return read_text_from_file(file_path)
30
+ raise ValueError("Provide --text or --file")
@@ -0,0 +1,20 @@
1
+ import time
2
+ from dataclasses import dataclass
3
+
4
+
5
+ @dataclass
6
+ class RateCounter:
7
+ last_tick: float = 0.0
8
+ last_count: int = 0
9
+
10
+ def __post_init__(self) -> None:
11
+ if self.last_tick == 0.0:
12
+ self.last_tick = time.perf_counter()
13
+
14
+ def rate(self, total_count: int) -> float:
15
+ now = time.perf_counter()
16
+ delta_t = max(now - self.last_tick, 1e-9)
17
+ delta_c = total_count - self.last_count
18
+ self.last_tick = now
19
+ self.last_count = total_count
20
+ return delta_c / delta_t
@@ -0,0 +1,11 @@
1
+ from pathlib import Path
2
+ from typing import Iterable
3
+
4
+
5
+ def iter_wordlist(path: str) -> Iterable[str]:
6
+ file_path = Path(path).expanduser().resolve()
7
+ with file_path.open("r", encoding="utf-8", errors="ignore") as handle:
8
+ for line in handle:
9
+ word = line.strip()
10
+ if word:
11
+ yield word