optalph 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- optalph/__init__.py +43 -0
- optalph/__main__.py +4 -0
- optalph/_base.py +63 -0
- optalph/_cli.py +132 -0
- optalph/config.py +67 -0
- optalph/dictionary.py +249 -0
- optalph/evaluator.py +242 -0
- optalph/frequencies.py +68 -0
- optalph/grid_evaluator.py +162 -0
- optalph/main.py +366 -0
- optalph/optimizer.py +546 -0
- optalph/py.typed +0 -0
- optalph/reporting.py +170 -0
- optalph/rules.py +158 -0
- optalph-1.0.0.dist-info/METADATA +243 -0
- optalph-1.0.0.dist-info/RECORD +19 -0
- optalph-1.0.0.dist-info/WHEEL +4 -0
- optalph-1.0.0.dist-info/entry_points.txt +2 -0
- optalph-1.0.0.dist-info/licenses/LICENSE +21 -0
optalph/__init__.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
try:
|
|
2
|
+
from importlib.metadata import version as _version
|
|
3
|
+
__version__ = _version("optalph")
|
|
4
|
+
except Exception:
|
|
5
|
+
__version__ = "0.0.0"
|
|
6
|
+
|
|
7
|
+
from .config import ENGLISH_ALPHABET, RUSSIAN_ALPHABET, OptimizerConfig, RulesConfig, RunConfig
|
|
8
|
+
from .dictionary import detect_alphabet, download_dictionary, is_valid_word, load_dictionary
|
|
9
|
+
from .evaluator import IncrementalEvaluator, annotate, evaluate, evaluate_detailed
|
|
10
|
+
from .frequencies import load_frequencies
|
|
11
|
+
from .grid_evaluator import IncrementalGridEvaluator
|
|
12
|
+
from .optimizer import SwapEvaluator, greedy_init, iterated_local_search, multistart_sa, simulated_annealing
|
|
13
|
+
from .rules import Adjacent, Compose, ExtractionRule, GridRule, SkipAdjacent, build_rules
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"__version__",
|
|
17
|
+
"RUSSIAN_ALPHABET",
|
|
18
|
+
"ENGLISH_ALPHABET",
|
|
19
|
+
"OptimizerConfig",
|
|
20
|
+
"RulesConfig",
|
|
21
|
+
"RunConfig",
|
|
22
|
+
"load_dictionary",
|
|
23
|
+
"detect_alphabet",
|
|
24
|
+
"is_valid_word",
|
|
25
|
+
"download_dictionary",
|
|
26
|
+
"Adjacent",
|
|
27
|
+
"SkipAdjacent",
|
|
28
|
+
"Compose",
|
|
29
|
+
"ExtractionRule",
|
|
30
|
+
"GridRule",
|
|
31
|
+
"build_rules",
|
|
32
|
+
"evaluate",
|
|
33
|
+
"evaluate_detailed",
|
|
34
|
+
"annotate",
|
|
35
|
+
"IncrementalEvaluator",
|
|
36
|
+
"IncrementalGridEvaluator",
|
|
37
|
+
"SwapEvaluator",
|
|
38
|
+
"greedy_init",
|
|
39
|
+
"simulated_annealing",
|
|
40
|
+
"multistart_sa",
|
|
41
|
+
"iterated_local_search",
|
|
42
|
+
"load_frequencies",
|
|
43
|
+
]
|
optalph/__main__.py
ADDED
optalph/_base.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any, Generic, TypeVar
|
|
3
|
+
|
|
4
|
+
_SK = TypeVar("_SK")
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class _IncrementalBase(ABC, Generic[_SK]):
|
|
8
|
+
dictionary: set[str]
|
|
9
|
+
weights: dict[str, float] | None
|
|
10
|
+
n: int
|
|
11
|
+
_current: list[str]
|
|
12
|
+
_current_score: float
|
|
13
|
+
_slots: dict[_SK, str]
|
|
14
|
+
_word_refs: dict[str, set[_SK]]
|
|
15
|
+
_valid_score: float
|
|
16
|
+
|
|
17
|
+
def _word_weight(self, word: str) -> float:
|
|
18
|
+
if word not in self.dictionary:
|
|
19
|
+
return 0.0
|
|
20
|
+
if self.weights is None:
|
|
21
|
+
return 1.0
|
|
22
|
+
return self.weights.get(word, 1.0)
|
|
23
|
+
|
|
24
|
+
def _put_slot(self, key: _SK, word: str) -> None:
|
|
25
|
+
old_word = self._slots.get(key)
|
|
26
|
+
if old_word is not None:
|
|
27
|
+
self._del_slot(key)
|
|
28
|
+
self._slots[key] = word
|
|
29
|
+
if word not in self._word_refs:
|
|
30
|
+
self._word_refs[word] = set()
|
|
31
|
+
self._valid_score += self._word_weight(word)
|
|
32
|
+
self._word_refs[word].add(key)
|
|
33
|
+
|
|
34
|
+
def _del_slot(self, key: _SK) -> None:
|
|
35
|
+
word = self._slots.pop(key)
|
|
36
|
+
refs = self._word_refs[word]
|
|
37
|
+
refs.discard(key)
|
|
38
|
+
if not refs:
|
|
39
|
+
del self._word_refs[word]
|
|
40
|
+
self._valid_score -= self._word_weight(word)
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def full_eval(self, permutation: tuple[str, ...] | list[str]) -> float: ...
|
|
44
|
+
|
|
45
|
+
@abstractmethod
|
|
46
|
+
def try_swap(self, i: int, j: int) -> tuple[float, list[tuple[_SK, str]], Any]: ...
|
|
47
|
+
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def undo_swap(self, i: int, j: int, snapshot: list[tuple[_SK, str]], affected: Any) -> float: ...
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def score(self) -> float:
|
|
53
|
+
return self._current_score
|
|
54
|
+
|
|
55
|
+
def get_permutation(self) -> tuple[str, ...]:
|
|
56
|
+
return tuple(self._current)
|
|
57
|
+
|
|
58
|
+
def _clone_common(self, target: Any) -> None:
|
|
59
|
+
target._current = self._current[:]
|
|
60
|
+
target._current_score = self._current_score
|
|
61
|
+
target._slots = dict(self._slots)
|
|
62
|
+
target._word_refs = {w: set(refs) for w, refs in self._word_refs.items()}
|
|
63
|
+
target._valid_score = self._valid_score
|
optalph/_cli.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
|
5
|
+
parser = argparse.ArgumentParser(
|
|
6
|
+
description="Optimize alphabet permutation for max dictionary words"
|
|
7
|
+
)
|
|
8
|
+
parser.add_argument(
|
|
9
|
+
"--alphabet", "-a", type=str, default=None,
|
|
10
|
+
help="Custom alphabet string (default: Russian)",
|
|
11
|
+
)
|
|
12
|
+
parser.add_argument(
|
|
13
|
+
"--window", "-w", type=int, nargs="+", default=[3],
|
|
14
|
+
help="Adjacent window size(s), e.g. -w 3 4 5",
|
|
15
|
+
)
|
|
16
|
+
parser.add_argument(
|
|
17
|
+
"--skip", "-s", type=int, nargs="+", default=[],
|
|
18
|
+
help="Skip rules as k:step pairs, e.g. -s 3 2 means SkipAdjacent(3,2)",
|
|
19
|
+
)
|
|
20
|
+
parser.add_argument(
|
|
21
|
+
"--method", choices=["sa", "ils"], default="sa",
|
|
22
|
+
help="Optimization method",
|
|
23
|
+
)
|
|
24
|
+
parser.add_argument(
|
|
25
|
+
"--iter", "-i", type=int, default=500000,
|
|
26
|
+
help="Max iterations for simulated annealing",
|
|
27
|
+
)
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"--t-start", type=float, default=100.0,
|
|
30
|
+
help="Starting temperature (default: 100)",
|
|
31
|
+
)
|
|
32
|
+
parser.add_argument(
|
|
33
|
+
"--t-end", type=float, default=0.01,
|
|
34
|
+
help="End temperature",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"--cooling", type=float, default=0.9999,
|
|
38
|
+
help="Cooling rate per iteration (default: 0.9999)",
|
|
39
|
+
)
|
|
40
|
+
parser.add_argument(
|
|
41
|
+
"--seed", type=int, default=None,
|
|
42
|
+
help="Random seed for reproducibility",
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"--dict-file", type=str, default=None,
|
|
46
|
+
help="External dictionary file (one word per line)",
|
|
47
|
+
)
|
|
48
|
+
parser.add_argument(
|
|
49
|
+
"--no-dawg", action="store_true",
|
|
50
|
+
help="Skip DAWG dictionary, use only external file",
|
|
51
|
+
)
|
|
52
|
+
parser.add_argument(
|
|
53
|
+
"--download", choices=["english", "russian"],
|
|
54
|
+
help="Download dictionary automatically (cached in ~/.optalph/)",
|
|
55
|
+
)
|
|
56
|
+
parser.add_argument(
|
|
57
|
+
"--restarts", "-r", type=int, default=1,
|
|
58
|
+
help="Number of multistart restarts",
|
|
59
|
+
)
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
"--freq-file", type=str, default=None,
|
|
62
|
+
help="Word frequency file for weighted scoring (format: word count)",
|
|
63
|
+
)
|
|
64
|
+
parser.add_argument(
|
|
65
|
+
"--incremental", action="store_true",
|
|
66
|
+
help="Use incremental evaluator (faster for large alphabets/grids)",
|
|
67
|
+
)
|
|
68
|
+
parser.add_argument(
|
|
69
|
+
"--save", type=str, default=None,
|
|
70
|
+
help="Save results to JSON file",
|
|
71
|
+
)
|
|
72
|
+
parser.add_argument(
|
|
73
|
+
"--detailed", action="store_true",
|
|
74
|
+
help="Show detailed results (found words)",
|
|
75
|
+
)
|
|
76
|
+
parser.add_argument(
|
|
77
|
+
"--show-invalid", action="store_true",
|
|
78
|
+
help="Also show invalid candidate words",
|
|
79
|
+
)
|
|
80
|
+
parser.add_argument(
|
|
81
|
+
"--quiet", "-q", action="store_true",
|
|
82
|
+
help="Minimal output",
|
|
83
|
+
)
|
|
84
|
+
parser.add_argument(
|
|
85
|
+
"--tabu", action="store_true",
|
|
86
|
+
help="Enable tabu list for SA (prevents cycling)",
|
|
87
|
+
)
|
|
88
|
+
parser.add_argument(
|
|
89
|
+
"--tabu-tenure", type=int, default=20,
|
|
90
|
+
help="Tabu list tenure (default: 20)",
|
|
91
|
+
)
|
|
92
|
+
parser.add_argument(
|
|
93
|
+
"--random-init", action="store_true",
|
|
94
|
+
help="Use random initial permutation instead of greedy bigram init",
|
|
95
|
+
)
|
|
96
|
+
parser.add_argument(
|
|
97
|
+
"--grid-rows", type=int, default=None,
|
|
98
|
+
help="Grid rows for 2D Boggle-like extraction (requires --grid-cols)",
|
|
99
|
+
)
|
|
100
|
+
parser.add_argument(
|
|
101
|
+
"--grid-cols", type=int, default=None,
|
|
102
|
+
help="Grid columns for 2D Boggle-like extraction (requires --grid-rows)",
|
|
103
|
+
)
|
|
104
|
+
parser.add_argument(
|
|
105
|
+
"--grid-min-len", type=int, default=3,
|
|
106
|
+
help="Minimum word length for grid extraction (default: 3)",
|
|
107
|
+
)
|
|
108
|
+
parser.add_argument(
|
|
109
|
+
"--ils", action="store_true",
|
|
110
|
+
help="Use Iterated Local Search (SA + perturbations)",
|
|
111
|
+
)
|
|
112
|
+
parser.add_argument(
|
|
113
|
+
"--ils-perturb", type=int, default=3,
|
|
114
|
+
help="ILS perturbation strength (number of swaps, default: 3)",
|
|
115
|
+
)
|
|
116
|
+
parser.add_argument(
|
|
117
|
+
"--ils-steps", type=int, default=10,
|
|
118
|
+
help="ILS number of perturbation steps (default: 10)",
|
|
119
|
+
)
|
|
120
|
+
parser.add_argument(
|
|
121
|
+
"--parallel", action="store_true",
|
|
122
|
+
help="Run multistart restarts in parallel (score_fn mode only)",
|
|
123
|
+
)
|
|
124
|
+
return parser.parse_args(argv)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def build_skip_rules(skip_args: list[int]) -> list[tuple[int, int]]:
|
|
128
|
+
if not skip_args:
|
|
129
|
+
return []
|
|
130
|
+
if len(skip_args) % 2 != 0:
|
|
131
|
+
raise ValueError("--skip requires pairs of arguments (k step k step ...)")
|
|
132
|
+
return [(skip_args[i], skip_args[i + 1]) for i in range(0, len(skip_args), 2)]
|
optalph/config.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
|
|
3
|
+
__all__ = ["RUSSIAN_ALPHABET", "ENGLISH_ALPHABET", "OptimizerConfig", "RulesConfig", "RunConfig"]
|
|
4
|
+
|
|
5
|
+
RUSSIAN_ALPHABET = tuple("абвгдеёжзийклмнопрстуфхцчшщъыьэюя")
|
|
6
|
+
ENGLISH_ALPHABET = tuple("abcdefghijklmnopqrstuvwxyz")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class OptimizerConfig:
|
|
11
|
+
"""Simulated annealing configuration."""
|
|
12
|
+
t_start: float = 100.0
|
|
13
|
+
t_end: float = 0.01
|
|
14
|
+
cooling_rate: float = 0.9999
|
|
15
|
+
max_iterations: int = 500000
|
|
16
|
+
seed: int | None = None
|
|
17
|
+
|
|
18
|
+
def __post_init__(self) -> None:
|
|
19
|
+
if self.t_start <= 0:
|
|
20
|
+
raise ValueError("t_start must be > 0")
|
|
21
|
+
if self.t_end < 0:
|
|
22
|
+
raise ValueError("t_end must be >= 0")
|
|
23
|
+
if self.t_end >= self.t_start:
|
|
24
|
+
raise ValueError("t_end must be < t_start")
|
|
25
|
+
if not 0 < self.cooling_rate < 1:
|
|
26
|
+
raise ValueError("cooling_rate must be in (0, 1)")
|
|
27
|
+
if self.max_iterations < 1:
|
|
28
|
+
raise ValueError("max_iterations must be >= 1")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class RulesConfig:
|
|
33
|
+
"""Extraction rule configuration."""
|
|
34
|
+
adjacent_windows: list[int] = field(default_factory=lambda: [3])
|
|
35
|
+
skip_rules: list[tuple[int, int]] = field(default_factory=list)
|
|
36
|
+
|
|
37
|
+
def __post_init__(self) -> None:
|
|
38
|
+
if not all(w >= 1 for w in self.adjacent_windows):
|
|
39
|
+
raise ValueError("All adjacent_windows must be >= 1")
|
|
40
|
+
for k, s in self.skip_rules:
|
|
41
|
+
if k < 1:
|
|
42
|
+
raise ValueError("Skip rule k must be >= 1")
|
|
43
|
+
if s < 1:
|
|
44
|
+
raise ValueError("Skip rule step must be >= 1")
|
|
45
|
+
if not self.adjacent_windows and not self.skip_rules:
|
|
46
|
+
raise ValueError("At least one rule must be specified")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class RunConfig:
|
|
51
|
+
"""Top-level run configuration."""
|
|
52
|
+
alphabet: tuple[str, ...] = RUSSIAN_ALPHABET
|
|
53
|
+
optimizer: OptimizerConfig = field(default_factory=OptimizerConfig)
|
|
54
|
+
rules: RulesConfig = field(default_factory=RulesConfig)
|
|
55
|
+
dict_file: str | None = None
|
|
56
|
+
use_dawg: bool = False
|
|
57
|
+
min_word_len: int = 3
|
|
58
|
+
|
|
59
|
+
def __post_init__(self) -> None:
|
|
60
|
+
if self.min_word_len < 1:
|
|
61
|
+
raise ValueError("min_word_len must be >= 1")
|
|
62
|
+
if len(self.alphabet) < 2:
|
|
63
|
+
raise ValueError("alphabet must contain at least 2 characters")
|
|
64
|
+
if self.min_word_len > len(self.alphabet):
|
|
65
|
+
raise ValueError(f"min_word_len ({self.min_word_len}) > alphabet size ({len(self.alphabet)})")
|
|
66
|
+
if len(set(self.alphabet)) != len(self.alphabet):
|
|
67
|
+
raise ValueError("alphabet must not contain duplicate characters")
|
optalph/dictionary.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import unicodedata
|
|
4
|
+
import urllib.request
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from . import __version__
|
|
8
|
+
|
|
9
|
+
__all__ = ["load_dictionary", "detect_alphabet", "download_dictionary"]
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
_CACHE_DIR = Path.home() / ".optalph"
|
|
14
|
+
|
|
15
|
+
_DICTIONARY_URLS: dict[str, str] = {
|
|
16
|
+
"english": "https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt",
|
|
17
|
+
"russian": "https://raw.githubusercontent.com/hingston/russian/master/russian.txt",
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _ensure_cache_dir() -> Path:
|
|
22
|
+
_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
23
|
+
return _CACHE_DIR
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _cache_path(name: str) -> Path:
|
|
27
|
+
return _ensure_cache_dir() / f"{name}.txt"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def download_dictionary(name: str, force: bool = False) -> str:
|
|
31
|
+
"""Download a dictionary to ~/.optalph/ and return the cached file path.
|
|
32
|
+
|
|
33
|
+
Supported names: 'english', 'russian'.
|
|
34
|
+
Downloads only if not already cached. Use force=True to re-download.
|
|
35
|
+
"""
|
|
36
|
+
name = name.lower()
|
|
37
|
+
if name not in _DICTIONARY_URLS:
|
|
38
|
+
raise ValueError(f"Unknown dictionary '{name}'. Available: {', '.join(_DICTIONARY_URLS)}")
|
|
39
|
+
|
|
40
|
+
url = _DICTIONARY_URLS[name]
|
|
41
|
+
cached = _cache_path(name)
|
|
42
|
+
|
|
43
|
+
if cached.is_file() and not force:
|
|
44
|
+
logger.info("Using cached dictionary: %s", cached)
|
|
45
|
+
return str(cached)
|
|
46
|
+
|
|
47
|
+
logger.info("Downloading dictionary '%s' from %s...", name, url)
|
|
48
|
+
|
|
49
|
+
tmp = cached.with_suffix(".tmp")
|
|
50
|
+
try:
|
|
51
|
+
req = urllib.request.Request(url, headers={"User-Agent": f"optalph/{__version__}"})
|
|
52
|
+
with urllib.request.urlopen(req, timeout=60) as resp:
|
|
53
|
+
with open(tmp, "wb") as f:
|
|
54
|
+
while True:
|
|
55
|
+
chunk = resp.read(8192)
|
|
56
|
+
if not chunk:
|
|
57
|
+
break
|
|
58
|
+
f.write(chunk)
|
|
59
|
+
tmp.rename(cached)
|
|
60
|
+
except Exception as e:
|
|
61
|
+
if tmp.is_file():
|
|
62
|
+
tmp.unlink()
|
|
63
|
+
raise RuntimeError(f"Failed to download dictionary: {e}") from e
|
|
64
|
+
|
|
65
|
+
logger.info("Downloaded %s (%.1f MB)", name, cached.stat().st_size / 1e6)
|
|
66
|
+
return str(cached)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def is_valid_word(word: str, allowed_chars: set[str] | None = None) -> bool:
|
|
70
|
+
if allowed_chars is not None:
|
|
71
|
+
return all(c in allowed_chars for c in word)
|
|
72
|
+
return word.isalpha()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def load_from_dawg(min_len: int = 1, max_len: int = 33, allowed_chars: set[str] | None = None) -> set[str]:
|
|
76
|
+
"""Load words from pymorphy2 DAWG dictionary files."""
|
|
77
|
+
if min_len > max_len:
|
|
78
|
+
raise ValueError(f"min_len ({min_len}) > max_len ({max_len})")
|
|
79
|
+
try:
|
|
80
|
+
import dawg_python
|
|
81
|
+
except ImportError:
|
|
82
|
+
raise ImportError("dawg_python not installed. Run: uv add dawg-python")
|
|
83
|
+
|
|
84
|
+
dict_path = None
|
|
85
|
+
try:
|
|
86
|
+
from pymorphy2_dicts_ru import get_path
|
|
87
|
+
dict_path = get_path()
|
|
88
|
+
except ImportError:
|
|
89
|
+
try:
|
|
90
|
+
import pymorphy2_dicts_ru
|
|
91
|
+
dict_path = os.path.dirname(pymorphy2_dicts_ru.__file__)
|
|
92
|
+
except ImportError:
|
|
93
|
+
pass
|
|
94
|
+
|
|
95
|
+
if dict_path is None:
|
|
96
|
+
raise RuntimeError("Cannot find pymorphy2_dicts_ru path")
|
|
97
|
+
|
|
98
|
+
words: set[str] = set()
|
|
99
|
+
|
|
100
|
+
words_file = os.path.join(dict_path, "words.dawg")
|
|
101
|
+
if not os.path.isfile(words_file):
|
|
102
|
+
raise FileNotFoundError(f"words.dawg not found at {dict_path}")
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
d = dawg_python.BytesDAWG()
|
|
106
|
+
d.load(words_file)
|
|
107
|
+
for key in d.keys():
|
|
108
|
+
if isinstance(key, bytes):
|
|
109
|
+
word = key.decode("utf-8").lower()
|
|
110
|
+
else:
|
|
111
|
+
word = key.lower()
|
|
112
|
+
if min_len <= len(word) <= max_len and is_valid_word(word, allowed_chars):
|
|
113
|
+
words.add(word)
|
|
114
|
+
except (OSError, ValueError) as e:
|
|
115
|
+
raise ValueError(f"Failed to load words.dawg: {e}") from e
|
|
116
|
+
|
|
117
|
+
return words
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
_ENCODINGS = ["utf-8-sig", "cp1251", "koi8-r", "cp866"]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def detect_alphabet(filepath: str, sample: int = 1000) -> set[str]:
|
|
124
|
+
"""Detect unique alphabetic characters from the first `sample` words.
|
|
125
|
+
|
|
126
|
+
Returns empty set if <90% of detected characters are letters (category L*)
|
|
127
|
+
or if fewer than 4 unique letters found — guards against garbage decoded
|
|
128
|
+
via latin-1 fallback.
|
|
129
|
+
"""
|
|
130
|
+
chars: set[str] = set()
|
|
131
|
+
count = 0
|
|
132
|
+
for enc in _ENCODINGS + ["latin-1"]:
|
|
133
|
+
try:
|
|
134
|
+
with open(filepath, "r", encoding=enc) as f:
|
|
135
|
+
for line in f:
|
|
136
|
+
word = line.strip().lower()
|
|
137
|
+
if word:
|
|
138
|
+
chars.update(c for c in word if c.isalpha())
|
|
139
|
+
count += 1
|
|
140
|
+
if count >= sample:
|
|
141
|
+
break
|
|
142
|
+
if chars:
|
|
143
|
+
letter_count = sum(
|
|
144
|
+
1 for c in chars
|
|
145
|
+
if unicodedata.category(c).startswith("L")
|
|
146
|
+
)
|
|
147
|
+
if letter_count >= 4 and letter_count / len(chars) >= 0.9:
|
|
148
|
+
return chars
|
|
149
|
+
chars.clear()
|
|
150
|
+
except (UnicodeDecodeError, UnicodeError):
|
|
151
|
+
continue
|
|
152
|
+
return set()
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def load_from_file(
|
|
156
|
+
filepath: str, min_len: int = 1, max_len: int = 33,
|
|
157
|
+
allowed_chars: set[str] | None = None,
|
|
158
|
+
) -> set[str]:
|
|
159
|
+
"""Load words from a text file. Tries utf-8, cp1251, koi8-r, cp866, latin-1."""
|
|
160
|
+
if not os.path.isfile(filepath):
|
|
161
|
+
raise FileNotFoundError(f"Dictionary file not found: {filepath}")
|
|
162
|
+
if min_len > max_len:
|
|
163
|
+
raise ValueError(f"min_len ({min_len}) > max_len ({max_len})")
|
|
164
|
+
|
|
165
|
+
words: set[str] = set()
|
|
166
|
+
encodings = _ENCODINGS
|
|
167
|
+
for enc in encodings:
|
|
168
|
+
try:
|
|
169
|
+
with open(filepath, "r", encoding=enc) as f:
|
|
170
|
+
for line in f:
|
|
171
|
+
word = line.strip().lower()
|
|
172
|
+
if min_len <= len(word) <= max_len and is_valid_word(word, allowed_chars):
|
|
173
|
+
words.add(word)
|
|
174
|
+
if not words:
|
|
175
|
+
raise ValueError(f"No valid words found in {filepath}")
|
|
176
|
+
return words
|
|
177
|
+
except (UnicodeDecodeError, UnicodeError):
|
|
178
|
+
words.clear()
|
|
179
|
+
continue
|
|
180
|
+
|
|
181
|
+
if not words:
|
|
182
|
+
with open(filepath, "r", encoding="latin-1") as f:
|
|
183
|
+
for line in f:
|
|
184
|
+
word = line.strip().lower()
|
|
185
|
+
if min_len <= len(word) <= max_len and is_valid_word(word, allowed_chars):
|
|
186
|
+
words.add(word)
|
|
187
|
+
|
|
188
|
+
if not words:
|
|
189
|
+
raise ValueError(f"No valid words found in {filepath}")
|
|
190
|
+
return words
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def load_dictionary(
|
|
194
|
+
filepath: str | None = None,
|
|
195
|
+
use_dawg: bool = True,
|
|
196
|
+
min_len: int = 1,
|
|
197
|
+
max_len: int = 33,
|
|
198
|
+
quiet: bool = False,
|
|
199
|
+
alphabet: tuple[str, ...] | None = None,
|
|
200
|
+
download: str | None = None,
|
|
201
|
+
) -> set[str]:
|
|
202
|
+
"""Load dictionary from DAWG and/or external file. Returns set of words.
|
|
203
|
+
alphabet: if provided, only words using characters from alphabet are kept.
|
|
204
|
+
download: 'english' or 'russian' — auto-download if no other source works.
|
|
205
|
+
"""
|
|
206
|
+
allowed_chars = set(alphabet) if alphabet is not None else None
|
|
207
|
+
words: set[str] = set()
|
|
208
|
+
|
|
209
|
+
if use_dawg:
|
|
210
|
+
try:
|
|
211
|
+
if not quiet:
|
|
212
|
+
logger.info("Loading dictionary from DAWG (pymorphy2-dicts-ru)...")
|
|
213
|
+
words.update(load_from_dawg(min_len, max_len, allowed_chars))
|
|
214
|
+
if not quiet:
|
|
215
|
+
logger.info(" dawg: %d words", len(words))
|
|
216
|
+
except (ImportError, RuntimeError, FileNotFoundError) as e:
|
|
217
|
+
if not quiet:
|
|
218
|
+
logger.info(" DAWG not available: %s", e)
|
|
219
|
+
|
|
220
|
+
if filepath:
|
|
221
|
+
if not quiet:
|
|
222
|
+
logger.info("Loading dictionary from %s...", filepath)
|
|
223
|
+
try:
|
|
224
|
+
file_words = load_from_file(filepath, min_len, max_len, allowed_chars)
|
|
225
|
+
if not quiet:
|
|
226
|
+
logger.info(" file: %d words", len(file_words))
|
|
227
|
+
words.update(file_words)
|
|
228
|
+
except (ValueError, FileNotFoundError) as e:
|
|
229
|
+
if not quiet:
|
|
230
|
+
logger.info(" file load failed: %s", e)
|
|
231
|
+
|
|
232
|
+
if not words and download:
|
|
233
|
+
cached = download_dictionary(download)
|
|
234
|
+
if not quiet:
|
|
235
|
+
logger.info("Loading downloaded dictionary...")
|
|
236
|
+
dl_words = load_from_file(cached, min_len, max_len, allowed_chars)
|
|
237
|
+
if not quiet:
|
|
238
|
+
logger.info(" downloaded: %d words", len(dl_words))
|
|
239
|
+
words.update(dl_words)
|
|
240
|
+
|
|
241
|
+
if not words:
|
|
242
|
+
raise ValueError(
|
|
243
|
+
"No dictionary words loaded. Provide --dict-file, install pymorphy2-dicts-ru, "
|
|
244
|
+
"or use --download english/russian"
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
if not quiet:
|
|
248
|
+
logger.info(" total unique: %d words", len(words))
|
|
249
|
+
return words
|