optalph 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
optalph/__init__.py ADDED
@@ -0,0 +1,43 @@
1
+ try:
2
+ from importlib.metadata import version as _version
3
+ __version__ = _version("optalph")
4
+ except Exception:
5
+ __version__ = "0.0.0"
6
+
7
+ from .config import ENGLISH_ALPHABET, RUSSIAN_ALPHABET, OptimizerConfig, RulesConfig, RunConfig
8
+ from .dictionary import detect_alphabet, download_dictionary, is_valid_word, load_dictionary
9
+ from .evaluator import IncrementalEvaluator, annotate, evaluate, evaluate_detailed
10
+ from .frequencies import load_frequencies
11
+ from .grid_evaluator import IncrementalGridEvaluator
12
+ from .optimizer import SwapEvaluator, greedy_init, iterated_local_search, multistart_sa, simulated_annealing
13
+ from .rules import Adjacent, Compose, ExtractionRule, GridRule, SkipAdjacent, build_rules
14
+
15
+ __all__ = [
16
+ "__version__",
17
+ "RUSSIAN_ALPHABET",
18
+ "ENGLISH_ALPHABET",
19
+ "OptimizerConfig",
20
+ "RulesConfig",
21
+ "RunConfig",
22
+ "load_dictionary",
23
+ "detect_alphabet",
24
+ "is_valid_word",
25
+ "download_dictionary",
26
+ "Adjacent",
27
+ "SkipAdjacent",
28
+ "Compose",
29
+ "ExtractionRule",
30
+ "GridRule",
31
+ "build_rules",
32
+ "evaluate",
33
+ "evaluate_detailed",
34
+ "annotate",
35
+ "IncrementalEvaluator",
36
+ "IncrementalGridEvaluator",
37
+ "SwapEvaluator",
38
+ "greedy_init",
39
+ "simulated_annealing",
40
+ "multistart_sa",
41
+ "iterated_local_search",
42
+ "load_frequencies",
43
+ ]
optalph/__main__.py ADDED
@@ -0,0 +1,4 @@
1
+ from .main import main
2
+
3
+ if __name__ == "__main__":
4
+ main()
optalph/_base.py ADDED
@@ -0,0 +1,63 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, Generic, TypeVar
3
+
4
+ _SK = TypeVar("_SK")
5
+
6
+
7
+ class _IncrementalBase(ABC, Generic[_SK]):
8
+ dictionary: set[str]
9
+ weights: dict[str, float] | None
10
+ n: int
11
+ _current: list[str]
12
+ _current_score: float
13
+ _slots: dict[_SK, str]
14
+ _word_refs: dict[str, set[_SK]]
15
+ _valid_score: float
16
+
17
+ def _word_weight(self, word: str) -> float:
18
+ if word not in self.dictionary:
19
+ return 0.0
20
+ if self.weights is None:
21
+ return 1.0
22
+ return self.weights.get(word, 1.0)
23
+
24
+ def _put_slot(self, key: _SK, word: str) -> None:
25
+ old_word = self._slots.get(key)
26
+ if old_word is not None:
27
+ self._del_slot(key)
28
+ self._slots[key] = word
29
+ if word not in self._word_refs:
30
+ self._word_refs[word] = set()
31
+ self._valid_score += self._word_weight(word)
32
+ self._word_refs[word].add(key)
33
+
34
+ def _del_slot(self, key: _SK) -> None:
35
+ word = self._slots.pop(key)
36
+ refs = self._word_refs[word]
37
+ refs.discard(key)
38
+ if not refs:
39
+ del self._word_refs[word]
40
+ self._valid_score -= self._word_weight(word)
41
+
42
+ @abstractmethod
43
+ def full_eval(self, permutation: tuple[str, ...] | list[str]) -> float: ...
44
+
45
+ @abstractmethod
46
+ def try_swap(self, i: int, j: int) -> tuple[float, list[tuple[_SK, str]], Any]: ...
47
+
48
+ @abstractmethod
49
+ def undo_swap(self, i: int, j: int, snapshot: list[tuple[_SK, str]], affected: Any) -> float: ...
50
+
51
+ @property
52
+ def score(self) -> float:
53
+ return self._current_score
54
+
55
+ def get_permutation(self) -> tuple[str, ...]:
56
+ return tuple(self._current)
57
+
58
+ def _clone_common(self, target: Any) -> None:
59
+ target._current = self._current[:]
60
+ target._current_score = self._current_score
61
+ target._slots = dict(self._slots)
62
+ target._word_refs = {w: set(refs) for w, refs in self._word_refs.items()}
63
+ target._valid_score = self._valid_score
optalph/_cli.py ADDED
@@ -0,0 +1,132 @@
1
+ import argparse
2
+
3
+
4
+ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
5
+ parser = argparse.ArgumentParser(
6
+ description="Optimize alphabet permutation for max dictionary words"
7
+ )
8
+ parser.add_argument(
9
+ "--alphabet", "-a", type=str, default=None,
10
+ help="Custom alphabet string (default: Russian)",
11
+ )
12
+ parser.add_argument(
13
+ "--window", "-w", type=int, nargs="+", default=[3],
14
+ help="Adjacent window size(s), e.g. -w 3 4 5",
15
+ )
16
+ parser.add_argument(
17
+ "--skip", "-s", type=int, nargs="+", default=[],
18
+ help="Skip rules as k:step pairs, e.g. -s 3 2 means SkipAdjacent(3,2)",
19
+ )
20
+ parser.add_argument(
21
+ "--method", choices=["sa", "ils"], default="sa",
22
+ help="Optimization method",
23
+ )
24
+ parser.add_argument(
25
+ "--iter", "-i", type=int, default=500000,
26
+ help="Max iterations for simulated annealing",
27
+ )
28
+ parser.add_argument(
29
+ "--t-start", type=float, default=100.0,
30
+ help="Starting temperature (default: 100)",
31
+ )
32
+ parser.add_argument(
33
+ "--t-end", type=float, default=0.01,
34
+ help="End temperature",
35
+ )
36
+ parser.add_argument(
37
+ "--cooling", type=float, default=0.9999,
38
+ help="Cooling rate per iteration (default: 0.9999)",
39
+ )
40
+ parser.add_argument(
41
+ "--seed", type=int, default=None,
42
+ help="Random seed for reproducibility",
43
+ )
44
+ parser.add_argument(
45
+ "--dict-file", type=str, default=None,
46
+ help="External dictionary file (one word per line)",
47
+ )
48
+ parser.add_argument(
49
+ "--no-dawg", action="store_true",
50
+ help="Skip DAWG dictionary, use only external file",
51
+ )
52
+ parser.add_argument(
53
+ "--download", choices=["english", "russian"],
54
+ help="Download dictionary automatically (cached in ~/.optalph/)",
55
+ )
56
+ parser.add_argument(
57
+ "--restarts", "-r", type=int, default=1,
58
+ help="Number of multistart restarts",
59
+ )
60
+ parser.add_argument(
61
+ "--freq-file", type=str, default=None,
62
+ help="Word frequency file for weighted scoring (format: word count)",
63
+ )
64
+ parser.add_argument(
65
+ "--incremental", action="store_true",
66
+ help="Use incremental evaluator (faster for large alphabets/grids)",
67
+ )
68
+ parser.add_argument(
69
+ "--save", type=str, default=None,
70
+ help="Save results to JSON file",
71
+ )
72
+ parser.add_argument(
73
+ "--detailed", action="store_true",
74
+ help="Show detailed results (found words)",
75
+ )
76
+ parser.add_argument(
77
+ "--show-invalid", action="store_true",
78
+ help="Also show invalid candidate words",
79
+ )
80
+ parser.add_argument(
81
+ "--quiet", "-q", action="store_true",
82
+ help="Minimal output",
83
+ )
84
+ parser.add_argument(
85
+ "--tabu", action="store_true",
86
+ help="Enable tabu list for SA (prevents cycling)",
87
+ )
88
+ parser.add_argument(
89
+ "--tabu-tenure", type=int, default=20,
90
+ help="Tabu list tenure (default: 20)",
91
+ )
92
+ parser.add_argument(
93
+ "--random-init", action="store_true",
94
+ help="Use random initial permutation instead of greedy bigram init",
95
+ )
96
+ parser.add_argument(
97
+ "--grid-rows", type=int, default=None,
98
+ help="Grid rows for 2D Boggle-like extraction (requires --grid-cols)",
99
+ )
100
+ parser.add_argument(
101
+ "--grid-cols", type=int, default=None,
102
+ help="Grid columns for 2D Boggle-like extraction (requires --grid-rows)",
103
+ )
104
+ parser.add_argument(
105
+ "--grid-min-len", type=int, default=3,
106
+ help="Minimum word length for grid extraction (default: 3)",
107
+ )
108
+ parser.add_argument(
109
+ "--ils", action="store_true",
110
+ help="Use Iterated Local Search (SA + perturbations)",
111
+ )
112
+ parser.add_argument(
113
+ "--ils-perturb", type=int, default=3,
114
+ help="ILS perturbation strength (number of swaps, default: 3)",
115
+ )
116
+ parser.add_argument(
117
+ "--ils-steps", type=int, default=10,
118
+ help="ILS number of perturbation steps (default: 10)",
119
+ )
120
+ parser.add_argument(
121
+ "--parallel", action="store_true",
122
+ help="Run multistart restarts in parallel (score_fn mode only)",
123
+ )
124
+ return parser.parse_args(argv)
125
+
126
+
127
+ def build_skip_rules(skip_args: list[int]) -> list[tuple[int, int]]:
128
+ if not skip_args:
129
+ return []
130
+ if len(skip_args) % 2 != 0:
131
+ raise ValueError("--skip requires pairs of arguments (k step k step ...)")
132
+ return [(skip_args[i], skip_args[i + 1]) for i in range(0, len(skip_args), 2)]
optalph/config.py ADDED
@@ -0,0 +1,67 @@
1
+ from dataclasses import dataclass, field
2
+
3
+ __all__ = ["RUSSIAN_ALPHABET", "ENGLISH_ALPHABET", "OptimizerConfig", "RulesConfig", "RunConfig"]
4
+
5
+ RUSSIAN_ALPHABET = tuple("абвгдеёжзийклмнопрстуфхцчшщъыьэюя")
6
+ ENGLISH_ALPHABET = tuple("abcdefghijklmnopqrstuvwxyz")
7
+
8
+
9
+ @dataclass
10
+ class OptimizerConfig:
11
+ """Simulated annealing configuration."""
12
+ t_start: float = 100.0
13
+ t_end: float = 0.01
14
+ cooling_rate: float = 0.9999
15
+ max_iterations: int = 500000
16
+ seed: int | None = None
17
+
18
+ def __post_init__(self) -> None:
19
+ if self.t_start <= 0:
20
+ raise ValueError("t_start must be > 0")
21
+ if self.t_end < 0:
22
+ raise ValueError("t_end must be >= 0")
23
+ if self.t_end >= self.t_start:
24
+ raise ValueError("t_end must be < t_start")
25
+ if not 0 < self.cooling_rate < 1:
26
+ raise ValueError("cooling_rate must be in (0, 1)")
27
+ if self.max_iterations < 1:
28
+ raise ValueError("max_iterations must be >= 1")
29
+
30
+
31
+ @dataclass
32
+ class RulesConfig:
33
+ """Extraction rule configuration."""
34
+ adjacent_windows: list[int] = field(default_factory=lambda: [3])
35
+ skip_rules: list[tuple[int, int]] = field(default_factory=list)
36
+
37
+ def __post_init__(self) -> None:
38
+ if not all(w >= 1 for w in self.adjacent_windows):
39
+ raise ValueError("All adjacent_windows must be >= 1")
40
+ for k, s in self.skip_rules:
41
+ if k < 1:
42
+ raise ValueError("Skip rule k must be >= 1")
43
+ if s < 1:
44
+ raise ValueError("Skip rule step must be >= 1")
45
+ if not self.adjacent_windows and not self.skip_rules:
46
+ raise ValueError("At least one rule must be specified")
47
+
48
+
49
+ @dataclass
50
+ class RunConfig:
51
+ """Top-level run configuration."""
52
+ alphabet: tuple[str, ...] = RUSSIAN_ALPHABET
53
+ optimizer: OptimizerConfig = field(default_factory=OptimizerConfig)
54
+ rules: RulesConfig = field(default_factory=RulesConfig)
55
+ dict_file: str | None = None
56
+ use_dawg: bool = False
57
+ min_word_len: int = 3
58
+
59
+ def __post_init__(self) -> None:
60
+ if self.min_word_len < 1:
61
+ raise ValueError("min_word_len must be >= 1")
62
+ if len(self.alphabet) < 2:
63
+ raise ValueError("alphabet must contain at least 2 characters")
64
+ if self.min_word_len > len(self.alphabet):
65
+ raise ValueError(f"min_word_len ({self.min_word_len}) > alphabet size ({len(self.alphabet)})")
66
+ if len(set(self.alphabet)) != len(self.alphabet):
67
+ raise ValueError("alphabet must not contain duplicate characters")
optalph/dictionary.py ADDED
@@ -0,0 +1,249 @@
1
+ import logging
2
+ import os
3
+ import unicodedata
4
+ import urllib.request
5
+ from pathlib import Path
6
+
7
+ from . import __version__
8
+
9
+ __all__ = ["load_dictionary", "detect_alphabet", "download_dictionary"]
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ _CACHE_DIR = Path.home() / ".optalph"
14
+
15
+ _DICTIONARY_URLS: dict[str, str] = {
16
+ "english": "https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt",
17
+ "russian": "https://raw.githubusercontent.com/hingston/russian/master/russian.txt",
18
+ }
19
+
20
+
21
+ def _ensure_cache_dir() -> Path:
22
+ _CACHE_DIR.mkdir(parents=True, exist_ok=True)
23
+ return _CACHE_DIR
24
+
25
+
26
+ def _cache_path(name: str) -> Path:
27
+ return _ensure_cache_dir() / f"{name}.txt"
28
+
29
+
30
+ def download_dictionary(name: str, force: bool = False) -> str:
31
+ """Download a dictionary to ~/.optalph/ and return the cached file path.
32
+
33
+ Supported names: 'english', 'russian'.
34
+ Downloads only if not already cached. Use force=True to re-download.
35
+ """
36
+ name = name.lower()
37
+ if name not in _DICTIONARY_URLS:
38
+ raise ValueError(f"Unknown dictionary '{name}'. Available: {', '.join(_DICTIONARY_URLS)}")
39
+
40
+ url = _DICTIONARY_URLS[name]
41
+ cached = _cache_path(name)
42
+
43
+ if cached.is_file() and not force:
44
+ logger.info("Using cached dictionary: %s", cached)
45
+ return str(cached)
46
+
47
+ logger.info("Downloading dictionary '%s' from %s...", name, url)
48
+
49
+ tmp = cached.with_suffix(".tmp")
50
+ try:
51
+ req = urllib.request.Request(url, headers={"User-Agent": f"optalph/{__version__}"})
52
+ with urllib.request.urlopen(req, timeout=60) as resp:
53
+ with open(tmp, "wb") as f:
54
+ while True:
55
+ chunk = resp.read(8192)
56
+ if not chunk:
57
+ break
58
+ f.write(chunk)
59
+ tmp.rename(cached)
60
+ except Exception as e:
61
+ if tmp.is_file():
62
+ tmp.unlink()
63
+ raise RuntimeError(f"Failed to download dictionary: {e}") from e
64
+
65
+ logger.info("Downloaded %s (%.1f MB)", name, cached.stat().st_size / 1e6)
66
+ return str(cached)
67
+
68
+
69
+ def is_valid_word(word: str, allowed_chars: set[str] | None = None) -> bool:
70
+ if allowed_chars is not None:
71
+ return all(c in allowed_chars for c in word)
72
+ return word.isalpha()
73
+
74
+
75
+ def load_from_dawg(min_len: int = 1, max_len: int = 33, allowed_chars: set[str] | None = None) -> set[str]:
76
+ """Load words from pymorphy2 DAWG dictionary files."""
77
+ if min_len > max_len:
78
+ raise ValueError(f"min_len ({min_len}) > max_len ({max_len})")
79
+ try:
80
+ import dawg_python
81
+ except ImportError:
82
+ raise ImportError("dawg_python not installed. Run: uv add dawg-python")
83
+
84
+ dict_path = None
85
+ try:
86
+ from pymorphy2_dicts_ru import get_path
87
+ dict_path = get_path()
88
+ except ImportError:
89
+ try:
90
+ import pymorphy2_dicts_ru
91
+ dict_path = os.path.dirname(pymorphy2_dicts_ru.__file__)
92
+ except ImportError:
93
+ pass
94
+
95
+ if dict_path is None:
96
+ raise RuntimeError("Cannot find pymorphy2_dicts_ru path")
97
+
98
+ words: set[str] = set()
99
+
100
+ words_file = os.path.join(dict_path, "words.dawg")
101
+ if not os.path.isfile(words_file):
102
+ raise FileNotFoundError(f"words.dawg not found at {dict_path}")
103
+
104
+ try:
105
+ d = dawg_python.BytesDAWG()
106
+ d.load(words_file)
107
+ for key in d.keys():
108
+ if isinstance(key, bytes):
109
+ word = key.decode("utf-8").lower()
110
+ else:
111
+ word = key.lower()
112
+ if min_len <= len(word) <= max_len and is_valid_word(word, allowed_chars):
113
+ words.add(word)
114
+ except (OSError, ValueError) as e:
115
+ raise ValueError(f"Failed to load words.dawg: {e}") from e
116
+
117
+ return words
118
+
119
+
120
+ _ENCODINGS = ["utf-8-sig", "cp1251", "koi8-r", "cp866"]
121
+
122
+
123
+ def detect_alphabet(filepath: str, sample: int = 1000) -> set[str]:
124
+ """Detect unique alphabetic characters from the first `sample` words.
125
+
126
+ Returns empty set if <90% of detected characters are letters (category L*)
127
+ or if fewer than 4 unique letters found — guards against garbage decoded
128
+ via latin-1 fallback.
129
+ """
130
+ chars: set[str] = set()
131
+ count = 0
132
+ for enc in _ENCODINGS + ["latin-1"]:
133
+ try:
134
+ with open(filepath, "r", encoding=enc) as f:
135
+ for line in f:
136
+ word = line.strip().lower()
137
+ if word:
138
+ chars.update(c for c in word if c.isalpha())
139
+ count += 1
140
+ if count >= sample:
141
+ break
142
+ if chars:
143
+ letter_count = sum(
144
+ 1 for c in chars
145
+ if unicodedata.category(c).startswith("L")
146
+ )
147
+ if letter_count >= 4 and letter_count / len(chars) >= 0.9:
148
+ return chars
149
+ chars.clear()
150
+ except (UnicodeDecodeError, UnicodeError):
151
+ continue
152
+ return set()
153
+
154
+
155
+ def load_from_file(
156
+ filepath: str, min_len: int = 1, max_len: int = 33,
157
+ allowed_chars: set[str] | None = None,
158
+ ) -> set[str]:
159
+ """Load words from a text file. Tries utf-8, cp1251, koi8-r, cp866, latin-1."""
160
+ if not os.path.isfile(filepath):
161
+ raise FileNotFoundError(f"Dictionary file not found: {filepath}")
162
+ if min_len > max_len:
163
+ raise ValueError(f"min_len ({min_len}) > max_len ({max_len})")
164
+
165
+ words: set[str] = set()
166
+ encodings = _ENCODINGS
167
+ for enc in encodings:
168
+ try:
169
+ with open(filepath, "r", encoding=enc) as f:
170
+ for line in f:
171
+ word = line.strip().lower()
172
+ if min_len <= len(word) <= max_len and is_valid_word(word, allowed_chars):
173
+ words.add(word)
174
+ if not words:
175
+ raise ValueError(f"No valid words found in {filepath}")
176
+ return words
177
+ except (UnicodeDecodeError, UnicodeError):
178
+ words.clear()
179
+ continue
180
+
181
+ if not words:
182
+ with open(filepath, "r", encoding="latin-1") as f:
183
+ for line in f:
184
+ word = line.strip().lower()
185
+ if min_len <= len(word) <= max_len and is_valid_word(word, allowed_chars):
186
+ words.add(word)
187
+
188
+ if not words:
189
+ raise ValueError(f"No valid words found in {filepath}")
190
+ return words
191
+
192
+
193
+ def load_dictionary(
194
+ filepath: str | None = None,
195
+ use_dawg: bool = True,
196
+ min_len: int = 1,
197
+ max_len: int = 33,
198
+ quiet: bool = False,
199
+ alphabet: tuple[str, ...] | None = None,
200
+ download: str | None = None,
201
+ ) -> set[str]:
202
+ """Load dictionary from DAWG and/or external file. Returns set of words.
203
+ alphabet: if provided, only words using characters from alphabet are kept.
204
+ download: 'english' or 'russian' — auto-download if no other source works.
205
+ """
206
+ allowed_chars = set(alphabet) if alphabet is not None else None
207
+ words: set[str] = set()
208
+
209
+ if use_dawg:
210
+ try:
211
+ if not quiet:
212
+ logger.info("Loading dictionary from DAWG (pymorphy2-dicts-ru)...")
213
+ words.update(load_from_dawg(min_len, max_len, allowed_chars))
214
+ if not quiet:
215
+ logger.info(" dawg: %d words", len(words))
216
+ except (ImportError, RuntimeError, FileNotFoundError) as e:
217
+ if not quiet:
218
+ logger.info(" DAWG not available: %s", e)
219
+
220
+ if filepath:
221
+ if not quiet:
222
+ logger.info("Loading dictionary from %s...", filepath)
223
+ try:
224
+ file_words = load_from_file(filepath, min_len, max_len, allowed_chars)
225
+ if not quiet:
226
+ logger.info(" file: %d words", len(file_words))
227
+ words.update(file_words)
228
+ except (ValueError, FileNotFoundError) as e:
229
+ if not quiet:
230
+ logger.info(" file load failed: %s", e)
231
+
232
+ if not words and download:
233
+ cached = download_dictionary(download)
234
+ if not quiet:
235
+ logger.info("Loading downloaded dictionary...")
236
+ dl_words = load_from_file(cached, min_len, max_len, allowed_chars)
237
+ if not quiet:
238
+ logger.info(" downloaded: %d words", len(dl_words))
239
+ words.update(dl_words)
240
+
241
+ if not words:
242
+ raise ValueError(
243
+ "No dictionary words loaded. Provide --dict-file, install pymorphy2-dicts-ru, "
244
+ "or use --download english/russian"
245
+ )
246
+
247
+ if not quiet:
248
+ logger.info(" total unique: %d words", len(words))
249
+ return words