glitchlings 0.3.0__cp312-cp312-macosx_11_0_universal2.whl → 0.4.0__cp312-cp312-macosx_11_0_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +4 -0
- glitchlings/_zoo_rust.cpython-312-darwin.so +0 -0
- glitchlings/config.py +258 -0
- glitchlings/config.toml +3 -0
- glitchlings/lexicon/__init__.py +191 -0
- glitchlings/lexicon/data/default_vector_cache.json +16 -0
- glitchlings/lexicon/graph.py +303 -0
- glitchlings/lexicon/metrics.py +169 -0
- glitchlings/lexicon/vector.py +610 -0
- glitchlings/lexicon/wordnet.py +182 -0
- glitchlings/main.py +145 -5
- glitchlings/zoo/__init__.py +15 -0
- glitchlings/zoo/_sampling.py +55 -0
- glitchlings/zoo/_text_utils.py +62 -0
- glitchlings/zoo/jargoyle.py +190 -200
- glitchlings/zoo/redactyl.py +26 -54
- glitchlings/zoo/reduple.py +10 -21
- glitchlings/zoo/rushmore.py +15 -21
- glitchlings/zoo/typogre.py +22 -1
- glitchlings/zoo/zeedub.py +40 -1
- {glitchlings-0.3.0.dist-info → glitchlings-0.4.0.dist-info}/METADATA +30 -8
- glitchlings-0.4.0.dist-info/RECORD +38 -0
- glitchlings-0.3.0.dist-info/RECORD +0 -29
- {glitchlings-0.3.0.dist-info → glitchlings-0.4.0.dist-info}/WHEEL +0 -0
- {glitchlings-0.3.0.dist-info → glitchlings-0.4.0.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.3.0.dist-info → glitchlings-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.3.0.dist-info → glitchlings-0.4.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""WordNet-backed lexicon implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
try: # pragma: no cover - exercised when NLTK is available
|
|
8
|
+
import nltk # type: ignore[import]
|
|
9
|
+
except ModuleNotFoundError as exc: # pragma: no cover - triggered when NLTK missing
|
|
10
|
+
nltk = None # type: ignore[assignment]
|
|
11
|
+
find = None # type: ignore[assignment]
|
|
12
|
+
_NLTK_IMPORT_ERROR = exc
|
|
13
|
+
else: # pragma: no cover - executed when NLTK is present
|
|
14
|
+
from nltk.corpus.reader import WordNetCorpusReader as _WordNetCorpusReader # type: ignore[import]
|
|
15
|
+
from nltk.data import find as _nltk_find # type: ignore[import]
|
|
16
|
+
|
|
17
|
+
find = _nltk_find
|
|
18
|
+
_NLTK_IMPORT_ERROR = None
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING: # pragma: no cover - typing aid only
|
|
21
|
+
from nltk.corpus.reader import WordNetCorpusReader # type: ignore[import]
|
|
22
|
+
else: # pragma: no cover - runtime fallback to avoid hard dependency
|
|
23
|
+
WordNetCorpusReader = Any
|
|
24
|
+
|
|
25
|
+
if nltk is not None: # pragma: no cover - guarded by import success
|
|
26
|
+
try:
|
|
27
|
+
from nltk.corpus import wordnet as _WORDNET_MODULE # type: ignore[import]
|
|
28
|
+
except ModuleNotFoundError: # pragma: no cover - only hit on namespace packages
|
|
29
|
+
_WORDNET_MODULE = None
|
|
30
|
+
else:
|
|
31
|
+
WordNetCorpusReader = _WordNetCorpusReader # type: ignore[assignment]
|
|
32
|
+
else:
|
|
33
|
+
_WORDNET_MODULE = None
|
|
34
|
+
|
|
35
|
+
from . import Lexicon
|
|
36
|
+
|
|
37
|
+
_WORDNET_HANDLE: WordNetCorpusReader | Any | None = _WORDNET_MODULE
|
|
38
|
+
_wordnet_ready = False
|
|
39
|
+
|
|
40
|
+
_VALID_POS: tuple[str, ...] = ("n", "v", "a", "r")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _require_nltk() -> None:
|
|
44
|
+
"""Ensure the NLTK dependency is present before continuing."""
|
|
45
|
+
|
|
46
|
+
if nltk is None or find is None:
|
|
47
|
+
message = (
|
|
48
|
+
"The NLTK package is required for WordNet-backed lexicons; install "
|
|
49
|
+
"`nltk` and its WordNet corpus manually to enable this backend."
|
|
50
|
+
)
|
|
51
|
+
if '_NLTK_IMPORT_ERROR' in globals() and _NLTK_IMPORT_ERROR is not None:
|
|
52
|
+
raise RuntimeError(message) from _NLTK_IMPORT_ERROR
|
|
53
|
+
raise RuntimeError(message)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def dependencies_available() -> bool:
|
|
57
|
+
"""Return ``True`` when the runtime NLTK dependency is present."""
|
|
58
|
+
|
|
59
|
+
return nltk is not None and find is not None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _load_wordnet_reader() -> WordNetCorpusReader:
|
|
63
|
+
"""Return a WordNet corpus reader from the downloaded corpus files."""
|
|
64
|
+
|
|
65
|
+
_require_nltk()
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
root = find("corpora/wordnet")
|
|
69
|
+
except LookupError:
|
|
70
|
+
try:
|
|
71
|
+
zip_root = find("corpora/wordnet.zip")
|
|
72
|
+
except LookupError as exc:
|
|
73
|
+
raise RuntimeError(
|
|
74
|
+
"The NLTK WordNet corpus is not installed; run `nltk.download('wordnet')`."
|
|
75
|
+
) from exc
|
|
76
|
+
root = zip_root.join("wordnet/")
|
|
77
|
+
|
|
78
|
+
return WordNetCorpusReader(root, None)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _wordnet(force_refresh: bool = False) -> WordNetCorpusReader | Any:
|
|
82
|
+
"""Retrieve the active WordNet handle, rebuilding it on demand."""
|
|
83
|
+
|
|
84
|
+
global _WORDNET_HANDLE
|
|
85
|
+
|
|
86
|
+
if force_refresh:
|
|
87
|
+
_WORDNET_HANDLE = _WORDNET_MODULE
|
|
88
|
+
|
|
89
|
+
if _WORDNET_HANDLE is not None:
|
|
90
|
+
return _WORDNET_HANDLE
|
|
91
|
+
|
|
92
|
+
_WORDNET_HANDLE = _load_wordnet_reader()
|
|
93
|
+
return _WORDNET_HANDLE
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def ensure_wordnet() -> None:
|
|
97
|
+
"""Ensure the WordNet corpus is available before use."""
|
|
98
|
+
|
|
99
|
+
global _wordnet_ready
|
|
100
|
+
if _wordnet_ready:
|
|
101
|
+
return
|
|
102
|
+
|
|
103
|
+
_require_nltk()
|
|
104
|
+
|
|
105
|
+
resource = _wordnet()
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
resource.ensure_loaded()
|
|
109
|
+
except LookupError:
|
|
110
|
+
nltk.download("wordnet", quiet=True)
|
|
111
|
+
try:
|
|
112
|
+
resource = _wordnet(force_refresh=True)
|
|
113
|
+
resource.ensure_loaded()
|
|
114
|
+
except LookupError as exc: # pragma: no cover - only triggered when download fails
|
|
115
|
+
raise RuntimeError(
|
|
116
|
+
"Unable to load NLTK WordNet corpus for synonym lookups."
|
|
117
|
+
) from exc
|
|
118
|
+
|
|
119
|
+
_wordnet_ready = True
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _collect_synonyms(word: str, parts_of_speech: tuple[str, ...]) -> list[str]:
|
|
123
|
+
"""Gather deterministic synonym candidates for the supplied word."""
|
|
124
|
+
|
|
125
|
+
normalized_word = word.lower()
|
|
126
|
+
wordnet = _wordnet()
|
|
127
|
+
synonyms: set[str] = set()
|
|
128
|
+
for pos_tag in parts_of_speech:
|
|
129
|
+
synsets = wordnet.synsets(word, pos=pos_tag)
|
|
130
|
+
if not synsets:
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
for synset in synsets:
|
|
134
|
+
lemmas_list = [lemma.name() for lemma in synset.lemmas()]
|
|
135
|
+
if not lemmas_list:
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
filtered = []
|
|
139
|
+
for lemma_str in lemmas_list:
|
|
140
|
+
cleaned = lemma_str.replace("_", " ")
|
|
141
|
+
if cleaned.lower() != normalized_word:
|
|
142
|
+
filtered.append(cleaned)
|
|
143
|
+
|
|
144
|
+
if filtered:
|
|
145
|
+
synonyms.update(filtered)
|
|
146
|
+
break
|
|
147
|
+
|
|
148
|
+
if synonyms:
|
|
149
|
+
break
|
|
150
|
+
|
|
151
|
+
return sorted(synonyms)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class WordNetLexicon(Lexicon):
|
|
155
|
+
"""Lexicon that retrieves synonyms from the NLTK WordNet corpus."""
|
|
156
|
+
|
|
157
|
+
def get_synonyms(
|
|
158
|
+
self, word: str, pos: str | None = None, n: int = 5
|
|
159
|
+
) -> list[str]:
|
|
160
|
+
ensure_wordnet()
|
|
161
|
+
|
|
162
|
+
if pos is None:
|
|
163
|
+
parts: tuple[str, ...] = _VALID_POS
|
|
164
|
+
else:
|
|
165
|
+
normalized_pos = pos.lower()
|
|
166
|
+
if normalized_pos not in _VALID_POS:
|
|
167
|
+
return []
|
|
168
|
+
parts = (normalized_pos,)
|
|
169
|
+
|
|
170
|
+
synonyms = _collect_synonyms(word, parts)
|
|
171
|
+
return self._deterministic_sample(synonyms, limit=n, word=word, pos=pos)
|
|
172
|
+
|
|
173
|
+
def supports_pos(self, pos: str | None) -> bool:
|
|
174
|
+
if pos is None:
|
|
175
|
+
return True
|
|
176
|
+
return pos.lower() in _VALID_POS
|
|
177
|
+
|
|
178
|
+
def __repr__(self) -> str: # pragma: no cover - trivial representation
|
|
179
|
+
return f"WordNetLexicon(seed={self.seed!r})"
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
__all__ = ["WordNetLexicon", "dependencies_available", "ensure_wordnet"]
|
glitchlings/main.py
CHANGED
|
@@ -8,6 +8,7 @@ from pathlib import Path
|
|
|
8
8
|
import sys
|
|
9
9
|
|
|
10
10
|
from . import SAMPLE_TEXT
|
|
11
|
+
from .config import DEFAULT_ATTACK_SEED, build_gaggle, load_attack_config
|
|
11
12
|
from .zoo import (
|
|
12
13
|
Glitchling,
|
|
13
14
|
Gaggle,
|
|
@@ -53,7 +54,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
53
54
|
"-s",
|
|
54
55
|
"--seed",
|
|
55
56
|
type=int,
|
|
56
|
-
default=
|
|
57
|
+
default=None,
|
|
57
58
|
help="Seed controlling deterministic corruption order (default: 151).",
|
|
58
59
|
)
|
|
59
60
|
parser.add_argument(
|
|
@@ -77,9 +78,83 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
77
78
|
action="store_true",
|
|
78
79
|
help="List available glitchlings and exit.",
|
|
79
80
|
)
|
|
81
|
+
parser.add_argument(
|
|
82
|
+
"-c",
|
|
83
|
+
"--config",
|
|
84
|
+
type=Path,
|
|
85
|
+
help="Load glitchlings from a YAML configuration file.",
|
|
86
|
+
)
|
|
80
87
|
return parser
|
|
81
88
|
|
|
82
89
|
|
|
90
|
+
def build_lexicon_parser() -> argparse.ArgumentParser:
|
|
91
|
+
builder = argparse.ArgumentParser(
|
|
92
|
+
prog="glitchlings build-lexicon",
|
|
93
|
+
description=(
|
|
94
|
+
"Generate deterministic synonym caches using vector embeddings so "
|
|
95
|
+
"they can be distributed without bundling large models."
|
|
96
|
+
),
|
|
97
|
+
)
|
|
98
|
+
builder.add_argument(
|
|
99
|
+
"--source",
|
|
100
|
+
required=True,
|
|
101
|
+
help=(
|
|
102
|
+
"Vector source specification. Use 'spacy:<model>' for spaCy pipelines "
|
|
103
|
+
"or provide a path to a gensim KeyedVectors/word2vec file."
|
|
104
|
+
),
|
|
105
|
+
)
|
|
106
|
+
builder.add_argument(
|
|
107
|
+
"--output",
|
|
108
|
+
required=True,
|
|
109
|
+
type=Path,
|
|
110
|
+
help="Path to the JSON file that will receive the synonym cache.",
|
|
111
|
+
)
|
|
112
|
+
builder.add_argument(
|
|
113
|
+
"--tokens",
|
|
114
|
+
type=Path,
|
|
115
|
+
help="Optional newline-delimited vocabulary file to restrict generation.",
|
|
116
|
+
)
|
|
117
|
+
builder.add_argument(
|
|
118
|
+
"--max-neighbors",
|
|
119
|
+
type=int,
|
|
120
|
+
default=50,
|
|
121
|
+
help="Number of nearest neighbours to cache per token (default: 50).",
|
|
122
|
+
)
|
|
123
|
+
builder.add_argument(
|
|
124
|
+
"--min-similarity",
|
|
125
|
+
type=float,
|
|
126
|
+
default=0.0,
|
|
127
|
+
help="Minimum cosine similarity required to keep a synonym (default: 0.0).",
|
|
128
|
+
)
|
|
129
|
+
builder.add_argument(
|
|
130
|
+
"--seed",
|
|
131
|
+
type=int,
|
|
132
|
+
help="Optional deterministic seed to bake into the resulting cache.",
|
|
133
|
+
)
|
|
134
|
+
builder.add_argument(
|
|
135
|
+
"--case-sensitive",
|
|
136
|
+
action="store_true",
|
|
137
|
+
help="Preserve original casing instead of lower-casing cache keys.",
|
|
138
|
+
)
|
|
139
|
+
builder.add_argument(
|
|
140
|
+
"--normalizer",
|
|
141
|
+
choices=["lower", "identity"],
|
|
142
|
+
default="lower",
|
|
143
|
+
help="Token normalization strategy for cache keys (default: lower).",
|
|
144
|
+
)
|
|
145
|
+
builder.add_argument(
|
|
146
|
+
"--limit",
|
|
147
|
+
type=int,
|
|
148
|
+
help="Optional maximum number of tokens to process.",
|
|
149
|
+
)
|
|
150
|
+
builder.add_argument(
|
|
151
|
+
"--overwrite",
|
|
152
|
+
action="store_true",
|
|
153
|
+
help="Allow overwriting an existing cache file.",
|
|
154
|
+
)
|
|
155
|
+
return builder
|
|
156
|
+
|
|
157
|
+
|
|
83
158
|
def list_glitchlings() -> None:
|
|
84
159
|
"""Print information about the available built-in glitchlings."""
|
|
85
160
|
|
|
@@ -129,10 +204,27 @@ def read_text(args: argparse.Namespace, parser: argparse.ArgumentParser) -> str:
|
|
|
129
204
|
|
|
130
205
|
|
|
131
206
|
def summon_glitchlings(
|
|
132
|
-
names: list[str] | None,
|
|
207
|
+
names: list[str] | None,
|
|
208
|
+
parser: argparse.ArgumentParser,
|
|
209
|
+
seed: int | None,
|
|
210
|
+
*,
|
|
211
|
+
config_path: Path | None = None,
|
|
133
212
|
) -> Gaggle:
|
|
134
213
|
"""Instantiate the requested glitchlings and bundle them in a ``Gaggle``."""
|
|
135
214
|
|
|
215
|
+
if config_path is not None:
|
|
216
|
+
if names:
|
|
217
|
+
parser.error("Cannot combine --config with --glitchling.")
|
|
218
|
+
raise AssertionError("parser.error should exit")
|
|
219
|
+
|
|
220
|
+
try:
|
|
221
|
+
config = load_attack_config(config_path)
|
|
222
|
+
except (TypeError, ValueError) as exc:
|
|
223
|
+
parser.error(str(exc))
|
|
224
|
+
raise AssertionError("parser.error should exit")
|
|
225
|
+
|
|
226
|
+
return build_gaggle(config, seed_override=seed)
|
|
227
|
+
|
|
136
228
|
if names:
|
|
137
229
|
normalized: list[str | Glitchling] = []
|
|
138
230
|
for specification in names:
|
|
@@ -144,8 +236,10 @@ def summon_glitchlings(
|
|
|
144
236
|
else:
|
|
145
237
|
normalized = DEFAULT_GLITCHLING_NAMES
|
|
146
238
|
|
|
239
|
+
effective_seed = seed if seed is not None else DEFAULT_ATTACK_SEED
|
|
240
|
+
|
|
147
241
|
try:
|
|
148
|
-
return summon(normalized, seed=
|
|
242
|
+
return summon(normalized, seed=effective_seed)
|
|
149
243
|
except ValueError as exc:
|
|
150
244
|
parser.error(str(exc))
|
|
151
245
|
raise AssertionError("parser.error should exit")
|
|
@@ -187,7 +281,12 @@ def run_cli(args: argparse.Namespace, parser: argparse.ArgumentParser) -> int:
|
|
|
187
281
|
return 0
|
|
188
282
|
|
|
189
283
|
text = read_text(args, parser)
|
|
190
|
-
gaggle = summon_glitchlings(
|
|
284
|
+
gaggle = summon_glitchlings(
|
|
285
|
+
args.glitchlings,
|
|
286
|
+
parser,
|
|
287
|
+
args.seed,
|
|
288
|
+
config_path=args.config,
|
|
289
|
+
)
|
|
191
290
|
|
|
192
291
|
corrupted = gaggle(text)
|
|
193
292
|
|
|
@@ -199,6 +298,37 @@ def run_cli(args: argparse.Namespace, parser: argparse.ArgumentParser) -> int:
|
|
|
199
298
|
return 0
|
|
200
299
|
|
|
201
300
|
|
|
301
|
+
def run_build_lexicon(args: argparse.Namespace) -> int:
|
|
302
|
+
"""Delegate to the vector lexicon cache builder using CLI arguments."""
|
|
303
|
+
|
|
304
|
+
from glitchlings.lexicon.vector import main as vector_main
|
|
305
|
+
|
|
306
|
+
vector_args = [
|
|
307
|
+
"--source",
|
|
308
|
+
args.source,
|
|
309
|
+
"--output",
|
|
310
|
+
str(args.output),
|
|
311
|
+
"--max-neighbors",
|
|
312
|
+
str(args.max_neighbors),
|
|
313
|
+
"--min-similarity",
|
|
314
|
+
str(args.min_similarity),
|
|
315
|
+
"--normalizer",
|
|
316
|
+
args.normalizer,
|
|
317
|
+
]
|
|
318
|
+
if args.tokens is not None:
|
|
319
|
+
vector_args.extend(["--tokens", str(args.tokens)])
|
|
320
|
+
if args.seed is not None:
|
|
321
|
+
vector_args.extend(["--seed", str(args.seed)])
|
|
322
|
+
if args.case_sensitive:
|
|
323
|
+
vector_args.append("--case-sensitive")
|
|
324
|
+
if args.limit is not None:
|
|
325
|
+
vector_args.extend(["--limit", str(args.limit)])
|
|
326
|
+
if args.overwrite:
|
|
327
|
+
vector_args.append("--overwrite")
|
|
328
|
+
|
|
329
|
+
return vector_main(vector_args)
|
|
330
|
+
|
|
331
|
+
|
|
202
332
|
def main(argv: list[str] | None = None) -> int:
|
|
203
333
|
"""Entry point for the ``glitchlings`` command line interface.
|
|
204
334
|
|
|
@@ -209,8 +339,18 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
209
339
|
int: Exit code suitable for use with ``sys.exit``.
|
|
210
340
|
"""
|
|
211
341
|
|
|
342
|
+
if argv is None:
|
|
343
|
+
raw_args = sys.argv[1:]
|
|
344
|
+
else:
|
|
345
|
+
raw_args = list(argv)
|
|
346
|
+
|
|
347
|
+
if raw_args and raw_args[0] == "build-lexicon":
|
|
348
|
+
builder = build_lexicon_parser()
|
|
349
|
+
args = builder.parse_args(raw_args[1:])
|
|
350
|
+
return run_build_lexicon(args)
|
|
351
|
+
|
|
212
352
|
parser = build_parser()
|
|
213
|
-
args = parser.parse_args(
|
|
353
|
+
args = parser.parse_args(raw_args)
|
|
214
354
|
return run_cli(args, parser)
|
|
215
355
|
|
|
216
356
|
|
glitchlings/zoo/__init__.py
CHANGED
|
@@ -39,6 +39,7 @@ __all__ = [
|
|
|
39
39
|
"BUILTIN_GLITCHLINGS",
|
|
40
40
|
"DEFAULT_GLITCHLING_NAMES",
|
|
41
41
|
"parse_glitchling_spec",
|
|
42
|
+
"get_glitchling_class",
|
|
42
43
|
]
|
|
43
44
|
|
|
44
45
|
_HAS_JARGOYLE = _jargoyle_available()
|
|
@@ -125,6 +126,20 @@ def parse_glitchling_spec(specification: str) -> Glitchling:
|
|
|
125
126
|
raise ValueError(f"Failed to instantiate glitchling '{name}': {exc}") from exc
|
|
126
127
|
|
|
127
128
|
|
|
129
|
+
def get_glitchling_class(name: str) -> type[Glitchling]:
|
|
130
|
+
"""Look up the glitchling class registered under ``name``."""
|
|
131
|
+
|
|
132
|
+
key = name.strip().lower()
|
|
133
|
+
if not key:
|
|
134
|
+
raise ValueError("Glitchling name cannot be empty.")
|
|
135
|
+
|
|
136
|
+
glitchling_type = _BUILTIN_GLITCHLING_TYPES.get(key)
|
|
137
|
+
if glitchling_type is None:
|
|
138
|
+
raise ValueError(f"Glitchling '{name}' not found.")
|
|
139
|
+
|
|
140
|
+
return glitchling_type
|
|
141
|
+
|
|
142
|
+
|
|
128
143
|
def summon(glitchlings: list[str | Glitchling], seed: int = 151) -> Gaggle:
|
|
129
144
|
"""Summon glitchlings by name (using defaults) or instance (to change parameters)."""
|
|
130
145
|
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import random
|
|
4
|
+
from typing import Sequence
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def weighted_sample_without_replacement(
|
|
8
|
+
population: Sequence[int],
|
|
9
|
+
weights: Sequence[float],
|
|
10
|
+
*,
|
|
11
|
+
k: int,
|
|
12
|
+
rng: random.Random,
|
|
13
|
+
) -> list[int]:
|
|
14
|
+
"""Sample ``k`` unique indices from ``population`` using ``weights``.
|
|
15
|
+
|
|
16
|
+
Mirrors the behaviour used by several glitchlings while centralising error
|
|
17
|
+
handling and RNG interactions so the Python and Rust implementations remain
|
|
18
|
+
aligned.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
if k < 0:
|
|
22
|
+
raise ValueError("Sample size cannot be negative")
|
|
23
|
+
|
|
24
|
+
if len(population) != len(weights):
|
|
25
|
+
raise ValueError("Population and weight sequences must be the same length")
|
|
26
|
+
|
|
27
|
+
items = list(zip(population, weights))
|
|
28
|
+
count = len(items)
|
|
29
|
+
if k == 0 or count == 0:
|
|
30
|
+
return []
|
|
31
|
+
|
|
32
|
+
if k > count:
|
|
33
|
+
raise ValueError("Sample larger than population or is negative")
|
|
34
|
+
|
|
35
|
+
selections: list[int] = []
|
|
36
|
+
for _ in range(k):
|
|
37
|
+
total_weight = sum(weight for _, weight in items)
|
|
38
|
+
if total_weight <= 0.0:
|
|
39
|
+
chosen_index = rng.randrange(len(items))
|
|
40
|
+
else:
|
|
41
|
+
threshold = rng.random() * total_weight
|
|
42
|
+
cumulative = 0.0
|
|
43
|
+
chosen_index = len(items) - 1
|
|
44
|
+
for idx, (_, weight) in enumerate(items):
|
|
45
|
+
cumulative += weight
|
|
46
|
+
if cumulative >= threshold:
|
|
47
|
+
chosen_index = idx
|
|
48
|
+
break
|
|
49
|
+
value, _ = items.pop(chosen_index)
|
|
50
|
+
selections.append(value)
|
|
51
|
+
|
|
52
|
+
return selections
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
__all__ = ["weighted_sample_without_replacement"]
|
glitchlings/zoo/_text_utils.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Sequence
|
|
4
6
|
|
|
5
7
|
_WORD_SPLIT_PATTERN = re.compile(r"(\s+)")
|
|
6
8
|
_TOKEN_EDGES_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$")
|
|
@@ -35,8 +37,68 @@ def token_core_length(token: str) -> int:
|
|
|
35
37
|
return length
|
|
36
38
|
|
|
37
39
|
|
|
40
|
+
@dataclass(frozen=True)
|
|
41
|
+
class WordToken:
|
|
42
|
+
"""Metadata describing a non-whitespace token yielded by word splitters."""
|
|
43
|
+
|
|
44
|
+
index: int
|
|
45
|
+
prefix: str
|
|
46
|
+
core: str
|
|
47
|
+
suffix: str
|
|
48
|
+
core_length: int
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def has_core(self) -> bool:
|
|
52
|
+
"""Return ``True`` when the token contains at least one core character."""
|
|
53
|
+
|
|
54
|
+
return bool(self.core)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def collect_word_tokens(
|
|
58
|
+
tokens: Sequence[str],
|
|
59
|
+
*,
|
|
60
|
+
skip_first_word: bool = False,
|
|
61
|
+
) -> list[WordToken]:
|
|
62
|
+
"""Return structured metadata for non-whitespace tokens within ``tokens``.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
tokens: Token sequence produced by :func:`split_preserving_whitespace`.
|
|
66
|
+
skip_first_word: Exclude the first candidate token (used by Rushmore to
|
|
67
|
+
preserve leading words).
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
start = 2 if skip_first_word else 0
|
|
71
|
+
collected: list[WordToken] = []
|
|
72
|
+
for index in range(start, len(tokens), 2):
|
|
73
|
+
token = tokens[index]
|
|
74
|
+
if not token or token.isspace():
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
prefix, core, suffix = split_token_edges(token)
|
|
78
|
+
core_length = len(core)
|
|
79
|
+
if core_length <= 0:
|
|
80
|
+
stripped = token.strip()
|
|
81
|
+
core_length = len(stripped) if stripped else len(token)
|
|
82
|
+
if core_length <= 0:
|
|
83
|
+
core_length = 1
|
|
84
|
+
|
|
85
|
+
collected.append(
|
|
86
|
+
WordToken(
|
|
87
|
+
index=index,
|
|
88
|
+
prefix=prefix,
|
|
89
|
+
core=core,
|
|
90
|
+
suffix=suffix,
|
|
91
|
+
core_length=core_length,
|
|
92
|
+
)
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
return collected
|
|
96
|
+
|
|
97
|
+
|
|
38
98
|
__all__ = [
|
|
39
99
|
"split_preserving_whitespace",
|
|
40
100
|
"split_token_edges",
|
|
41
101
|
"token_core_length",
|
|
102
|
+
"WordToken",
|
|
103
|
+
"collect_word_tokens",
|
|
42
104
|
]
|