glitchlings 0.4.4__cp310-cp310-macosx_11_0_universal2.whl → 0.5.0__cp310-cp310-macosx_11_0_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +4 -0
- glitchlings/_zoo_rust.cpython-310-darwin.so +0 -0
- glitchlings/compat.py +2 -4
- glitchlings/config.py +14 -28
- glitchlings/dev/__init__.py +5 -0
- glitchlings/dev/sync_assets.py +153 -0
- glitchlings/dlc/_shared.py +6 -6
- glitchlings/dlc/huggingface.py +6 -6
- glitchlings/dlc/prime.py +1 -1
- glitchlings/dlc/pytorch.py +3 -3
- glitchlings/dlc/pytorch_lightning.py +4 -10
- glitchlings/lexicon/_cache.py +3 -5
- glitchlings/lexicon/vector.py +6 -5
- glitchlings/lexicon/wordnet.py +4 -8
- glitchlings/util/hokey_generator.py +144 -0
- glitchlings/util/stretch_locator.py +140 -0
- glitchlings/util/stretchability.py +370 -0
- glitchlings/zoo/__init__.py +5 -1
- glitchlings/zoo/_ocr_confusions.py +3 -3
- glitchlings/zoo/_text_utils.py +10 -9
- glitchlings/zoo/adjax.py +3 -18
- glitchlings/zoo/apostrofae.py +2 -5
- glitchlings/zoo/assets/__init__.py +54 -0
- glitchlings/zoo/assets/hokey_assets.json +193 -0
- glitchlings/zoo/hokey.py +173 -0
- glitchlings/zoo/jargoyle.py +2 -16
- glitchlings/zoo/mim1c.py +2 -17
- glitchlings/zoo/redactyl.py +3 -17
- glitchlings/zoo/reduple.py +3 -17
- glitchlings/zoo/rushmore.py +3 -20
- glitchlings/zoo/scannequin.py +3 -20
- glitchlings/zoo/typogre.py +2 -19
- glitchlings/zoo/zeedub.py +2 -13
- {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/METADATA +29 -6
- glitchlings-0.5.0.dist-info/RECORD +53 -0
- glitchlings/zoo/_rate.py +0 -131
- glitchlings-0.4.4.dist-info/RECORD +0 -47
- /glitchlings/zoo/{ocr_confusions.tsv → assets/ocr_confusions.tsv} +0 -0
- {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/WHEEL +0 -0
- {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/top_level.txt +0 -0
glitchlings/__init__.py
CHANGED
|
@@ -5,6 +5,7 @@ from .zoo import (
|
|
|
5
5
|
Apostrofae,
|
|
6
6
|
Gaggle,
|
|
7
7
|
Glitchling,
|
|
8
|
+
Hokey,
|
|
8
9
|
Jargoyle,
|
|
9
10
|
Mim1c,
|
|
10
11
|
Redactyl,
|
|
@@ -15,6 +16,7 @@ from .zoo import (
|
|
|
15
16
|
Zeedub,
|
|
16
17
|
adjax,
|
|
17
18
|
apostrofae,
|
|
19
|
+
hokey,
|
|
18
20
|
is_rust_pipeline_enabled,
|
|
19
21
|
is_rust_pipeline_supported,
|
|
20
22
|
jargoyle,
|
|
@@ -42,6 +44,8 @@ __all__ = [
|
|
|
42
44
|
"adjax",
|
|
43
45
|
"Apostrofae",
|
|
44
46
|
"apostrofae",
|
|
47
|
+
"Hokey",
|
|
48
|
+
"hokey",
|
|
45
49
|
"Redactyl",
|
|
46
50
|
"redactyl",
|
|
47
51
|
"Reduple",
|
|
Binary file
|
glitchlings/compat.py
CHANGED
|
@@ -17,16 +17,14 @@ _MISSING = _MissingSentinel()
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class _MarkerProtocol(Protocol):
|
|
20
|
-
def evaluate(self, environment: dict[str, str]) -> bool:
|
|
21
|
-
...
|
|
20
|
+
def evaluate(self, environment: dict[str, str]) -> bool: ...
|
|
22
21
|
|
|
23
22
|
|
|
24
23
|
class _RequirementProtocol(Protocol):
|
|
25
24
|
marker: _MarkerProtocol | None
|
|
26
25
|
name: str
|
|
27
26
|
|
|
28
|
-
def __init__(self, requirement: str) -> None:
|
|
29
|
-
...
|
|
27
|
+
def __init__(self, requirement: str) -> None: ...
|
|
30
28
|
|
|
31
29
|
|
|
32
30
|
try: # pragma: no cover - packaging is bundled with modern Python environments
|
glitchlings/config.py
CHANGED
|
@@ -4,7 +4,6 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
import importlib
|
|
6
6
|
import os
|
|
7
|
-
import warnings
|
|
8
7
|
from dataclasses import dataclass, field
|
|
9
8
|
from io import TextIOBase
|
|
10
9
|
from pathlib import Path
|
|
@@ -19,8 +18,7 @@ except ModuleNotFoundError: # pragma: no cover - Python < 3.11
|
|
|
19
18
|
|
|
20
19
|
|
|
21
20
|
class _TomllibModule(Protocol):
|
|
22
|
-
def load(self, fp: IO[bytes]) -> Any:
|
|
23
|
-
...
|
|
21
|
+
def load(self, fp: IO[bytes]) -> Any: ...
|
|
24
22
|
|
|
25
23
|
|
|
26
24
|
tomllib = cast(_TomllibModule, _tomllib)
|
|
@@ -29,8 +27,7 @@ tomllib = cast(_TomllibModule, _tomllib)
|
|
|
29
27
|
class _YamlModule(Protocol):
|
|
30
28
|
YAMLError: type[Exception]
|
|
31
29
|
|
|
32
|
-
def safe_load(self, stream: str) -> Any:
|
|
33
|
-
...
|
|
30
|
+
def safe_load(self, stream: str) -> Any: ...
|
|
34
31
|
|
|
35
32
|
|
|
36
33
|
yaml = cast(_YamlModule, importlib.import_module("yaml"))
|
|
@@ -59,17 +56,6 @@ ATTACK_CONFIG_SCHEMA: dict[str, Any] = {
|
|
|
59
56
|
"required": ["name"],
|
|
60
57
|
"properties": {
|
|
61
58
|
"name": {"type": "string", "minLength": 1},
|
|
62
|
-
"type": {"type": "string", "minLength": 1},
|
|
63
|
-
"parameters": {"type": "object"},
|
|
64
|
-
},
|
|
65
|
-
"additionalProperties": True,
|
|
66
|
-
},
|
|
67
|
-
{
|
|
68
|
-
"type": "object",
|
|
69
|
-
"required": ["type"],
|
|
70
|
-
"properties": {
|
|
71
|
-
"name": {"type": "string", "minLength": 1},
|
|
72
|
-
"type": {"type": "string", "minLength": 1},
|
|
73
59
|
"parameters": {"type": "object"},
|
|
74
60
|
},
|
|
75
61
|
"additionalProperties": True,
|
|
@@ -265,7 +251,12 @@ def _validate_attack_config_schema(data: Any, *, source: str) -> Mapping[str, An
|
|
|
265
251
|
|
|
266
252
|
for index, entry in enumerate(raw_glitchlings, start=1):
|
|
267
253
|
if isinstance(entry, Mapping):
|
|
268
|
-
|
|
254
|
+
if "type" in entry:
|
|
255
|
+
raise ValueError(
|
|
256
|
+
f"{source}: glitchling #{index} uses unsupported 'type'; use 'name'."
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
name_candidate = entry.get("name")
|
|
269
260
|
if not isinstance(name_candidate, str) or not name_candidate.strip():
|
|
270
261
|
raise ValueError(f"{source}: glitchling #{index} is missing a 'name'.")
|
|
271
262
|
parameters = entry.get("parameters")
|
|
@@ -328,17 +319,12 @@ def _build_glitchling(entry: Any, source: str, index: int) -> "Glitchling":
|
|
|
328
319
|
raise ValueError(f"{source}: glitchling #{index}: {exc}") from exc
|
|
329
320
|
|
|
330
321
|
if isinstance(entry, Mapping):
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
warnings.warn(
|
|
335
|
-
f"{source}: glitchling #{index} uses 'type'; prefer 'name'.",
|
|
336
|
-
DeprecationWarning,
|
|
337
|
-
stacklevel=2,
|
|
322
|
+
if "type" in entry:
|
|
323
|
+
raise ValueError(
|
|
324
|
+
f"{source}: glitchling #{index} uses unsupported 'type'; use 'name'."
|
|
338
325
|
)
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
name_value = legacy_type
|
|
326
|
+
|
|
327
|
+
name_value = entry.get("name")
|
|
342
328
|
|
|
343
329
|
if not isinstance(name_value, str) or not name_value.strip():
|
|
344
330
|
raise ValueError(f"{source}: glitchling #{index} is missing a 'name'.")
|
|
@@ -354,7 +340,7 @@ def _build_glitchling(entry: Any, source: str, index: int) -> "Glitchling":
|
|
|
354
340
|
kwargs = {
|
|
355
341
|
key: value
|
|
356
342
|
for key, value in entry.items()
|
|
357
|
-
if key not in {"name", "
|
|
343
|
+
if key not in {"name", "parameters"}
|
|
358
344
|
}
|
|
359
345
|
|
|
360
346
|
try:
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""Synchronise canonical glitchling assets with the vendored Rust copies."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import shutil
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Iterator, Sequence
|
|
10
|
+
|
|
11
|
+
RUST_VENDORED_ASSETS: frozenset[str] = frozenset({
|
|
12
|
+
"hokey_assets.json",
|
|
13
|
+
"ocr_confusions.tsv",
|
|
14
|
+
})
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _project_root(default: Path | None = None) -> Path:
|
|
18
|
+
if default is not None:
|
|
19
|
+
return default
|
|
20
|
+
return Path(__file__).resolve().parents[3]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _canonical_asset_dir(project_root: Path) -> Path:
|
|
24
|
+
canonical = project_root / "src" / "glitchlings" / "zoo" / "assets"
|
|
25
|
+
if not canonical.is_dir():
|
|
26
|
+
raise RuntimeError(
|
|
27
|
+
"expected canonical assets under 'src/glitchlings/zoo/assets'; "
|
|
28
|
+
"run this command from the repository root"
|
|
29
|
+
)
|
|
30
|
+
return canonical
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _rust_asset_dir(project_root: Path) -> Path:
|
|
34
|
+
return project_root / "rust" / "zoo" / "assets"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _iter_extraneous_assets(rust_dir: Path) -> Iterator[Path]:
|
|
38
|
+
if not rust_dir.exists():
|
|
39
|
+
return
|
|
40
|
+
for path in rust_dir.iterdir():
|
|
41
|
+
if path.is_file() and path.name not in RUST_VENDORED_ASSETS:
|
|
42
|
+
yield path
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def sync_assets(
|
|
46
|
+
project_root: Path | None = None,
|
|
47
|
+
*,
|
|
48
|
+
check: bool = False,
|
|
49
|
+
quiet: bool = False,
|
|
50
|
+
) -> bool:
|
|
51
|
+
"""Synchronise the vendored Rust asset copies with the canonical sources."""
|
|
52
|
+
|
|
53
|
+
root = _project_root(project_root)
|
|
54
|
+
canonical_dir = _canonical_asset_dir(root)
|
|
55
|
+
rust_dir = _rust_asset_dir(root)
|
|
56
|
+
|
|
57
|
+
missing_sources = [
|
|
58
|
+
name
|
|
59
|
+
for name in RUST_VENDORED_ASSETS
|
|
60
|
+
if not (canonical_dir / name).is_file()
|
|
61
|
+
]
|
|
62
|
+
if missing_sources:
|
|
63
|
+
missing_list = ", ".join(sorted(missing_sources))
|
|
64
|
+
raise RuntimeError(f"missing canonical assets: {missing_list}")
|
|
65
|
+
|
|
66
|
+
extraneous = list(_iter_extraneous_assets(rust_dir))
|
|
67
|
+
|
|
68
|
+
mismatched: list[tuple[str, str]] = []
|
|
69
|
+
for name in sorted(RUST_VENDORED_ASSETS):
|
|
70
|
+
source = canonical_dir / name
|
|
71
|
+
target = rust_dir / name
|
|
72
|
+
if not target.exists():
|
|
73
|
+
mismatched.append((name, "missing"))
|
|
74
|
+
continue
|
|
75
|
+
if source.read_bytes() != target.read_bytes():
|
|
76
|
+
mismatched.append((name, "outdated"))
|
|
77
|
+
|
|
78
|
+
if check:
|
|
79
|
+
if mismatched or extraneous:
|
|
80
|
+
if not quiet:
|
|
81
|
+
for name, reason in mismatched:
|
|
82
|
+
target = rust_dir / name
|
|
83
|
+
print(
|
|
84
|
+
f"{target.relative_to(root)} is {reason}; run sync_assets to refresh it",
|
|
85
|
+
file=sys.stderr,
|
|
86
|
+
)
|
|
87
|
+
for extra in extraneous:
|
|
88
|
+
print(
|
|
89
|
+
(
|
|
90
|
+
"unexpected vendored asset "
|
|
91
|
+
f"{extra.relative_to(root)}; run sync_assets to prune it"
|
|
92
|
+
),
|
|
93
|
+
file=sys.stderr,
|
|
94
|
+
)
|
|
95
|
+
return False
|
|
96
|
+
if not quiet:
|
|
97
|
+
print("Rust asset bundle is up to date.")
|
|
98
|
+
return True
|
|
99
|
+
|
|
100
|
+
rust_dir.mkdir(parents=True, exist_ok=True)
|
|
101
|
+
|
|
102
|
+
for name, reason in mismatched:
|
|
103
|
+
source = canonical_dir / name
|
|
104
|
+
target = rust_dir / name
|
|
105
|
+
shutil.copy2(source, target)
|
|
106
|
+
if not quiet:
|
|
107
|
+
verb = "Copied" if reason == "missing" else "Updated"
|
|
108
|
+
print(
|
|
109
|
+
f"{verb} {source.relative_to(root)} -> {target.relative_to(root)}",
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
for extra in extraneous:
|
|
113
|
+
extra.unlink()
|
|
114
|
+
if not quiet:
|
|
115
|
+
print(f"Removed extraneous vendored asset {extra.relative_to(root)}")
|
|
116
|
+
|
|
117
|
+
if not mismatched and not extraneous and not quiet:
|
|
118
|
+
print("Rust asset bundle already aligned with canonical copies.")
|
|
119
|
+
|
|
120
|
+
return True
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
124
|
+
parser = argparse.ArgumentParser(
|
|
125
|
+
description="Synchronise canonical glitchling assets with the vendored Rust copies.",
|
|
126
|
+
)
|
|
127
|
+
parser.add_argument(
|
|
128
|
+
"--check",
|
|
129
|
+
action="store_true",
|
|
130
|
+
help="exit with a non-zero status when vendored assets diverge",
|
|
131
|
+
)
|
|
132
|
+
parser.add_argument(
|
|
133
|
+
"--quiet",
|
|
134
|
+
action="store_true",
|
|
135
|
+
help="suppress status output",
|
|
136
|
+
)
|
|
137
|
+
parser.add_argument(
|
|
138
|
+
"--project-root",
|
|
139
|
+
type=Path,
|
|
140
|
+
help="override the detected project root (useful for testing)",
|
|
141
|
+
)
|
|
142
|
+
return parser
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def main(argv: Sequence[str] | None = None) -> int:
|
|
146
|
+
parser = build_parser()
|
|
147
|
+
args = parser.parse_args(argv)
|
|
148
|
+
ok = sync_assets(project_root=args.project_root, check=args.check, quiet=args.quiet)
|
|
149
|
+
return 0 if ok else 1
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
if __name__ == "__main__": # pragma: no cover - CLI entry point
|
|
153
|
+
raise SystemExit(main())
|
glitchlings/dlc/_shared.py
CHANGED
|
@@ -67,10 +67,10 @@ def resolve_columns(dataset: Any, columns: Sequence[str] | None) -> list[str]:
|
|
|
67
67
|
raise ValueError("Unable to determine which dataset columns to corrupt.")
|
|
68
68
|
|
|
69
69
|
|
|
70
|
-
def
|
|
70
|
+
def normalize_column_spec(
|
|
71
71
|
columns: str | int | Sequence[str | int] | None,
|
|
72
72
|
) -> list[str | int] | None:
|
|
73
|
-
"""
|
|
73
|
+
"""Normalize a column specification into a list of keys or indices.
|
|
74
74
|
|
|
75
75
|
Args:
|
|
76
76
|
columns: Column specification as a single value, sequence of values, or None.
|
|
@@ -87,10 +87,10 @@ def normalise_column_spec(
|
|
|
87
87
|
if isinstance(columns, (str, int)):
|
|
88
88
|
return [columns]
|
|
89
89
|
|
|
90
|
-
|
|
91
|
-
if not
|
|
90
|
+
normalized = list(columns)
|
|
91
|
+
if not normalized:
|
|
92
92
|
raise ValueError("At least one column must be specified")
|
|
93
|
-
return
|
|
93
|
+
return normalized
|
|
94
94
|
|
|
95
95
|
|
|
96
96
|
def is_textual_candidate(value: Any) -> bool:
|
|
@@ -147,7 +147,7 @@ def corrupt_text_value(value: Any, gaggle: Gaggle) -> Any:
|
|
|
147
147
|
__all__ = [
|
|
148
148
|
"corrupt_text_value",
|
|
149
149
|
"is_textual_candidate",
|
|
150
|
-
"
|
|
150
|
+
"normalize_column_spec",
|
|
151
151
|
"resolve_columns",
|
|
152
152
|
"resolve_environment",
|
|
153
153
|
]
|
glitchlings/dlc/huggingface.py
CHANGED
|
@@ -10,15 +10,15 @@ from ..util.adapters import coerce_gaggle
|
|
|
10
10
|
from ..zoo import Gaggle, Glitchling
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
def
|
|
14
|
-
"""
|
|
13
|
+
def _normalize_columns(column: str | Sequence[str]) -> list[str]:
|
|
14
|
+
"""Normalize a column specification to a list."""
|
|
15
15
|
if isinstance(column, str):
|
|
16
16
|
return [column]
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
if not
|
|
18
|
+
normalized = list(column)
|
|
19
|
+
if not normalized:
|
|
20
20
|
raise ValueError("At least one column must be specified")
|
|
21
|
-
return
|
|
21
|
+
return normalized
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def _glitch_dataset(
|
|
@@ -29,7 +29,7 @@ def _glitch_dataset(
|
|
|
29
29
|
seed: int = 151,
|
|
30
30
|
) -> Any:
|
|
31
31
|
"""Apply glitchlings to the provided dataset columns."""
|
|
32
|
-
columns =
|
|
32
|
+
columns = _normalize_columns(column)
|
|
33
33
|
gaggle = coerce_gaggle(glitchlings, seed=seed)
|
|
34
34
|
return gaggle.corrupt_dataset(dataset, columns)
|
|
35
35
|
|
glitchlings/dlc/prime.py
CHANGED
|
@@ -117,7 +117,7 @@ def _as_gaggle(
|
|
|
117
117
|
|
|
118
118
|
|
|
119
119
|
def _extract_completion_text(completion: Any) -> str:
|
|
120
|
-
"""
|
|
120
|
+
"""Normalize a completion payload into a plain string."""
|
|
121
121
|
if isinstance(completion, str):
|
|
122
122
|
return completion
|
|
123
123
|
|
glitchlings/dlc/pytorch.py
CHANGED
|
@@ -9,7 +9,7 @@ from ..compat import get_torch_dataloader, require_torch
|
|
|
9
9
|
from ..compat import torch as _torch_dependency
|
|
10
10
|
from ..util.adapters import coerce_gaggle
|
|
11
11
|
from ..zoo import Gaggle, Glitchling
|
|
12
|
-
from ._shared import corrupt_text_value, is_textual_candidate,
|
|
12
|
+
from ._shared import corrupt_text_value, is_textual_candidate, normalize_column_spec
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
def _apply_to_batch(batch: Any, targets: list[str | int] | None, gaggle: Gaggle) -> Any:
|
|
@@ -134,8 +134,8 @@ def _ensure_dataloader_class() -> type[Any]:
|
|
|
134
134
|
) -> _GlitchedDataLoader:
|
|
135
135
|
"""Return a lazily glitched view of the loader's batches."""
|
|
136
136
|
gaggle = coerce_gaggle(glitchlings, seed=seed)
|
|
137
|
-
|
|
138
|
-
return _GlitchedDataLoader(self, gaggle, columns=
|
|
137
|
+
normalized = normalize_column_spec(columns)
|
|
138
|
+
return _GlitchedDataLoader(self, gaggle, columns=normalized)
|
|
139
139
|
|
|
140
140
|
setattr(dataloader_cls, "glitch", glitch)
|
|
141
141
|
|
|
@@ -8,7 +8,7 @@ from typing import Any, cast
|
|
|
8
8
|
from ..compat import get_pytorch_lightning_datamodule, require_pytorch_lightning
|
|
9
9
|
from ..util.adapters import coerce_gaggle
|
|
10
10
|
from ..zoo import Gaggle, Glitchling
|
|
11
|
-
from ._shared import corrupt_text_value,
|
|
11
|
+
from ._shared import corrupt_text_value, normalize_column_spec
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
def _glitch_batch(batch: Any, columns: list[str], gaggle: Gaggle) -> Any:
|
|
@@ -40,10 +40,7 @@ def _wrap_dataloader(dataloader: Any, columns: list[str], gaggle: Gaggle) -> Any
|
|
|
40
40
|
if isinstance(dataloader, Mapping):
|
|
41
41
|
mapping_type = cast(type[Any], dataloader.__class__)
|
|
42
42
|
return mapping_type(
|
|
43
|
-
{
|
|
44
|
-
key: _wrap_dataloader(value, columns, gaggle)
|
|
45
|
-
for key, value in dataloader.items()
|
|
46
|
-
}
|
|
43
|
+
{key: _wrap_dataloader(value, columns, gaggle) for key, value in dataloader.items()}
|
|
47
44
|
)
|
|
48
45
|
|
|
49
46
|
if isinstance(dataloader, list):
|
|
@@ -54,9 +51,7 @@ def _wrap_dataloader(dataloader: Any, columns: list[str], gaggle: Gaggle) -> Any
|
|
|
54
51
|
|
|
55
52
|
if isinstance(dataloader, Sequence) and not isinstance(dataloader, (str, bytes, bytearray)):
|
|
56
53
|
sequence_type = cast(type[Any], dataloader.__class__)
|
|
57
|
-
return sequence_type(
|
|
58
|
-
_wrap_dataloader(value, columns, gaggle) for value in dataloader
|
|
59
|
-
)
|
|
54
|
+
return sequence_type(_wrap_dataloader(value, columns, gaggle) for value in dataloader)
|
|
60
55
|
|
|
61
56
|
return _GlitchedDataLoader(dataloader, columns, gaggle)
|
|
62
57
|
|
|
@@ -89,7 +84,7 @@ def _glitch_datamodule(
|
|
|
89
84
|
) -> Any:
|
|
90
85
|
"""Return a proxy that applies glitchlings to batches from the datamodule."""
|
|
91
86
|
|
|
92
|
-
columns =
|
|
87
|
+
columns = normalize_column_spec(column)
|
|
93
88
|
if columns is None: # pragma: no cover - defensive
|
|
94
89
|
raise ValueError("At least one column must be specified")
|
|
95
90
|
# Lightning datamodules only support string column names (mapping keys)
|
|
@@ -212,4 +207,3 @@ else: # pragma: no cover - optional dependency
|
|
|
212
207
|
|
|
213
208
|
|
|
214
209
|
__all__ = ["LightningDataModule", "install"]
|
|
215
|
-
|
glitchlings/lexicon/_cache.py
CHANGED
|
@@ -19,7 +19,7 @@ class CacheSnapshot:
|
|
|
19
19
|
checksum: str | None = None
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
def
|
|
22
|
+
def _normalize_entries(payload: Mapping[str, object]) -> CacheEntries:
|
|
23
23
|
"""Convert raw cache payloads into canonical mapping form."""
|
|
24
24
|
entries: CacheEntries = {}
|
|
25
25
|
for key, values in payload.items():
|
|
@@ -75,7 +75,7 @@ def load_cache(path: Path) -> CacheSnapshot:
|
|
|
75
75
|
else:
|
|
76
76
|
entries_payload = payload # legacy format without metadata
|
|
77
77
|
|
|
78
|
-
entries =
|
|
78
|
+
entries = _normalize_entries(entries_payload)
|
|
79
79
|
if checksum is not None:
|
|
80
80
|
expected = compute_checksum(entries)
|
|
81
81
|
if checksum != expected:
|
|
@@ -88,9 +88,7 @@ def load_cache(path: Path) -> CacheSnapshot:
|
|
|
88
88
|
|
|
89
89
|
def write_cache(path: Path, entries: Mapping[str, Sequence[str]]) -> CacheSnapshot:
|
|
90
90
|
"""Persist ``entries`` to ``path`` with checksum metadata."""
|
|
91
|
-
serialisable: CacheEntries = {
|
|
92
|
-
key: list(values) for key, values in sorted(entries.items())
|
|
93
|
-
}
|
|
91
|
+
serialisable: CacheEntries = {key: list(values) for key, values in sorted(entries.items())}
|
|
94
92
|
checksum = compute_checksum(serialisable)
|
|
95
93
|
payload = {
|
|
96
94
|
"__meta__": {
|
glitchlings/lexicon/vector.py
CHANGED
|
@@ -16,6 +16,9 @@ from ._cache import CacheSnapshot
|
|
|
16
16
|
from ._cache import load_cache as _load_cache_file
|
|
17
17
|
from ._cache import write_cache as _write_cache_file
|
|
18
18
|
|
|
19
|
+
# Minimum number of neighbors to consider for similarity queries
|
|
20
|
+
MIN_NEIGHBORS = 1
|
|
21
|
+
|
|
19
22
|
|
|
20
23
|
def _cosine_similarity(vector_a: Sequence[float], vector_b: Sequence[float]) -> float:
|
|
21
24
|
"""Return the cosine similarity between two dense vectors."""
|
|
@@ -304,7 +307,7 @@ class VectorLexicon(LexiconBackend):
|
|
|
304
307
|
"""Initialise the lexicon with an embedding ``source`` and optional cache."""
|
|
305
308
|
super().__init__(seed=seed)
|
|
306
309
|
self._adapter = _resolve_source(source)
|
|
307
|
-
self._max_neighbors = max(
|
|
310
|
+
self._max_neighbors = max(MIN_NEIGHBORS, max_neighbors)
|
|
308
311
|
self._min_similarity = min_similarity
|
|
309
312
|
self._cache: MutableMapping[str, list[str]] = {}
|
|
310
313
|
self._cache_path: Path | None
|
|
@@ -371,7 +374,7 @@ class VectorLexicon(LexiconBackend):
|
|
|
371
374
|
if cache_key in self._cache:
|
|
372
375
|
return self._cache[cache_key]
|
|
373
376
|
|
|
374
|
-
neighbor_limit = self._max_neighbors if limit is None else max(
|
|
377
|
+
neighbor_limit = self._max_neighbors if limit is None else max(MIN_NEIGHBORS, limit)
|
|
375
378
|
neighbors = self._fetch_neighbors(
|
|
376
379
|
original=original, normalized=normalized, limit=neighbor_limit
|
|
377
380
|
)
|
|
@@ -624,9 +627,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|
|
624
627
|
)
|
|
625
628
|
iterator = lexicon.iter_vocabulary()
|
|
626
629
|
if args.limit is not None:
|
|
627
|
-
token_iter = (
|
|
628
|
-
token for index, token in enumerate(iterator) if index < args.limit
|
|
629
|
-
)
|
|
630
|
+
token_iter = (token for index, token in enumerate(iterator) if index < args.limit)
|
|
630
631
|
else:
|
|
631
632
|
token_iter = iterator
|
|
632
633
|
|
glitchlings/lexicon/wordnet.py
CHANGED
|
@@ -13,21 +13,17 @@ from ._cache import CacheSnapshot
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class _LemmaProtocol(Protocol):
|
|
16
|
-
def name(self) -> str:
|
|
17
|
-
...
|
|
16
|
+
def name(self) -> str: ...
|
|
18
17
|
|
|
19
18
|
|
|
20
19
|
class _SynsetProtocol(Protocol):
|
|
21
|
-
def lemmas(self) -> Sequence[_LemmaProtocol]:
|
|
22
|
-
...
|
|
20
|
+
def lemmas(self) -> Sequence[_LemmaProtocol]: ...
|
|
23
21
|
|
|
24
22
|
|
|
25
23
|
class _WordNetResource(Protocol):
|
|
26
|
-
def synsets(self, word: str, pos: str | None = None) -> Sequence[_SynsetProtocol]:
|
|
27
|
-
...
|
|
24
|
+
def synsets(self, word: str, pos: str | None = None) -> Sequence[_SynsetProtocol]: ...
|
|
28
25
|
|
|
29
|
-
def ensure_loaded(self) -> None:
|
|
30
|
-
...
|
|
26
|
+
def ensure_loaded(self) -> None: ...
|
|
31
27
|
|
|
32
28
|
|
|
33
29
|
WordNetCorpusReaderFactory = Callable[[Any, Any], _WordNetResource]
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Hokey expressive lengthening generator."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
from .stretch_locator import StretchSite, apply_stretch, find_stretch_site
|
|
8
|
+
from .stretchability import RandomLike, StretchabilityAnalyzer, StretchabilityFeatures
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(slots=True)
|
|
12
|
+
class HokeyConfig:
|
|
13
|
+
rate: float = 0.3
|
|
14
|
+
extension_min: int = 2
|
|
15
|
+
extension_max: int = 5
|
|
16
|
+
base_p: float = 0.45
|
|
17
|
+
word_length_threshold: int = 6
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(slots=True)
|
|
21
|
+
class StretchEvent:
|
|
22
|
+
token_index: int
|
|
23
|
+
original: str
|
|
24
|
+
stretched: str
|
|
25
|
+
repeats: int
|
|
26
|
+
site: StretchSite
|
|
27
|
+
score: float
|
|
28
|
+
features: StretchabilityFeatures
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class NegativeBinomialSampler:
|
|
32
|
+
"""Sample stretch lengths from a clipped negative binomial distribution."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, base_p: float = 0.45) -> None:
|
|
35
|
+
self.base_p = base_p
|
|
36
|
+
|
|
37
|
+
def sample(
|
|
38
|
+
self,
|
|
39
|
+
rng: RandomLike,
|
|
40
|
+
*,
|
|
41
|
+
intensity: float,
|
|
42
|
+
minimum: int,
|
|
43
|
+
maximum: int,
|
|
44
|
+
) -> int:
|
|
45
|
+
minimum = max(0, int(minimum))
|
|
46
|
+
maximum = max(minimum, int(maximum))
|
|
47
|
+
if maximum == 0:
|
|
48
|
+
return 0
|
|
49
|
+
if maximum == minimum:
|
|
50
|
+
return maximum
|
|
51
|
+
|
|
52
|
+
r = max(1, int(round(1 + 2 * intensity)))
|
|
53
|
+
adjusted_p = self.base_p / (1.0 + 0.75 * max(0.0, intensity))
|
|
54
|
+
adjusted_p = max(0.05, min(0.95, adjusted_p))
|
|
55
|
+
failures = sum(self._geometric_sample(rng, adjusted_p) for _ in range(r))
|
|
56
|
+
extra = minimum + failures
|
|
57
|
+
return max(minimum, min(maximum, extra))
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def _geometric_sample(rng: RandomLike, p: float) -> int:
|
|
61
|
+
count = 0
|
|
62
|
+
while rng.random() > p:
|
|
63
|
+
count += 1
|
|
64
|
+
return count
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class HokeyGenerator:
|
|
68
|
+
"""Full expressive lengthening pipeline."""
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
analyzer: StretchabilityAnalyzer | None = None,
|
|
73
|
+
sampler: NegativeBinomialSampler | None = None,
|
|
74
|
+
) -> None:
|
|
75
|
+
self.analyzer = analyzer or StretchabilityAnalyzer()
|
|
76
|
+
self.sampler = sampler or NegativeBinomialSampler()
|
|
77
|
+
|
|
78
|
+
def generate(
|
|
79
|
+
self,
|
|
80
|
+
text: str,
|
|
81
|
+
*,
|
|
82
|
+
rng: RandomLike,
|
|
83
|
+
config: HokeyConfig,
|
|
84
|
+
) -> tuple[str, list[StretchEvent]]:
|
|
85
|
+
if not text:
|
|
86
|
+
return text, []
|
|
87
|
+
|
|
88
|
+
if config.base_p != self.sampler.base_p:
|
|
89
|
+
self.sampler.base_p = config.base_p
|
|
90
|
+
|
|
91
|
+
tokens = self.analyzer.tokenise(text)
|
|
92
|
+
candidates = self.analyzer.analyse_tokens(tokens)
|
|
93
|
+
selected = self.analyzer.select_candidates(candidates, rate=config.rate, rng=rng)
|
|
94
|
+
if not selected:
|
|
95
|
+
return text, []
|
|
96
|
+
|
|
97
|
+
token_strings = [token.text for token in tokens]
|
|
98
|
+
events: list[StretchEvent] = []
|
|
99
|
+
|
|
100
|
+
for candidate in selected:
|
|
101
|
+
token_idx = candidate.token.index
|
|
102
|
+
original = token_strings[token_idx]
|
|
103
|
+
site = find_stretch_site(original)
|
|
104
|
+
if site is None:
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
intensity = min(1.5, candidate.features.intensity() + 0.35 * candidate.score)
|
|
108
|
+
alpha_count = sum(1 for ch in original if ch.isalpha())
|
|
109
|
+
if config.word_length_threshold > 0 and alpha_count > config.word_length_threshold * 2:
|
|
110
|
+
continue
|
|
111
|
+
if config.word_length_threshold > 0 and alpha_count > config.word_length_threshold:
|
|
112
|
+
excess = alpha_count - config.word_length_threshold
|
|
113
|
+
intensity = intensity / (1.0 + 0.35 * excess)
|
|
114
|
+
if candidate.score < 0.35 and excess >= 2:
|
|
115
|
+
continue
|
|
116
|
+
intensity = max(0.05, intensity)
|
|
117
|
+
|
|
118
|
+
repeats = self.sampler.sample(
|
|
119
|
+
rng,
|
|
120
|
+
intensity=intensity,
|
|
121
|
+
minimum=config.extension_min,
|
|
122
|
+
maximum=config.extension_max,
|
|
123
|
+
)
|
|
124
|
+
if repeats <= 0:
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
stretched_word = apply_stretch(original, site, repeats)
|
|
128
|
+
token_strings[token_idx] = stretched_word
|
|
129
|
+
events.append(
|
|
130
|
+
StretchEvent(
|
|
131
|
+
token_index=token_idx,
|
|
132
|
+
original=original,
|
|
133
|
+
stretched=stretched_word,
|
|
134
|
+
repeats=repeats,
|
|
135
|
+
site=site,
|
|
136
|
+
score=candidate.score,
|
|
137
|
+
features=candidate.features,
|
|
138
|
+
)
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
return "".join(token_strings), events
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
__all__ = ["HokeyGenerator", "HokeyConfig", "StretchEvent", "NegativeBinomialSampler"]
|