glitchlings 0.4.4__cp312-cp312-macosx_11_0_universal2.whl → 0.5.0__cp312-cp312-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

Files changed (42) hide show
  1. glitchlings/__init__.py +4 -0
  2. glitchlings/_zoo_rust.cpython-312-darwin.so +0 -0
  3. glitchlings/compat.py +2 -4
  4. glitchlings/config.py +14 -28
  5. glitchlings/dev/__init__.py +5 -0
  6. glitchlings/dev/sync_assets.py +153 -0
  7. glitchlings/dlc/_shared.py +6 -6
  8. glitchlings/dlc/huggingface.py +6 -6
  9. glitchlings/dlc/prime.py +1 -1
  10. glitchlings/dlc/pytorch.py +3 -3
  11. glitchlings/dlc/pytorch_lightning.py +4 -10
  12. glitchlings/lexicon/_cache.py +3 -5
  13. glitchlings/lexicon/vector.py +6 -5
  14. glitchlings/lexicon/wordnet.py +4 -8
  15. glitchlings/util/hokey_generator.py +144 -0
  16. glitchlings/util/stretch_locator.py +140 -0
  17. glitchlings/util/stretchability.py +370 -0
  18. glitchlings/zoo/__init__.py +5 -1
  19. glitchlings/zoo/_ocr_confusions.py +3 -3
  20. glitchlings/zoo/_text_utils.py +10 -9
  21. glitchlings/zoo/adjax.py +3 -18
  22. glitchlings/zoo/apostrofae.py +2 -5
  23. glitchlings/zoo/assets/__init__.py +54 -0
  24. glitchlings/zoo/assets/hokey_assets.json +193 -0
  25. glitchlings/zoo/hokey.py +173 -0
  26. glitchlings/zoo/jargoyle.py +2 -16
  27. glitchlings/zoo/mim1c.py +2 -17
  28. glitchlings/zoo/redactyl.py +3 -17
  29. glitchlings/zoo/reduple.py +3 -17
  30. glitchlings/zoo/rushmore.py +3 -20
  31. glitchlings/zoo/scannequin.py +3 -20
  32. glitchlings/zoo/typogre.py +2 -19
  33. glitchlings/zoo/zeedub.py +2 -13
  34. {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/METADATA +29 -6
  35. glitchlings-0.5.0.dist-info/RECORD +53 -0
  36. glitchlings/zoo/_rate.py +0 -131
  37. glitchlings-0.4.4.dist-info/RECORD +0 -47
  38. /glitchlings/zoo/{ocr_confusions.tsv → assets/ocr_confusions.tsv} +0 -0
  39. {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/WHEEL +0 -0
  40. {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/entry_points.txt +0 -0
  41. {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/licenses/LICENSE +0 -0
  42. {glitchlings-0.4.4.dist-info → glitchlings-0.5.0.dist-info}/top_level.txt +0 -0
glitchlings/__init__.py CHANGED
@@ -5,6 +5,7 @@ from .zoo import (
5
5
  Apostrofae,
6
6
  Gaggle,
7
7
  Glitchling,
8
+ Hokey,
8
9
  Jargoyle,
9
10
  Mim1c,
10
11
  Redactyl,
@@ -15,6 +16,7 @@ from .zoo import (
15
16
  Zeedub,
16
17
  adjax,
17
18
  apostrofae,
19
+ hokey,
18
20
  is_rust_pipeline_enabled,
19
21
  is_rust_pipeline_supported,
20
22
  jargoyle,
@@ -42,6 +44,8 @@ __all__ = [
42
44
  "adjax",
43
45
  "Apostrofae",
44
46
  "apostrofae",
47
+ "Hokey",
48
+ "hokey",
45
49
  "Redactyl",
46
50
  "redactyl",
47
51
  "Reduple",
Binary file
glitchlings/compat.py CHANGED
@@ -17,16 +17,14 @@ _MISSING = _MissingSentinel()
17
17
 
18
18
 
19
19
  class _MarkerProtocol(Protocol):
20
- def evaluate(self, environment: dict[str, str]) -> bool:
21
- ...
20
+ def evaluate(self, environment: dict[str, str]) -> bool: ...
22
21
 
23
22
 
24
23
  class _RequirementProtocol(Protocol):
25
24
  marker: _MarkerProtocol | None
26
25
  name: str
27
26
 
28
- def __init__(self, requirement: str) -> None:
29
- ...
27
+ def __init__(self, requirement: str) -> None: ...
30
28
 
31
29
 
32
30
  try: # pragma: no cover - packaging is bundled with modern Python environments
glitchlings/config.py CHANGED
@@ -4,7 +4,6 @@ from __future__ import annotations
4
4
 
5
5
  import importlib
6
6
  import os
7
- import warnings
8
7
  from dataclasses import dataclass, field
9
8
  from io import TextIOBase
10
9
  from pathlib import Path
@@ -19,8 +18,7 @@ except ModuleNotFoundError: # pragma: no cover - Python < 3.11
19
18
 
20
19
 
21
20
  class _TomllibModule(Protocol):
22
- def load(self, fp: IO[bytes]) -> Any:
23
- ...
21
+ def load(self, fp: IO[bytes]) -> Any: ...
24
22
 
25
23
 
26
24
  tomllib = cast(_TomllibModule, _tomllib)
@@ -29,8 +27,7 @@ tomllib = cast(_TomllibModule, _tomllib)
29
27
  class _YamlModule(Protocol):
30
28
  YAMLError: type[Exception]
31
29
 
32
- def safe_load(self, stream: str) -> Any:
33
- ...
30
+ def safe_load(self, stream: str) -> Any: ...
34
31
 
35
32
 
36
33
  yaml = cast(_YamlModule, importlib.import_module("yaml"))
@@ -59,17 +56,6 @@ ATTACK_CONFIG_SCHEMA: dict[str, Any] = {
59
56
  "required": ["name"],
60
57
  "properties": {
61
58
  "name": {"type": "string", "minLength": 1},
62
- "type": {"type": "string", "minLength": 1},
63
- "parameters": {"type": "object"},
64
- },
65
- "additionalProperties": True,
66
- },
67
- {
68
- "type": "object",
69
- "required": ["type"],
70
- "properties": {
71
- "name": {"type": "string", "minLength": 1},
72
- "type": {"type": "string", "minLength": 1},
73
59
  "parameters": {"type": "object"},
74
60
  },
75
61
  "additionalProperties": True,
@@ -265,7 +251,12 @@ def _validate_attack_config_schema(data: Any, *, source: str) -> Mapping[str, An
265
251
 
266
252
  for index, entry in enumerate(raw_glitchlings, start=1):
267
253
  if isinstance(entry, Mapping):
268
- name_candidate = entry.get("name") or entry.get("type")
254
+ if "type" in entry:
255
+ raise ValueError(
256
+ f"{source}: glitchling #{index} uses unsupported 'type'; use 'name'."
257
+ )
258
+
259
+ name_candidate = entry.get("name")
269
260
  if not isinstance(name_candidate, str) or not name_candidate.strip():
270
261
  raise ValueError(f"{source}: glitchling #{index} is missing a 'name'.")
271
262
  parameters = entry.get("parameters")
@@ -328,17 +319,12 @@ def _build_glitchling(entry: Any, source: str, index: int) -> "Glitchling":
328
319
  raise ValueError(f"{source}: glitchling #{index}: {exc}") from exc
329
320
 
330
321
  if isinstance(entry, Mapping):
331
- name_value = entry.get("name")
332
- legacy_type = entry.get("type")
333
- if name_value is None and legacy_type is not None:
334
- warnings.warn(
335
- f"{source}: glitchling #{index} uses 'type'; prefer 'name'.",
336
- DeprecationWarning,
337
- stacklevel=2,
322
+ if "type" in entry:
323
+ raise ValueError(
324
+ f"{source}: glitchling #{index} uses unsupported 'type'; use 'name'."
338
325
  )
339
- name_value = legacy_type
340
- elif name_value is None:
341
- name_value = legacy_type
326
+
327
+ name_value = entry.get("name")
342
328
 
343
329
  if not isinstance(name_value, str) or not name_value.strip():
344
330
  raise ValueError(f"{source}: glitchling #{index} is missing a 'name'.")
@@ -354,7 +340,7 @@ def _build_glitchling(entry: Any, source: str, index: int) -> "Glitchling":
354
340
  kwargs = {
355
341
  key: value
356
342
  for key, value in entry.items()
357
- if key not in {"name", "type", "parameters"}
343
+ if key not in {"name", "parameters"}
358
344
  }
359
345
 
360
346
  try:
@@ -0,0 +1,5 @@
1
+ """Developer-facing utilities for maintaining the Glitchlings repository."""
2
+
3
+ from .sync_assets import sync_assets
4
+
5
+ __all__ = ["sync_assets"]
@@ -0,0 +1,153 @@
1
+ """Synchronise canonical glitchling assets with the vendored Rust copies."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import shutil
7
+ import sys
8
+ from pathlib import Path
9
+ from typing import Iterator, Sequence
10
+
11
+ RUST_VENDORED_ASSETS: frozenset[str] = frozenset({
12
+ "hokey_assets.json",
13
+ "ocr_confusions.tsv",
14
+ })
15
+
16
+
17
+ def _project_root(default: Path | None = None) -> Path:
18
+ if default is not None:
19
+ return default
20
+ return Path(__file__).resolve().parents[3]
21
+
22
+
23
+ def _canonical_asset_dir(project_root: Path) -> Path:
24
+ canonical = project_root / "src" / "glitchlings" / "zoo" / "assets"
25
+ if not canonical.is_dir():
26
+ raise RuntimeError(
27
+ "expected canonical assets under 'src/glitchlings/zoo/assets'; "
28
+ "run this command from the repository root"
29
+ )
30
+ return canonical
31
+
32
+
33
+ def _rust_asset_dir(project_root: Path) -> Path:
34
+ return project_root / "rust" / "zoo" / "assets"
35
+
36
+
37
+ def _iter_extraneous_assets(rust_dir: Path) -> Iterator[Path]:
38
+ if not rust_dir.exists():
39
+ return
40
+ for path in rust_dir.iterdir():
41
+ if path.is_file() and path.name not in RUST_VENDORED_ASSETS:
42
+ yield path
43
+
44
+
45
+ def sync_assets(
46
+ project_root: Path | None = None,
47
+ *,
48
+ check: bool = False,
49
+ quiet: bool = False,
50
+ ) -> bool:
51
+ """Synchronise the vendored Rust asset copies with the canonical sources."""
52
+
53
+ root = _project_root(project_root)
54
+ canonical_dir = _canonical_asset_dir(root)
55
+ rust_dir = _rust_asset_dir(root)
56
+
57
+ missing_sources = [
58
+ name
59
+ for name in RUST_VENDORED_ASSETS
60
+ if not (canonical_dir / name).is_file()
61
+ ]
62
+ if missing_sources:
63
+ missing_list = ", ".join(sorted(missing_sources))
64
+ raise RuntimeError(f"missing canonical assets: {missing_list}")
65
+
66
+ extraneous = list(_iter_extraneous_assets(rust_dir))
67
+
68
+ mismatched: list[tuple[str, str]] = []
69
+ for name in sorted(RUST_VENDORED_ASSETS):
70
+ source = canonical_dir / name
71
+ target = rust_dir / name
72
+ if not target.exists():
73
+ mismatched.append((name, "missing"))
74
+ continue
75
+ if source.read_bytes() != target.read_bytes():
76
+ mismatched.append((name, "outdated"))
77
+
78
+ if check:
79
+ if mismatched or extraneous:
80
+ if not quiet:
81
+ for name, reason in mismatched:
82
+ target = rust_dir / name
83
+ print(
84
+ f"{target.relative_to(root)} is {reason}; run sync_assets to refresh it",
85
+ file=sys.stderr,
86
+ )
87
+ for extra in extraneous:
88
+ print(
89
+ (
90
+ "unexpected vendored asset "
91
+ f"{extra.relative_to(root)}; run sync_assets to prune it"
92
+ ),
93
+ file=sys.stderr,
94
+ )
95
+ return False
96
+ if not quiet:
97
+ print("Rust asset bundle is up to date.")
98
+ return True
99
+
100
+ rust_dir.mkdir(parents=True, exist_ok=True)
101
+
102
+ for name, reason in mismatched:
103
+ source = canonical_dir / name
104
+ target = rust_dir / name
105
+ shutil.copy2(source, target)
106
+ if not quiet:
107
+ verb = "Copied" if reason == "missing" else "Updated"
108
+ print(
109
+ f"{verb} {source.relative_to(root)} -> {target.relative_to(root)}",
110
+ )
111
+
112
+ for extra in extraneous:
113
+ extra.unlink()
114
+ if not quiet:
115
+ print(f"Removed extraneous vendored asset {extra.relative_to(root)}")
116
+
117
+ if not mismatched and not extraneous and not quiet:
118
+ print("Rust asset bundle already aligned with canonical copies.")
119
+
120
+ return True
121
+
122
+
123
+ def build_parser() -> argparse.ArgumentParser:
124
+ parser = argparse.ArgumentParser(
125
+ description="Synchronise canonical glitchling assets with the vendored Rust copies.",
126
+ )
127
+ parser.add_argument(
128
+ "--check",
129
+ action="store_true",
130
+ help="exit with a non-zero status when vendored assets diverge",
131
+ )
132
+ parser.add_argument(
133
+ "--quiet",
134
+ action="store_true",
135
+ help="suppress status output",
136
+ )
137
+ parser.add_argument(
138
+ "--project-root",
139
+ type=Path,
140
+ help="override the detected project root (useful for testing)",
141
+ )
142
+ return parser
143
+
144
+
145
+ def main(argv: Sequence[str] | None = None) -> int:
146
+ parser = build_parser()
147
+ args = parser.parse_args(argv)
148
+ ok = sync_assets(project_root=args.project_root, check=args.check, quiet=args.quiet)
149
+ return 0 if ok else 1
150
+
151
+
152
+ if __name__ == "__main__": # pragma: no cover - CLI entry point
153
+ raise SystemExit(main())
@@ -67,10 +67,10 @@ def resolve_columns(dataset: Any, columns: Sequence[str] | None) -> list[str]:
67
67
  raise ValueError("Unable to determine which dataset columns to corrupt.")
68
68
 
69
69
 
70
- def normalise_column_spec(
70
+ def normalize_column_spec(
71
71
  columns: str | int | Sequence[str | int] | None,
72
72
  ) -> list[str | int] | None:
73
- """Normalise a column specification into a list of keys or indices.
73
+ """Normalize a column specification into a list of keys or indices.
74
74
 
75
75
  Args:
76
76
  columns: Column specification as a single value, sequence of values, or None.
@@ -87,10 +87,10 @@ def normalise_column_spec(
87
87
  if isinstance(columns, (str, int)):
88
88
  return [columns]
89
89
 
90
- normalised = list(columns)
91
- if not normalised:
90
+ normalized = list(columns)
91
+ if not normalized:
92
92
  raise ValueError("At least one column must be specified")
93
- return normalised
93
+ return normalized
94
94
 
95
95
 
96
96
  def is_textual_candidate(value: Any) -> bool:
@@ -147,7 +147,7 @@ def corrupt_text_value(value: Any, gaggle: Gaggle) -> Any:
147
147
  __all__ = [
148
148
  "corrupt_text_value",
149
149
  "is_textual_candidate",
150
- "normalise_column_spec",
150
+ "normalize_column_spec",
151
151
  "resolve_columns",
152
152
  "resolve_environment",
153
153
  ]
@@ -10,15 +10,15 @@ from ..util.adapters import coerce_gaggle
10
10
  from ..zoo import Gaggle, Glitchling
11
11
 
12
12
 
13
- def _normalise_columns(column: str | Sequence[str]) -> list[str]:
14
- """Normalise a column specification to a list."""
13
+ def _normalize_columns(column: str | Sequence[str]) -> list[str]:
14
+ """Normalize a column specification to a list."""
15
15
  if isinstance(column, str):
16
16
  return [column]
17
17
 
18
- normalised = list(column)
19
- if not normalised:
18
+ normalized = list(column)
19
+ if not normalized:
20
20
  raise ValueError("At least one column must be specified")
21
- return normalised
21
+ return normalized
22
22
 
23
23
 
24
24
  def _glitch_dataset(
@@ -29,7 +29,7 @@ def _glitch_dataset(
29
29
  seed: int = 151,
30
30
  ) -> Any:
31
31
  """Apply glitchlings to the provided dataset columns."""
32
- columns = _normalise_columns(column)
32
+ columns = _normalize_columns(column)
33
33
  gaggle = coerce_gaggle(glitchlings, seed=seed)
34
34
  return gaggle.corrupt_dataset(dataset, columns)
35
35
 
glitchlings/dlc/prime.py CHANGED
@@ -117,7 +117,7 @@ def _as_gaggle(
117
117
 
118
118
 
119
119
  def _extract_completion_text(completion: Any) -> str:
120
- """Normalise a completion payload into a plain string."""
120
+ """Normalize a completion payload into a plain string."""
121
121
  if isinstance(completion, str):
122
122
  return completion
123
123
 
@@ -9,7 +9,7 @@ from ..compat import get_torch_dataloader, require_torch
9
9
  from ..compat import torch as _torch_dependency
10
10
  from ..util.adapters import coerce_gaggle
11
11
  from ..zoo import Gaggle, Glitchling
12
- from ._shared import corrupt_text_value, is_textual_candidate, normalise_column_spec
12
+ from ._shared import corrupt_text_value, is_textual_candidate, normalize_column_spec
13
13
 
14
14
 
15
15
  def _apply_to_batch(batch: Any, targets: list[str | int] | None, gaggle: Gaggle) -> Any:
@@ -134,8 +134,8 @@ def _ensure_dataloader_class() -> type[Any]:
134
134
  ) -> _GlitchedDataLoader:
135
135
  """Return a lazily glitched view of the loader's batches."""
136
136
  gaggle = coerce_gaggle(glitchlings, seed=seed)
137
- normalised = normalise_column_spec(columns)
138
- return _GlitchedDataLoader(self, gaggle, columns=normalised)
137
+ normalized = normalize_column_spec(columns)
138
+ return _GlitchedDataLoader(self, gaggle, columns=normalized)
139
139
 
140
140
  setattr(dataloader_cls, "glitch", glitch)
141
141
 
@@ -8,7 +8,7 @@ from typing import Any, cast
8
8
  from ..compat import get_pytorch_lightning_datamodule, require_pytorch_lightning
9
9
  from ..util.adapters import coerce_gaggle
10
10
  from ..zoo import Gaggle, Glitchling
11
- from ._shared import corrupt_text_value, normalise_column_spec
11
+ from ._shared import corrupt_text_value, normalize_column_spec
12
12
 
13
13
 
14
14
  def _glitch_batch(batch: Any, columns: list[str], gaggle: Gaggle) -> Any:
@@ -40,10 +40,7 @@ def _wrap_dataloader(dataloader: Any, columns: list[str], gaggle: Gaggle) -> Any
40
40
  if isinstance(dataloader, Mapping):
41
41
  mapping_type = cast(type[Any], dataloader.__class__)
42
42
  return mapping_type(
43
- {
44
- key: _wrap_dataloader(value, columns, gaggle)
45
- for key, value in dataloader.items()
46
- }
43
+ {key: _wrap_dataloader(value, columns, gaggle) for key, value in dataloader.items()}
47
44
  )
48
45
 
49
46
  if isinstance(dataloader, list):
@@ -54,9 +51,7 @@ def _wrap_dataloader(dataloader: Any, columns: list[str], gaggle: Gaggle) -> Any
54
51
 
55
52
  if isinstance(dataloader, Sequence) and not isinstance(dataloader, (str, bytes, bytearray)):
56
53
  sequence_type = cast(type[Any], dataloader.__class__)
57
- return sequence_type(
58
- _wrap_dataloader(value, columns, gaggle) for value in dataloader
59
- )
54
+ return sequence_type(_wrap_dataloader(value, columns, gaggle) for value in dataloader)
60
55
 
61
56
  return _GlitchedDataLoader(dataloader, columns, gaggle)
62
57
 
@@ -89,7 +84,7 @@ def _glitch_datamodule(
89
84
  ) -> Any:
90
85
  """Return a proxy that applies glitchlings to batches from the datamodule."""
91
86
 
92
- columns = normalise_column_spec(column)
87
+ columns = normalize_column_spec(column)
93
88
  if columns is None: # pragma: no cover - defensive
94
89
  raise ValueError("At least one column must be specified")
95
90
  # Lightning datamodules only support string column names (mapping keys)
@@ -212,4 +207,3 @@ else: # pragma: no cover - optional dependency
212
207
 
213
208
 
214
209
  __all__ = ["LightningDataModule", "install"]
215
-
@@ -19,7 +19,7 @@ class CacheSnapshot:
19
19
  checksum: str | None = None
20
20
 
21
21
 
22
- def _normalise_entries(payload: Mapping[str, object]) -> CacheEntries:
22
+ def _normalize_entries(payload: Mapping[str, object]) -> CacheEntries:
23
23
  """Convert raw cache payloads into canonical mapping form."""
24
24
  entries: CacheEntries = {}
25
25
  for key, values in payload.items():
@@ -75,7 +75,7 @@ def load_cache(path: Path) -> CacheSnapshot:
75
75
  else:
76
76
  entries_payload = payload # legacy format without metadata
77
77
 
78
- entries = _normalise_entries(entries_payload)
78
+ entries = _normalize_entries(entries_payload)
79
79
  if checksum is not None:
80
80
  expected = compute_checksum(entries)
81
81
  if checksum != expected:
@@ -88,9 +88,7 @@ def load_cache(path: Path) -> CacheSnapshot:
88
88
 
89
89
  def write_cache(path: Path, entries: Mapping[str, Sequence[str]]) -> CacheSnapshot:
90
90
  """Persist ``entries`` to ``path`` with checksum metadata."""
91
- serialisable: CacheEntries = {
92
- key: list(values) for key, values in sorted(entries.items())
93
- }
91
+ serialisable: CacheEntries = {key: list(values) for key, values in sorted(entries.items())}
94
92
  checksum = compute_checksum(serialisable)
95
93
  payload = {
96
94
  "__meta__": {
@@ -16,6 +16,9 @@ from ._cache import CacheSnapshot
16
16
  from ._cache import load_cache as _load_cache_file
17
17
  from ._cache import write_cache as _write_cache_file
18
18
 
19
+ # Minimum number of neighbors to consider for similarity queries
20
+ MIN_NEIGHBORS = 1
21
+
19
22
 
20
23
  def _cosine_similarity(vector_a: Sequence[float], vector_b: Sequence[float]) -> float:
21
24
  """Return the cosine similarity between two dense vectors."""
@@ -304,7 +307,7 @@ class VectorLexicon(LexiconBackend):
304
307
  """Initialise the lexicon with an embedding ``source`` and optional cache."""
305
308
  super().__init__(seed=seed)
306
309
  self._adapter = _resolve_source(source)
307
- self._max_neighbors = max(1, max_neighbors)
310
+ self._max_neighbors = max(MIN_NEIGHBORS, max_neighbors)
308
311
  self._min_similarity = min_similarity
309
312
  self._cache: MutableMapping[str, list[str]] = {}
310
313
  self._cache_path: Path | None
@@ -371,7 +374,7 @@ class VectorLexicon(LexiconBackend):
371
374
  if cache_key in self._cache:
372
375
  return self._cache[cache_key]
373
376
 
374
- neighbor_limit = self._max_neighbors if limit is None else max(1, limit)
377
+ neighbor_limit = self._max_neighbors if limit is None else max(MIN_NEIGHBORS, limit)
375
378
  neighbors = self._fetch_neighbors(
376
379
  original=original, normalized=normalized, limit=neighbor_limit
377
380
  )
@@ -624,9 +627,7 @@ def main(argv: Sequence[str] | None = None) -> int:
624
627
  )
625
628
  iterator = lexicon.iter_vocabulary()
626
629
  if args.limit is not None:
627
- token_iter = (
628
- token for index, token in enumerate(iterator) if index < args.limit
629
- )
630
+ token_iter = (token for index, token in enumerate(iterator) if index < args.limit)
630
631
  else:
631
632
  token_iter = iterator
632
633
 
@@ -13,21 +13,17 @@ from ._cache import CacheSnapshot
13
13
 
14
14
 
15
15
  class _LemmaProtocol(Protocol):
16
- def name(self) -> str:
17
- ...
16
+ def name(self) -> str: ...
18
17
 
19
18
 
20
19
  class _SynsetProtocol(Protocol):
21
- def lemmas(self) -> Sequence[_LemmaProtocol]:
22
- ...
20
+ def lemmas(self) -> Sequence[_LemmaProtocol]: ...
23
21
 
24
22
 
25
23
  class _WordNetResource(Protocol):
26
- def synsets(self, word: str, pos: str | None = None) -> Sequence[_SynsetProtocol]:
27
- ...
24
+ def synsets(self, word: str, pos: str | None = None) -> Sequence[_SynsetProtocol]: ...
28
25
 
29
- def ensure_loaded(self) -> None:
30
- ...
26
+ def ensure_loaded(self) -> None: ...
31
27
 
32
28
 
33
29
  WordNetCorpusReaderFactory = Callable[[Any, Any], _WordNetResource]
@@ -0,0 +1,144 @@
1
+ """Hokey expressive lengthening generator."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+ from .stretch_locator import StretchSite, apply_stretch, find_stretch_site
8
+ from .stretchability import RandomLike, StretchabilityAnalyzer, StretchabilityFeatures
9
+
10
+
11
+ @dataclass(slots=True)
12
+ class HokeyConfig:
13
+ rate: float = 0.3
14
+ extension_min: int = 2
15
+ extension_max: int = 5
16
+ base_p: float = 0.45
17
+ word_length_threshold: int = 6
18
+
19
+
20
+ @dataclass(slots=True)
21
+ class StretchEvent:
22
+ token_index: int
23
+ original: str
24
+ stretched: str
25
+ repeats: int
26
+ site: StretchSite
27
+ score: float
28
+ features: StretchabilityFeatures
29
+
30
+
31
+ class NegativeBinomialSampler:
32
+ """Sample stretch lengths from a clipped negative binomial distribution."""
33
+
34
+ def __init__(self, base_p: float = 0.45) -> None:
35
+ self.base_p = base_p
36
+
37
+ def sample(
38
+ self,
39
+ rng: RandomLike,
40
+ *,
41
+ intensity: float,
42
+ minimum: int,
43
+ maximum: int,
44
+ ) -> int:
45
+ minimum = max(0, int(minimum))
46
+ maximum = max(minimum, int(maximum))
47
+ if maximum == 0:
48
+ return 0
49
+ if maximum == minimum:
50
+ return maximum
51
+
52
+ r = max(1, int(round(1 + 2 * intensity)))
53
+ adjusted_p = self.base_p / (1.0 + 0.75 * max(0.0, intensity))
54
+ adjusted_p = max(0.05, min(0.95, adjusted_p))
55
+ failures = sum(self._geometric_sample(rng, adjusted_p) for _ in range(r))
56
+ extra = minimum + failures
57
+ return max(minimum, min(maximum, extra))
58
+
59
+ @staticmethod
60
+ def _geometric_sample(rng: RandomLike, p: float) -> int:
61
+ count = 0
62
+ while rng.random() > p:
63
+ count += 1
64
+ return count
65
+
66
+
67
+ class HokeyGenerator:
68
+ """Full expressive lengthening pipeline."""
69
+
70
+ def __init__(
71
+ self,
72
+ analyzer: StretchabilityAnalyzer | None = None,
73
+ sampler: NegativeBinomialSampler | None = None,
74
+ ) -> None:
75
+ self.analyzer = analyzer or StretchabilityAnalyzer()
76
+ self.sampler = sampler or NegativeBinomialSampler()
77
+
78
+ def generate(
79
+ self,
80
+ text: str,
81
+ *,
82
+ rng: RandomLike,
83
+ config: HokeyConfig,
84
+ ) -> tuple[str, list[StretchEvent]]:
85
+ if not text:
86
+ return text, []
87
+
88
+ if config.base_p != self.sampler.base_p:
89
+ self.sampler.base_p = config.base_p
90
+
91
+ tokens = self.analyzer.tokenise(text)
92
+ candidates = self.analyzer.analyse_tokens(tokens)
93
+ selected = self.analyzer.select_candidates(candidates, rate=config.rate, rng=rng)
94
+ if not selected:
95
+ return text, []
96
+
97
+ token_strings = [token.text for token in tokens]
98
+ events: list[StretchEvent] = []
99
+
100
+ for candidate in selected:
101
+ token_idx = candidate.token.index
102
+ original = token_strings[token_idx]
103
+ site = find_stretch_site(original)
104
+ if site is None:
105
+ continue
106
+
107
+ intensity = min(1.5, candidate.features.intensity() + 0.35 * candidate.score)
108
+ alpha_count = sum(1 for ch in original if ch.isalpha())
109
+ if config.word_length_threshold > 0 and alpha_count > config.word_length_threshold * 2:
110
+ continue
111
+ if config.word_length_threshold > 0 and alpha_count > config.word_length_threshold:
112
+ excess = alpha_count - config.word_length_threshold
113
+ intensity = intensity / (1.0 + 0.35 * excess)
114
+ if candidate.score < 0.35 and excess >= 2:
115
+ continue
116
+ intensity = max(0.05, intensity)
117
+
118
+ repeats = self.sampler.sample(
119
+ rng,
120
+ intensity=intensity,
121
+ minimum=config.extension_min,
122
+ maximum=config.extension_max,
123
+ )
124
+ if repeats <= 0:
125
+ continue
126
+
127
+ stretched_word = apply_stretch(original, site, repeats)
128
+ token_strings[token_idx] = stretched_word
129
+ events.append(
130
+ StretchEvent(
131
+ token_index=token_idx,
132
+ original=original,
133
+ stretched=stretched_word,
134
+ repeats=repeats,
135
+ site=site,
136
+ score=candidate.score,
137
+ features=candidate.features,
138
+ )
139
+ )
140
+
141
+ return "".join(token_strings), events
142
+
143
+
144
+ __all__ = ["HokeyGenerator", "HokeyConfig", "StretchEvent", "NegativeBinomialSampler"]