glitchlings 0.4.3__cp311-cp311-macosx_11_0_universal2.whl → 0.4.5__cp311-cp311-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

Files changed (39) hide show
  1. glitchlings/__init__.py +4 -0
  2. glitchlings/_zoo_rust.cpython-311-darwin.so +0 -0
  3. glitchlings/compat.py +2 -4
  4. glitchlings/config.py +2 -4
  5. glitchlings/data/__init__.py +1 -0
  6. glitchlings/data/hokey_assets.json +193 -0
  7. glitchlings/dlc/_shared.py +86 -1
  8. glitchlings/dlc/huggingface.py +6 -6
  9. glitchlings/dlc/prime.py +1 -1
  10. glitchlings/dlc/pytorch.py +9 -59
  11. glitchlings/dlc/pytorch_lightning.py +10 -34
  12. glitchlings/lexicon/__init__.py +5 -1
  13. glitchlings/lexicon/_cache.py +3 -5
  14. glitchlings/lexicon/vector.py +6 -5
  15. glitchlings/lexicon/wordnet.py +4 -8
  16. glitchlings/util/hokey_generator.py +144 -0
  17. glitchlings/util/stretch_locator.py +140 -0
  18. glitchlings/util/stretchability.py +375 -0
  19. glitchlings/zoo/__init__.py +5 -1
  20. glitchlings/zoo/_rate.py +114 -1
  21. glitchlings/zoo/_rust_extensions.py +143 -0
  22. glitchlings/zoo/adjax.py +3 -4
  23. glitchlings/zoo/apostrofae.py +3 -4
  24. glitchlings/zoo/core.py +21 -9
  25. glitchlings/zoo/hokey.py +173 -0
  26. glitchlings/zoo/jargoyle.py +6 -2
  27. glitchlings/zoo/redactyl.py +4 -5
  28. glitchlings/zoo/reduple.py +3 -4
  29. glitchlings/zoo/rushmore.py +3 -4
  30. glitchlings/zoo/scannequin.py +3 -4
  31. glitchlings/zoo/typogre.py +3 -4
  32. glitchlings/zoo/zeedub.py +3 -4
  33. {glitchlings-0.4.3.dist-info → glitchlings-0.4.5.dist-info}/METADATA +32 -8
  34. glitchlings-0.4.5.dist-info/RECORD +53 -0
  35. glitchlings-0.4.3.dist-info/RECORD +0 -46
  36. {glitchlings-0.4.3.dist-info → glitchlings-0.4.5.dist-info}/WHEEL +0 -0
  37. {glitchlings-0.4.3.dist-info → glitchlings-0.4.5.dist-info}/entry_points.txt +0 -0
  38. {glitchlings-0.4.3.dist-info → glitchlings-0.4.5.dist-info}/licenses/LICENSE +0 -0
  39. {glitchlings-0.4.3.dist-info → glitchlings-0.4.5.dist-info}/top_level.txt +0 -0
glitchlings/__init__.py CHANGED
@@ -5,6 +5,7 @@ from .zoo import (
5
5
  Apostrofae,
6
6
  Gaggle,
7
7
  Glitchling,
8
+ Hokey,
8
9
  Jargoyle,
9
10
  Mim1c,
10
11
  Redactyl,
@@ -15,6 +16,7 @@ from .zoo import (
15
16
  Zeedub,
16
17
  adjax,
17
18
  apostrofae,
19
+ hokey,
18
20
  is_rust_pipeline_enabled,
19
21
  is_rust_pipeline_supported,
20
22
  jargoyle,
@@ -42,6 +44,8 @@ __all__ = [
42
44
  "adjax",
43
45
  "Apostrofae",
44
46
  "apostrofae",
47
+ "Hokey",
48
+ "hokey",
45
49
  "Redactyl",
46
50
  "redactyl",
47
51
  "Reduple",
Binary file
glitchlings/compat.py CHANGED
@@ -17,16 +17,14 @@ _MISSING = _MissingSentinel()
17
17
 
18
18
 
19
19
  class _MarkerProtocol(Protocol):
20
- def evaluate(self, environment: dict[str, str]) -> bool:
21
- ...
20
+ def evaluate(self, environment: dict[str, str]) -> bool: ...
22
21
 
23
22
 
24
23
  class _RequirementProtocol(Protocol):
25
24
  marker: _MarkerProtocol | None
26
25
  name: str
27
26
 
28
- def __init__(self, requirement: str) -> None:
29
- ...
27
+ def __init__(self, requirement: str) -> None: ...
30
28
 
31
29
 
32
30
  try: # pragma: no cover - packaging is bundled with modern Python environments
glitchlings/config.py CHANGED
@@ -19,8 +19,7 @@ except ModuleNotFoundError: # pragma: no cover - Python < 3.11
19
19
 
20
20
 
21
21
  class _TomllibModule(Protocol):
22
- def load(self, fp: IO[bytes]) -> Any:
23
- ...
22
+ def load(self, fp: IO[bytes]) -> Any: ...
24
23
 
25
24
 
26
25
  tomllib = cast(_TomllibModule, _tomllib)
@@ -29,8 +28,7 @@ tomllib = cast(_TomllibModule, _tomllib)
29
28
  class _YamlModule(Protocol):
30
29
  YAMLError: type[Exception]
31
30
 
32
- def safe_load(self, stream: str) -> Any:
33
- ...
31
+ def safe_load(self, stream: str) -> Any: ...
34
32
 
35
33
 
36
34
  yaml = cast(_YamlModule, importlib.import_module("yaml"))
@@ -0,0 +1 @@
1
+ """Static data assets shared across Glitchlings implementations."""
@@ -0,0 +1,193 @@
1
+ {
2
+ "lexical_prior": {
3
+ "so": 0.92,
4
+ "no": 0.89,
5
+ "go": 0.72,
6
+ "yeah": 0.86,
7
+ "yay": 0.81,
8
+ "ya": 0.7,
9
+ "hey": 0.66,
10
+ "okay": 0.68,
11
+ "ok": 0.64,
12
+ "cool": 0.78,
13
+ "omg": 0.74,
14
+ "wow": 0.88,
15
+ "wee": 0.62,
16
+ "woo": 0.69,
17
+ "woohoo": 0.74,
18
+ "whoa": 0.71,
19
+ "woah": 0.7,
20
+ "yayyy": 0.75,
21
+ "yayyyy": 0.76,
22
+ "yas": 0.79,
23
+ "yass": 0.8,
24
+ "yaaas": 0.82,
25
+ "please": 0.53,
26
+ "pleaseee": 0.57,
27
+ "pleaseeee": 0.6,
28
+ "pleaseeeee": 0.63,
29
+ "lol": 0.83,
30
+ "lmao": 0.65,
31
+ "omggg": 0.75,
32
+ "omgggg": 0.76,
33
+ "squee": 0.64,
34
+ "hahaha": 0.6,
35
+ "haha": 0.56,
36
+ "really": 0.58,
37
+ "very": 0.49,
38
+ "love": 0.55,
39
+ "cute": 0.52,
40
+ "nice": 0.47,
41
+ "sweet": 0.45,
42
+ "yayness": 0.44,
43
+ "ugh": 0.5,
44
+ "aww": 0.61,
45
+ "yess": 0.81,
46
+ "yes": 0.9,
47
+ "pls": 0.48,
48
+ "pleeeease": 0.62,
49
+ "nooo": 0.88,
50
+ "noooo": 0.89,
51
+ "dang": 0.41,
52
+ "geez": 0.39,
53
+ "danggg": 0.44,
54
+ "dangit": 0.38,
55
+ "sick": 0.35,
56
+ "epic": 0.37,
57
+ "rad": 0.5,
58
+ "goal": 0.56,
59
+ "great": 0.46,
60
+ "awesome": 0.51,
61
+ "amazing": 0.52,
62
+ "perfect": 0.49,
63
+ "fantastic": 0.5,
64
+ "stellar": 0.48,
65
+ "yippee": 0.67,
66
+ "stoked": 0.48,
67
+ "yikes": 0.43,
68
+ "gosh": 0.41,
69
+ "heck": 0.36
70
+ },
71
+ "interjections": [
72
+ "wow",
73
+ "omg",
74
+ "hey",
75
+ "ugh",
76
+ "yay",
77
+ "yayyy",
78
+ "yayyyy",
79
+ "woo",
80
+ "woohoo",
81
+ "whoa",
82
+ "woah",
83
+ "whooo",
84
+ "ah",
85
+ "aw",
86
+ "aww",
87
+ "hmm",
88
+ "huh",
89
+ "yo",
90
+ "yikes",
91
+ "gah",
92
+ "phew",
93
+ "sheesh"
94
+ ],
95
+ "intensifiers": [
96
+ "so",
97
+ "very",
98
+ "really",
99
+ "super",
100
+ "mega",
101
+ "ultra",
102
+ "too",
103
+ "way",
104
+ "crazy",
105
+ "insanely",
106
+ "totally",
107
+ "extremely",
108
+ "seriously",
109
+ "absolutely",
110
+ "completely",
111
+ "entirely",
112
+ "utterly",
113
+ "hella",
114
+ "wicked",
115
+ "truly"
116
+ ],
117
+ "evaluatives": [
118
+ "cool",
119
+ "great",
120
+ "awesome",
121
+ "amazing",
122
+ "perfect",
123
+ "nice",
124
+ "sweet",
125
+ "lovely",
126
+ "loving",
127
+ "silly",
128
+ "wild",
129
+ "fun",
130
+ "funny",
131
+ "adorable",
132
+ "cute",
133
+ "fantastic",
134
+ "fabulous",
135
+ "brilliant",
136
+ "stellar",
137
+ "rad",
138
+ "epic",
139
+ "delightful",
140
+ "gorgeous"
141
+ ],
142
+ "positive_lexicon": [
143
+ "love",
144
+ "loved",
145
+ "loving",
146
+ "like",
147
+ "liked",
148
+ "awesome",
149
+ "amazing",
150
+ "yay",
151
+ "great",
152
+ "good",
153
+ "fun",
154
+ "funny",
155
+ "blessed",
156
+ "excited",
157
+ "cool",
158
+ "best",
159
+ "beautiful",
160
+ "happy",
161
+ "happiest",
162
+ "joy",
163
+ "joyful",
164
+ "thrilled",
165
+ "ecstatic",
166
+ "stoked",
167
+ "pumped",
168
+ "glad"
169
+ ],
170
+ "negative_lexicon": [
171
+ "bad",
172
+ "sad",
173
+ "angry",
174
+ "annoyed",
175
+ "mad",
176
+ "terrible",
177
+ "awful",
178
+ "hate",
179
+ "hated",
180
+ "crying",
181
+ "hurt",
182
+ "tired",
183
+ "worst",
184
+ "ugh",
185
+ "nope",
186
+ "upset",
187
+ "frustrated",
188
+ "drained",
189
+ "exhausted",
190
+ "bummed",
191
+ "grumpy"
192
+ ]
193
+ }
@@ -5,6 +5,8 @@ from __future__ import annotations
5
5
  from collections.abc import Callable, Sequence
6
6
  from typing import Any
7
7
 
8
+ from ..zoo.core import Gaggle, _is_transcript
9
+
8
10
 
9
11
  def resolve_environment(
10
12
  env: Any,
@@ -65,4 +67,87 @@ def resolve_columns(dataset: Any, columns: Sequence[str] | None) -> list[str]:
65
67
  raise ValueError("Unable to determine which dataset columns to corrupt.")
66
68
 
67
69
 
68
- __all__ = ["resolve_columns", "resolve_environment"]
70
+ def normalize_column_spec(
71
+ columns: str | int | Sequence[str | int] | None,
72
+ ) -> list[str | int] | None:
73
+ """Normalize a column specification into a list of keys or indices.
74
+
75
+ Args:
76
+ columns: Column specification as a single value, sequence of values, or None.
77
+
78
+ Returns:
79
+ A list of column identifiers, or None if input was None.
80
+
81
+ Raises:
82
+ ValueError: If an empty sequence is provided.
83
+ """
84
+ if columns is None:
85
+ return None
86
+
87
+ if isinstance(columns, (str, int)):
88
+ return [columns]
89
+
90
+ normalized = list(columns)
91
+ if not normalized:
92
+ raise ValueError("At least one column must be specified")
93
+ return normalized
94
+
95
+
96
+ def is_textual_candidate(value: Any) -> bool:
97
+ """Return ``True`` when ``value`` looks like text that glitchlings can corrupt.
98
+
99
+ Args:
100
+ value: The value to check for textual content.
101
+
102
+ Returns:
103
+ True if the value appears to be textual content.
104
+ """
105
+ if isinstance(value, str):
106
+ return True
107
+
108
+ if _is_transcript(value, allow_empty=False, require_all_content=True):
109
+ return True
110
+
111
+ if isinstance(value, Sequence) and not isinstance(value, (bytes, bytearray, str)):
112
+ if not value:
113
+ return False
114
+ if all(isinstance(item, str) for item in value):
115
+ return True
116
+ if _is_transcript(list(value), allow_empty=False, require_all_content=True):
117
+ return True
118
+
119
+ return False
120
+
121
+
122
+ def corrupt_text_value(value: Any, gaggle: Gaggle) -> Any:
123
+ """Return ``value`` with glitchlings applied when possible.
124
+
125
+ Args:
126
+ value: The value to corrupt (string, transcript, or sequence of strings).
127
+ gaggle: The gaggle of glitchlings to apply.
128
+
129
+ Returns:
130
+ The corrupted value, preserving the original type where possible.
131
+ """
132
+ if isinstance(value, str):
133
+ return gaggle.corrupt(value)
134
+
135
+ if _is_transcript(value, allow_empty=True):
136
+ return gaggle.corrupt(value)
137
+
138
+ if isinstance(value, list) and value and all(isinstance(item, str) for item in value):
139
+ return [gaggle.corrupt(item) for item in value]
140
+
141
+ if isinstance(value, tuple) and value and all(isinstance(item, str) for item in value):
142
+ return tuple(gaggle.corrupt(item) for item in value)
143
+
144
+ return value
145
+
146
+
147
+ __all__ = [
148
+ "corrupt_text_value",
149
+ "is_textual_candidate",
150
+ "normalize_column_spec",
151
+ "resolve_columns",
152
+ "resolve_environment",
153
+ ]
@@ -10,15 +10,15 @@ from ..util.adapters import coerce_gaggle
10
10
  from ..zoo import Gaggle, Glitchling
11
11
 
12
12
 
13
- def _normalise_columns(column: str | Sequence[str]) -> list[str]:
14
- """Normalise a column specification to a list."""
13
+ def _normalize_columns(column: str | Sequence[str]) -> list[str]:
14
+ """Normalize a column specification to a list."""
15
15
  if isinstance(column, str):
16
16
  return [column]
17
17
 
18
- normalised = list(column)
19
- if not normalised:
18
+ normalized = list(column)
19
+ if not normalized:
20
20
  raise ValueError("At least one column must be specified")
21
- return normalised
21
+ return normalized
22
22
 
23
23
 
24
24
  def _glitch_dataset(
@@ -29,7 +29,7 @@ def _glitch_dataset(
29
29
  seed: int = 151,
30
30
  ) -> Any:
31
31
  """Apply glitchlings to the provided dataset columns."""
32
- columns = _normalise_columns(column)
32
+ columns = _normalize_columns(column)
33
33
  gaggle = coerce_gaggle(glitchlings, seed=seed)
34
34
  return gaggle.corrupt_dataset(dataset, columns)
35
35
 
glitchlings/dlc/prime.py CHANGED
@@ -117,7 +117,7 @@ def _as_gaggle(
117
117
 
118
118
 
119
119
  def _extract_completion_text(completion: Any) -> str:
120
- """Normalise a completion payload into a plain string."""
120
+ """Normalize a completion payload into a plain string."""
121
121
  if isinstance(completion, str):
122
122
  return completion
123
123
 
@@ -9,63 +9,13 @@ from ..compat import get_torch_dataloader, require_torch
9
9
  from ..compat import torch as _torch_dependency
10
10
  from ..util.adapters import coerce_gaggle
11
11
  from ..zoo import Gaggle, Glitchling
12
- from ..zoo.core import _is_transcript
13
-
14
-
15
- def _normalise_columns(columns: str | int | Sequence[str | int] | None) -> list[str | int] | None:
16
- """Normalise a column specification into a list of keys or indices."""
17
- if columns is None:
18
- return None
19
-
20
- if isinstance(columns, (str, int)):
21
- return [columns]
22
-
23
- normalised = list(columns)
24
- if not normalised:
25
- raise ValueError("At least one column must be specified")
26
- return normalised
27
-
28
-
29
- def _is_textual_candidate(value: Any) -> bool:
30
- """Return ``True`` when ``value`` looks like text that glitchlings can corrupt."""
31
- if isinstance(value, str):
32
- return True
33
-
34
- if _is_transcript(value, allow_empty=False, require_all_content=True):
35
- return True
36
-
37
- if isinstance(value, Sequence) and not isinstance(value, (bytes, bytearray, str)):
38
- if not value:
39
- return False
40
- if all(isinstance(item, str) for item in value):
41
- return True
42
- if _is_transcript(list(value), allow_empty=False, require_all_content=True):
43
- return True
44
-
45
- return False
46
-
47
-
48
- def _corrupt_text(value: Any, gaggle: Gaggle) -> Any:
49
- """Return ``value`` with glitchlings applied when possible."""
50
- if isinstance(value, str):
51
- return gaggle.corrupt(value)
52
-
53
- if _is_transcript(value, allow_empty=True):
54
- return gaggle.corrupt(value)
55
-
56
- if isinstance(value, list) and value and all(isinstance(item, str) for item in value):
57
- return [gaggle.corrupt(item) for item in value]
58
-
59
- if isinstance(value, tuple) and value and all(isinstance(item, str) for item in value):
60
- return tuple(gaggle.corrupt(item) for item in value)
61
-
62
- return value
12
+ from ._shared import corrupt_text_value, is_textual_candidate, normalize_column_spec
63
13
 
64
14
 
65
15
  def _apply_to_batch(batch: Any, targets: list[str | int] | None, gaggle: Gaggle) -> Any:
66
16
  """Return ``batch`` with glitchlings applied to the specified ``targets``."""
67
17
  if targets is None:
68
- return _corrupt_text(batch, gaggle)
18
+ return corrupt_text_value(batch, gaggle)
69
19
 
70
20
  if isinstance(batch, Mapping):
71
21
  mutated = cast(MutableMapping[str, Any], dict(batch))
@@ -74,7 +24,7 @@ def _apply_to_batch(batch: Any, targets: list[str | int] | None, gaggle: Gaggle)
74
24
  raise TypeError("Mapping batches require string column names")
75
25
  if key not in mutated:
76
26
  raise ValueError(f"Column '{key}' not found in DataLoader batch")
77
- mutated[key] = _corrupt_text(mutated[key], gaggle)
27
+ mutated[key] = corrupt_text_value(mutated[key], gaggle)
78
28
  return mutated
79
29
 
80
30
  if isinstance(batch, Sequence) and not isinstance(batch, (bytes, bytearray, str)):
@@ -83,7 +33,7 @@ def _apply_to_batch(batch: Any, targets: list[str | int] | None, gaggle: Gaggle)
83
33
  if not isinstance(index, int):
84
34
  raise TypeError("Sequence batches require integer column indices")
85
35
  try:
86
- mutated_sequence[index] = _corrupt_text(mutated_sequence[index], gaggle)
36
+ mutated_sequence[index] = corrupt_text_value(mutated_sequence[index], gaggle)
87
37
  except IndexError as exc: # pragma: no cover - defensive
88
38
  raise IndexError("Column index out of range for DataLoader batch") from exc
89
39
  if isinstance(batch, tuple):
@@ -96,20 +46,20 @@ def _apply_to_batch(batch: Any, targets: list[str | int] | None, gaggle: Gaggle)
96
46
  def _infer_targets(batch: Any) -> list[str | int] | None:
97
47
  """Infer which fields should be glitched from a representative ``batch``."""
98
48
  if isinstance(batch, Mapping):
99
- inferred = [key for key, value in batch.items() if _is_textual_candidate(value)]
49
+ inferred = [key for key, value in batch.items() if is_textual_candidate(value)]
100
50
  if inferred:
101
51
  return inferred
102
52
  raise ValueError("Unable to infer which mapping columns contain text")
103
53
 
104
54
  if isinstance(batch, Sequence) and not isinstance(batch, (bytes, bytearray, str)):
105
55
  inferred_indices: list[str | int] = [
106
- idx for idx, value in enumerate(batch) if _is_textual_candidate(value)
56
+ idx for idx, value in enumerate(batch) if is_textual_candidate(value)
107
57
  ]
108
58
  if inferred_indices:
109
59
  return inferred_indices
110
60
  raise ValueError("Unable to infer which sequence indices contain text")
111
61
 
112
- if _is_textual_candidate(batch):
62
+ if is_textual_candidate(batch):
113
63
  return None
114
64
 
115
65
  raise TypeError("Unsupported DataLoader batch type for glitching")
@@ -184,8 +134,8 @@ def _ensure_dataloader_class() -> type[Any]:
184
134
  ) -> _GlitchedDataLoader:
185
135
  """Return a lazily glitched view of the loader's batches."""
186
136
  gaggle = coerce_gaggle(glitchlings, seed=seed)
187
- normalised = _normalise_columns(columns)
188
- return _GlitchedDataLoader(self, gaggle, columns=normalised)
137
+ normalized = normalize_column_spec(columns)
138
+ return _GlitchedDataLoader(self, gaggle, columns=normalized)
189
139
 
190
140
  setattr(dataloader_cls, "glitch", glitch)
191
141
 
@@ -8,29 +8,7 @@ from typing import Any, cast
8
8
  from ..compat import get_pytorch_lightning_datamodule, require_pytorch_lightning
9
9
  from ..util.adapters import coerce_gaggle
10
10
  from ..zoo import Gaggle, Glitchling
11
- from ..zoo.core import _is_transcript
12
-
13
-
14
- def _normalise_columns(column: str | Sequence[str]) -> list[str]:
15
- """Normalise a column specification to a list."""
16
- if isinstance(column, str):
17
- return [column]
18
-
19
- normalised = list(column)
20
- if not normalised:
21
- raise ValueError("At least one column must be specified")
22
- return normalised
23
-
24
-
25
- def _glitch_value(value: Any, gaggle: Gaggle) -> Any:
26
- """Apply glitchlings to a value when it contains textual content."""
27
- if isinstance(value, str) or _is_transcript(value, allow_empty=False, require_all_content=True):
28
- return gaggle.corrupt(value)
29
-
30
- if isinstance(value, Sequence) and value and all(isinstance(item, str) for item in value):
31
- return [gaggle.corrupt(item) for item in value]
32
-
33
- return value
11
+ from ._shared import corrupt_text_value, normalize_column_spec
34
12
 
35
13
 
36
14
  def _glitch_batch(batch: Any, columns: list[str], gaggle: Gaggle) -> Any:
@@ -49,7 +27,7 @@ def _glitch_batch(batch: Any, columns: list[str], gaggle: Gaggle) -> Any:
49
27
  raise ValueError(f"Columns not found in batch: {missing_str}")
50
28
 
51
29
  for column in columns:
52
- mutated[column] = _glitch_value(mutated[column], gaggle)
30
+ mutated[column] = corrupt_text_value(mutated[column], gaggle)
53
31
 
54
32
  return mutated
55
33
 
@@ -62,10 +40,7 @@ def _wrap_dataloader(dataloader: Any, columns: list[str], gaggle: Gaggle) -> Any
62
40
  if isinstance(dataloader, Mapping):
63
41
  mapping_type = cast(type[Any], dataloader.__class__)
64
42
  return mapping_type(
65
- {
66
- key: _wrap_dataloader(value, columns, gaggle)
67
- for key, value in dataloader.items()
68
- }
43
+ {key: _wrap_dataloader(value, columns, gaggle) for key, value in dataloader.items()}
69
44
  )
70
45
 
71
46
  if isinstance(dataloader, list):
@@ -76,9 +51,7 @@ def _wrap_dataloader(dataloader: Any, columns: list[str], gaggle: Gaggle) -> Any
76
51
 
77
52
  if isinstance(dataloader, Sequence) and not isinstance(dataloader, (str, bytes, bytearray)):
78
53
  sequence_type = cast(type[Any], dataloader.__class__)
79
- return sequence_type(
80
- _wrap_dataloader(value, columns, gaggle) for value in dataloader
81
- )
54
+ return sequence_type(_wrap_dataloader(value, columns, gaggle) for value in dataloader)
82
55
 
83
56
  return _GlitchedDataLoader(dataloader, columns, gaggle)
84
57
 
@@ -111,9 +84,13 @@ def _glitch_datamodule(
111
84
  ) -> Any:
112
85
  """Return a proxy that applies glitchlings to batches from the datamodule."""
113
86
 
114
- columns = _normalise_columns(column)
87
+ columns = normalize_column_spec(column)
88
+ if columns is None: # pragma: no cover - defensive
89
+ raise ValueError("At least one column must be specified")
90
+ # Lightning datamodules only support string column names (mapping keys)
91
+ columns_str = cast(list[str], columns)
115
92
  gaggle = coerce_gaggle(glitchlings, seed=seed)
116
- return _GlitchedLightningDataModule(datamodule, columns, gaggle)
93
+ return _GlitchedLightningDataModule(datamodule, columns_str, gaggle)
117
94
 
118
95
 
119
96
  class _GlitchedLightningDataModule:
@@ -230,4 +207,3 @@ else: # pragma: no cover - optional dependency
230
207
 
231
208
 
232
209
  __all__ = ["LightningDataModule", "install"]
233
-
@@ -102,7 +102,11 @@ from .vector import VectorLexicon, build_vector_cache # noqa: E402
102
102
  _WordNetLexicon: type[LexiconBackend] | None
103
103
  try: # pragma: no cover - optional dependency
104
104
  from .wordnet import WordNetLexicon as _WordNetLexicon
105
- except Exception: # pragma: no cover - triggered when nltk unavailable
105
+ except (
106
+ ImportError,
107
+ ModuleNotFoundError,
108
+ AttributeError,
109
+ ): # pragma: no cover - triggered when nltk unavailable
106
110
  _WordNetLexicon = None
107
111
 
108
112
  WordNetLexicon: type[LexiconBackend] | None = _WordNetLexicon
@@ -19,7 +19,7 @@ class CacheSnapshot:
19
19
  checksum: str | None = None
20
20
 
21
21
 
22
- def _normalise_entries(payload: Mapping[str, object]) -> CacheEntries:
22
+ def _normalize_entries(payload: Mapping[str, object]) -> CacheEntries:
23
23
  """Convert raw cache payloads into canonical mapping form."""
24
24
  entries: CacheEntries = {}
25
25
  for key, values in payload.items():
@@ -75,7 +75,7 @@ def load_cache(path: Path) -> CacheSnapshot:
75
75
  else:
76
76
  entries_payload = payload # legacy format without metadata
77
77
 
78
- entries = _normalise_entries(entries_payload)
78
+ entries = _normalize_entries(entries_payload)
79
79
  if checksum is not None:
80
80
  expected = compute_checksum(entries)
81
81
  if checksum != expected:
@@ -88,9 +88,7 @@ def load_cache(path: Path) -> CacheSnapshot:
88
88
 
89
89
  def write_cache(path: Path, entries: Mapping[str, Sequence[str]]) -> CacheSnapshot:
90
90
  """Persist ``entries`` to ``path`` with checksum metadata."""
91
- serialisable: CacheEntries = {
92
- key: list(values) for key, values in sorted(entries.items())
93
- }
91
+ serialisable: CacheEntries = {key: list(values) for key, values in sorted(entries.items())}
94
92
  checksum = compute_checksum(serialisable)
95
93
  payload = {
96
94
  "__meta__": {
@@ -16,6 +16,9 @@ from ._cache import CacheSnapshot
16
16
  from ._cache import load_cache as _load_cache_file
17
17
  from ._cache import write_cache as _write_cache_file
18
18
 
19
+ # Minimum number of neighbors to consider for similarity queries
20
+ MIN_NEIGHBORS = 1
21
+
19
22
 
20
23
  def _cosine_similarity(vector_a: Sequence[float], vector_b: Sequence[float]) -> float:
21
24
  """Return the cosine similarity between two dense vectors."""
@@ -304,7 +307,7 @@ class VectorLexicon(LexiconBackend):
304
307
  """Initialise the lexicon with an embedding ``source`` and optional cache."""
305
308
  super().__init__(seed=seed)
306
309
  self._adapter = _resolve_source(source)
307
- self._max_neighbors = max(1, max_neighbors)
310
+ self._max_neighbors = max(MIN_NEIGHBORS, max_neighbors)
308
311
  self._min_similarity = min_similarity
309
312
  self._cache: MutableMapping[str, list[str]] = {}
310
313
  self._cache_path: Path | None
@@ -371,7 +374,7 @@ class VectorLexicon(LexiconBackend):
371
374
  if cache_key in self._cache:
372
375
  return self._cache[cache_key]
373
376
 
374
- neighbor_limit = self._max_neighbors if limit is None else max(1, limit)
377
+ neighbor_limit = self._max_neighbors if limit is None else max(MIN_NEIGHBORS, limit)
375
378
  neighbors = self._fetch_neighbors(
376
379
  original=original, normalized=normalized, limit=neighbor_limit
377
380
  )
@@ -624,9 +627,7 @@ def main(argv: Sequence[str] | None = None) -> int:
624
627
  )
625
628
  iterator = lexicon.iter_vocabulary()
626
629
  if args.limit is not None:
627
- token_iter = (
628
- token for index, token in enumerate(iterator) if index < args.limit
629
- )
630
+ token_iter = (token for index, token in enumerate(iterator) if index < args.limit)
630
631
  else:
631
632
  token_iter = iterator
632
633