glitchlings 0.4.3__cp312-cp312-win_amd64.whl → 0.4.5__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of glitchlings might be problematic. Click here for more details.
- glitchlings/__init__.py +4 -0
- glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
- glitchlings/compat.py +2 -4
- glitchlings/config.py +2 -4
- glitchlings/data/__init__.py +1 -0
- glitchlings/data/hokey_assets.json +193 -0
- glitchlings/dlc/_shared.py +86 -1
- glitchlings/dlc/huggingface.py +6 -6
- glitchlings/dlc/prime.py +1 -1
- glitchlings/dlc/pytorch.py +9 -59
- glitchlings/dlc/pytorch_lightning.py +10 -34
- glitchlings/lexicon/__init__.py +5 -1
- glitchlings/lexicon/_cache.py +3 -5
- glitchlings/lexicon/vector.py +6 -5
- glitchlings/lexicon/wordnet.py +4 -8
- glitchlings/util/hokey_generator.py +144 -0
- glitchlings/util/stretch_locator.py +140 -0
- glitchlings/util/stretchability.py +375 -0
- glitchlings/zoo/__init__.py +5 -1
- glitchlings/zoo/_rate.py +114 -1
- glitchlings/zoo/_rust_extensions.py +143 -0
- glitchlings/zoo/adjax.py +3 -4
- glitchlings/zoo/apostrofae.py +3 -4
- glitchlings/zoo/core.py +21 -9
- glitchlings/zoo/hokey.py +173 -0
- glitchlings/zoo/jargoyle.py +6 -2
- glitchlings/zoo/redactyl.py +4 -5
- glitchlings/zoo/reduple.py +3 -4
- glitchlings/zoo/rushmore.py +3 -4
- glitchlings/zoo/scannequin.py +3 -4
- glitchlings/zoo/typogre.py +3 -4
- glitchlings/zoo/zeedub.py +3 -4
- {glitchlings-0.4.3.dist-info → glitchlings-0.4.5.dist-info}/METADATA +32 -8
- glitchlings-0.4.5.dist-info/RECORD +53 -0
- glitchlings-0.4.3.dist-info/RECORD +0 -46
- {glitchlings-0.4.3.dist-info → glitchlings-0.4.5.dist-info}/WHEEL +0 -0
- {glitchlings-0.4.3.dist-info → glitchlings-0.4.5.dist-info}/entry_points.txt +0 -0
- {glitchlings-0.4.3.dist-info → glitchlings-0.4.5.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.4.3.dist-info → glitchlings-0.4.5.dist-info}/top_level.txt +0 -0
glitchlings/__init__.py
CHANGED
|
@@ -5,6 +5,7 @@ from .zoo import (
|
|
|
5
5
|
Apostrofae,
|
|
6
6
|
Gaggle,
|
|
7
7
|
Glitchling,
|
|
8
|
+
Hokey,
|
|
8
9
|
Jargoyle,
|
|
9
10
|
Mim1c,
|
|
10
11
|
Redactyl,
|
|
@@ -15,6 +16,7 @@ from .zoo import (
|
|
|
15
16
|
Zeedub,
|
|
16
17
|
adjax,
|
|
17
18
|
apostrofae,
|
|
19
|
+
hokey,
|
|
18
20
|
is_rust_pipeline_enabled,
|
|
19
21
|
is_rust_pipeline_supported,
|
|
20
22
|
jargoyle,
|
|
@@ -42,6 +44,8 @@ __all__ = [
|
|
|
42
44
|
"adjax",
|
|
43
45
|
"Apostrofae",
|
|
44
46
|
"apostrofae",
|
|
47
|
+
"Hokey",
|
|
48
|
+
"hokey",
|
|
45
49
|
"Redactyl",
|
|
46
50
|
"redactyl",
|
|
47
51
|
"Reduple",
|
|
Binary file
|
glitchlings/compat.py
CHANGED
|
@@ -17,16 +17,14 @@ _MISSING = _MissingSentinel()
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class _MarkerProtocol(Protocol):
|
|
20
|
-
def evaluate(self, environment: dict[str, str]) -> bool:
|
|
21
|
-
...
|
|
20
|
+
def evaluate(self, environment: dict[str, str]) -> bool: ...
|
|
22
21
|
|
|
23
22
|
|
|
24
23
|
class _RequirementProtocol(Protocol):
|
|
25
24
|
marker: _MarkerProtocol | None
|
|
26
25
|
name: str
|
|
27
26
|
|
|
28
|
-
def __init__(self, requirement: str) -> None:
|
|
29
|
-
...
|
|
27
|
+
def __init__(self, requirement: str) -> None: ...
|
|
30
28
|
|
|
31
29
|
|
|
32
30
|
try: # pragma: no cover - packaging is bundled with modern Python environments
|
glitchlings/config.py
CHANGED
|
@@ -19,8 +19,7 @@ except ModuleNotFoundError: # pragma: no cover - Python < 3.11
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class _TomllibModule(Protocol):
|
|
22
|
-
def load(self, fp: IO[bytes]) -> Any:
|
|
23
|
-
...
|
|
22
|
+
def load(self, fp: IO[bytes]) -> Any: ...
|
|
24
23
|
|
|
25
24
|
|
|
26
25
|
tomllib = cast(_TomllibModule, _tomllib)
|
|
@@ -29,8 +28,7 @@ tomllib = cast(_TomllibModule, _tomllib)
|
|
|
29
28
|
class _YamlModule(Protocol):
|
|
30
29
|
YAMLError: type[Exception]
|
|
31
30
|
|
|
32
|
-
def safe_load(self, stream: str) -> Any:
|
|
33
|
-
...
|
|
31
|
+
def safe_load(self, stream: str) -> Any: ...
|
|
34
32
|
|
|
35
33
|
|
|
36
34
|
yaml = cast(_YamlModule, importlib.import_module("yaml"))
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Static data assets shared across Glitchlings implementations."""
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexical_prior": {
|
|
3
|
+
"so": 0.92,
|
|
4
|
+
"no": 0.89,
|
|
5
|
+
"go": 0.72,
|
|
6
|
+
"yeah": 0.86,
|
|
7
|
+
"yay": 0.81,
|
|
8
|
+
"ya": 0.7,
|
|
9
|
+
"hey": 0.66,
|
|
10
|
+
"okay": 0.68,
|
|
11
|
+
"ok": 0.64,
|
|
12
|
+
"cool": 0.78,
|
|
13
|
+
"omg": 0.74,
|
|
14
|
+
"wow": 0.88,
|
|
15
|
+
"wee": 0.62,
|
|
16
|
+
"woo": 0.69,
|
|
17
|
+
"woohoo": 0.74,
|
|
18
|
+
"whoa": 0.71,
|
|
19
|
+
"woah": 0.7,
|
|
20
|
+
"yayyy": 0.75,
|
|
21
|
+
"yayyyy": 0.76,
|
|
22
|
+
"yas": 0.79,
|
|
23
|
+
"yass": 0.8,
|
|
24
|
+
"yaaas": 0.82,
|
|
25
|
+
"please": 0.53,
|
|
26
|
+
"pleaseee": 0.57,
|
|
27
|
+
"pleaseeee": 0.6,
|
|
28
|
+
"pleaseeeee": 0.63,
|
|
29
|
+
"lol": 0.83,
|
|
30
|
+
"lmao": 0.65,
|
|
31
|
+
"omggg": 0.75,
|
|
32
|
+
"omgggg": 0.76,
|
|
33
|
+
"squee": 0.64,
|
|
34
|
+
"hahaha": 0.6,
|
|
35
|
+
"haha": 0.56,
|
|
36
|
+
"really": 0.58,
|
|
37
|
+
"very": 0.49,
|
|
38
|
+
"love": 0.55,
|
|
39
|
+
"cute": 0.52,
|
|
40
|
+
"nice": 0.47,
|
|
41
|
+
"sweet": 0.45,
|
|
42
|
+
"yayness": 0.44,
|
|
43
|
+
"ugh": 0.5,
|
|
44
|
+
"aww": 0.61,
|
|
45
|
+
"yess": 0.81,
|
|
46
|
+
"yes": 0.9,
|
|
47
|
+
"pls": 0.48,
|
|
48
|
+
"pleeeease": 0.62,
|
|
49
|
+
"nooo": 0.88,
|
|
50
|
+
"noooo": 0.89,
|
|
51
|
+
"dang": 0.41,
|
|
52
|
+
"geez": 0.39,
|
|
53
|
+
"danggg": 0.44,
|
|
54
|
+
"dangit": 0.38,
|
|
55
|
+
"sick": 0.35,
|
|
56
|
+
"epic": 0.37,
|
|
57
|
+
"rad": 0.5,
|
|
58
|
+
"goal": 0.56,
|
|
59
|
+
"great": 0.46,
|
|
60
|
+
"awesome": 0.51,
|
|
61
|
+
"amazing": 0.52,
|
|
62
|
+
"perfect": 0.49,
|
|
63
|
+
"fantastic": 0.5,
|
|
64
|
+
"stellar": 0.48,
|
|
65
|
+
"yippee": 0.67,
|
|
66
|
+
"stoked": 0.48,
|
|
67
|
+
"yikes": 0.43,
|
|
68
|
+
"gosh": 0.41,
|
|
69
|
+
"heck": 0.36
|
|
70
|
+
},
|
|
71
|
+
"interjections": [
|
|
72
|
+
"wow",
|
|
73
|
+
"omg",
|
|
74
|
+
"hey",
|
|
75
|
+
"ugh",
|
|
76
|
+
"yay",
|
|
77
|
+
"yayyy",
|
|
78
|
+
"yayyyy",
|
|
79
|
+
"woo",
|
|
80
|
+
"woohoo",
|
|
81
|
+
"whoa",
|
|
82
|
+
"woah",
|
|
83
|
+
"whooo",
|
|
84
|
+
"ah",
|
|
85
|
+
"aw",
|
|
86
|
+
"aww",
|
|
87
|
+
"hmm",
|
|
88
|
+
"huh",
|
|
89
|
+
"yo",
|
|
90
|
+
"yikes",
|
|
91
|
+
"gah",
|
|
92
|
+
"phew",
|
|
93
|
+
"sheesh"
|
|
94
|
+
],
|
|
95
|
+
"intensifiers": [
|
|
96
|
+
"so",
|
|
97
|
+
"very",
|
|
98
|
+
"really",
|
|
99
|
+
"super",
|
|
100
|
+
"mega",
|
|
101
|
+
"ultra",
|
|
102
|
+
"too",
|
|
103
|
+
"way",
|
|
104
|
+
"crazy",
|
|
105
|
+
"insanely",
|
|
106
|
+
"totally",
|
|
107
|
+
"extremely",
|
|
108
|
+
"seriously",
|
|
109
|
+
"absolutely",
|
|
110
|
+
"completely",
|
|
111
|
+
"entirely",
|
|
112
|
+
"utterly",
|
|
113
|
+
"hella",
|
|
114
|
+
"wicked",
|
|
115
|
+
"truly"
|
|
116
|
+
],
|
|
117
|
+
"evaluatives": [
|
|
118
|
+
"cool",
|
|
119
|
+
"great",
|
|
120
|
+
"awesome",
|
|
121
|
+
"amazing",
|
|
122
|
+
"perfect",
|
|
123
|
+
"nice",
|
|
124
|
+
"sweet",
|
|
125
|
+
"lovely",
|
|
126
|
+
"loving",
|
|
127
|
+
"silly",
|
|
128
|
+
"wild",
|
|
129
|
+
"fun",
|
|
130
|
+
"funny",
|
|
131
|
+
"adorable",
|
|
132
|
+
"cute",
|
|
133
|
+
"fantastic",
|
|
134
|
+
"fabulous",
|
|
135
|
+
"brilliant",
|
|
136
|
+
"stellar",
|
|
137
|
+
"rad",
|
|
138
|
+
"epic",
|
|
139
|
+
"delightful",
|
|
140
|
+
"gorgeous"
|
|
141
|
+
],
|
|
142
|
+
"positive_lexicon": [
|
|
143
|
+
"love",
|
|
144
|
+
"loved",
|
|
145
|
+
"loving",
|
|
146
|
+
"like",
|
|
147
|
+
"liked",
|
|
148
|
+
"awesome",
|
|
149
|
+
"amazing",
|
|
150
|
+
"yay",
|
|
151
|
+
"great",
|
|
152
|
+
"good",
|
|
153
|
+
"fun",
|
|
154
|
+
"funny",
|
|
155
|
+
"blessed",
|
|
156
|
+
"excited",
|
|
157
|
+
"cool",
|
|
158
|
+
"best",
|
|
159
|
+
"beautiful",
|
|
160
|
+
"happy",
|
|
161
|
+
"happiest",
|
|
162
|
+
"joy",
|
|
163
|
+
"joyful",
|
|
164
|
+
"thrilled",
|
|
165
|
+
"ecstatic",
|
|
166
|
+
"stoked",
|
|
167
|
+
"pumped",
|
|
168
|
+
"glad"
|
|
169
|
+
],
|
|
170
|
+
"negative_lexicon": [
|
|
171
|
+
"bad",
|
|
172
|
+
"sad",
|
|
173
|
+
"angry",
|
|
174
|
+
"annoyed",
|
|
175
|
+
"mad",
|
|
176
|
+
"terrible",
|
|
177
|
+
"awful",
|
|
178
|
+
"hate",
|
|
179
|
+
"hated",
|
|
180
|
+
"crying",
|
|
181
|
+
"hurt",
|
|
182
|
+
"tired",
|
|
183
|
+
"worst",
|
|
184
|
+
"ugh",
|
|
185
|
+
"nope",
|
|
186
|
+
"upset",
|
|
187
|
+
"frustrated",
|
|
188
|
+
"drained",
|
|
189
|
+
"exhausted",
|
|
190
|
+
"bummed",
|
|
191
|
+
"grumpy"
|
|
192
|
+
]
|
|
193
|
+
}
|
glitchlings/dlc/_shared.py
CHANGED
|
@@ -5,6 +5,8 @@ from __future__ import annotations
|
|
|
5
5
|
from collections.abc import Callable, Sequence
|
|
6
6
|
from typing import Any
|
|
7
7
|
|
|
8
|
+
from ..zoo.core import Gaggle, _is_transcript
|
|
9
|
+
|
|
8
10
|
|
|
9
11
|
def resolve_environment(
|
|
10
12
|
env: Any,
|
|
@@ -65,4 +67,87 @@ def resolve_columns(dataset: Any, columns: Sequence[str] | None) -> list[str]:
|
|
|
65
67
|
raise ValueError("Unable to determine which dataset columns to corrupt.")
|
|
66
68
|
|
|
67
69
|
|
|
68
|
-
|
|
70
|
+
def normalize_column_spec(
|
|
71
|
+
columns: str | int | Sequence[str | int] | None,
|
|
72
|
+
) -> list[str | int] | None:
|
|
73
|
+
"""Normalize a column specification into a list of keys or indices.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
columns: Column specification as a single value, sequence of values, or None.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
A list of column identifiers, or None if input was None.
|
|
80
|
+
|
|
81
|
+
Raises:
|
|
82
|
+
ValueError: If an empty sequence is provided.
|
|
83
|
+
"""
|
|
84
|
+
if columns is None:
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
if isinstance(columns, (str, int)):
|
|
88
|
+
return [columns]
|
|
89
|
+
|
|
90
|
+
normalized = list(columns)
|
|
91
|
+
if not normalized:
|
|
92
|
+
raise ValueError("At least one column must be specified")
|
|
93
|
+
return normalized
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def is_textual_candidate(value: Any) -> bool:
|
|
97
|
+
"""Return ``True`` when ``value`` looks like text that glitchlings can corrupt.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
value: The value to check for textual content.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
True if the value appears to be textual content.
|
|
104
|
+
"""
|
|
105
|
+
if isinstance(value, str):
|
|
106
|
+
return True
|
|
107
|
+
|
|
108
|
+
if _is_transcript(value, allow_empty=False, require_all_content=True):
|
|
109
|
+
return True
|
|
110
|
+
|
|
111
|
+
if isinstance(value, Sequence) and not isinstance(value, (bytes, bytearray, str)):
|
|
112
|
+
if not value:
|
|
113
|
+
return False
|
|
114
|
+
if all(isinstance(item, str) for item in value):
|
|
115
|
+
return True
|
|
116
|
+
if _is_transcript(list(value), allow_empty=False, require_all_content=True):
|
|
117
|
+
return True
|
|
118
|
+
|
|
119
|
+
return False
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def corrupt_text_value(value: Any, gaggle: Gaggle) -> Any:
|
|
123
|
+
"""Return ``value`` with glitchlings applied when possible.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
value: The value to corrupt (string, transcript, or sequence of strings).
|
|
127
|
+
gaggle: The gaggle of glitchlings to apply.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
The corrupted value, preserving the original type where possible.
|
|
131
|
+
"""
|
|
132
|
+
if isinstance(value, str):
|
|
133
|
+
return gaggle.corrupt(value)
|
|
134
|
+
|
|
135
|
+
if _is_transcript(value, allow_empty=True):
|
|
136
|
+
return gaggle.corrupt(value)
|
|
137
|
+
|
|
138
|
+
if isinstance(value, list) and value and all(isinstance(item, str) for item in value):
|
|
139
|
+
return [gaggle.corrupt(item) for item in value]
|
|
140
|
+
|
|
141
|
+
if isinstance(value, tuple) and value and all(isinstance(item, str) for item in value):
|
|
142
|
+
return tuple(gaggle.corrupt(item) for item in value)
|
|
143
|
+
|
|
144
|
+
return value
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
__all__ = [
|
|
148
|
+
"corrupt_text_value",
|
|
149
|
+
"is_textual_candidate",
|
|
150
|
+
"normalize_column_spec",
|
|
151
|
+
"resolve_columns",
|
|
152
|
+
"resolve_environment",
|
|
153
|
+
]
|
glitchlings/dlc/huggingface.py
CHANGED
|
@@ -10,15 +10,15 @@ from ..util.adapters import coerce_gaggle
|
|
|
10
10
|
from ..zoo import Gaggle, Glitchling
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
def
|
|
14
|
-
"""
|
|
13
|
+
def _normalize_columns(column: str | Sequence[str]) -> list[str]:
|
|
14
|
+
"""Normalize a column specification to a list."""
|
|
15
15
|
if isinstance(column, str):
|
|
16
16
|
return [column]
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
if not
|
|
18
|
+
normalized = list(column)
|
|
19
|
+
if not normalized:
|
|
20
20
|
raise ValueError("At least one column must be specified")
|
|
21
|
-
return
|
|
21
|
+
return normalized
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def _glitch_dataset(
|
|
@@ -29,7 +29,7 @@ def _glitch_dataset(
|
|
|
29
29
|
seed: int = 151,
|
|
30
30
|
) -> Any:
|
|
31
31
|
"""Apply glitchlings to the provided dataset columns."""
|
|
32
|
-
columns =
|
|
32
|
+
columns = _normalize_columns(column)
|
|
33
33
|
gaggle = coerce_gaggle(glitchlings, seed=seed)
|
|
34
34
|
return gaggle.corrupt_dataset(dataset, columns)
|
|
35
35
|
|
glitchlings/dlc/prime.py
CHANGED
|
@@ -117,7 +117,7 @@ def _as_gaggle(
|
|
|
117
117
|
|
|
118
118
|
|
|
119
119
|
def _extract_completion_text(completion: Any) -> str:
|
|
120
|
-
"""
|
|
120
|
+
"""Normalize a completion payload into a plain string."""
|
|
121
121
|
if isinstance(completion, str):
|
|
122
122
|
return completion
|
|
123
123
|
|
glitchlings/dlc/pytorch.py
CHANGED
|
@@ -9,63 +9,13 @@ from ..compat import get_torch_dataloader, require_torch
|
|
|
9
9
|
from ..compat import torch as _torch_dependency
|
|
10
10
|
from ..util.adapters import coerce_gaggle
|
|
11
11
|
from ..zoo import Gaggle, Glitchling
|
|
12
|
-
from
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def _normalise_columns(columns: str | int | Sequence[str | int] | None) -> list[str | int] | None:
|
|
16
|
-
"""Normalise a column specification into a list of keys or indices."""
|
|
17
|
-
if columns is None:
|
|
18
|
-
return None
|
|
19
|
-
|
|
20
|
-
if isinstance(columns, (str, int)):
|
|
21
|
-
return [columns]
|
|
22
|
-
|
|
23
|
-
normalised = list(columns)
|
|
24
|
-
if not normalised:
|
|
25
|
-
raise ValueError("At least one column must be specified")
|
|
26
|
-
return normalised
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def _is_textual_candidate(value: Any) -> bool:
|
|
30
|
-
"""Return ``True`` when ``value`` looks like text that glitchlings can corrupt."""
|
|
31
|
-
if isinstance(value, str):
|
|
32
|
-
return True
|
|
33
|
-
|
|
34
|
-
if _is_transcript(value, allow_empty=False, require_all_content=True):
|
|
35
|
-
return True
|
|
36
|
-
|
|
37
|
-
if isinstance(value, Sequence) and not isinstance(value, (bytes, bytearray, str)):
|
|
38
|
-
if not value:
|
|
39
|
-
return False
|
|
40
|
-
if all(isinstance(item, str) for item in value):
|
|
41
|
-
return True
|
|
42
|
-
if _is_transcript(list(value), allow_empty=False, require_all_content=True):
|
|
43
|
-
return True
|
|
44
|
-
|
|
45
|
-
return False
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def _corrupt_text(value: Any, gaggle: Gaggle) -> Any:
|
|
49
|
-
"""Return ``value`` with glitchlings applied when possible."""
|
|
50
|
-
if isinstance(value, str):
|
|
51
|
-
return gaggle.corrupt(value)
|
|
52
|
-
|
|
53
|
-
if _is_transcript(value, allow_empty=True):
|
|
54
|
-
return gaggle.corrupt(value)
|
|
55
|
-
|
|
56
|
-
if isinstance(value, list) and value and all(isinstance(item, str) for item in value):
|
|
57
|
-
return [gaggle.corrupt(item) for item in value]
|
|
58
|
-
|
|
59
|
-
if isinstance(value, tuple) and value and all(isinstance(item, str) for item in value):
|
|
60
|
-
return tuple(gaggle.corrupt(item) for item in value)
|
|
61
|
-
|
|
62
|
-
return value
|
|
12
|
+
from ._shared import corrupt_text_value, is_textual_candidate, normalize_column_spec
|
|
63
13
|
|
|
64
14
|
|
|
65
15
|
def _apply_to_batch(batch: Any, targets: list[str | int] | None, gaggle: Gaggle) -> Any:
|
|
66
16
|
"""Return ``batch`` with glitchlings applied to the specified ``targets``."""
|
|
67
17
|
if targets is None:
|
|
68
|
-
return
|
|
18
|
+
return corrupt_text_value(batch, gaggle)
|
|
69
19
|
|
|
70
20
|
if isinstance(batch, Mapping):
|
|
71
21
|
mutated = cast(MutableMapping[str, Any], dict(batch))
|
|
@@ -74,7 +24,7 @@ def _apply_to_batch(batch: Any, targets: list[str | int] | None, gaggle: Gaggle)
|
|
|
74
24
|
raise TypeError("Mapping batches require string column names")
|
|
75
25
|
if key not in mutated:
|
|
76
26
|
raise ValueError(f"Column '{key}' not found in DataLoader batch")
|
|
77
|
-
mutated[key] =
|
|
27
|
+
mutated[key] = corrupt_text_value(mutated[key], gaggle)
|
|
78
28
|
return mutated
|
|
79
29
|
|
|
80
30
|
if isinstance(batch, Sequence) and not isinstance(batch, (bytes, bytearray, str)):
|
|
@@ -83,7 +33,7 @@ def _apply_to_batch(batch: Any, targets: list[str | int] | None, gaggle: Gaggle)
|
|
|
83
33
|
if not isinstance(index, int):
|
|
84
34
|
raise TypeError("Sequence batches require integer column indices")
|
|
85
35
|
try:
|
|
86
|
-
mutated_sequence[index] =
|
|
36
|
+
mutated_sequence[index] = corrupt_text_value(mutated_sequence[index], gaggle)
|
|
87
37
|
except IndexError as exc: # pragma: no cover - defensive
|
|
88
38
|
raise IndexError("Column index out of range for DataLoader batch") from exc
|
|
89
39
|
if isinstance(batch, tuple):
|
|
@@ -96,20 +46,20 @@ def _apply_to_batch(batch: Any, targets: list[str | int] | None, gaggle: Gaggle)
|
|
|
96
46
|
def _infer_targets(batch: Any) -> list[str | int] | None:
|
|
97
47
|
"""Infer which fields should be glitched from a representative ``batch``."""
|
|
98
48
|
if isinstance(batch, Mapping):
|
|
99
|
-
inferred = [key for key, value in batch.items() if
|
|
49
|
+
inferred = [key for key, value in batch.items() if is_textual_candidate(value)]
|
|
100
50
|
if inferred:
|
|
101
51
|
return inferred
|
|
102
52
|
raise ValueError("Unable to infer which mapping columns contain text")
|
|
103
53
|
|
|
104
54
|
if isinstance(batch, Sequence) and not isinstance(batch, (bytes, bytearray, str)):
|
|
105
55
|
inferred_indices: list[str | int] = [
|
|
106
|
-
idx for idx, value in enumerate(batch) if
|
|
56
|
+
idx for idx, value in enumerate(batch) if is_textual_candidate(value)
|
|
107
57
|
]
|
|
108
58
|
if inferred_indices:
|
|
109
59
|
return inferred_indices
|
|
110
60
|
raise ValueError("Unable to infer which sequence indices contain text")
|
|
111
61
|
|
|
112
|
-
if
|
|
62
|
+
if is_textual_candidate(batch):
|
|
113
63
|
return None
|
|
114
64
|
|
|
115
65
|
raise TypeError("Unsupported DataLoader batch type for glitching")
|
|
@@ -184,8 +134,8 @@ def _ensure_dataloader_class() -> type[Any]:
|
|
|
184
134
|
) -> _GlitchedDataLoader:
|
|
185
135
|
"""Return a lazily glitched view of the loader's batches."""
|
|
186
136
|
gaggle = coerce_gaggle(glitchlings, seed=seed)
|
|
187
|
-
|
|
188
|
-
return _GlitchedDataLoader(self, gaggle, columns=
|
|
137
|
+
normalized = normalize_column_spec(columns)
|
|
138
|
+
return _GlitchedDataLoader(self, gaggle, columns=normalized)
|
|
189
139
|
|
|
190
140
|
setattr(dataloader_cls, "glitch", glitch)
|
|
191
141
|
|
|
@@ -8,29 +8,7 @@ from typing import Any, cast
|
|
|
8
8
|
from ..compat import get_pytorch_lightning_datamodule, require_pytorch_lightning
|
|
9
9
|
from ..util.adapters import coerce_gaggle
|
|
10
10
|
from ..zoo import Gaggle, Glitchling
|
|
11
|
-
from
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def _normalise_columns(column: str | Sequence[str]) -> list[str]:
|
|
15
|
-
"""Normalise a column specification to a list."""
|
|
16
|
-
if isinstance(column, str):
|
|
17
|
-
return [column]
|
|
18
|
-
|
|
19
|
-
normalised = list(column)
|
|
20
|
-
if not normalised:
|
|
21
|
-
raise ValueError("At least one column must be specified")
|
|
22
|
-
return normalised
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def _glitch_value(value: Any, gaggle: Gaggle) -> Any:
|
|
26
|
-
"""Apply glitchlings to a value when it contains textual content."""
|
|
27
|
-
if isinstance(value, str) or _is_transcript(value, allow_empty=False, require_all_content=True):
|
|
28
|
-
return gaggle.corrupt(value)
|
|
29
|
-
|
|
30
|
-
if isinstance(value, Sequence) and value and all(isinstance(item, str) for item in value):
|
|
31
|
-
return [gaggle.corrupt(item) for item in value]
|
|
32
|
-
|
|
33
|
-
return value
|
|
11
|
+
from ._shared import corrupt_text_value, normalize_column_spec
|
|
34
12
|
|
|
35
13
|
|
|
36
14
|
def _glitch_batch(batch: Any, columns: list[str], gaggle: Gaggle) -> Any:
|
|
@@ -49,7 +27,7 @@ def _glitch_batch(batch: Any, columns: list[str], gaggle: Gaggle) -> Any:
|
|
|
49
27
|
raise ValueError(f"Columns not found in batch: {missing_str}")
|
|
50
28
|
|
|
51
29
|
for column in columns:
|
|
52
|
-
mutated[column] =
|
|
30
|
+
mutated[column] = corrupt_text_value(mutated[column], gaggle)
|
|
53
31
|
|
|
54
32
|
return mutated
|
|
55
33
|
|
|
@@ -62,10 +40,7 @@ def _wrap_dataloader(dataloader: Any, columns: list[str], gaggle: Gaggle) -> Any
|
|
|
62
40
|
if isinstance(dataloader, Mapping):
|
|
63
41
|
mapping_type = cast(type[Any], dataloader.__class__)
|
|
64
42
|
return mapping_type(
|
|
65
|
-
{
|
|
66
|
-
key: _wrap_dataloader(value, columns, gaggle)
|
|
67
|
-
for key, value in dataloader.items()
|
|
68
|
-
}
|
|
43
|
+
{key: _wrap_dataloader(value, columns, gaggle) for key, value in dataloader.items()}
|
|
69
44
|
)
|
|
70
45
|
|
|
71
46
|
if isinstance(dataloader, list):
|
|
@@ -76,9 +51,7 @@ def _wrap_dataloader(dataloader: Any, columns: list[str], gaggle: Gaggle) -> Any
|
|
|
76
51
|
|
|
77
52
|
if isinstance(dataloader, Sequence) and not isinstance(dataloader, (str, bytes, bytearray)):
|
|
78
53
|
sequence_type = cast(type[Any], dataloader.__class__)
|
|
79
|
-
return sequence_type(
|
|
80
|
-
_wrap_dataloader(value, columns, gaggle) for value in dataloader
|
|
81
|
-
)
|
|
54
|
+
return sequence_type(_wrap_dataloader(value, columns, gaggle) for value in dataloader)
|
|
82
55
|
|
|
83
56
|
return _GlitchedDataLoader(dataloader, columns, gaggle)
|
|
84
57
|
|
|
@@ -111,9 +84,13 @@ def _glitch_datamodule(
|
|
|
111
84
|
) -> Any:
|
|
112
85
|
"""Return a proxy that applies glitchlings to batches from the datamodule."""
|
|
113
86
|
|
|
114
|
-
columns =
|
|
87
|
+
columns = normalize_column_spec(column)
|
|
88
|
+
if columns is None: # pragma: no cover - defensive
|
|
89
|
+
raise ValueError("At least one column must be specified")
|
|
90
|
+
# Lightning datamodules only support string column names (mapping keys)
|
|
91
|
+
columns_str = cast(list[str], columns)
|
|
115
92
|
gaggle = coerce_gaggle(glitchlings, seed=seed)
|
|
116
|
-
return _GlitchedLightningDataModule(datamodule,
|
|
93
|
+
return _GlitchedLightningDataModule(datamodule, columns_str, gaggle)
|
|
117
94
|
|
|
118
95
|
|
|
119
96
|
class _GlitchedLightningDataModule:
|
|
@@ -230,4 +207,3 @@ else: # pragma: no cover - optional dependency
|
|
|
230
207
|
|
|
231
208
|
|
|
232
209
|
__all__ = ["LightningDataModule", "install"]
|
|
233
|
-
|
glitchlings/lexicon/__init__.py
CHANGED
|
@@ -102,7 +102,11 @@ from .vector import VectorLexicon, build_vector_cache # noqa: E402
|
|
|
102
102
|
_WordNetLexicon: type[LexiconBackend] | None
|
|
103
103
|
try: # pragma: no cover - optional dependency
|
|
104
104
|
from .wordnet import WordNetLexicon as _WordNetLexicon
|
|
105
|
-
except
|
|
105
|
+
except (
|
|
106
|
+
ImportError,
|
|
107
|
+
ModuleNotFoundError,
|
|
108
|
+
AttributeError,
|
|
109
|
+
): # pragma: no cover - triggered when nltk unavailable
|
|
106
110
|
_WordNetLexicon = None
|
|
107
111
|
|
|
108
112
|
WordNetLexicon: type[LexiconBackend] | None = _WordNetLexicon
|
glitchlings/lexicon/_cache.py
CHANGED
|
@@ -19,7 +19,7 @@ class CacheSnapshot:
|
|
|
19
19
|
checksum: str | None = None
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
def
|
|
22
|
+
def _normalize_entries(payload: Mapping[str, object]) -> CacheEntries:
|
|
23
23
|
"""Convert raw cache payloads into canonical mapping form."""
|
|
24
24
|
entries: CacheEntries = {}
|
|
25
25
|
for key, values in payload.items():
|
|
@@ -75,7 +75,7 @@ def load_cache(path: Path) -> CacheSnapshot:
|
|
|
75
75
|
else:
|
|
76
76
|
entries_payload = payload # legacy format without metadata
|
|
77
77
|
|
|
78
|
-
entries =
|
|
78
|
+
entries = _normalize_entries(entries_payload)
|
|
79
79
|
if checksum is not None:
|
|
80
80
|
expected = compute_checksum(entries)
|
|
81
81
|
if checksum != expected:
|
|
@@ -88,9 +88,7 @@ def load_cache(path: Path) -> CacheSnapshot:
|
|
|
88
88
|
|
|
89
89
|
def write_cache(path: Path, entries: Mapping[str, Sequence[str]]) -> CacheSnapshot:
|
|
90
90
|
"""Persist ``entries`` to ``path`` with checksum metadata."""
|
|
91
|
-
serialisable: CacheEntries = {
|
|
92
|
-
key: list(values) for key, values in sorted(entries.items())
|
|
93
|
-
}
|
|
91
|
+
serialisable: CacheEntries = {key: list(values) for key, values in sorted(entries.items())}
|
|
94
92
|
checksum = compute_checksum(serialisable)
|
|
95
93
|
payload = {
|
|
96
94
|
"__meta__": {
|
glitchlings/lexicon/vector.py
CHANGED
|
@@ -16,6 +16,9 @@ from ._cache import CacheSnapshot
|
|
|
16
16
|
from ._cache import load_cache as _load_cache_file
|
|
17
17
|
from ._cache import write_cache as _write_cache_file
|
|
18
18
|
|
|
19
|
+
# Minimum number of neighbors to consider for similarity queries
|
|
20
|
+
MIN_NEIGHBORS = 1
|
|
21
|
+
|
|
19
22
|
|
|
20
23
|
def _cosine_similarity(vector_a: Sequence[float], vector_b: Sequence[float]) -> float:
|
|
21
24
|
"""Return the cosine similarity between two dense vectors."""
|
|
@@ -304,7 +307,7 @@ class VectorLexicon(LexiconBackend):
|
|
|
304
307
|
"""Initialise the lexicon with an embedding ``source`` and optional cache."""
|
|
305
308
|
super().__init__(seed=seed)
|
|
306
309
|
self._adapter = _resolve_source(source)
|
|
307
|
-
self._max_neighbors = max(
|
|
310
|
+
self._max_neighbors = max(MIN_NEIGHBORS, max_neighbors)
|
|
308
311
|
self._min_similarity = min_similarity
|
|
309
312
|
self._cache: MutableMapping[str, list[str]] = {}
|
|
310
313
|
self._cache_path: Path | None
|
|
@@ -371,7 +374,7 @@ class VectorLexicon(LexiconBackend):
|
|
|
371
374
|
if cache_key in self._cache:
|
|
372
375
|
return self._cache[cache_key]
|
|
373
376
|
|
|
374
|
-
neighbor_limit = self._max_neighbors if limit is None else max(
|
|
377
|
+
neighbor_limit = self._max_neighbors if limit is None else max(MIN_NEIGHBORS, limit)
|
|
375
378
|
neighbors = self._fetch_neighbors(
|
|
376
379
|
original=original, normalized=normalized, limit=neighbor_limit
|
|
377
380
|
)
|
|
@@ -624,9 +627,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|
|
624
627
|
)
|
|
625
628
|
iterator = lexicon.iter_vocabulary()
|
|
626
629
|
if args.limit is not None:
|
|
627
|
-
token_iter = (
|
|
628
|
-
token for index, token in enumerate(iterator) if index < args.limit
|
|
629
|
-
)
|
|
630
|
+
token_iter = (token for index, token in enumerate(iterator) if index < args.limit)
|
|
630
631
|
else:
|
|
631
632
|
token_iter = iterator
|
|
632
633
|
|