glitchlings 1.0.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/__init__.py +101 -0
- glitchlings/__main__.py +8 -0
- glitchlings/_corruption_engine/__init__.py +12 -0
- glitchlings/_corruption_engine.cp313-win_amd64.pyd +0 -0
- glitchlings/assets/__init__.py +180 -0
- glitchlings/assets/apostrofae_pairs.json +32 -0
- glitchlings/assets/ekkokin_homophones.json +2014 -0
- glitchlings/assets/hokey_assets.json +193 -0
- glitchlings/assets/lexemes/academic.json +1049 -0
- glitchlings/assets/lexemes/colors.json +1333 -0
- glitchlings/assets/lexemes/corporate.json +716 -0
- glitchlings/assets/lexemes/cyberpunk.json +22 -0
- glitchlings/assets/lexemes/lovecraftian.json +23 -0
- glitchlings/assets/lexemes/synonyms.json +3354 -0
- glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
- glitchlings/assets/ocr_confusions.tsv +30 -0
- glitchlings/assets/pipeline_assets.json +29 -0
- glitchlings/attack/__init__.py +184 -0
- glitchlings/attack/analysis.py +1321 -0
- glitchlings/attack/core.py +819 -0
- glitchlings/attack/core_execution.py +378 -0
- glitchlings/attack/core_planning.py +612 -0
- glitchlings/attack/encode.py +114 -0
- glitchlings/attack/metrics.py +211 -0
- glitchlings/attack/metrics_dispatch.py +70 -0
- glitchlings/attack/tokenization.py +338 -0
- glitchlings/attack/tokenizer_metrics.py +373 -0
- glitchlings/auggie.py +285 -0
- glitchlings/compat/__init__.py +9 -0
- glitchlings/compat/loaders.py +355 -0
- glitchlings/compat/types.py +41 -0
- glitchlings/conf/__init__.py +39 -0
- glitchlings/conf/loaders.py +331 -0
- glitchlings/conf/schema.py +156 -0
- glitchlings/conf/types.py +72 -0
- glitchlings/config.toml +2 -0
- glitchlings/constants.py +139 -0
- glitchlings/dev/__init__.py +3 -0
- glitchlings/dev/docs.py +45 -0
- glitchlings/dlc/__init__.py +21 -0
- glitchlings/dlc/_shared.py +300 -0
- glitchlings/dlc/gutenberg.py +400 -0
- glitchlings/dlc/huggingface.py +68 -0
- glitchlings/dlc/langchain.py +147 -0
- glitchlings/dlc/nemo.py +283 -0
- glitchlings/dlc/prime.py +215 -0
- glitchlings/dlc/pytorch.py +98 -0
- glitchlings/dlc/pytorch_lightning.py +173 -0
- glitchlings/internal/__init__.py +16 -0
- glitchlings/internal/rust.py +159 -0
- glitchlings/internal/rust_ffi.py +599 -0
- glitchlings/main.py +426 -0
- glitchlings/protocols.py +91 -0
- glitchlings/runtime_config.py +24 -0
- glitchlings/util/__init__.py +41 -0
- glitchlings/util/adapters.py +65 -0
- glitchlings/util/keyboards.py +508 -0
- glitchlings/util/transcripts.py +108 -0
- glitchlings/zoo/__init__.py +161 -0
- glitchlings/zoo/assets/__init__.py +29 -0
- glitchlings/zoo/core.py +852 -0
- glitchlings/zoo/core_execution.py +154 -0
- glitchlings/zoo/core_planning.py +451 -0
- glitchlings/zoo/corrupt_dispatch.py +291 -0
- glitchlings/zoo/hokey.py +139 -0
- glitchlings/zoo/jargoyle.py +301 -0
- glitchlings/zoo/mim1c.py +269 -0
- glitchlings/zoo/pedant/__init__.py +109 -0
- glitchlings/zoo/pedant/core.py +99 -0
- glitchlings/zoo/pedant/forms.py +50 -0
- glitchlings/zoo/pedant/stones.py +83 -0
- glitchlings/zoo/redactyl.py +94 -0
- glitchlings/zoo/rng.py +280 -0
- glitchlings/zoo/rushmore.py +416 -0
- glitchlings/zoo/scannequin.py +370 -0
- glitchlings/zoo/transforms.py +331 -0
- glitchlings/zoo/typogre.py +194 -0
- glitchlings/zoo/validation.py +643 -0
- glitchlings/zoo/wherewolf.py +120 -0
- glitchlings/zoo/zeedub.py +165 -0
- glitchlings-1.0.0.dist-info/METADATA +404 -0
- glitchlings-1.0.0.dist-info/RECORD +86 -0
- glitchlings-1.0.0.dist-info/WHEEL +5 -0
- glitchlings-1.0.0.dist-info/entry_points.txt +3 -0
- glitchlings-1.0.0.dist-info/licenses/LICENSE +201 -0
- glitchlings-1.0.0.dist-info/top_level.txt +1 -0
glitchlings/constants.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Centralized defaults and shared configuration constants."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
# Global configuration defaults
|
|
8
|
+
DEFAULT_ATTACK_SEED = 151
|
|
9
|
+
DEFAULT_CONFIG_PATH = Path(__file__).with_name("config.toml")
|
|
10
|
+
|
|
11
|
+
# Character-level glitchling default rates
|
|
12
|
+
DEFAULT_TYPOGRE_RATE = 0.02
|
|
13
|
+
DEFAULT_TYPOGRE_KEYBOARD = "CURATOR_QWERTY"
|
|
14
|
+
DEFAULT_TYPOGRE_MOTOR_WEIGHTING = "uniform"
|
|
15
|
+
DEFAULT_MIM1C_RATE = 0.02
|
|
16
|
+
DEFAULT_ZEEDUB_RATE = 0.02
|
|
17
|
+
|
|
18
|
+
# Scannequin OCR simulation defaults
|
|
19
|
+
# Base rate for character-level OCR confusions
|
|
20
|
+
DEFAULT_SCANNEQUIN_RATE = 0.02
|
|
21
|
+
# Burst model parameters (Kanungo et al., 1994)
|
|
22
|
+
DEFAULT_SCANNEQUIN_BURST_ENTER = 0.0 # Disabled by default
|
|
23
|
+
DEFAULT_SCANNEQUIN_BURST_EXIT = 0.3
|
|
24
|
+
DEFAULT_SCANNEQUIN_BURST_MULTIPLIER = 3.0
|
|
25
|
+
# Document-level bias parameters (UNLV-ISRI, 1995)
|
|
26
|
+
DEFAULT_SCANNEQUIN_BIAS_K = 0 # Disabled by default
|
|
27
|
+
DEFAULT_SCANNEQUIN_BIAS_BETA = 2.0
|
|
28
|
+
# Whitespace error parameters (Smith, 2007)
|
|
29
|
+
DEFAULT_SCANNEQUIN_SPACE_DROP_RATE = 0.0 # Disabled by default
|
|
30
|
+
DEFAULT_SCANNEQUIN_SPACE_INSERT_RATE = 0.0 # Disabled by default
|
|
31
|
+
|
|
32
|
+
# Scannequin quality presets based on UNLV-ISRI test regimes (Rice et al., 1995)
|
|
33
|
+
# Each preset maps to (rate, burst_enter, burst_exit, burst_multiplier, bias_k, bias_beta,
|
|
34
|
+
# space_drop_rate, space_insert_rate)
|
|
35
|
+
SCANNEQUIN_PRESETS: dict[str, tuple[float, float, float, float, int, float, float, float]] = {
|
|
36
|
+
# Clean 300dpi scan - minimal errors, good quality baseline
|
|
37
|
+
"clean_300dpi": (0.01, 0.0, 0.3, 3.0, 0, 2.0, 0.0, 0.0),
|
|
38
|
+
# Newspaper scan - moderate errors, some burst, stroke-loss bias
|
|
39
|
+
"newspaper": (0.03, 0.05, 0.3, 2.5, 3, 2.0, 0.005, 0.002),
|
|
40
|
+
# Fax quality - high errors, strong burst, heavy l/1/I confusion bias
|
|
41
|
+
"fax": (0.06, 0.1, 0.2, 3.5, 5, 3.0, 0.02, 0.01),
|
|
42
|
+
# Third-generation photocopy - very degraded, long burst runs
|
|
43
|
+
"photocopy_3rd_gen": (0.08, 0.15, 0.15, 4.0, 5, 3.5, 0.03, 0.015),
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
# Word-level glitchling default rates
|
|
47
|
+
DEFAULT_WHEREWOLF_RATE = 0.02
|
|
48
|
+
DEFAULT_WHEREWOLF_WEIGHTING = "flat"
|
|
49
|
+
DEFAULT_JARGOYLE_RATE = 0.01
|
|
50
|
+
DEFAULT_REDACTYL_RATE = 0.025
|
|
51
|
+
DEFAULT_REDACTYL_CHAR = "\u2588" # █ FULL BLOCK
|
|
52
|
+
|
|
53
|
+
# Rushmore default rates per mode
|
|
54
|
+
RUSHMORE_DEFAULT_RATES = {
|
|
55
|
+
"delete": 0.01,
|
|
56
|
+
"duplicate": 0.01,
|
|
57
|
+
"swap": 0.5,
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
# Mim1c Unicode script class defaults
|
|
61
|
+
MIM1C_DEFAULT_CLASSES: tuple[str, ...] = ("LATIN", "GREEK", "CYRILLIC", "COMMON")
|
|
62
|
+
|
|
63
|
+
# Mim1c homoglyph mode defaults
|
|
64
|
+
# Available modes: "single_script", "mixed_script", "compatibility", "aggressive"
|
|
65
|
+
DEFAULT_MIM1C_MODE = "mixed_script"
|
|
66
|
+
DEFAULT_MIM1C_MAX_CONSECUTIVE = 3
|
|
67
|
+
|
|
68
|
+
# Zeedub zero-width character palettes by visibility mode
|
|
69
|
+
ZEEDUB_DEFAULT_ZERO_WIDTHS: tuple[str, ...] = (
|
|
70
|
+
"\u200b", # ZERO WIDTH SPACE
|
|
71
|
+
"\u200c", # ZERO WIDTH NON-JOINER
|
|
72
|
+
"\u200d", # ZERO WIDTH JOINER
|
|
73
|
+
"\ufeff", # BYTE ORDER MARK (zero-width no-break space)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Glyphless mode palette (true invisibles only)
|
|
77
|
+
ZEEDUB_GLYPHLESS_PALETTE: tuple[str, ...] = (
|
|
78
|
+
"\u200b", # ZERO WIDTH SPACE
|
|
79
|
+
"\u200c", # ZERO WIDTH NON-JOINER
|
|
80
|
+
"\u200d", # ZERO WIDTH JOINER
|
|
81
|
+
"\ufeff", # BYTE ORDER MARK
|
|
82
|
+
"\u2060", # WORD JOINER
|
|
83
|
+
"\u034f", # COMBINING GRAPHEME JOINER
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# With joiners palette (includes variation selectors VS1-VS16)
|
|
87
|
+
ZEEDUB_WITH_JOINERS_PALETTE: tuple[str, ...] = ZEEDUB_GLYPHLESS_PALETTE + tuple(
|
|
88
|
+
chr(c)
|
|
89
|
+
for c in range(0xFE00, 0xFE10) # VS1-VS16
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Semi-visible palette (includes thin spaces)
|
|
93
|
+
ZEEDUB_SEMI_VISIBLE_PALETTE: tuple[str, ...] = ZEEDUB_WITH_JOINERS_PALETTE + (
|
|
94
|
+
"\u200a", # HAIR SPACE
|
|
95
|
+
"\u2009", # THIN SPACE
|
|
96
|
+
"\u202f", # NARROW NO-BREAK SPACE
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Zeedub defaults
|
|
100
|
+
DEFAULT_ZEEDUB_VISIBILITY = "glyphless"
|
|
101
|
+
DEFAULT_ZEEDUB_PLACEMENT = "random"
|
|
102
|
+
DEFAULT_ZEEDUB_MAX_CONSECUTIVE = 4
|
|
103
|
+
|
|
104
|
+
__all__ = [
|
|
105
|
+
"DEFAULT_ATTACK_SEED",
|
|
106
|
+
"DEFAULT_CONFIG_PATH",
|
|
107
|
+
"DEFAULT_WHEREWOLF_RATE",
|
|
108
|
+
"DEFAULT_WHEREWOLF_WEIGHTING",
|
|
109
|
+
"DEFAULT_JARGOYLE_RATE",
|
|
110
|
+
"DEFAULT_MIM1C_RATE",
|
|
111
|
+
"DEFAULT_REDACTYL_CHAR",
|
|
112
|
+
"DEFAULT_REDACTYL_RATE",
|
|
113
|
+
# Scannequin defaults
|
|
114
|
+
"DEFAULT_SCANNEQUIN_RATE",
|
|
115
|
+
"DEFAULT_SCANNEQUIN_BURST_ENTER",
|
|
116
|
+
"DEFAULT_SCANNEQUIN_BURST_EXIT",
|
|
117
|
+
"DEFAULT_SCANNEQUIN_BURST_MULTIPLIER",
|
|
118
|
+
"DEFAULT_SCANNEQUIN_BIAS_K",
|
|
119
|
+
"DEFAULT_SCANNEQUIN_BIAS_BETA",
|
|
120
|
+
"DEFAULT_SCANNEQUIN_SPACE_DROP_RATE",
|
|
121
|
+
"DEFAULT_SCANNEQUIN_SPACE_INSERT_RATE",
|
|
122
|
+
"SCANNEQUIN_PRESETS",
|
|
123
|
+
# Typogre defaults
|
|
124
|
+
"DEFAULT_TYPOGRE_KEYBOARD",
|
|
125
|
+
"DEFAULT_TYPOGRE_MOTOR_WEIGHTING",
|
|
126
|
+
"DEFAULT_TYPOGRE_RATE",
|
|
127
|
+
"DEFAULT_ZEEDUB_RATE",
|
|
128
|
+
"DEFAULT_ZEEDUB_VISIBILITY",
|
|
129
|
+
"DEFAULT_ZEEDUB_PLACEMENT",
|
|
130
|
+
"DEFAULT_ZEEDUB_MAX_CONSECUTIVE",
|
|
131
|
+
"MIM1C_DEFAULT_CLASSES",
|
|
132
|
+
"DEFAULT_MIM1C_MODE",
|
|
133
|
+
"DEFAULT_MIM1C_MAX_CONSECUTIVE",
|
|
134
|
+
"RUSHMORE_DEFAULT_RATES",
|
|
135
|
+
"ZEEDUB_DEFAULT_ZERO_WIDTHS",
|
|
136
|
+
"ZEEDUB_GLYPHLESS_PALETTE",
|
|
137
|
+
"ZEEDUB_WITH_JOINERS_PALETTE",
|
|
138
|
+
"ZEEDUB_SEMI_VISIBLE_PALETTE",
|
|
139
|
+
]
|
glitchlings/dev/docs.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Developer helpers for refreshing generated documentation assets."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import runpy
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
REPO_ROOT = Path(__file__).resolve().parents[3]
|
|
9
|
+
DOCS_DIR = REPO_ROOT / "docs"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _run_script(path: Path) -> None:
|
|
13
|
+
if not path.exists():
|
|
14
|
+
raise FileNotFoundError(f"Documentation helper not found: {path}")
|
|
15
|
+
runpy.run_path(str(path))
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def refresh_cli_reference() -> None:
|
|
19
|
+
"""Regenerate the CLI reference docs page."""
|
|
20
|
+
_run_script(DOCS_DIR / "build_cli_reference.py")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def refresh_monster_manual() -> None:
|
|
24
|
+
"""Regenerate the Monster Manual."""
|
|
25
|
+
_run_script(DOCS_DIR / "build_monster_manual.py")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def refresh_gallery() -> None:
|
|
29
|
+
"""Regenerate the glitchling gallery page."""
|
|
30
|
+
_run_script(DOCS_DIR / "build_glitchling_gallery.py")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def refresh_all() -> None:
|
|
34
|
+
"""Regenerate CLI reference, Monster Manual, and gallery docs in one call."""
|
|
35
|
+
refresh_cli_reference()
|
|
36
|
+
refresh_monster_manual()
|
|
37
|
+
refresh_gallery()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def main() -> None:
|
|
41
|
+
refresh_all()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
if __name__ == "__main__":
|
|
45
|
+
main()
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Optional DLC integrations for Glitchlings.
|
|
2
|
+
|
|
3
|
+
This module provides explicit wrapper classes for integrating glitchlings
|
|
4
|
+
with popular ML frameworks:
|
|
5
|
+
|
|
6
|
+
- :class:`~glitchlings.dlc.huggingface.GlitchedDataset`: Wrap Hugging Face datasets
|
|
7
|
+
- :class:`~glitchlings.dlc.pytorch.GlitchedDataLoader`: Wrap PyTorch data loaders
|
|
8
|
+
- :class:`~glitchlings.dlc.pytorch_lightning.GlitchedLightningDataModule`: Wrap
|
|
9
|
+
Lightning data modules
|
|
10
|
+
- :class:`~glitchlings.dlc.gutenberg.GlitchenbergAPI`: Wrap Project Gutenberg API
|
|
11
|
+
- :class:`~glitchlings.dlc.langchain.GlitchedRunnable`: Wrap LangChain runnables
|
|
12
|
+
- :class:`~glitchlings.dlc.nemo.GlitchlingColumnGenerator`: NeMo DataDesigner plugin
|
|
13
|
+
|
|
14
|
+
Example:
|
|
15
|
+
>>> from glitchlings.dlc.huggingface import GlitchedDataset
|
|
16
|
+
>>> from datasets import Dataset
|
|
17
|
+
>>> dataset = Dataset.from_dict({"text": ["hello", "world"]})
|
|
18
|
+
>>> corrupted = GlitchedDataset(dataset, "typogre", column="text")
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
__all__: list[str] = []
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
"""Shared utilities for DLC integrations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Mapping, Sequence
|
|
6
|
+
from typing import Any, cast
|
|
7
|
+
|
|
8
|
+
from ..util.transcripts import is_transcript
|
|
9
|
+
from ..zoo.core import Gaggle
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def resolve_columns(dataset: Any, columns: Sequence[str] | None) -> list[str]:
|
|
13
|
+
"""Identify which dataset columns should be corrupted."""
|
|
14
|
+
available = set(getattr(dataset, "column_names", ()))
|
|
15
|
+
|
|
16
|
+
if columns is not None:
|
|
17
|
+
missing = sorted(set(columns) - available)
|
|
18
|
+
if missing:
|
|
19
|
+
missing_str = ", ".join(missing)
|
|
20
|
+
raise ValueError(f"Columns not found in dataset: {missing_str}")
|
|
21
|
+
return list(columns)
|
|
22
|
+
|
|
23
|
+
for candidate in ("prompt", "question"):
|
|
24
|
+
if candidate in available:
|
|
25
|
+
return [candidate]
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
dataset_length = len(dataset)
|
|
29
|
+
except TypeError:
|
|
30
|
+
preview_rows: list[dict[str, Any]]
|
|
31
|
+
take_fn = getattr(dataset, "take", None)
|
|
32
|
+
if callable(take_fn):
|
|
33
|
+
preview_rows = list(take_fn(1))
|
|
34
|
+
else:
|
|
35
|
+
iterator = iter(dataset)
|
|
36
|
+
try:
|
|
37
|
+
first_row = next(iterator)
|
|
38
|
+
except StopIteration:
|
|
39
|
+
preview_rows = []
|
|
40
|
+
else:
|
|
41
|
+
preview_rows = [first_row]
|
|
42
|
+
sample = dict(preview_rows[0]) if preview_rows else {}
|
|
43
|
+
else:
|
|
44
|
+
sample = dataset[0] if dataset_length else {}
|
|
45
|
+
inferred = [
|
|
46
|
+
name for name in getattr(dataset, "column_names", ()) if isinstance(sample.get(name), str)
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
if inferred:
|
|
50
|
+
return inferred
|
|
51
|
+
|
|
52
|
+
raise ValueError("Unable to determine which dataset columns to corrupt.")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def normalize_column_spec(
|
|
56
|
+
columns: str | int | Sequence[str | int] | None,
|
|
57
|
+
) -> list[str | int] | None:
|
|
58
|
+
"""Normalize a column specification into a list of keys or indices.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
columns: Column specification as a single value, sequence of values, or None.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
A list of column identifiers, or None if input was None.
|
|
65
|
+
|
|
66
|
+
Raises:
|
|
67
|
+
ValueError: If an empty sequence is provided.
|
|
68
|
+
"""
|
|
69
|
+
if columns is None:
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
if isinstance(columns, (str, int)):
|
|
73
|
+
return [columns]
|
|
74
|
+
|
|
75
|
+
normalized = list(columns)
|
|
76
|
+
if not normalized:
|
|
77
|
+
raise ValueError("At least one column must be specified")
|
|
78
|
+
return normalized
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def is_textual_candidate(value: Any) -> bool:
|
|
82
|
+
"""Return ``True`` when ``value`` looks like text that glitchlings can corrupt.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
value: The value to check for textual content.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
True if the value appears to be textual content.
|
|
89
|
+
"""
|
|
90
|
+
if isinstance(value, str):
|
|
91
|
+
return True
|
|
92
|
+
|
|
93
|
+
if is_transcript(value, allow_empty=False, require_all_content=True):
|
|
94
|
+
return True
|
|
95
|
+
|
|
96
|
+
if isinstance(value, Sequence) and not isinstance(value, (bytes, bytearray, str)):
|
|
97
|
+
if not value:
|
|
98
|
+
return False
|
|
99
|
+
if all(isinstance(item, str) for item in value):
|
|
100
|
+
return True
|
|
101
|
+
if is_transcript(list(value), allow_empty=False, require_all_content=True):
|
|
102
|
+
return True
|
|
103
|
+
|
|
104
|
+
return False
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def corrupt_text_value(value: Any, gaggle: Gaggle) -> Any:
|
|
108
|
+
"""Return ``value`` with glitchlings applied when possible.
|
|
109
|
+
|
|
110
|
+
Uses parallel Rust pipeline execution for lists of strings when all
|
|
111
|
+
glitchlings support the Rust pipeline, providing significant speedups
|
|
112
|
+
for large batches.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
value: The value to corrupt (string, transcript, or sequence of strings).
|
|
116
|
+
gaggle: The gaggle of glitchlings to apply.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
The corrupted value, preserving the original type where possible.
|
|
120
|
+
"""
|
|
121
|
+
if isinstance(value, str):
|
|
122
|
+
return gaggle.corrupt(value)
|
|
123
|
+
|
|
124
|
+
if is_transcript(value, allow_empty=True):
|
|
125
|
+
return gaggle.corrupt(value)
|
|
126
|
+
|
|
127
|
+
if isinstance(value, list) and value and all(isinstance(item, str) for item in value):
|
|
128
|
+
return gaggle.corrupt_batch(value)
|
|
129
|
+
|
|
130
|
+
if isinstance(value, tuple) and value and all(isinstance(item, str) for item in value):
|
|
131
|
+
return tuple(gaggle.corrupt_batch(list(value)))
|
|
132
|
+
|
|
133
|
+
return value
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def infer_batch_targets(batch: Any) -> list[str | int] | None:
|
|
137
|
+
"""Infer which fields should be glitched from a representative batch.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
batch: A batch from a DataLoader (mapping, sequence, or textual value).
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
A list of column keys (strings) or indices (ints), or None if the batch
|
|
144
|
+
itself is textual content.
|
|
145
|
+
|
|
146
|
+
Raises:
|
|
147
|
+
ValueError: If unable to infer textual columns/indices.
|
|
148
|
+
TypeError: If the batch type is unsupported.
|
|
149
|
+
"""
|
|
150
|
+
if isinstance(batch, Mapping):
|
|
151
|
+
inferred = [key for key, value in batch.items() if is_textual_candidate(value)]
|
|
152
|
+
if inferred:
|
|
153
|
+
return inferred
|
|
154
|
+
raise ValueError("Unable to infer which mapping columns contain text")
|
|
155
|
+
|
|
156
|
+
if isinstance(batch, Sequence) and not isinstance(batch, (bytes, bytearray, str)):
|
|
157
|
+
inferred_indices: list[str | int] = [
|
|
158
|
+
idx for idx, value in enumerate(batch) if is_textual_candidate(value)
|
|
159
|
+
]
|
|
160
|
+
if inferred_indices:
|
|
161
|
+
return inferred_indices
|
|
162
|
+
raise ValueError("Unable to infer which sequence indices contain text")
|
|
163
|
+
|
|
164
|
+
if is_textual_candidate(batch):
|
|
165
|
+
return None
|
|
166
|
+
|
|
167
|
+
raise TypeError("Unsupported DataLoader batch type for glitching")
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def corrupt_batch(batch: Any, targets: list[str | int] | None, gaggle: Gaggle) -> Any:
|
|
171
|
+
"""Return batch with glitchlings applied to the specified targets.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
batch: The batch to corrupt (mapping, sequence, or textual value).
|
|
175
|
+
targets: List of column keys (strings) or indices (ints), or None to
|
|
176
|
+
corrupt the entire batch as textual content.
|
|
177
|
+
gaggle: The gaggle of glitchlings to apply.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
The corrupted batch, preserving the original type.
|
|
181
|
+
|
|
182
|
+
Raises:
|
|
183
|
+
TypeError: If batch type is unsupported or targets are incompatible.
|
|
184
|
+
ValueError: If a specified target is not found in the batch.
|
|
185
|
+
"""
|
|
186
|
+
if targets is None:
|
|
187
|
+
return corrupt_text_value(batch, gaggle)
|
|
188
|
+
|
|
189
|
+
if isinstance(batch, Mapping):
|
|
190
|
+
# Use copy() if available, otherwise dict()
|
|
191
|
+
if hasattr(batch, "copy"):
|
|
192
|
+
mutated = batch.copy()
|
|
193
|
+
else:
|
|
194
|
+
mutated = dict(batch)
|
|
195
|
+
|
|
196
|
+
for key in targets:
|
|
197
|
+
if not isinstance(key, str):
|
|
198
|
+
raise TypeError("Mapping batches require string column names")
|
|
199
|
+
if key not in mutated:
|
|
200
|
+
raise ValueError(f"Column '{key}' not found in DataLoader batch")
|
|
201
|
+
mutated[key] = corrupt_text_value(mutated[key], gaggle)
|
|
202
|
+
return mutated
|
|
203
|
+
|
|
204
|
+
if isinstance(batch, Sequence) and not isinstance(batch, (bytes, bytearray, str)):
|
|
205
|
+
mutated_sequence = list(batch)
|
|
206
|
+
for index in targets:
|
|
207
|
+
if not isinstance(index, int):
|
|
208
|
+
raise TypeError("Sequence batches require integer column indices")
|
|
209
|
+
try:
|
|
210
|
+
mutated_sequence[index] = corrupt_text_value(mutated_sequence[index], gaggle)
|
|
211
|
+
except IndexError as exc: # pragma: no cover - defensive
|
|
212
|
+
raise IndexError("Column index out of range for DataLoader batch") from exc
|
|
213
|
+
if isinstance(batch, tuple):
|
|
214
|
+
return tuple(mutated_sequence)
|
|
215
|
+
return mutated_sequence
|
|
216
|
+
|
|
217
|
+
raise TypeError("Unsupported DataLoader batch type for glitching")
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
class BaseGlitchedDataLoader:
|
|
221
|
+
"""Proxy dataloader that glitches batches produced by the wrapped loader.
|
|
222
|
+
|
|
223
|
+
This class wraps a dataloader and applies glitchlings to specified columns
|
|
224
|
+
in each batch as it's yielded. It supports both mapping-based batches (dict-like)
|
|
225
|
+
and sequence-based batches (list/tuple-like).
|
|
226
|
+
"""
|
|
227
|
+
|
|
228
|
+
def __init__(self, dataloader: Any, columns: list[str | int], gaggle: Gaggle) -> None:
|
|
229
|
+
"""Initialize the glitched dataloader.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
dataloader: The underlying dataloader to wrap.
|
|
233
|
+
columns: List of column names (strings) or indices (ints) to corrupt.
|
|
234
|
+
gaggle: The gaggle of glitchlings to apply.
|
|
235
|
+
"""
|
|
236
|
+
self._dataloader = dataloader
|
|
237
|
+
self._columns = columns
|
|
238
|
+
self._gaggle = gaggle
|
|
239
|
+
|
|
240
|
+
def __iter__(self) -> Any:
|
|
241
|
+
"""Yield corrupted batches from the underlying dataloader."""
|
|
242
|
+
for batch in self._dataloader:
|
|
243
|
+
yield corrupt_batch(batch, self._columns, self._gaggle)
|
|
244
|
+
|
|
245
|
+
def __len__(self) -> int:
|
|
246
|
+
"""Return the number of batches in the dataloader."""
|
|
247
|
+
return len(self._dataloader)
|
|
248
|
+
|
|
249
|
+
def __getattr__(self, attribute: str) -> Any:
|
|
250
|
+
"""Proxy attribute access to the underlying dataloader."""
|
|
251
|
+
return getattr(self._dataloader, attribute)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def wrap_dataloader(dataloader: Any, columns: list[str | int], gaggle: Gaggle) -> Any:
|
|
255
|
+
"""Wrap a dataloader (or nested structure) to apply glitchlings lazily.
|
|
256
|
+
|
|
257
|
+
This function recursively wraps dataloaders in nested structures (mappings,
|
|
258
|
+
lists, tuples, etc.) so that all dataloaders in the structure will yield
|
|
259
|
+
corrupted batches.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
dataloader: The dataloader or nested structure to wrap.
|
|
263
|
+
columns: List of column names (strings) or indices (ints) to corrupt.
|
|
264
|
+
gaggle: The gaggle of glitchlings to apply.
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
The wrapped dataloader or structure, with the same type as the input.
|
|
268
|
+
"""
|
|
269
|
+
if dataloader is None:
|
|
270
|
+
return None
|
|
271
|
+
|
|
272
|
+
if isinstance(dataloader, Mapping):
|
|
273
|
+
mapping_type = cast(type[Any], dataloader.__class__)
|
|
274
|
+
return mapping_type(
|
|
275
|
+
{key: wrap_dataloader(value, columns, gaggle) for key, value in dataloader.items()}
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
if isinstance(dataloader, list):
|
|
279
|
+
return [wrap_dataloader(value, columns, gaggle) for value in dataloader]
|
|
280
|
+
|
|
281
|
+
if isinstance(dataloader, tuple):
|
|
282
|
+
return tuple(wrap_dataloader(value, columns, gaggle) for value in dataloader)
|
|
283
|
+
|
|
284
|
+
if isinstance(dataloader, Sequence) and not isinstance(dataloader, (str, bytes, bytearray)):
|
|
285
|
+
sequence_type = cast(type[Any], dataloader.__class__)
|
|
286
|
+
return sequence_type(wrap_dataloader(value, columns, gaggle) for value in dataloader)
|
|
287
|
+
|
|
288
|
+
return BaseGlitchedDataLoader(dataloader, columns, gaggle)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
__all__ = [
|
|
292
|
+
"BaseGlitchedDataLoader",
|
|
293
|
+
"corrupt_batch",
|
|
294
|
+
"corrupt_text_value",
|
|
295
|
+
"infer_batch_targets",
|
|
296
|
+
"is_textual_candidate",
|
|
297
|
+
"normalize_column_spec",
|
|
298
|
+
"resolve_columns",
|
|
299
|
+
"wrap_dataloader",
|
|
300
|
+
]
|