glitchlings 0.2.5__cp312-cp312-win_amd64.whl → 0.9.3__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glitchlings/__init__.py +36 -17
- glitchlings/__main__.py +0 -1
- glitchlings/_zoo_rust/__init__.py +12 -0
- glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
- glitchlings/assets/__init__.py +180 -0
- glitchlings/assets/apostrofae_pairs.json +32 -0
- glitchlings/assets/ekkokin_homophones.json +2014 -0
- glitchlings/assets/hokey_assets.json +193 -0
- glitchlings/assets/lexemes/academic.json +1049 -0
- glitchlings/assets/lexemes/colors.json +1333 -0
- glitchlings/assets/lexemes/corporate.json +716 -0
- glitchlings/assets/lexemes/cyberpunk.json +22 -0
- glitchlings/assets/lexemes/lovecraftian.json +23 -0
- glitchlings/assets/lexemes/synonyms.json +3354 -0
- glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
- glitchlings/assets/pipeline_assets.json +29 -0
- glitchlings/attack/__init__.py +53 -0
- glitchlings/attack/compose.py +299 -0
- glitchlings/attack/core.py +465 -0
- glitchlings/attack/encode.py +114 -0
- glitchlings/attack/metrics.py +104 -0
- glitchlings/attack/metrics_dispatch.py +70 -0
- glitchlings/attack/tokenization.py +157 -0
- glitchlings/auggie.py +283 -0
- glitchlings/compat/__init__.py +9 -0
- glitchlings/compat/loaders.py +355 -0
- glitchlings/compat/types.py +41 -0
- glitchlings/conf/__init__.py +41 -0
- glitchlings/conf/loaders.py +331 -0
- glitchlings/conf/schema.py +156 -0
- glitchlings/conf/types.py +72 -0
- glitchlings/config.toml +2 -0
- glitchlings/constants.py +59 -0
- glitchlings/dev/__init__.py +3 -0
- glitchlings/dev/docs.py +45 -0
- glitchlings/dlc/__init__.py +17 -3
- glitchlings/dlc/_shared.py +296 -0
- glitchlings/dlc/gutenberg.py +400 -0
- glitchlings/dlc/huggingface.py +37 -65
- glitchlings/dlc/prime.py +55 -114
- glitchlings/dlc/pytorch.py +98 -0
- glitchlings/dlc/pytorch_lightning.py +173 -0
- glitchlings/internal/__init__.py +16 -0
- glitchlings/internal/rust.py +159 -0
- glitchlings/internal/rust_ffi.py +432 -0
- glitchlings/main.py +123 -32
- glitchlings/runtime_config.py +24 -0
- glitchlings/util/__init__.py +29 -176
- glitchlings/util/adapters.py +65 -0
- glitchlings/util/keyboards.py +311 -0
- glitchlings/util/transcripts.py +108 -0
- glitchlings/zoo/__init__.py +47 -24
- glitchlings/zoo/assets/__init__.py +29 -0
- glitchlings/zoo/core.py +301 -167
- glitchlings/zoo/core_execution.py +98 -0
- glitchlings/zoo/core_planning.py +451 -0
- glitchlings/zoo/corrupt_dispatch.py +295 -0
- glitchlings/zoo/ekkokin.py +118 -0
- glitchlings/zoo/hokey.py +137 -0
- glitchlings/zoo/jargoyle.py +179 -274
- glitchlings/zoo/mim1c.py +106 -68
- glitchlings/zoo/pedant/__init__.py +107 -0
- glitchlings/zoo/pedant/core.py +105 -0
- glitchlings/zoo/pedant/forms.py +74 -0
- glitchlings/zoo/pedant/stones.py +74 -0
- glitchlings/zoo/redactyl.py +44 -175
- glitchlings/zoo/rng.py +259 -0
- glitchlings/zoo/rushmore.py +359 -116
- glitchlings/zoo/scannequin.py +18 -125
- glitchlings/zoo/transforms.py +386 -0
- glitchlings/zoo/typogre.py +76 -162
- glitchlings/zoo/validation.py +477 -0
- glitchlings/zoo/zeedub.py +33 -86
- glitchlings-0.9.3.dist-info/METADATA +334 -0
- glitchlings-0.9.3.dist-info/RECORD +80 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/entry_points.txt +1 -0
- glitchlings/zoo/_ocr_confusions.py +0 -34
- glitchlings/zoo/_rate.py +0 -21
- glitchlings/zoo/reduple.py +0 -169
- glitchlings-0.2.5.dist-info/METADATA +0 -490
- glitchlings-0.2.5.dist-info/RECORD +0 -27
- /glitchlings/{zoo → assets}/ocr_confusions.tsv +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/WHEEL +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/licenses/LICENSE +0 -0
- {glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
"""Shared utilities for DLC integrations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Mapping, Sequence
|
|
6
|
+
from typing import Any, cast
|
|
7
|
+
|
|
8
|
+
from ..util.transcripts import is_transcript
|
|
9
|
+
from ..zoo.core import Gaggle
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def resolve_columns(dataset: Any, columns: Sequence[str] | None) -> list[str]:
|
|
13
|
+
"""Identify which dataset columns should be corrupted."""
|
|
14
|
+
available = set(getattr(dataset, "column_names", ()))
|
|
15
|
+
|
|
16
|
+
if columns is not None:
|
|
17
|
+
missing = sorted(set(columns) - available)
|
|
18
|
+
if missing:
|
|
19
|
+
missing_str = ", ".join(missing)
|
|
20
|
+
raise ValueError(f"Columns not found in dataset: {missing_str}")
|
|
21
|
+
return list(columns)
|
|
22
|
+
|
|
23
|
+
for candidate in ("prompt", "question"):
|
|
24
|
+
if candidate in available:
|
|
25
|
+
return [candidate]
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
dataset_length = len(dataset)
|
|
29
|
+
except TypeError:
|
|
30
|
+
preview_rows: list[dict[str, Any]]
|
|
31
|
+
take_fn = getattr(dataset, "take", None)
|
|
32
|
+
if callable(take_fn):
|
|
33
|
+
preview_rows = list(take_fn(1))
|
|
34
|
+
else:
|
|
35
|
+
iterator = iter(dataset)
|
|
36
|
+
try:
|
|
37
|
+
first_row = next(iterator)
|
|
38
|
+
except StopIteration:
|
|
39
|
+
preview_rows = []
|
|
40
|
+
else:
|
|
41
|
+
preview_rows = [first_row]
|
|
42
|
+
sample = dict(preview_rows[0]) if preview_rows else {}
|
|
43
|
+
else:
|
|
44
|
+
sample = dataset[0] if dataset_length else {}
|
|
45
|
+
inferred = [
|
|
46
|
+
name for name in getattr(dataset, "column_names", ()) if isinstance(sample.get(name), str)
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
if inferred:
|
|
50
|
+
return inferred
|
|
51
|
+
|
|
52
|
+
raise ValueError("Unable to determine which dataset columns to corrupt.")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def normalize_column_spec(
|
|
56
|
+
columns: str | int | Sequence[str | int] | None,
|
|
57
|
+
) -> list[str | int] | None:
|
|
58
|
+
"""Normalize a column specification into a list of keys or indices.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
columns: Column specification as a single value, sequence of values, or None.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
A list of column identifiers, or None if input was None.
|
|
65
|
+
|
|
66
|
+
Raises:
|
|
67
|
+
ValueError: If an empty sequence is provided.
|
|
68
|
+
"""
|
|
69
|
+
if columns is None:
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
if isinstance(columns, (str, int)):
|
|
73
|
+
return [columns]
|
|
74
|
+
|
|
75
|
+
normalized = list(columns)
|
|
76
|
+
if not normalized:
|
|
77
|
+
raise ValueError("At least one column must be specified")
|
|
78
|
+
return normalized
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def is_textual_candidate(value: Any) -> bool:
|
|
82
|
+
"""Return ``True`` when ``value`` looks like text that glitchlings can corrupt.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
value: The value to check for textual content.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
True if the value appears to be textual content.
|
|
89
|
+
"""
|
|
90
|
+
if isinstance(value, str):
|
|
91
|
+
return True
|
|
92
|
+
|
|
93
|
+
if is_transcript(value, allow_empty=False, require_all_content=True):
|
|
94
|
+
return True
|
|
95
|
+
|
|
96
|
+
if isinstance(value, Sequence) and not isinstance(value, (bytes, bytearray, str)):
|
|
97
|
+
if not value:
|
|
98
|
+
return False
|
|
99
|
+
if all(isinstance(item, str) for item in value):
|
|
100
|
+
return True
|
|
101
|
+
if is_transcript(list(value), allow_empty=False, require_all_content=True):
|
|
102
|
+
return True
|
|
103
|
+
|
|
104
|
+
return False
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def corrupt_text_value(value: Any, gaggle: Gaggle) -> Any:
|
|
108
|
+
"""Return ``value`` with glitchlings applied when possible.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
value: The value to corrupt (string, transcript, or sequence of strings).
|
|
112
|
+
gaggle: The gaggle of glitchlings to apply.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
The corrupted value, preserving the original type where possible.
|
|
116
|
+
"""
|
|
117
|
+
if isinstance(value, str):
|
|
118
|
+
return gaggle.corrupt(value)
|
|
119
|
+
|
|
120
|
+
if is_transcript(value, allow_empty=True):
|
|
121
|
+
return gaggle.corrupt(value)
|
|
122
|
+
|
|
123
|
+
if isinstance(value, list) and value and all(isinstance(item, str) for item in value):
|
|
124
|
+
return [gaggle.corrupt(item) for item in value]
|
|
125
|
+
|
|
126
|
+
if isinstance(value, tuple) and value and all(isinstance(item, str) for item in value):
|
|
127
|
+
return tuple(gaggle.corrupt(item) for item in value)
|
|
128
|
+
|
|
129
|
+
return value
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def infer_batch_targets(batch: Any) -> list[str | int] | None:
|
|
133
|
+
"""Infer which fields should be glitched from a representative batch.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
batch: A batch from a DataLoader (mapping, sequence, or textual value).
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
A list of column keys (strings) or indices (ints), or None if the batch
|
|
140
|
+
itself is textual content.
|
|
141
|
+
|
|
142
|
+
Raises:
|
|
143
|
+
ValueError: If unable to infer textual columns/indices.
|
|
144
|
+
TypeError: If the batch type is unsupported.
|
|
145
|
+
"""
|
|
146
|
+
if isinstance(batch, Mapping):
|
|
147
|
+
inferred = [key for key, value in batch.items() if is_textual_candidate(value)]
|
|
148
|
+
if inferred:
|
|
149
|
+
return inferred
|
|
150
|
+
raise ValueError("Unable to infer which mapping columns contain text")
|
|
151
|
+
|
|
152
|
+
if isinstance(batch, Sequence) and not isinstance(batch, (bytes, bytearray, str)):
|
|
153
|
+
inferred_indices: list[str | int] = [
|
|
154
|
+
idx for idx, value in enumerate(batch) if is_textual_candidate(value)
|
|
155
|
+
]
|
|
156
|
+
if inferred_indices:
|
|
157
|
+
return inferred_indices
|
|
158
|
+
raise ValueError("Unable to infer which sequence indices contain text")
|
|
159
|
+
|
|
160
|
+
if is_textual_candidate(batch):
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
raise TypeError("Unsupported DataLoader batch type for glitching")
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def corrupt_batch(batch: Any, targets: list[str | int] | None, gaggle: Gaggle) -> Any:
|
|
167
|
+
"""Return batch with glitchlings applied to the specified targets.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
batch: The batch to corrupt (mapping, sequence, or textual value).
|
|
171
|
+
targets: List of column keys (strings) or indices (ints), or None to
|
|
172
|
+
corrupt the entire batch as textual content.
|
|
173
|
+
gaggle: The gaggle of glitchlings to apply.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
The corrupted batch, preserving the original type.
|
|
177
|
+
|
|
178
|
+
Raises:
|
|
179
|
+
TypeError: If batch type is unsupported or targets are incompatible.
|
|
180
|
+
ValueError: If a specified target is not found in the batch.
|
|
181
|
+
"""
|
|
182
|
+
if targets is None:
|
|
183
|
+
return corrupt_text_value(batch, gaggle)
|
|
184
|
+
|
|
185
|
+
if isinstance(batch, Mapping):
|
|
186
|
+
# Use copy() if available, otherwise dict()
|
|
187
|
+
if hasattr(batch, "copy"):
|
|
188
|
+
mutated = batch.copy()
|
|
189
|
+
else:
|
|
190
|
+
mutated = dict(batch)
|
|
191
|
+
|
|
192
|
+
for key in targets:
|
|
193
|
+
if not isinstance(key, str):
|
|
194
|
+
raise TypeError("Mapping batches require string column names")
|
|
195
|
+
if key not in mutated:
|
|
196
|
+
raise ValueError(f"Column '{key}' not found in DataLoader batch")
|
|
197
|
+
mutated[key] = corrupt_text_value(mutated[key], gaggle)
|
|
198
|
+
return mutated
|
|
199
|
+
|
|
200
|
+
if isinstance(batch, Sequence) and not isinstance(batch, (bytes, bytearray, str)):
|
|
201
|
+
mutated_sequence = list(batch)
|
|
202
|
+
for index in targets:
|
|
203
|
+
if not isinstance(index, int):
|
|
204
|
+
raise TypeError("Sequence batches require integer column indices")
|
|
205
|
+
try:
|
|
206
|
+
mutated_sequence[index] = corrupt_text_value(mutated_sequence[index], gaggle)
|
|
207
|
+
except IndexError as exc: # pragma: no cover - defensive
|
|
208
|
+
raise IndexError("Column index out of range for DataLoader batch") from exc
|
|
209
|
+
if isinstance(batch, tuple):
|
|
210
|
+
return tuple(mutated_sequence)
|
|
211
|
+
return mutated_sequence
|
|
212
|
+
|
|
213
|
+
raise TypeError("Unsupported DataLoader batch type for glitching")
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class BaseGlitchedDataLoader:
|
|
217
|
+
"""Proxy dataloader that glitches batches produced by the wrapped loader.
|
|
218
|
+
|
|
219
|
+
This class wraps a dataloader and applies glitchlings to specified columns
|
|
220
|
+
in each batch as it's yielded. It supports both mapping-based batches (dict-like)
|
|
221
|
+
and sequence-based batches (list/tuple-like).
|
|
222
|
+
"""
|
|
223
|
+
|
|
224
|
+
def __init__(self, dataloader: Any, columns: list[str | int], gaggle: Gaggle) -> None:
|
|
225
|
+
"""Initialize the glitched dataloader.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
dataloader: The underlying dataloader to wrap.
|
|
229
|
+
columns: List of column names (strings) or indices (ints) to corrupt.
|
|
230
|
+
gaggle: The gaggle of glitchlings to apply.
|
|
231
|
+
"""
|
|
232
|
+
self._dataloader = dataloader
|
|
233
|
+
self._columns = columns
|
|
234
|
+
self._gaggle = gaggle
|
|
235
|
+
|
|
236
|
+
def __iter__(self) -> Any:
|
|
237
|
+
"""Yield corrupted batches from the underlying dataloader."""
|
|
238
|
+
for batch in self._dataloader:
|
|
239
|
+
yield corrupt_batch(batch, self._columns, self._gaggle)
|
|
240
|
+
|
|
241
|
+
def __len__(self) -> int:
|
|
242
|
+
"""Return the number of batches in the dataloader."""
|
|
243
|
+
return len(self._dataloader)
|
|
244
|
+
|
|
245
|
+
def __getattr__(self, attribute: str) -> Any:
|
|
246
|
+
"""Proxy attribute access to the underlying dataloader."""
|
|
247
|
+
return getattr(self._dataloader, attribute)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def wrap_dataloader(dataloader: Any, columns: list[str | int], gaggle: Gaggle) -> Any:
|
|
251
|
+
"""Wrap a dataloader (or nested structure) to apply glitchlings lazily.
|
|
252
|
+
|
|
253
|
+
This function recursively wraps dataloaders in nested structures (mappings,
|
|
254
|
+
lists, tuples, etc.) so that all dataloaders in the structure will yield
|
|
255
|
+
corrupted batches.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
dataloader: The dataloader or nested structure to wrap.
|
|
259
|
+
columns: List of column names (strings) or indices (ints) to corrupt.
|
|
260
|
+
gaggle: The gaggle of glitchlings to apply.
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
The wrapped dataloader or structure, with the same type as the input.
|
|
264
|
+
"""
|
|
265
|
+
if dataloader is None:
|
|
266
|
+
return None
|
|
267
|
+
|
|
268
|
+
if isinstance(dataloader, Mapping):
|
|
269
|
+
mapping_type = cast(type[Any], dataloader.__class__)
|
|
270
|
+
return mapping_type(
|
|
271
|
+
{key: wrap_dataloader(value, columns, gaggle) for key, value in dataloader.items()}
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
if isinstance(dataloader, list):
|
|
275
|
+
return [wrap_dataloader(value, columns, gaggle) for value in dataloader]
|
|
276
|
+
|
|
277
|
+
if isinstance(dataloader, tuple):
|
|
278
|
+
return tuple(wrap_dataloader(value, columns, gaggle) for value in dataloader)
|
|
279
|
+
|
|
280
|
+
if isinstance(dataloader, Sequence) and not isinstance(dataloader, (str, bytes, bytearray)):
|
|
281
|
+
sequence_type = cast(type[Any], dataloader.__class__)
|
|
282
|
+
return sequence_type(wrap_dataloader(value, columns, gaggle) for value in dataloader)
|
|
283
|
+
|
|
284
|
+
return BaseGlitchedDataLoader(dataloader, columns, gaggle)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
__all__ = [
|
|
288
|
+
"BaseGlitchedDataLoader",
|
|
289
|
+
"corrupt_batch",
|
|
290
|
+
"corrupt_text_value",
|
|
291
|
+
"infer_batch_targets",
|
|
292
|
+
"is_textual_candidate",
|
|
293
|
+
"normalize_column_spec",
|
|
294
|
+
"resolve_columns",
|
|
295
|
+
"wrap_dataloader",
|
|
296
|
+
]
|
|
@@ -0,0 +1,400 @@
|
|
|
1
|
+
"""Integration helpers for the py-gutenberg library.
|
|
2
|
+
|
|
3
|
+
This module provides a wrapper around the GutenbergAPI that applies
|
|
4
|
+
glitchlings to book text as it's fetched.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from collections.abc import Iterable
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from functools import cached_property
|
|
12
|
+
from typing import Any, Protocol, TypeAlias, cast
|
|
13
|
+
|
|
14
|
+
from ..util.adapters import coerce_gaggle
|
|
15
|
+
from ..zoo import Gaggle, Glitchling
|
|
16
|
+
from ._shared import corrupt_text_value
|
|
17
|
+
|
|
18
|
+
#: Default Gutendex API instance URL (public instance hosted at gutendex.com).
|
|
19
|
+
DEFAULT_GUTENDEX_URL = "https://gutendex.com"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class PersonProtocol(Protocol):
|
|
23
|
+
"""Minimal interface for py-gutenberg Person objects."""
|
|
24
|
+
|
|
25
|
+
name: str
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BookProtocol(Protocol):
|
|
29
|
+
"""Minimal interface for py-gutenberg Book objects."""
|
|
30
|
+
|
|
31
|
+
id: int
|
|
32
|
+
title: str
|
|
33
|
+
authors: list[PersonProtocol]
|
|
34
|
+
translators: list[PersonProtocol]
|
|
35
|
+
subjects: list[str]
|
|
36
|
+
bookshelves: list[str]
|
|
37
|
+
languages: list[str]
|
|
38
|
+
copyright: bool
|
|
39
|
+
media_type: str
|
|
40
|
+
formats: dict[str, str]
|
|
41
|
+
download_count: int
|
|
42
|
+
|
|
43
|
+
def get_text(self) -> str: ...
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class GutenbergAPIProtocol(Protocol):
|
|
47
|
+
"""Subset of the py-gutenberg API we rely on."""
|
|
48
|
+
|
|
49
|
+
instance_url: str
|
|
50
|
+
|
|
51
|
+
def get_all_books(self) -> Iterable[BookProtocol]: ...
|
|
52
|
+
|
|
53
|
+
def get_public_domain_books(self) -> Iterable[BookProtocol]: ...
|
|
54
|
+
|
|
55
|
+
def get_copyrighted_books(self) -> Iterable[BookProtocol]: ...
|
|
56
|
+
|
|
57
|
+
def get_books_by_author(self, author: str) -> Iterable[BookProtocol]: ...
|
|
58
|
+
|
|
59
|
+
def get_books_by_ids(self, ids: list[int]) -> Iterable[BookProtocol]: ...
|
|
60
|
+
|
|
61
|
+
def get_books_by_language(self, languages: list[str]) -> Iterable[BookProtocol]: ...
|
|
62
|
+
|
|
63
|
+
def get_books_by_search(self, query: str) -> Iterable[BookProtocol]: ...
|
|
64
|
+
|
|
65
|
+
def get_books_by_mime_type(self, mime_type: str) -> Iterable[BookProtocol]: ...
|
|
66
|
+
|
|
67
|
+
def get_books_ascending(self) -> Iterable[BookProtocol]: ...
|
|
68
|
+
|
|
69
|
+
def get_oldest(self) -> Iterable[BookProtocol]: ...
|
|
70
|
+
|
|
71
|
+
def get_latest(self, topic: str = "recent") -> Iterable[BookProtocol]: ...
|
|
72
|
+
|
|
73
|
+
def get_book(self, book_id: int) -> BookProtocol: ...
|
|
74
|
+
|
|
75
|
+
def get_book_metadata(self, book_id: int) -> BookProtocol: ...
|
|
76
|
+
|
|
77
|
+
def get_book_text(self, book_id: int) -> BookProtocol: ...
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
Person: TypeAlias = PersonProtocol
|
|
81
|
+
Book: TypeAlias = BookProtocol
|
|
82
|
+
GutenbergAPI: TypeAlias = GutenbergAPIProtocol
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass
|
|
86
|
+
class GlitchedBook:
|
|
87
|
+
"""A Book wrapper that corrupts text content via glitchlings.
|
|
88
|
+
|
|
89
|
+
This class wraps a py-gutenberg Book object but provides corrupted text
|
|
90
|
+
when accessed. The original Book attributes are preserved.
|
|
91
|
+
|
|
92
|
+
Attributes:
|
|
93
|
+
id: The Gutenberg book ID.
|
|
94
|
+
title: The corrupted book title.
|
|
95
|
+
original_title: The original (uncorrupted) book title.
|
|
96
|
+
authors: List of book authors.
|
|
97
|
+
translators: List of book translators.
|
|
98
|
+
subjects: List of subject categories.
|
|
99
|
+
bookshelves: List of bookshelf categories.
|
|
100
|
+
languages: List of language codes.
|
|
101
|
+
copyright: Whether the book is under copyright.
|
|
102
|
+
media_type: The media type of the book.
|
|
103
|
+
formats: Dictionary mapping MIME types to download URLs.
|
|
104
|
+
download_count: Number of times the book has been downloaded.
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
id: int
|
|
108
|
+
title: str
|
|
109
|
+
original_title: str
|
|
110
|
+
authors: list[Person]
|
|
111
|
+
translators: list[Person]
|
|
112
|
+
subjects: list[str]
|
|
113
|
+
bookshelves: list[str]
|
|
114
|
+
languages: list[str]
|
|
115
|
+
copyright: bool
|
|
116
|
+
media_type: str
|
|
117
|
+
formats: dict[str, str]
|
|
118
|
+
download_count: int
|
|
119
|
+
_original_book: Book = field(repr=False)
|
|
120
|
+
_gaggle: Gaggle = field(repr=False)
|
|
121
|
+
|
|
122
|
+
@classmethod
|
|
123
|
+
def from_book(cls, book: Book, gaggle: Gaggle) -> GlitchedBook:
|
|
124
|
+
"""Create a GlitchedBook from a py-gutenberg Book.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
book: The original Book object from py-gutenberg.
|
|
128
|
+
gaggle: The gaggle of glitchlings to apply to text.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
A GlitchedBook that corrupts text with the provided gaggle.
|
|
132
|
+
"""
|
|
133
|
+
# Use shared utility for consistent corruption; cast tells mypy this is str
|
|
134
|
+
corrupted_title = cast(str, corrupt_text_value(book.title, gaggle))
|
|
135
|
+
return cls(
|
|
136
|
+
id=book.id,
|
|
137
|
+
title=corrupted_title,
|
|
138
|
+
original_title=book.title,
|
|
139
|
+
authors=book.authors,
|
|
140
|
+
translators=book.translators,
|
|
141
|
+
subjects=book.subjects,
|
|
142
|
+
bookshelves=book.bookshelves,
|
|
143
|
+
languages=book.languages,
|
|
144
|
+
copyright=book.copyright,
|
|
145
|
+
media_type=book.media_type,
|
|
146
|
+
formats=book.formats,
|
|
147
|
+
download_count=book.download_count,
|
|
148
|
+
_original_book=book,
|
|
149
|
+
_gaggle=gaggle,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
@cached_property
|
|
153
|
+
def _text_content(self) -> str:
|
|
154
|
+
"""Lazily fetch and corrupt the full text content of the book."""
|
|
155
|
+
original_text: str = self._original_book.get_text()
|
|
156
|
+
return cast(str, corrupt_text_value(original_text, self._gaggle))
|
|
157
|
+
|
|
158
|
+
def get_text(self) -> str:
|
|
159
|
+
"""Fetch and corrupt the full text content of the book.
|
|
160
|
+
|
|
161
|
+
This method fetches the book's text from Project Gutenberg and applies
|
|
162
|
+
glitchlings corruption to it. The text is fetched fresh on the first call
|
|
163
|
+
and cached for subsequent calls.
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
The corrupted full text of the book.
|
|
167
|
+
|
|
168
|
+
Raises:
|
|
169
|
+
AttributeError: If the underlying Book doesn't support get_text().
|
|
170
|
+
"""
|
|
171
|
+
return self._text_content
|
|
172
|
+
|
|
173
|
+
def __repr__(self) -> str:
|
|
174
|
+
"""Return a concise representation of the GlitchedBook."""
|
|
175
|
+
return (
|
|
176
|
+
f"GlitchedBook(id={self.id}, title={self.title!r}, "
|
|
177
|
+
f"authors={[a.name for a in self.authors]!r})"
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
def __getattr__(self, name: str) -> Any:
|
|
181
|
+
"""Delegate attribute access to the original book."""
|
|
182
|
+
return getattr(self._original_book, name)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class GlitchenbergAPI:
|
|
186
|
+
"""A wrapper around GutenbergAPI that corrupts book text with glitchlings.
|
|
187
|
+
|
|
188
|
+
This class provides the same interface as GutenbergAPI but applies
|
|
189
|
+
glitchlings to corrupt book text as it's fetched.
|
|
190
|
+
|
|
191
|
+
Example:
|
|
192
|
+
>>> from glitchlings.dlc.gutenberg import GlitchenbergAPI
|
|
193
|
+
>>> from glitchlings import Typogre
|
|
194
|
+
>>> api = GlitchenbergAPI(Typogre(rate=0.05), seed=42)
|
|
195
|
+
>>> book = api.get_book(1342) # Pride and Prejudice
|
|
196
|
+
>>> print(book.title) # Title will have typos applied
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
def __init__(
|
|
200
|
+
self,
|
|
201
|
+
glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
|
|
202
|
+
*,
|
|
203
|
+
seed: int = 151,
|
|
204
|
+
instance_url: str = DEFAULT_GUTENDEX_URL,
|
|
205
|
+
) -> None:
|
|
206
|
+
"""Initialize the GlitchenbergAPI.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
glitchlings: A glitchling, gaggle, or specification of glitchlings to apply.
|
|
210
|
+
seed: RNG seed for deterministic corruption (default: 151).
|
|
211
|
+
instance_url: The Gutendex instance URL to use for API requests.
|
|
212
|
+
Defaults to the public instance at gutendex.com. For production use,
|
|
213
|
+
consider self-hosting Gutendex.
|
|
214
|
+
"""
|
|
215
|
+
self._gaggle = coerce_gaggle(glitchlings, seed=seed)
|
|
216
|
+
self._api = _get_gutenberg_api(instance_url)
|
|
217
|
+
|
|
218
|
+
@property
|
|
219
|
+
def instance_url(self) -> str:
|
|
220
|
+
"""Return the Gutendex instance URL."""
|
|
221
|
+
return str(self._api.instance_url)
|
|
222
|
+
|
|
223
|
+
@property
|
|
224
|
+
def gaggle(self) -> Gaggle:
|
|
225
|
+
"""Return the gaggle used for corruption."""
|
|
226
|
+
return self._gaggle
|
|
227
|
+
|
|
228
|
+
def _corrupt_book(self, book: Book) -> GlitchedBook:
|
|
229
|
+
"""Apply glitchlings to a Book object."""
|
|
230
|
+
return GlitchedBook.from_book(book, self._gaggle)
|
|
231
|
+
|
|
232
|
+
def _corrupt_books(self, books: Iterable[Book]) -> list[GlitchedBook]:
|
|
233
|
+
"""Apply glitchlings to a list of Book objects."""
|
|
234
|
+
return [self._corrupt_book(book) for book in books]
|
|
235
|
+
|
|
236
|
+
def corrupt_books(self, books: list[Book]) -> list[GlitchedBook]:
|
|
237
|
+
"""Apply glitchlings to a list of Book objects.
|
|
238
|
+
|
|
239
|
+
This method allows batch corruption of books fetched from other sources
|
|
240
|
+
or the underlying API.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
books: List of py-gutenberg Book objects to corrupt.
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
List of GlitchedBook objects with corrupted text.
|
|
247
|
+
|
|
248
|
+
Example:
|
|
249
|
+
>>> # Fetch from underlying API and corrupt separately
|
|
250
|
+
>>> raw_books = api._api.get_books_by_author("Austen")
|
|
251
|
+
>>> glitched = api.corrupt_books(raw_books)
|
|
252
|
+
"""
|
|
253
|
+
return self._corrupt_books(books)
|
|
254
|
+
|
|
255
|
+
# Methods that return lists of books
|
|
256
|
+
def get_all_books(self) -> list[GlitchedBook]:
|
|
257
|
+
"""Get all books with glitchling corruption applied."""
|
|
258
|
+
return self._corrupt_books(self._api.get_all_books())
|
|
259
|
+
|
|
260
|
+
def get_public_domain_books(self) -> list[GlitchedBook]:
|
|
261
|
+
"""Get public domain books with glitchling corruption applied."""
|
|
262
|
+
return self._corrupt_books(self._api.get_public_domain_books())
|
|
263
|
+
|
|
264
|
+
def get_copyrighted_books(self) -> list[GlitchedBook]:
|
|
265
|
+
"""Get copyrighted books with glitchling corruption applied."""
|
|
266
|
+
return self._corrupt_books(self._api.get_copyrighted_books())
|
|
267
|
+
|
|
268
|
+
def get_books_by_author(self, author: str) -> list[GlitchedBook]:
|
|
269
|
+
"""Get books by author with glitchling corruption applied.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
author: Author name to search for.
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
List of GlitchedBook objects with corrupted text.
|
|
276
|
+
"""
|
|
277
|
+
return self._corrupt_books(self._api.get_books_by_author(author))
|
|
278
|
+
|
|
279
|
+
def get_books_by_ids(self, ids: list[int]) -> list[GlitchedBook]:
|
|
280
|
+
"""Get books by IDs with glitchling corruption applied.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
ids: List of Gutenberg book IDs to retrieve.
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
List of GlitchedBook objects with corrupted text.
|
|
287
|
+
"""
|
|
288
|
+
return self._corrupt_books(self._api.get_books_by_ids(ids))
|
|
289
|
+
|
|
290
|
+
def get_books_by_language(self, languages: list[str]) -> list[GlitchedBook]:
|
|
291
|
+
"""Get books by language with glitchling corruption applied.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
languages: List of language codes (e.g., ["en", "fr"]).
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
List of GlitchedBook objects with corrupted text.
|
|
298
|
+
"""
|
|
299
|
+
return self._corrupt_books(self._api.get_books_by_language(languages))
|
|
300
|
+
|
|
301
|
+
def get_books_by_search(self, query: str) -> list[GlitchedBook]:
|
|
302
|
+
"""Search for books with glitchling corruption applied.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
query: Search query string.
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
List of GlitchedBook objects with corrupted text.
|
|
309
|
+
"""
|
|
310
|
+
return self._corrupt_books(self._api.get_books_by_search(query))
|
|
311
|
+
|
|
312
|
+
def get_books_by_mime_type(self, mime_type: str) -> list[GlitchedBook]:
|
|
313
|
+
"""Get books by MIME type with glitchling corruption applied.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
mime_type: MIME type filter (e.g., "text/plain").
|
|
317
|
+
|
|
318
|
+
Returns:
|
|
319
|
+
List of GlitchedBook objects with corrupted text.
|
|
320
|
+
"""
|
|
321
|
+
return self._corrupt_books(self._api.get_books_by_mime_type(mime_type))
|
|
322
|
+
|
|
323
|
+
def get_books_ascending(self) -> list[GlitchedBook]:
|
|
324
|
+
"""Get books sorted ascending with glitchling corruption applied."""
|
|
325
|
+
return self._corrupt_books(self._api.get_books_ascending())
|
|
326
|
+
|
|
327
|
+
def get_oldest(self) -> list[GlitchedBook]:
|
|
328
|
+
"""Get oldest books with glitchling corruption applied."""
|
|
329
|
+
return self._corrupt_books(self._api.get_oldest())
|
|
330
|
+
|
|
331
|
+
def get_latest(self, topic: str = "recent") -> list[GlitchedBook]:
|
|
332
|
+
"""Get latest books by topic with glitchling corruption applied.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
topic: Topic string to filter books by (e.g., "fiction", "science").
|
|
336
|
+
Defaults to "recent".
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
List of GlitchedBook objects with corrupted text.
|
|
340
|
+
"""
|
|
341
|
+
return self._corrupt_books(self._api.get_latest(topic))
|
|
342
|
+
|
|
343
|
+
# Methods that return single books
|
|
344
|
+
def get_book(self, book_id: int) -> GlitchedBook:
|
|
345
|
+
"""Get a book by ID with glitchling corruption applied.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
book_id: Gutenberg book ID.
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
GlitchedBook with corrupted text.
|
|
352
|
+
"""
|
|
353
|
+
return self._corrupt_book(self._api.get_book(book_id))
|
|
354
|
+
|
|
355
|
+
def get_book_metadata(self, book_id: int) -> GlitchedBook:
|
|
356
|
+
"""Get book metadata by ID with glitchling corruption applied.
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
book_id: Gutenberg book ID.
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
GlitchedBook with corrupted metadata.
|
|
363
|
+
"""
|
|
364
|
+
return self._corrupt_book(self._api.get_book_metadata(book_id))
|
|
365
|
+
|
|
366
|
+
def get_book_text(self, book_id: int) -> GlitchedBook:
|
|
367
|
+
"""Get book text by ID with glitchling corruption applied.
|
|
368
|
+
|
|
369
|
+
Args:
|
|
370
|
+
book_id: Gutenberg book ID.
|
|
371
|
+
|
|
372
|
+
Returns:
|
|
373
|
+
GlitchedBook with corrupted text.
|
|
374
|
+
"""
|
|
375
|
+
return self._corrupt_book(self._api.get_book_text(book_id))
|
|
376
|
+
|
|
377
|
+
def __getattr__(self, name: str) -> Any:
|
|
378
|
+
"""Delegate attribute access to the underlying API."""
|
|
379
|
+
return getattr(self._api, name)
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def _get_gutenberg_api(instance_url: str) -> GutenbergAPI:
|
|
383
|
+
"""Import and return a GutenbergAPI instance.
|
|
384
|
+
|
|
385
|
+
Raises:
|
|
386
|
+
ImportError: If py-gutenberg is not installed.
|
|
387
|
+
"""
|
|
388
|
+
try:
|
|
389
|
+
from gutenberg import GutenbergAPI
|
|
390
|
+
except ImportError as exc:
|
|
391
|
+
raise ImportError(
|
|
392
|
+
"py-gutenberg is required for the GlitchenbergAPI integration. "
|
|
393
|
+
"Install it with: pip install py-gutenberg"
|
|
394
|
+
) from exc
|
|
395
|
+
|
|
396
|
+
api = GutenbergAPI(instance_url=instance_url)
|
|
397
|
+
return cast(GutenbergAPIProtocol, api)
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
__all__ = ["DEFAULT_GUTENDEX_URL", "GlitchenbergAPI", "GlitchedBook"]
|