PyPI - glitchlings - Versions diffs - 0.2.5__cp312-cp312-win_amd64.whl → 0.9.3__cp312-cp312-win_amd64.whl - Mend

glitchlings 0.2.5__cp312-cp312-win_amd64.whl → 0.9.3__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

glitchlings/__init__.py +36 -17
glitchlings/__main__.py +0 -1
glitchlings/_zoo_rust/__init__.py +12 -0
glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
glitchlings/assets/__init__.py +180 -0
glitchlings/assets/apostrofae_pairs.json +32 -0
glitchlings/assets/ekkokin_homophones.json +2014 -0
glitchlings/assets/hokey_assets.json +193 -0
glitchlings/assets/lexemes/academic.json +1049 -0
glitchlings/assets/lexemes/colors.json +1333 -0
glitchlings/assets/lexemes/corporate.json +716 -0
glitchlings/assets/lexemes/cyberpunk.json +22 -0
glitchlings/assets/lexemes/lovecraftian.json +23 -0
glitchlings/assets/lexemes/synonyms.json +3354 -0
glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
glitchlings/assets/pipeline_assets.json +29 -0
glitchlings/attack/__init__.py +53 -0
glitchlings/attack/compose.py +299 -0
glitchlings/attack/core.py +465 -0
glitchlings/attack/encode.py +114 -0
glitchlings/attack/metrics.py +104 -0
glitchlings/attack/metrics_dispatch.py +70 -0
glitchlings/attack/tokenization.py +157 -0
glitchlings/auggie.py +283 -0
glitchlings/compat/__init__.py +9 -0
glitchlings/compat/loaders.py +355 -0
glitchlings/compat/types.py +41 -0
glitchlings/conf/__init__.py +41 -0
glitchlings/conf/loaders.py +331 -0
glitchlings/conf/schema.py +156 -0
glitchlings/conf/types.py +72 -0
glitchlings/config.toml +2 -0
glitchlings/constants.py +59 -0
glitchlings/dev/__init__.py +3 -0
glitchlings/dev/docs.py +45 -0
glitchlings/dlc/__init__.py +17 -3
glitchlings/dlc/_shared.py +296 -0
glitchlings/dlc/gutenberg.py +400 -0
glitchlings/dlc/huggingface.py +37 -65
glitchlings/dlc/prime.py +55 -114
glitchlings/dlc/pytorch.py +98 -0
glitchlings/dlc/pytorch_lightning.py +173 -0
glitchlings/internal/__init__.py +16 -0
glitchlings/internal/rust.py +159 -0
glitchlings/internal/rust_ffi.py +432 -0
glitchlings/main.py +123 -32
glitchlings/runtime_config.py +24 -0
glitchlings/util/__init__.py +29 -176
glitchlings/util/adapters.py +65 -0
glitchlings/util/keyboards.py +311 -0
glitchlings/util/transcripts.py +108 -0
glitchlings/zoo/__init__.py +47 -24
glitchlings/zoo/assets/__init__.py +29 -0
glitchlings/zoo/core.py +301 -167
glitchlings/zoo/core_execution.py +98 -0
glitchlings/zoo/core_planning.py +451 -0
glitchlings/zoo/corrupt_dispatch.py +295 -0
glitchlings/zoo/ekkokin.py +118 -0
glitchlings/zoo/hokey.py +137 -0
glitchlings/zoo/jargoyle.py +179 -274
glitchlings/zoo/mim1c.py +106 -68
glitchlings/zoo/pedant/__init__.py +107 -0
glitchlings/zoo/pedant/core.py +105 -0
glitchlings/zoo/pedant/forms.py +74 -0
glitchlings/zoo/pedant/stones.py +74 -0
glitchlings/zoo/redactyl.py +44 -175
glitchlings/zoo/rng.py +259 -0
glitchlings/zoo/rushmore.py +359 -116
glitchlings/zoo/scannequin.py +18 -125
glitchlings/zoo/transforms.py +386 -0
glitchlings/zoo/typogre.py +76 -162
glitchlings/zoo/validation.py +477 -0
glitchlings/zoo/zeedub.py +33 -86
glitchlings-0.9.3.dist-info/METADATA +334 -0
glitchlings-0.9.3.dist-info/RECORD +80 -0
{glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/entry_points.txt +1 -0
glitchlings/zoo/_ocr_confusions.py +0 -34
glitchlings/zoo/_rate.py +0 -21
glitchlings/zoo/reduple.py +0 -169
glitchlings-0.2.5.dist-info/METADATA +0 -490
glitchlings-0.2.5.dist-info/RECORD +0 -27
/glitchlings/{zoo → assets}/ocr_confusions.tsv +0 -0
{glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/WHEEL +0 -0
{glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/licenses/LICENSE +0 -0
{glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/top_level.txt +0 -0

glitchlings/attack/metrics_dispatch.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""Pure metric dispatch functions.
+This module contains pure functions for dispatching metric computations.
+It does not import Rust FFI or perform any IO - it operates on already-
+resolved metric functions.
+Pure guarantees:
+- No import side effects beyond stdlib
+- No Rust FFI loading
+- Deterministic dispatch logic
+The impure Rust metric loading lives in metrics.py.
+"""
+from __future__ import annotations
+from typing import Sequence, TypeGuard
+TokenSequence = Sequence[str]
+TokenBatch = Sequence[TokenSequence]
+def is_batch(tokens: TokenSequence | TokenBatch) -> TypeGuard[TokenBatch]:
+    """Determine if tokens represent a batch of sequences.
+    An empty list is treated as an empty batch (returning True) so that
+    ``metric([], [])`` returns ``[]`` rather than ``0.0``. This matches
+    the behavior of :meth:`Attack.run` when processing empty transcripts.
+    Args:
+        tokens: Either a sequence of token strings or a batch of such sequences.
+    Returns:
+        True if tokens is a batch (list of lists), False if a single sequence.
+    """
+    if not tokens:
+        return True  # Empty list is an empty batch
+    first = tokens[0]
+    return isinstance(first, Sequence) and not isinstance(first, (str, bytes))
+def validate_batch_consistency(
+    original: TokenSequence | TokenBatch,
+    corrupted: TokenSequence | TokenBatch,
+    metric_name: str,
+) -> None:
+    """Validate that both inputs are consistently batched or single.
+    Args:
+        original: Original token sequence or batch.
+        corrupted: Corrupted token sequence or batch.
+        metric_name: Name of the metric (for error messages).
+    Raises:
+        TypeError: If one input is batched and the other isn't.
+    """
+    original_is_batch = is_batch(original)
+    corrupted_is_batch = is_batch(corrupted)
+    if original_is_batch != corrupted_is_batch:
+        raise TypeError(f"{metric_name} expects either both batch inputs or both single sequences")
+__all__ = [
+    "TokenBatch",
+    "TokenSequence",
+    "is_batch",
+    "validate_batch_consistency",
+]

glitchlings/attack/tokenization.py ADDED Viewed

@@ -0,0 +1,157 @@
+from __future__ import annotations
+import importlib.util
+import zlib
+from typing import Any, Protocol, Sequence
+DEFAULT_TIKTOKEN_ENCODINGS = ("o200k_base", "cl100k_base")
+class Tokenizer(Protocol):
+    def encode(self, text: str) -> tuple[list[str], list[int]]: ...
+    def decode(self, tokens: Sequence[str]) -> str: ...
+class WhitespaceTokenizer:
+    def encode(self, text: str) -> tuple[list[str], list[int]]:
+        tokens = text.split()
+        # Synthetic IDs based on adler32 hash for stability
+        ids = [zlib.adler32(t.encode("utf-8")) & 0xFFFFFFFF for t in tokens]
+        return tokens, ids
+    def decode(self, tokens: Sequence[str]) -> str:
+        return " ".join(tokens)
+    def encode_batch(self, texts: Sequence[str]) -> list[tuple[list[str], list[int]]]:
+        return [self.encode(text) for text in texts]
+class TiktokenTokenizer:
+    def __init__(self, model_name: str):
+        import tiktoken
+        self.name = model_name
+        try:
+            self.enc = tiktoken.get_encoding(model_name)
+        except ValueError:
+            self.enc = tiktoken.encoding_for_model(model_name)
+    def encode(self, text: str) -> tuple[list[str], list[int]]:
+        ids = self.enc.encode(text)
+        tokens = [
+            self.enc.decode_single_token_bytes(i).decode("utf-8", errors="replace") for i in ids
+        ]
+        return tokens, ids
+    def decode(self, tokens: Sequence[str], sep: str = "") -> str:
+        return sep.join(tokens)
+    def encode_batch(self, texts: Sequence[str]) -> list[tuple[list[str], list[int]]]:
+        id_batches = [list(batch) for batch in self.enc.encode_batch(list(texts))]
+        token_batches: list[list[str]] = []
+        for ids in id_batches:
+            token_batches.append(
+                [
+                    self.enc.decode_single_token_bytes(i).decode("utf-8", errors="replace")
+                    for i in ids
+                ]
+            )
+        return list(zip(token_batches, id_batches))
+class HuggingFaceTokenizerWrapper:
+    def __init__(self, tokenizer_obj: Any):
+        self.tokenizer = tokenizer_obj
+    def encode(self, text: str) -> tuple[list[str], list[int]]:
+        # tokenizers.Tokenizer.encode returns an Encoding object
+        encoding = self.tokenizer.encode(text)
+        return encoding.tokens, encoding.ids
+    def decode(self, tokens: Sequence[str]) -> str:
+        # Use the tokenizer's decode method to properly handle model-specific
+        # artifacts (e.g., "##" for WordPiece, "Ġ" for BPE).
+        # Convert tokens to IDs first, then decode.
+        try:
+            token_ids = [self.tokenizer.token_to_id(token) for token in tokens]
+            # Filter out None values (tokens not in vocabulary)
+            valid_ids = [tid for tid in token_ids if tid is not None]
+            if valid_ids:
+                result: str = self.tokenizer.decode(valid_ids)
+                return result
+        except (AttributeError, TypeError):
+            pass
+        # Fallback: simple join without any replacements
+        return "".join(tokens)
+    def encode_batch(self, texts: Sequence[str]) -> list[tuple[list[str], list[int]]]:
+        encodings = self.tokenizer.encode_batch(list(texts))
+        return [(encoding.tokens, encoding.ids) for encoding in encodings]
+def resolve_tokenizer(tokenizer: str | Tokenizer | None) -> Tokenizer:
+    if tokenizer is None:
+        return _default_tokenizer()
+    if isinstance(tokenizer, str):
+        if importlib.util.find_spec("tiktoken"):
+            import tiktoken
+            try:
+                # Check if valid tiktoken encoding/model
+                try:
+                    tiktoken.get_encoding(tokenizer)
+                    return TiktokenTokenizer(tokenizer)
+                except ValueError:
+                    try:
+                        tiktoken.encoding_for_model(tokenizer)
+                        return TiktokenTokenizer(tokenizer)
+                    except ValueError:
+                        pass
+            except ImportError:
+                pass
+        if importlib.util.find_spec("tokenizers"):
+            from tokenizers import Tokenizer
+            try:
+                return HuggingFaceTokenizerWrapper(Tokenizer.from_pretrained(tokenizer))
+            except Exception:
+                pass
+        raise ValueError(f"Could not resolve tokenizer: {tokenizer}")
+    # Check if it is a HuggingFace tokenizer object
+    if importlib.util.find_spec("tokenizers"):
+        from tokenizers import Tokenizer as HFTokenizer
+        if isinstance(tokenizer, HFTokenizer):
+            return HuggingFaceTokenizerWrapper(tokenizer)
+    return tokenizer
+def _default_tokenizer() -> Tokenizer:
+    """Select a modern, lightweight tokenizer with graceful fallbacks."""
+    if importlib.util.find_spec("tiktoken"):
+        import tiktoken
+        for encoding in DEFAULT_TIKTOKEN_ENCODINGS:
+            try:
+                tiktoken.get_encoding(encoding)
+                return TiktokenTokenizer(encoding)
+            except ValueError:
+                continue
+    return WhitespaceTokenizer()
+__all__ = [
+    "DEFAULT_TIKTOKEN_ENCODINGS",
+    "HuggingFaceTokenizerWrapper",
+    "TiktokenTokenizer",
+    "Tokenizer",
+    "WhitespaceTokenizer",
+    "resolve_tokenizer",
+]

glitchlings/auggie.py ADDED Viewed

@@ -0,0 +1,283 @@
+"""Laboratory assistant for composing gaggles with behaviour-focused helpers."""
+from __future__ import annotations
+from collections.abc import Iterable, Sequence
+from typing import Collection, Literal
+from .zoo.core import Gaggle, Glitchling
+from .zoo.ekkokin import Ekkokin
+from .zoo.hokey import Hokey
+from .zoo.jargoyle import (
+    DEFAULT_LEXEMES,
+    DEFAULT_MODE,
+    Jargoyle,
+    JargoyleMode,
+)
+from .zoo.mim1c import Mim1c
+from .zoo.pedant import Pedant
+from .zoo.pedant.stones import PedantStone
+from .zoo.redactyl import FULL_BLOCK, Redactyl
+from .zoo.rushmore import Rushmore, RushmoreMode
+from .zoo.scannequin import Scannequin
+from .zoo.typogre import Typogre
+from .zoo.zeedub import Zeedub
+class Auggie(Gaggle):
+    """Assistant that incrementally assembles glitchlings into a gaggle."""
+    def __init__(
+        self,
+        glitchlings: Iterable[Glitchling] | None = None,
+        *,
+        seed: int = 151,
+    ) -> None:
+        self._blueprint: list[Glitchling] = []
+        initial = list(glitchlings or [])
+        super().__init__(initial, seed=seed)
+        if initial:
+            self._blueprint = [glitchling.clone() for glitchling in initial]
+            self._rebuild_plan()
+        else:
+            self._blueprint = []
+    def _rebuild_plan(self) -> None:
+        self._clones_by_index = []
+        for index, glitchling in enumerate(self._blueprint):
+            clone = glitchling.clone()
+            setattr(clone, "_gaggle_index", index)
+            self._clones_by_index.append(clone)
+        self.sort_glitchlings()
+    def _enqueue(self, glitchling: Glitchling) -> "Auggie":
+        self._blueprint.append(glitchling)
+        self._rebuild_plan()
+        return self
+    def clone(self, seed: int | None = None) -> "Auggie":
+        clone_seed = seed if seed is not None else self.seed
+        resolved_seed = 151 if clone_seed is None else int(clone_seed)
+        blueprint = [glitch.clone() for glitch in self._blueprint]
+        return Auggie(blueprint, seed=resolved_seed)
+    def typo(
+        self,
+        *,
+        rate: float | None = None,
+        keyboard: str = "CURATOR_QWERTY",
+        seed: int | None = None,
+    ) -> "Auggie":
+        """Add :class:`Typogre` using behaviour-driven nomenclature."""
+        return self._enqueue(Typogre(rate=rate, keyboard=keyboard, seed=seed))
+    def confusable(
+        self,
+        *,
+        rate: float | None = None,
+        classes: list[str] | Literal["all"] | None = None,
+        banned_characters: Collection[str] | None = None,
+        seed: int | None = None,
+    ) -> "Auggie":
+        """Add :class:`Mim1c` for homoglyph substitutions."""
+        return self._enqueue(
+            Mim1c(
+                rate=rate,
+                classes=classes,
+                banned_characters=banned_characters,
+                seed=seed,
+            )
+        )
+    def curly_quotes(self, *, seed: int | None = None) -> "Auggie":
+        """Add :class:`Pedant` evolved with Curlite to smarten punctuation."""
+        return self._enqueue(Pedant(stone=PedantStone.CURLITE, seed=seed))
+    def stretch(
+        self,
+        *,
+        rate: float = 0.3,
+        extension_min: int = 2,
+        extension_max: int = 5,
+        word_length_threshold: int = 6,
+        base_p: float = 0.45,
+        seed: int | None = None,
+    ) -> "Auggie":
+        """Add :class:`Hokey` for elongated, expressive words."""
+        return self._enqueue(
+            Hokey(
+                rate=rate,
+                extension_min=extension_min,
+                extension_max=extension_max,
+                word_length_threshold=word_length_threshold,
+                base_p=base_p,
+                seed=seed,
+            )
+        )
+    def homophone(
+        self,
+        *,
+        rate: float | None = None,
+        seed: int | None = None,
+    ) -> "Auggie":
+        """Add :class:`Ekkokin` to swap words for homophones."""
+        return self._enqueue(Ekkokin(rate=rate, seed=seed))
+    def pedantry(
+        self,
+        *,
+        stone: PedantStone | str = PedantStone.COEURITE,
+        seed: int | None = None,
+    ) -> "Auggie":
+        """Add :class:`Pedant` to evolve text via a chosen stone."""
+        return self._enqueue(Pedant(stone=stone, seed=seed))
+    def remix(
+        self,
+        *,
+        modes: RushmoreMode | str | Iterable[RushmoreMode | str] | None = None,
+        rate: float | None = None,
+        delete_rate: float | None = None,
+        duplicate_rate: float | None = None,
+        swap_rate: float | None = None,
+        seed: int | None = None,
+        unweighted: bool = False,
+        delete_unweighted: bool | None = None,
+        duplicate_unweighted: bool | None = None,
+    ) -> "Auggie":
+        """Add :class:`Rushmore` for deletion, duplication, and swap attacks."""
+        return self._enqueue(
+            Rushmore(
+                modes=modes,
+                rate=rate,
+                delete_rate=delete_rate,
+                duplicate_rate=duplicate_rate,
+                swap_rate=swap_rate,
+                seed=seed,
+                unweighted=unweighted,
+                delete_unweighted=delete_unweighted,
+                duplicate_unweighted=duplicate_unweighted,
+            )
+        )
+    def redact(
+        self,
+        *,
+        replacement_char: str = FULL_BLOCK,
+        rate: float | None = None,
+        merge_adjacent: bool = False,
+        seed: int | None = 151,
+        unweighted: bool = False,
+    ) -> "Auggie":
+        """Add :class:`Redactyl` to blackout words."""
+        return self._enqueue(
+            Redactyl(
+                replacement_char=replacement_char,
+                rate=rate,
+                merge_adjacent=merge_adjacent,
+                seed=seed if seed is not None else 151,
+                unweighted=unweighted,
+            )
+        )
+    def recolor(self, *, mode: JargoyleMode = "literal", seed: int | None = None) -> "Auggie":
+        """Add :class:`Jargoyle` with ``lexemes="colors"`` to remap colour terms.
+        Args:
+            mode: "literal" for deterministic first-entry swaps,
+                  "drift" for random selection from palette.
+            seed: Seed for deterministic randomness.
+        Returns:
+            Self for method chaining.
+        """
+        return self._enqueue(Jargoyle(lexemes="colors", mode=mode, rate=1.0, seed=seed))
+    def drift(
+        self,
+        *,
+        lexemes: str = DEFAULT_LEXEMES,
+        mode: JargoyleMode = DEFAULT_MODE,
+        rate: float | None = None,
+        seed: int | None = None,
+    ) -> "Auggie":
+        """Add :class:`Jargoyle` for dictionary-based word drift.
+        Swaps words with alternatives from the specified lexeme dictionary.
+        Args:
+            lexemes: Dictionary to use. One of:
+                "colors" (color term swapping),
+                "synonyms" (general synonyms),
+                "corporate" (business jargon),
+                "academic" (scholarly terms).
+            mode: "literal" for deterministic first-entry swaps,
+                  "drift" for random selection.
+            rate: Probability of transforming each matching word.
+            seed: Seed for deterministic randomness.
+        Returns:
+            Self for method chaining.
+        """
+        return self._enqueue(Jargoyle(lexemes=lexemes, mode=mode, rate=rate, seed=seed))
+    def ocr(
+        self,
+        *,
+        rate: float | None = None,
+        seed: int | None = None,
+    ) -> "Auggie":
+        """Add :class:`Scannequin` to simulate OCR artefacts."""
+        return self._enqueue(Scannequin(rate=rate, seed=seed))
+    def zero_width(
+        self,
+        *,
+        rate: float | None = None,
+        seed: int | None = None,
+        characters: Sequence[str] | None = None,
+    ) -> "Auggie":
+        """Add :class:`Zeedub` to hide zero-width glyphs inside text."""
+        return self._enqueue(Zeedub(rate=rate, seed=seed, characters=characters))
+    def synonym(
+        self,
+        *,
+        rate: float | None = None,
+        seed: int | None = None,
+        lexemes: str = "synonyms",
+        mode: JargoyleMode = "drift",
+    ) -> "Auggie":
+        """Add :class:`Jargoyle` for synonym substitutions.
+        Args:
+            rate: Probability of transforming each matching word.
+            seed: Seed for deterministic randomness.
+            lexemes: Dictionary to use (default "synonyms").
+            mode: "literal" or "drift" (default "drift").
+        Returns:
+            Self for method chaining.
+        """
+        return self._enqueue(
+            Jargoyle(
+                rate=rate,
+                seed=seed,
+                lexemes=lexemes,
+                mode=mode,
+            )
+        )
+__all__ = ["Auggie"]

glitchlings/compat/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""Compatibility helpers centralising optional dependency imports and extras.
+For 1.0, this package no longer re-exports loader utilities or type sentinels.
+Import directly from ``glitchlings.compat.loaders`` or ``glitchlings.compat.types``.
+"""
+from __future__ import annotations
+__all__: list[str] = []