PyPI - glitchlings - Versions diffs - 0.2.3__cp312-cp312-manylinux_2_28_x86_64.whl - Mend

glitchlings 0.2.3__cp312-cp312-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

glitchlings/__init__.py +42 -0
glitchlings/__main__.py +9 -0
glitchlings/_zoo_rust.cpython-312-x86_64-linux-gnu.so +0 -0
glitchlings/dlc/__init__.py +5 -0
glitchlings/dlc/huggingface.py +96 -0
glitchlings/dlc/prime.py +274 -0
glitchlings/main.py +218 -0
glitchlings/util/__init__.py +181 -0
glitchlings/zoo/__init__.py +134 -0
glitchlings/zoo/_ocr_confusions.py +34 -0
glitchlings/zoo/_rate.py +21 -0
glitchlings/zoo/core.py +405 -0
glitchlings/zoo/jargoyle.py +336 -0
glitchlings/zoo/mim1c.py +108 -0
glitchlings/zoo/ocr_confusions.tsv +30 -0
glitchlings/zoo/redactyl.py +165 -0
glitchlings/zoo/reduple.py +128 -0
glitchlings/zoo/rushmore.py +136 -0
glitchlings/zoo/scannequin.py +171 -0
glitchlings/zoo/typogre.py +212 -0
glitchlings-0.2.3.dist-info/METADATA +478 -0
glitchlings-0.2.3.dist-info/RECORD +26 -0
glitchlings-0.2.3.dist-info/WHEEL +5 -0
glitchlings-0.2.3.dist-info/entry_points.txt +2 -0
glitchlings-0.2.3.dist-info/licenses/LICENSE +201 -0
glitchlings-0.2.3.dist-info/top_level.txt +1 -0

glitchlings/__init__.py ADDED Viewed

@@ -0,0 +1,42 @@
+from .zoo import (
+    Typogre,
+    typogre,
+    Mim1c,
+    mim1c,
+    Jargoyle,
+    jargoyle,
+    Redactyl,
+    redactyl,
+    Reduple,
+    reduple,
+    Rushmore,
+    rushmore,
+    Scannequin,
+    scannequin,
+    Glitchling,
+    Gaggle,
+    summon,
+)
+from .util import SAMPLE_TEXT
+__all__ = [
+    "Typogre",
+    "typogre",
+    "Mim1c",
+    "mim1c",
+    "Jargoyle",
+    "jargoyle",
+    "Redactyl",
+    "redactyl",
+    "Reduple",
+    "reduple",
+    "Rushmore",
+    "rushmore",
+    "Scannequin",
+    "scannequin",
+    "summon",
+    "Glitchling",
+    "Gaggle",
+    "SAMPLE_TEXT",
+]

glitchlings/__main__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from __future__ import annotations
+import sys
+from .main import main
+if __name__ == "__main__":
+    sys.exit(main())

glitchlings/_zoo_rust.cpython-312-x86_64-linux-gnu.so ADDED Viewed

Binary file

glitchlings/dlc/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Optional DLC integrations for Glitchlings."""
+from .huggingface import install as install_huggingface
+__all__ = ["install_huggingface"]

glitchlings/dlc/huggingface.py ADDED Viewed

@@ -0,0 +1,96 @@
+"""Integration helpers for the Hugging Face datasets library."""
+from __future__ import annotations
+from collections.abc import Iterable, Sequence
+from typing import Any
+try:  # pragma: no cover - optional dependency is required at runtime
+    from datasets import Dataset as _DatasetsDataset
+except ModuleNotFoundError as _datasets_error:  # pragma: no cover - optional dependency
+    _DatasetsDataset = None  # type: ignore[assignment]
+else:
+    _datasets_error = None
+from ..zoo import Gaggle, Glitchling, summon
+def _normalise_columns(column: str | Sequence[str]) -> list[str]:
+    """Normalise a column specification to a list."""
+    if isinstance(column, str):
+        return [column]
+    normalised = list(column)
+    if not normalised:
+        raise ValueError("At least one column must be specified")
+    return normalised
+def _as_gaggle(glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling], seed: int) -> Gaggle:
+    """Coerce any supported glitchling specification into a :class:`Gaggle`."""
+    if isinstance(glitchlings, Gaggle):
+        return glitchlings
+    if isinstance(glitchlings, (Glitchling, str)):
+        resolved: Iterable[str | Glitchling] = [glitchlings]
+    else:
+        resolved = glitchlings
+    return summon(list(resolved), seed=seed)
+def _glitch_dataset(
+    dataset: Any,
+    glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
+    column: str | Sequence[str],
+    *,
+    seed: int = 151,
+) -> Any:
+    """Internal helper implementing :meth:`Dataset.glitch`."""
+    columns = _normalise_columns(column)
+    gaggle = _as_gaggle(glitchlings, seed=seed)
+    return gaggle.corrupt_dataset(dataset, columns)
+def _ensure_dataset_class() -> Any:
+    """Return the Hugging Face :class:`~datasets.Dataset` patched with ``.glitch``."""
+    if _DatasetsDataset is None:  # pragma: no cover - datasets is an install-time dependency
+        message = "datasets is not installed"
+        raise ModuleNotFoundError(message) from _datasets_error
+    if getattr(_DatasetsDataset, "glitch", None) is None:
+        def glitch(  # type: ignore[override]
+            self: Any,
+            glitchlings: Glitchling | Gaggle | str | Iterable[str | Glitchling],
+            *,
+            column: str | Sequence[str],
+            seed: int = 151,
+            **_: Any,
+        ) -> Any:
+            """Return a lazily corrupted copy of the dataset."""
+            return _glitch_dataset(self, glitchlings, column, seed=seed)
+        setattr(_DatasetsDataset, "glitch", glitch)
+    return _DatasetsDataset
+def install() -> None:
+    """Monkeypatch the Hugging Face :class:`~datasets.Dataset` with ``.glitch``."""
+    _ensure_dataset_class()
+if _DatasetsDataset is not None:
+    Dataset = _ensure_dataset_class()
+else:  # pragma: no cover - datasets is an install-time dependency
+    Dataset = None  # type: ignore[assignment]
+__all__ = ["Dataset", "install"]

glitchlings/dlc/prime.py ADDED Viewed

@@ -0,0 +1,274 @@
+"""Integration helpers for the optional verifiers prime DLC."""
+from __future__ import annotations
+from collections.abc import Iterable, Sequence
+from enum import Enum
+from typing import Any, Callable
+import verifiers as vf
+from jellyfish import damerau_levenshtein_distance
+try:
+    from .huggingface import Dataset
+except ModuleNotFoundError:  # pragma: no cover - optional dependency
+    Dataset = object  # type: ignore[assignment]
+else:
+    if Dataset is None:  # pragma: no cover - optional dependency
+        Dataset = object  # type: ignore[assignment]
+from ..zoo import Gaggle, Glitchling, Mim1c, Typogre, summon
+def _resolve_environment(env: str | vf.Environment) -> vf.Environment:
+    """Return a fully-instantiated verifier environment."""
+    if isinstance(env, str):
+        env = vf.load_environment(env)
+    if not isinstance(env, vf.Environment):
+        raise TypeError("Invalid environment type")
+    return env
+def _resolve_columns(dataset: Dataset, columns: Sequence[str] | None) -> list[str]:
+    """Identify which dataset columns should be corrupted."""
+    available = set(dataset.column_names)
+    if columns is not None:
+        missing = sorted(set(columns) - available)
+        if missing:
+            missing_str = ", ".join(missing)
+            raise ValueError(f"Columns not found in dataset: {missing_str}")
+        return list(columns)
+    for candidate in ("prompt", "question"):
+        if candidate in available:
+            return [candidate]
+    sample = dataset[0] if len(dataset) else {}
+    inferred = [
+        name
+        for name in dataset.column_names
+        if isinstance(sample.get(name), str)
+    ]
+    if inferred:
+        return inferred
+    raise ValueError("Unable to determine which dataset columns to corrupt.")
+class Difficulty(Enum):
+    """Difficulty levels for tutorial environments."""
+    Easy = 0.25
+    Normal = 1.0
+    Hard = 1.75
+    Extreme = 3
+    Impossible = 9
+def tutorial_level(
+    env: vf.Environment | str,
+    seed: int = 151,
+    difficulty: Difficulty = Difficulty.Normal,
+) -> vf.Environment:
+    """Create a low-corruption environment using tuned defaults."""
+    tuned_mim1c = Mim1c(rate=0.01 * difficulty.value)
+    tuned_typogre = Typogre(rate=0.025 * difficulty.value)
+    return load_environment(
+        env,
+        glitchlings=[tuned_mim1c, tuned_typogre],
+        seed=seed,
+    )
+def load_environment(
+    env: str | vf.Environment,
+    glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle | None = None,
+    *,
+    seed: int = 151,
+    columns: Sequence[str] | None = None,
+) -> vf.Environment:
+    """Load an environment and optionally corrupt it with glitchlings."""
+    environment = _resolve_environment(env)
+    if glitchlings is None:
+        return environment
+    if isinstance(glitchlings, Gaggle):
+        gaggle = glitchlings
+    else:
+        if isinstance(glitchlings, (Glitchling, str)):
+            resolved = [glitchlings]
+        else:
+            resolved = list(glitchlings)
+        gaggle = summon(resolved, seed=seed)
+    dataset = environment.dataset
+    corrupt_columns = _resolve_columns(dataset, columns)
+    environment.dataset = gaggle.corrupt_dataset(dataset, corrupt_columns)
+    return environment
+def _as_gaggle(
+    glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle,
+    *,
+    seed: int,
+) -> Gaggle:
+    """Coerce any supported glitchling specification into a :class:`Gaggle`."""
+    if isinstance(glitchlings, Gaggle):
+        return glitchlings
+    if isinstance(glitchlings, (Glitchling, str)):
+        resolved: Iterable[str | Glitchling] = [glitchlings]
+    else:
+        resolved = glitchlings
+    return summon(list(resolved), seed=seed)
+def _extract_completion_text(completion: Any) -> str:
+    """Normalise a completion payload into a plain string."""
+    if isinstance(completion, str):
+        return completion
+    if isinstance(completion, list) and completion:
+        first = completion[0]
+        if isinstance(first, dict) and "content" in first:
+            return str(first["content"])
+        return str(first)
+    return str(completion)
+def symmetric_damerau_levenshtein_similarity(
+    _: Any,
+    completion: Any,
+    answer: str,
+) -> float:
+    """Return ``1 - (distance / max_len)`` using Damerau-Levenshtein distance."""
+    completion_text = _extract_completion_text(completion)
+    target = answer or ""
+    denominator = max(len(completion_text), len(target), 1)
+    distance = damerau_levenshtein_distance(completion_text, target)
+    score = 1.0 - (distance / denominator)
+    return max(0.0, min(1.0, score))
+DEFAULT_CLEANUP_INSTRUCTIONS = (
+    "You are a meticulous copy editor. Restore the provided text to its original form."
+)
+def echo_chamber(
+    dataset_id: str,
+    column: str,
+    glitchlings: Iterable[str | Glitchling] | Glitchling | str | Gaggle,
+    *,
+    seed: int = 151,
+    instructions: str = DEFAULT_CLEANUP_INSTRUCTIONS,
+    reward_function: Callable[..., float] | None = None,
+    split: str | None = None,
+    **load_dataset_kwargs: Any,
+) -> vf.Environment:
+    """Create an Echo Chamber Prime environment from a Hugging Face dataset column.
+    Args:
+        dataset_id: Identifier of the Hugging Face dataset to load.
+        column: Name of the column whose text should be glitched.
+        glitchlings: Glitchling specifiers that will corrupt the prompts.
+        seed: RNG seed forwarded to :func:`summon`.
+        instructions: System instructions supplied to the environment prompts.
+        reward_function: Optional callable used to score completions. Defaults to
+            :func:`symmetric_damerau_levenshtein_similarity` when omitted.
+        split: Optional dataset split to load.
+        **load_dataset_kwargs: Extra keyword arguments forwarded to
+            :func:`datasets.load_dataset`.
+    """
+    try:
+        from datasets import Dataset as HFDataset, DatasetDict, load_dataset
+    except ModuleNotFoundError as exc:  # pragma: no cover - optional dependency
+        message = "datasets is required to build an echo chamber"
+        raise ModuleNotFoundError(message) from exc
+    hf_dataset: HFDataset | DatasetDict
+    if split is None:
+        hf_dataset = load_dataset(dataset_id, **load_dataset_kwargs)
+        if isinstance(hf_dataset, DatasetDict):
+            try:
+                hf_dataset = next(iter(hf_dataset.values()))
+            except StopIteration as exc:  # pragma: no cover - defensive
+                raise ValueError("The specified dataset does not contain any splits") from exc
+    else:
+        hf_dataset = load_dataset(dataset_id, split=split, **load_dataset_kwargs)
+    if isinstance(hf_dataset, DatasetDict):
+        raise ValueError(
+            "Specify which split to use when the dataset loads as a DatasetDict."
+        )
+    filtered_dataset = hf_dataset.filter(
+        lambda row: row.get(column) is not None,
+        load_from_cache_file=False,
+    )
+    source_column_names = list(filtered_dataset.column_names)
+    def _build_prompt(row: dict[str, Any]) -> dict[str, Any]:
+        text = str(row[column])
+        prompt = [
+            {"role": "system", "content": instructions},
+            {"role": "user", "content": f"Corrupted text:\n{text}"},
+        ]
+        return {"prompt": prompt, "answer": text}
+    base_dataset = filtered_dataset.map(
+        _build_prompt,
+        remove_columns=source_column_names,
+        load_from_cache_file=False,
+    )
+    try:
+        dataset_length = len(base_dataset)  # type: ignore[arg-type]
+    except TypeError:
+        preview_rows: list[dict[str, Any]]
+        take_fn = getattr(base_dataset, "take", None)
+        if callable(take_fn):
+            preview_rows = list(take_fn(1))
+        else:
+            iterator = iter(base_dataset)
+            try:
+                first_row = next(iterator)
+            except StopIteration:
+                preview_rows = []
+            else:
+                preview_rows = [first_row]
+        if not preview_rows:
+            raise ValueError(
+                f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
+            )
+    else:
+        if dataset_length == 0:
+            raise ValueError(
+                f"Column '{column}' did not yield any textual entries in dataset '{dataset_id}'."
+            )
+    gaggle = _as_gaggle(glitchlings, seed=seed)
+    glitched_dataset = gaggle.corrupt_dataset(base_dataset, ["prompt"])
+    rubric_func = reward_function or symmetric_damerau_levenshtein_similarity
+    rubric = vf.Rubric(funcs=[rubric_func], weights=[1.0])
+    return vf.SingleTurnEnv(dataset=glitched_dataset, rubric=rubric)

glitchlings/main.py ADDED Viewed

@@ -0,0 +1,218 @@
+"""Command line interface for summoning and running glitchlings."""
+from __future__ import annotations
+import argparse
+import difflib
+from pathlib import Path
+import sys
+from . import SAMPLE_TEXT
+from .zoo import (
+    Glitchling,
+    Gaggle,
+    BUILTIN_GLITCHLINGS,
+    DEFAULT_GLITCHLING_NAMES,
+    parse_glitchling_spec,
+    summon,
+)
+MAX_NAME_WIDTH = max(len(glitchling.name) for glitchling in BUILTIN_GLITCHLINGS.values())
+def build_parser() -> argparse.ArgumentParser:
+    """Create and configure the CLI argument parser.
+    Returns:
+        argparse.ArgumentParser: The configured argument parser instance.
+    """
+    parser = argparse.ArgumentParser(
+        description=(
+            "Summon glitchlings to corrupt text. Provide input text as an argument, "
+            "via --file, or pipe it on stdin."
+        )
+    )
+    parser.add_argument(
+        "text",
+        nargs="?",
+        help="Text to corrupt. If omitted, stdin is used or --sample provides fallback text.",
+    )
+    parser.add_argument(
+        "-g",
+        "--glitchling",
+        dest="glitchlings",
+        action="append",
+        metavar="SPEC",
+        help=(
+            "Glitchling to apply, optionally with parameters like "
+            "Typogre(rate=0.05). Repeat for multiples; defaults to all built-ins."
+        ),
+    )
+    parser.add_argument(
+        "-s",
+        "--seed",
+        type=int,
+        default=151,
+        help="Seed controlling deterministic corruption order (default: 151).",
+    )
+    parser.add_argument(
+        "-f",
+        "--file",
+        type=Path,
+        help="Read input text from a file instead of the command line argument.",
+    )
+    parser.add_argument(
+        "--sample",
+        action="store_true",
+        help="Use the included SAMPLE_TEXT when no other input is provided.",
+    )
+    parser.add_argument(
+        "--diff",
+        action="store_true",
+        help="Show a unified diff between the original and corrupted text.",
+    )
+    parser.add_argument(
+        "--list",
+        action="store_true",
+        help="List available glitchlings and exit.",
+    )
+    return parser
+def list_glitchlings() -> None:
+    """Print information about the available built-in glitchlings."""
+    for key in DEFAULT_GLITCHLING_NAMES:
+        glitchling = BUILTIN_GLITCHLINGS[key]
+        display_name = glitchling.name
+        scope = glitchling.level.name.title()
+        order = glitchling.order.name.lower()
+        print(f"{display_name:>{MAX_NAME_WIDTH}} — scope: {scope}, order: {order}")
+def read_text(args: argparse.Namespace, parser: argparse.ArgumentParser) -> str:
+    """Resolve the input text based on CLI arguments.
+    Args:
+        args: Parsed arguments from the CLI.
+        parser: The argument parser used for emitting user-facing errors.
+    Returns:
+        str: The text to corrupt.
+    Raises:
+        SystemExit: Raised indirectly via ``parser.error`` on failure.
+    """
+    if args.file is not None:
+        try:
+            return args.file.read_text(encoding="utf-8")
+        except OSError as exc:
+            filename = getattr(exc, "filename", None) or args.file
+            reason = exc.strerror or str(exc)
+            parser.error(f"Failed to read file {filename}: {reason}")
+    if args.text:
+        return args.text
+    if not sys.stdin.isatty():
+        return sys.stdin.read()
+    if args.sample:
+        return SAMPLE_TEXT
+    parser.error(
+        "No input text provided. Supply text as an argument, use --file, pipe input, or pass --sample."
+    )
+    raise AssertionError("parser.error should exit")
+def summon_glitchlings(
+    names: list[str] | None, parser: argparse.ArgumentParser, seed: int
+) -> Gaggle:
+    """Instantiate the requested glitchlings and bundle them in a ``Gaggle``."""
+    if names:
+        normalized: list[str | Glitchling] = []
+        for specification in names:
+            try:
+                normalized.append(parse_glitchling_spec(specification))
+            except ValueError as exc:
+                parser.error(str(exc))
+                raise AssertionError("parser.error should exit")
+    else:
+        normalized = DEFAULT_GLITCHLING_NAMES
+    try:
+        return summon(normalized, seed=seed)
+    except ValueError as exc:
+        parser.error(str(exc))
+        raise AssertionError("parser.error should exit")
+def show_diff(original: str, corrupted: str) -> None:
+    """Display a unified diff between the original and corrupted text."""
+    diff_lines = list(
+        difflib.unified_diff(
+            original.splitlines(keepends=True),
+            corrupted.splitlines(keepends=True),
+            fromfile="original",
+            tofile="corrupted",
+            lineterm="",
+        )
+    )
+    if diff_lines:
+        for line in diff_lines:
+            print(line)
+    else:
+        print("No changes detected.")
+def run_cli(args: argparse.Namespace, parser: argparse.ArgumentParser) -> int:
+    """Execute the CLI workflow using the provided arguments.
+    Args:
+        args: Parsed CLI arguments.
+        parser: Argument parser used for error reporting.
+    Returns:
+        int: Exit code for the process (``0`` on success).
+    """
+    if args.list:
+        list_glitchlings()
+        return 0
+    text = read_text(args, parser)
+    gaggle = summon_glitchlings(args.glitchlings, parser, args.seed)
+    corrupted = gaggle(text)
+    if args.diff:
+        show_diff(text, corrupted)
+    else:
+        print(corrupted)
+    return 0
+def main(argv: list[str] | None = None) -> int:
+    """Entry point for the ``glitchlings`` command line interface.
+    Args:
+        argv: Optional list of command line arguments. Defaults to ``sys.argv``.
+    Returns:
+        int: Exit code suitable for use with ``sys.exit``.
+    """
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    return run_cli(args, parser)
+if __name__ == "__main__":
+    sys.exit(main())