PyPI - glitchlings - Versions diffs - 0.4.4__cp313-cp313-win_amd64.whl - Mend

glitchlings 0.4.4__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of glitchlings might be problematic. Click here for more details.

Files changed (47) hide show

glitchlings/__init__.py +67 -0
glitchlings/__main__.py +8 -0
glitchlings/_zoo_rust.cp313-win_amd64.pyd +0 -0
glitchlings/compat.py +284 -0
glitchlings/config.py +388 -0
glitchlings/config.toml +3 -0
glitchlings/dlc/__init__.py +7 -0
glitchlings/dlc/_shared.py +153 -0
glitchlings/dlc/huggingface.py +81 -0
glitchlings/dlc/prime.py +254 -0
glitchlings/dlc/pytorch.py +166 -0
glitchlings/dlc/pytorch_lightning.py +215 -0
glitchlings/lexicon/__init__.py +192 -0
glitchlings/lexicon/_cache.py +110 -0
glitchlings/lexicon/data/default_vector_cache.json +82 -0
glitchlings/lexicon/metrics.py +162 -0
glitchlings/lexicon/vector.py +651 -0
glitchlings/lexicon/wordnet.py +232 -0
glitchlings/main.py +364 -0
glitchlings/util/__init__.py +195 -0
glitchlings/util/adapters.py +27 -0
glitchlings/zoo/__init__.py +168 -0
glitchlings/zoo/_ocr_confusions.py +32 -0
glitchlings/zoo/_rate.py +131 -0
glitchlings/zoo/_rust_extensions.py +143 -0
glitchlings/zoo/_sampling.py +54 -0
glitchlings/zoo/_text_utils.py +100 -0
glitchlings/zoo/adjax.py +128 -0
glitchlings/zoo/apostrofae.py +127 -0
glitchlings/zoo/assets/__init__.py +0 -0
glitchlings/zoo/assets/apostrofae_pairs.json +32 -0
glitchlings/zoo/core.py +582 -0
glitchlings/zoo/jargoyle.py +335 -0
glitchlings/zoo/mim1c.py +109 -0
glitchlings/zoo/ocr_confusions.tsv +30 -0
glitchlings/zoo/redactyl.py +193 -0
glitchlings/zoo/reduple.py +148 -0
glitchlings/zoo/rushmore.py +153 -0
glitchlings/zoo/scannequin.py +171 -0
glitchlings/zoo/typogre.py +231 -0
glitchlings/zoo/zeedub.py +185 -0
glitchlings-0.4.4.dist-info/METADATA +627 -0
glitchlings-0.4.4.dist-info/RECORD +47 -0
glitchlings-0.4.4.dist-info/WHEEL +5 -0
glitchlings-0.4.4.dist-info/entry_points.txt +2 -0
glitchlings-0.4.4.dist-info/licenses/LICENSE +201 -0
glitchlings-0.4.4.dist-info/top_level.txt +1 -0

glitchlings/lexicon/wordnet.py ADDED Viewed

@@ -0,0 +1,232 @@
+"""WordNet-backed lexicon implementation."""
+from __future__ import annotations
+from importlib import import_module
+from pathlib import Path
+from types import ModuleType
+from typing import Any, Callable, Protocol, Sequence, cast
+from ..compat import nltk as _nltk_dependency
+from . import LexiconBackend
+from ._cache import CacheSnapshot
+class _LemmaProtocol(Protocol):
+    def name(self) -> str:
+        ...
+class _SynsetProtocol(Protocol):
+    def lemmas(self) -> Sequence[_LemmaProtocol]:
+        ...
+class _WordNetResource(Protocol):
+    def synsets(self, word: str, pos: str | None = None) -> Sequence[_SynsetProtocol]:
+        ...
+    def ensure_loaded(self) -> None:
+        ...
+WordNetCorpusReaderFactory = Callable[[Any, Any], _WordNetResource]
+nltk: ModuleType | None = _nltk_dependency.get()
+_NLTK_IMPORT_ERROR: ModuleNotFoundError | None = _nltk_dependency.error
+WordNetCorpusReader: WordNetCorpusReaderFactory | None = None
+find: Callable[[str], Any] | None = None
+_WORDNET_MODULE: _WordNetResource | None = None
+if nltk is not None:  # pragma: no cover - guarded by import success
+    try:
+        corpus_reader_module = import_module("nltk.corpus.reader")
+    except ModuleNotFoundError as exc:  # pragma: no cover - triggered when corpus missing
+        if _NLTK_IMPORT_ERROR is None:
+            _NLTK_IMPORT_ERROR = exc
+    else:
+        reader_candidate = getattr(corpus_reader_module, "WordNetCorpusReader", None)
+        if reader_candidate is not None:
+            WordNetCorpusReader = cast(WordNetCorpusReaderFactory, reader_candidate)
+        try:
+            data_module = import_module("nltk.data")
+        except ModuleNotFoundError as exc:  # pragma: no cover - triggered when data missing
+            if _NLTK_IMPORT_ERROR is None:
+                _NLTK_IMPORT_ERROR = exc
+        else:
+            locator = getattr(data_module, "find", None)
+            if callable(locator):
+                find = cast(Callable[[str], Any], locator)
+    try:
+        module_candidate = import_module("nltk.corpus.wordnet")
+    except ModuleNotFoundError:  # pragma: no cover - only hit on namespace packages
+        _WORDNET_MODULE = None
+    else:
+        _WORDNET_MODULE = cast(_WordNetResource, module_candidate)
+else:
+    nltk = None
+    find = None
+    _WORDNET_MODULE = None
+_WORDNET_HANDLE: _WordNetResource | None = _WORDNET_MODULE
+_wordnet_ready = False
+_VALID_POS: tuple[str, ...] = ("n", "v", "a", "r")
+def _require_nltk() -> None:
+    """Ensure the NLTK dependency is present before continuing."""
+    if nltk is None or find is None:
+        message = (
+            "The NLTK package is required for WordNet-backed lexicons; install "
+            "`nltk` and its WordNet corpus manually to enable this backend."
+        )
+        if "_NLTK_IMPORT_ERROR" in globals() and _NLTK_IMPORT_ERROR is not None:
+            raise RuntimeError(message) from _NLTK_IMPORT_ERROR
+        raise RuntimeError(message)
+def dependencies_available() -> bool:
+    """Return ``True`` when the runtime NLTK dependency is present."""
+    return nltk is not None and find is not None
+def _load_wordnet_reader() -> _WordNetResource:
+    """Return a WordNet corpus reader from the downloaded corpus files."""
+    _require_nltk()
+    if WordNetCorpusReader is None:
+        raise RuntimeError("The NLTK WordNet corpus reader is unavailable.")
+    locator = find
+    if locator is None:
+        raise RuntimeError("The NLTK data locator is unavailable.")
+    try:
+        root = locator("corpora/wordnet")
+    except LookupError:
+        try:
+            zip_root = locator("corpora/wordnet.zip")
+        except LookupError as exc:
+            raise RuntimeError(
+                "The NLTK WordNet corpus is not installed; run `nltk.download('wordnet')`."
+            ) from exc
+        root = zip_root.join("wordnet/")
+    return WordNetCorpusReader(root, None)
+def _wordnet(force_refresh: bool = False) -> _WordNetResource:
+    """Retrieve the active WordNet handle, rebuilding it on demand."""
+    global _WORDNET_HANDLE
+    if force_refresh:
+        _WORDNET_HANDLE = _WORDNET_MODULE
+    cached = _WORDNET_HANDLE
+    if cached is not None:
+        return cached
+    resource = _load_wordnet_reader()
+    _WORDNET_HANDLE = resource
+    return resource
+def ensure_wordnet() -> None:
+    """Ensure the WordNet corpus is available before use."""
+    global _wordnet_ready
+    if _wordnet_ready:
+        return
+    _require_nltk()
+    resource = _wordnet()
+    nltk_module = nltk
+    if nltk_module is None:
+        raise RuntimeError("The NLTK dependency is unexpectedly unavailable.")
+    try:
+        resource.ensure_loaded()
+    except LookupError:
+        nltk_module.download("wordnet", quiet=True)
+        try:
+            resource = _wordnet(force_refresh=True)
+            resource.ensure_loaded()
+        except LookupError as exc:  # pragma: no cover - only triggered when download fails
+            raise RuntimeError("Unable to load NLTK WordNet corpus for synonym lookups.") from exc
+    _wordnet_ready = True
+def _collect_synonyms(word: str, parts_of_speech: tuple[str, ...]) -> list[str]:
+    """Gather deterministic synonym candidates for the supplied word."""
+    normalized_word = word.lower()
+    wordnet = _wordnet()
+    synonyms: set[str] = set()
+    for pos_tag in parts_of_speech:
+        synsets = wordnet.synsets(word, pos=pos_tag)
+        if not synsets:
+            continue
+        for synset in synsets:
+            lemmas_list = [lemma.name() for lemma in synset.lemmas()]
+            if not lemmas_list:
+                continue
+            filtered = []
+            for lemma_str in lemmas_list:
+                cleaned = lemma_str.replace("_", " ")
+                if cleaned.lower() != normalized_word:
+                    filtered.append(cleaned)
+            if filtered:
+                synonyms.update(filtered)
+                break
+        if synonyms:
+            break
+    return sorted(synonyms)
+class WordNetLexicon(LexiconBackend):
+    """Lexicon that retrieves synonyms from the NLTK WordNet corpus."""
+    def get_synonyms(self, word: str, pos: str | None = None, n: int = 5) -> list[str]:
+        """Return up to ``n`` WordNet lemmas for ``word`` filtered by ``pos`` if provided."""
+        ensure_wordnet()
+        if pos is None:
+            parts: tuple[str, ...] = _VALID_POS
+        else:
+            normalized_pos = pos.lower()
+            if normalized_pos not in _VALID_POS:
+                return []
+            parts = (normalized_pos,)
+        synonyms = _collect_synonyms(word, parts)
+        return self._deterministic_sample(synonyms, limit=n, word=word, pos=pos)
+    def supports_pos(self, pos: str | None) -> bool:
+        """Return ``True`` when ``pos`` is unset or recognised by the WordNet corpus."""
+        if pos is None:
+            return True
+        return pos.lower() in _VALID_POS
+    @classmethod
+    def load_cache(cls, path: str | Path) -> CacheSnapshot:
+        """WordNet lexicons do not persist caches; raising keeps the contract explicit."""
+        raise RuntimeError("WordNetLexicon does not persist or load caches.")
+    def save_cache(self, path: str | Path | None = None) -> Path | None:
+        """WordNet lexicons do not persist caches; raising keeps the contract explicit."""
+        raise RuntimeError("WordNetLexicon does not persist or load caches.")
+    def __repr__(self) -> str:  # pragma: no cover - trivial representation
+        return f"WordNetLexicon(seed={self.seed!r})"
+__all__ = ["WordNetLexicon", "dependencies_available", "ensure_wordnet"]

glitchlings/main.py ADDED Viewed

@@ -0,0 +1,364 @@
+"""Command line interface for summoning and running glitchlings."""
+from __future__ import annotations
+import argparse
+import difflib
+import sys
+from collections.abc import Sequence
+from pathlib import Path
+from typing import cast
+from . import SAMPLE_TEXT
+from .config import DEFAULT_ATTACK_SEED, build_gaggle, load_attack_config
+from .zoo import (
+    BUILTIN_GLITCHLINGS,
+    DEFAULT_GLITCHLING_NAMES,
+    Gaggle,
+    Glitchling,
+    parse_glitchling_spec,
+    summon,
+)
+MAX_NAME_WIDTH = max(len(glitchling.name) for glitchling in BUILTIN_GLITCHLINGS.values())
+def build_parser() -> argparse.ArgumentParser:
+    """Create and configure the CLI argument parser.
+    Returns:
+        argparse.ArgumentParser: The configured argument parser instance.
+    """
+    parser = argparse.ArgumentParser(
+        description=(
+            "Summon glitchlings to corrupt text. Provide input text as an argument, "
+            "via --file, or pipe it on stdin."
+        )
+    )
+    parser.add_argument(
+        "text",
+        nargs="?",
+        help="Text to corrupt. If omitted, stdin is used or --sample provides fallback text.",
+    )
+    parser.add_argument(
+        "-g",
+        "--glitchling",
+        dest="glitchlings",
+        action="append",
+        metavar="SPEC",
+        help=(
+            "Glitchling to apply, optionally with parameters like "
+            "Typogre(rate=0.05). Repeat for multiples; defaults to all built-ins."
+        ),
+    )
+    parser.add_argument(
+        "-s",
+        "--seed",
+        type=int,
+        default=None,
+        help="Seed controlling deterministic corruption order (default: 151).",
+    )
+    parser.add_argument(
+        "-f",
+        "--file",
+        type=Path,
+        help="Read input text from a file instead of the command line argument.",
+    )
+    parser.add_argument(
+        "--sample",
+        action="store_true",
+        help="Use the included SAMPLE_TEXT when no other input is provided.",
+    )
+    parser.add_argument(
+        "--diff",
+        action="store_true",
+        help="Show a unified diff between the original and corrupted text.",
+    )
+    parser.add_argument(
+        "--list",
+        action="store_true",
+        help="List available glitchlings and exit.",
+    )
+    parser.add_argument(
+        "-c",
+        "--config",
+        type=Path,
+        help="Load glitchlings from a YAML configuration file.",
+    )
+    return parser
+def build_lexicon_parser() -> argparse.ArgumentParser:
+    """Create the ``build-lexicon`` subcommand parser with vector cache options."""
+    builder = argparse.ArgumentParser(
+        prog="glitchlings build-lexicon",
+        description=(
+            "Generate deterministic synonym caches using vector embeddings so "
+            "they can be distributed without bundling large models."
+        ),
+    )
+    builder.add_argument(
+        "--source",
+        required=True,
+        help=(
+            "Vector source specification. Use 'spacy:<model>' for spaCy pipelines "
+            "or provide a path to a gensim KeyedVectors/word2vec file."
+        ),
+    )
+    builder.add_argument(
+        "--output",
+        required=True,
+        type=Path,
+        help="Path to the JSON file that will receive the synonym cache.",
+    )
+    builder.add_argument(
+        "--tokens",
+        type=Path,
+        help="Optional newline-delimited vocabulary file to restrict generation.",
+    )
+    builder.add_argument(
+        "--max-neighbors",
+        type=int,
+        default=50,
+        help="Number of nearest neighbours to cache per token (default: 50).",
+    )
+    builder.add_argument(
+        "--min-similarity",
+        type=float,
+        default=0.0,
+        help="Minimum cosine similarity required to keep a synonym (default: 0.0).",
+    )
+    builder.add_argument(
+        "--seed",
+        type=int,
+        help="Optional deterministic seed to bake into the resulting cache.",
+    )
+    builder.add_argument(
+        "--case-sensitive",
+        action="store_true",
+        help="Preserve original casing instead of lower-casing cache keys.",
+    )
+    builder.add_argument(
+        "--normalizer",
+        choices=["lower", "identity"],
+        default="lower",
+        help="Token normalization strategy for cache keys (default: lower).",
+    )
+    builder.add_argument(
+        "--limit",
+        type=int,
+        help="Optional maximum number of tokens to process.",
+    )
+    builder.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Allow overwriting an existing cache file.",
+    )
+    return builder
+def list_glitchlings() -> None:
+    """Print information about the available built-in glitchlings."""
+    for key in DEFAULT_GLITCHLING_NAMES:
+        glitchling = BUILTIN_GLITCHLINGS[key]
+        display_name = glitchling.name
+        scope = glitchling.level.name.title()
+        order = glitchling.order.name.lower()
+        print(f"{display_name:>{MAX_NAME_WIDTH}} — scope: {scope}, order: {order}")
+def read_text(args: argparse.Namespace, parser: argparse.ArgumentParser) -> str:
+    """Resolve the input text based on CLI arguments.
+    Args:
+        args: Parsed arguments from the CLI.
+        parser: The argument parser used for emitting user-facing errors.
+    Returns:
+        str: The text to corrupt.
+    Raises:
+        SystemExit: Raised indirectly via ``parser.error`` on failure.
+    """
+    file_path = cast(Path | None, getattr(args, "file", None))
+    if file_path is not None:
+        try:
+            return file_path.read_text(encoding="utf-8")
+        except OSError as exc:
+            filename = getattr(exc, "filename", None) or file_path
+            reason = exc.strerror or str(exc)
+            parser.error(f"Failed to read file {filename}: {reason}")
+    text_argument = cast(str | None, getattr(args, "text", None))
+    if text_argument:
+        return text_argument
+    if not sys.stdin.isatty():
+        return sys.stdin.read()
+    if bool(getattr(args, "sample", False)):
+        return SAMPLE_TEXT
+    parser.error(
+        "No input text provided. Supply text as an argument, use --file, pipe input, or "
+        "pass --sample."
+    )
+    raise AssertionError("parser.error should exit")
+def summon_glitchlings(
+    names: list[str] | None,
+    parser: argparse.ArgumentParser,
+    seed: int | None,
+    *,
+    config_path: Path | None = None,
+) -> Gaggle:
+    """Instantiate the requested glitchlings and bundle them in a ``Gaggle``."""
+    if config_path is not None:
+        if names:
+            parser.error("Cannot combine --config with --glitchling.")
+            raise AssertionError("parser.error should exit")
+        try:
+            config = load_attack_config(config_path)
+        except (TypeError, ValueError) as exc:
+            parser.error(str(exc))
+            raise AssertionError("parser.error should exit")
+        return build_gaggle(config, seed_override=seed)
+    normalized: Sequence[str | Glitchling]
+    if names:
+        parsed: list[str | Glitchling] = []
+        for specification in names:
+            try:
+                parsed.append(parse_glitchling_spec(specification))
+            except ValueError as exc:
+                parser.error(str(exc))
+                raise AssertionError("parser.error should exit")
+        normalized = parsed
+    else:
+        normalized = list(DEFAULT_GLITCHLING_NAMES)
+    effective_seed = seed if seed is not None else DEFAULT_ATTACK_SEED
+    try:
+        return summon(list(normalized), seed=effective_seed)
+    except ValueError as exc:
+        parser.error(str(exc))
+        raise AssertionError("parser.error should exit")
+def show_diff(original: str, corrupted: str) -> None:
+    """Display a unified diff between the original and corrupted text."""
+    diff_lines = list(
+        difflib.unified_diff(
+            original.splitlines(keepends=True),
+            corrupted.splitlines(keepends=True),
+            fromfile="original",
+            tofile="corrupted",
+            lineterm="",
+        )
+    )
+    if diff_lines:
+        for line in diff_lines:
+            print(line)
+    else:
+        print("No changes detected.")
+def run_cli(args: argparse.Namespace, parser: argparse.ArgumentParser) -> int:
+    """Execute the CLI workflow using the provided arguments.
+    Args:
+        args: Parsed CLI arguments.
+        parser: Argument parser used for error reporting.
+    Returns:
+        int: Exit code for the process (``0`` on success).
+    """
+    if args.list:
+        list_glitchlings()
+        return 0
+    text = read_text(args, parser)
+    gaggle = summon_glitchlings(
+        args.glitchlings,
+        parser,
+        args.seed,
+        config_path=args.config,
+    )
+    corrupted = gaggle.corrupt(text)
+    if not isinstance(corrupted, str):
+        message = "Gaggle returned non-string output for string input"
+        raise TypeError(message)
+    if args.diff:
+        show_diff(text, corrupted)
+    else:
+        print(corrupted)
+    return 0
+def run_build_lexicon(args: argparse.Namespace) -> int:
+    """Delegate to the vector lexicon cache builder using CLI arguments."""
+    from glitchlings.lexicon.vector import main as vector_main
+    vector_args = [
+        "--source",
+        args.source,
+        "--output",
+        str(args.output),
+        "--max-neighbors",
+        str(args.max_neighbors),
+        "--min-similarity",
+        str(args.min_similarity),
+        "--normalizer",
+        args.normalizer,
+    ]
+    if args.tokens is not None:
+        vector_args.extend(["--tokens", str(args.tokens)])
+    if args.seed is not None:
+        vector_args.extend(["--seed", str(args.seed)])
+    if args.case_sensitive:
+        vector_args.append("--case-sensitive")
+    if args.limit is not None:
+        vector_args.extend(["--limit", str(args.limit)])
+    if args.overwrite:
+        vector_args.append("--overwrite")
+    return vector_main(vector_args)
+def main(argv: list[str] | None = None) -> int:
+    """Entry point for the ``glitchlings`` command line interface.
+    Args:
+        argv: Optional list of command line arguments. Defaults to ``sys.argv``.
+    Returns:
+        int: Exit code suitable for use with ``sys.exit``.
+    """
+    if argv is None:
+        raw_args = sys.argv[1:]
+    else:
+        raw_args = list(argv)
+    if raw_args and raw_args[0] == "build-lexicon":
+        builder = build_lexicon_parser()
+        args = builder.parse_args(raw_args[1:])
+        return run_build_lexicon(args)
+    parser = build_parser()
+    args = parser.parse_args(raw_args)
+    return run_cli(args, parser)
+if __name__ == "__main__":
+    sys.exit(main())