PyPI - glitchlings - Versions diffs - 0.3.0__cp312-cp312-macosx_11_0_universal2.whl → 0.4.0__cp312-cp312-macosx_11_0_universal2.whl - Mend

glitchlings 0.3.0__cp312-cp312-macosx_11_0_universal2.whl → 0.4.0__cp312-cp312-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of glitchlings might be problematic. Click here for more details.

Files changed (27) hide show

glitchlings/__init__.py +4 -0
glitchlings/_zoo_rust.cpython-312-darwin.so +0 -0
glitchlings/config.py +258 -0
glitchlings/config.toml +3 -0
glitchlings/lexicon/__init__.py +191 -0
glitchlings/lexicon/data/default_vector_cache.json +16 -0
glitchlings/lexicon/graph.py +303 -0
glitchlings/lexicon/metrics.py +169 -0
glitchlings/lexicon/vector.py +610 -0
glitchlings/lexicon/wordnet.py +182 -0
glitchlings/main.py +145 -5
glitchlings/zoo/__init__.py +15 -0
glitchlings/zoo/_sampling.py +55 -0
glitchlings/zoo/_text_utils.py +62 -0
glitchlings/zoo/jargoyle.py +190 -200
glitchlings/zoo/redactyl.py +26 -54
glitchlings/zoo/reduple.py +10 -21
glitchlings/zoo/rushmore.py +15 -21
glitchlings/zoo/typogre.py +22 -1
glitchlings/zoo/zeedub.py +40 -1
{glitchlings-0.3.0.dist-info → glitchlings-0.4.0.dist-info}/METADATA +30 -8
glitchlings-0.4.0.dist-info/RECORD +38 -0
glitchlings-0.3.0.dist-info/RECORD +0 -29
{glitchlings-0.3.0.dist-info → glitchlings-0.4.0.dist-info}/WHEEL +0 -0
{glitchlings-0.3.0.dist-info → glitchlings-0.4.0.dist-info}/entry_points.txt +0 -0
{glitchlings-0.3.0.dist-info → glitchlings-0.4.0.dist-info}/licenses/LICENSE +0 -0
{glitchlings-0.3.0.dist-info → glitchlings-0.4.0.dist-info}/top_level.txt +0 -0

glitchlings/lexicon/wordnet.py ADDED Viewed

@@ -0,0 +1,182 @@
+"""WordNet-backed lexicon implementation."""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any
+try:  # pragma: no cover - exercised when NLTK is available
+    import nltk  # type: ignore[import]
+except ModuleNotFoundError as exc:  # pragma: no cover - triggered when NLTK missing
+    nltk = None  # type: ignore[assignment]
+    find = None  # type: ignore[assignment]
+    _NLTK_IMPORT_ERROR = exc
+else:  # pragma: no cover - executed when NLTK is present
+    from nltk.corpus.reader import WordNetCorpusReader as _WordNetCorpusReader  # type: ignore[import]
+    from nltk.data import find as _nltk_find  # type: ignore[import]
+    find = _nltk_find
+    _NLTK_IMPORT_ERROR = None
+if TYPE_CHECKING:  # pragma: no cover - typing aid only
+    from nltk.corpus.reader import WordNetCorpusReader  # type: ignore[import]
+else:  # pragma: no cover - runtime fallback to avoid hard dependency
+    WordNetCorpusReader = Any
+if nltk is not None:  # pragma: no cover - guarded by import success
+    try:
+        from nltk.corpus import wordnet as _WORDNET_MODULE  # type: ignore[import]
+    except ModuleNotFoundError:  # pragma: no cover - only hit on namespace packages
+        _WORDNET_MODULE = None
+    else:
+        WordNetCorpusReader = _WordNetCorpusReader  # type: ignore[assignment]
+else:
+    _WORDNET_MODULE = None
+from . import Lexicon
+_WORDNET_HANDLE: WordNetCorpusReader | Any | None = _WORDNET_MODULE
+_wordnet_ready = False
+_VALID_POS: tuple[str, ...] = ("n", "v", "a", "r")
+def _require_nltk() -> None:
+    """Ensure the NLTK dependency is present before continuing."""
+    if nltk is None or find is None:
+        message = (
+            "The NLTK package is required for WordNet-backed lexicons; install "
+            "`nltk` and its WordNet corpus manually to enable this backend."
+        )
+        if '_NLTK_IMPORT_ERROR' in globals() and _NLTK_IMPORT_ERROR is not None:
+            raise RuntimeError(message) from _NLTK_IMPORT_ERROR
+        raise RuntimeError(message)
+def dependencies_available() -> bool:
+    """Return ``True`` when the runtime NLTK dependency is present."""
+    return nltk is not None and find is not None
+def _load_wordnet_reader() -> WordNetCorpusReader:
+    """Return a WordNet corpus reader from the downloaded corpus files."""
+    _require_nltk()
+    try:
+        root = find("corpora/wordnet")
+    except LookupError:
+        try:
+            zip_root = find("corpora/wordnet.zip")
+        except LookupError as exc:
+            raise RuntimeError(
+                "The NLTK WordNet corpus is not installed; run `nltk.download('wordnet')`."
+            ) from exc
+        root = zip_root.join("wordnet/")
+    return WordNetCorpusReader(root, None)
+def _wordnet(force_refresh: bool = False) -> WordNetCorpusReader | Any:
+    """Retrieve the active WordNet handle, rebuilding it on demand."""
+    global _WORDNET_HANDLE
+    if force_refresh:
+        _WORDNET_HANDLE = _WORDNET_MODULE
+    if _WORDNET_HANDLE is not None:
+        return _WORDNET_HANDLE
+    _WORDNET_HANDLE = _load_wordnet_reader()
+    return _WORDNET_HANDLE
+def ensure_wordnet() -> None:
+    """Ensure the WordNet corpus is available before use."""
+    global _wordnet_ready
+    if _wordnet_ready:
+        return
+    _require_nltk()
+    resource = _wordnet()
+    try:
+        resource.ensure_loaded()
+    except LookupError:
+        nltk.download("wordnet", quiet=True)
+        try:
+            resource = _wordnet(force_refresh=True)
+            resource.ensure_loaded()
+        except LookupError as exc:  # pragma: no cover - only triggered when download fails
+            raise RuntimeError(
+                "Unable to load NLTK WordNet corpus for synonym lookups."
+            ) from exc
+    _wordnet_ready = True
+def _collect_synonyms(word: str, parts_of_speech: tuple[str, ...]) -> list[str]:
+    """Gather deterministic synonym candidates for the supplied word."""
+    normalized_word = word.lower()
+    wordnet = _wordnet()
+    synonyms: set[str] = set()
+    for pos_tag in parts_of_speech:
+        synsets = wordnet.synsets(word, pos=pos_tag)
+        if not synsets:
+            continue
+        for synset in synsets:
+            lemmas_list = [lemma.name() for lemma in synset.lemmas()]
+            if not lemmas_list:
+                continue
+            filtered = []
+            for lemma_str in lemmas_list:
+                cleaned = lemma_str.replace("_", " ")
+                if cleaned.lower() != normalized_word:
+                    filtered.append(cleaned)
+            if filtered:
+                synonyms.update(filtered)
+                break
+        if synonyms:
+            break
+    return sorted(synonyms)
+class WordNetLexicon(Lexicon):
+    """Lexicon that retrieves synonyms from the NLTK WordNet corpus."""
+    def get_synonyms(
+        self, word: str, pos: str | None = None, n: int = 5
+    ) -> list[str]:
+        ensure_wordnet()
+        if pos is None:
+            parts: tuple[str, ...] = _VALID_POS
+        else:
+            normalized_pos = pos.lower()
+            if normalized_pos not in _VALID_POS:
+                return []
+            parts = (normalized_pos,)
+        synonyms = _collect_synonyms(word, parts)
+        return self._deterministic_sample(synonyms, limit=n, word=word, pos=pos)
+    def supports_pos(self, pos: str | None) -> bool:
+        if pos is None:
+            return True
+        return pos.lower() in _VALID_POS
+    def __repr__(self) -> str:  # pragma: no cover - trivial representation
+        return f"WordNetLexicon(seed={self.seed!r})"
+__all__ = ["WordNetLexicon", "dependencies_available", "ensure_wordnet"]

glitchlings/main.py CHANGED Viewed

@@ -8,6 +8,7 @@ from pathlib import Path
 import sys
 from . import SAMPLE_TEXT
+from .config import DEFAULT_ATTACK_SEED, build_gaggle, load_attack_config
 from .zoo import (
     Glitchling,
     Gaggle,
@@ -53,7 +54,7 @@ def build_parser() -> argparse.ArgumentParser:
         "-s",
         "--seed",
         type=int,
-        default=151,
+        default=None,
         help="Seed controlling deterministic corruption order (default: 151).",
     )
     parser.add_argument(
@@ -77,9 +78,83 @@ def build_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="List available glitchlings and exit.",
     )
+    parser.add_argument(
+        "-c",
+        "--config",
+        type=Path,
+        help="Load glitchlings from a YAML configuration file.",
+    )
     return parser
+def build_lexicon_parser() -> argparse.ArgumentParser:
+    builder = argparse.ArgumentParser(
+        prog="glitchlings build-lexicon",
+        description=(
+            "Generate deterministic synonym caches using vector embeddings so "
+            "they can be distributed without bundling large models."
+        ),
+    )
+    builder.add_argument(
+        "--source",
+        required=True,
+        help=(
+            "Vector source specification. Use 'spacy:<model>' for spaCy pipelines "
+            "or provide a path to a gensim KeyedVectors/word2vec file."
+        ),
+    )
+    builder.add_argument(
+        "--output",
+        required=True,
+        type=Path,
+        help="Path to the JSON file that will receive the synonym cache.",
+    )
+    builder.add_argument(
+        "--tokens",
+        type=Path,
+        help="Optional newline-delimited vocabulary file to restrict generation.",
+    )
+    builder.add_argument(
+        "--max-neighbors",
+        type=int,
+        default=50,
+        help="Number of nearest neighbours to cache per token (default: 50).",
+    )
+    builder.add_argument(
+        "--min-similarity",
+        type=float,
+        default=0.0,
+        help="Minimum cosine similarity required to keep a synonym (default: 0.0).",
+    )
+    builder.add_argument(
+        "--seed",
+        type=int,
+        help="Optional deterministic seed to bake into the resulting cache.",
+    )
+    builder.add_argument(
+        "--case-sensitive",
+        action="store_true",
+        help="Preserve original casing instead of lower-casing cache keys.",
+    )
+    builder.add_argument(
+        "--normalizer",
+        choices=["lower", "identity"],
+        default="lower",
+        help="Token normalization strategy for cache keys (default: lower).",
+    )
+    builder.add_argument(
+        "--limit",
+        type=int,
+        help="Optional maximum number of tokens to process.",
+    )
+    builder.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Allow overwriting an existing cache file.",
+    )
+    return builder
 def list_glitchlings() -> None:
     """Print information about the available built-in glitchlings."""
@@ -129,10 +204,27 @@ def read_text(args: argparse.Namespace, parser: argparse.ArgumentParser) -> str:
 def summon_glitchlings(
-    names: list[str] | None, parser: argparse.ArgumentParser, seed: int
+    names: list[str] | None,
+    parser: argparse.ArgumentParser,
+    seed: int | None,
+    *,
+    config_path: Path | None = None,
 ) -> Gaggle:
     """Instantiate the requested glitchlings and bundle them in a ``Gaggle``."""
+    if config_path is not None:
+        if names:
+            parser.error("Cannot combine --config with --glitchling.")
+            raise AssertionError("parser.error should exit")
+        try:
+            config = load_attack_config(config_path)
+        except (TypeError, ValueError) as exc:
+            parser.error(str(exc))
+            raise AssertionError("parser.error should exit")
+        return build_gaggle(config, seed_override=seed)
     if names:
         normalized: list[str | Glitchling] = []
         for specification in names:
@@ -144,8 +236,10 @@ def summon_glitchlings(
     else:
         normalized = DEFAULT_GLITCHLING_NAMES
+    effective_seed = seed if seed is not None else DEFAULT_ATTACK_SEED
     try:
-        return summon(normalized, seed=seed)
+        return summon(normalized, seed=effective_seed)
     except ValueError as exc:
         parser.error(str(exc))
         raise AssertionError("parser.error should exit")
@@ -187,7 +281,12 @@ def run_cli(args: argparse.Namespace, parser: argparse.ArgumentParser) -> int:
         return 0
     text = read_text(args, parser)
-    gaggle = summon_glitchlings(args.glitchlings, parser, args.seed)
+    gaggle = summon_glitchlings(
+        args.glitchlings,
+        parser,
+        args.seed,
+        config_path=args.config,
+    )
     corrupted = gaggle(text)
@@ -199,6 +298,37 @@ def run_cli(args: argparse.Namespace, parser: argparse.ArgumentParser) -> int:
     return 0
+def run_build_lexicon(args: argparse.Namespace) -> int:
+    """Delegate to the vector lexicon cache builder using CLI arguments."""
+    from glitchlings.lexicon.vector import main as vector_main
+    vector_args = [
+        "--source",
+        args.source,
+        "--output",
+        str(args.output),
+        "--max-neighbors",
+        str(args.max_neighbors),
+        "--min-similarity",
+        str(args.min_similarity),
+        "--normalizer",
+        args.normalizer,
+    ]
+    if args.tokens is not None:
+        vector_args.extend(["--tokens", str(args.tokens)])
+    if args.seed is not None:
+        vector_args.extend(["--seed", str(args.seed)])
+    if args.case_sensitive:
+        vector_args.append("--case-sensitive")
+    if args.limit is not None:
+        vector_args.extend(["--limit", str(args.limit)])
+    if args.overwrite:
+        vector_args.append("--overwrite")
+    return vector_main(vector_args)
 def main(argv: list[str] | None = None) -> int:
     """Entry point for the ``glitchlings`` command line interface.
@@ -209,8 +339,18 @@ def main(argv: list[str] | None = None) -> int:
         int: Exit code suitable for use with ``sys.exit``.
     """
+    if argv is None:
+        raw_args = sys.argv[1:]
+    else:
+        raw_args = list(argv)
+    if raw_args and raw_args[0] == "build-lexicon":
+        builder = build_lexicon_parser()
+        args = builder.parse_args(raw_args[1:])
+        return run_build_lexicon(args)
     parser = build_parser()
-    args = parser.parse_args(argv)
+    args = parser.parse_args(raw_args)
     return run_cli(args, parser)

glitchlings/zoo/__init__.py CHANGED Viewed

@@ -39,6 +39,7 @@ __all__ = [
     "BUILTIN_GLITCHLINGS",
     "DEFAULT_GLITCHLING_NAMES",
     "parse_glitchling_spec",
+    "get_glitchling_class",
 ]
 _HAS_JARGOYLE = _jargoyle_available()
@@ -125,6 +126,20 @@ def parse_glitchling_spec(specification: str) -> Glitchling:
         raise ValueError(f"Failed to instantiate glitchling '{name}': {exc}") from exc
+def get_glitchling_class(name: str) -> type[Glitchling]:
+    """Look up the glitchling class registered under ``name``."""
+    key = name.strip().lower()
+    if not key:
+        raise ValueError("Glitchling name cannot be empty.")
+    glitchling_type = _BUILTIN_GLITCHLING_TYPES.get(key)
+    if glitchling_type is None:
+        raise ValueError(f"Glitchling '{name}' not found.")
+    return glitchling_type
 def summon(glitchlings: list[str | Glitchling], seed: int = 151) -> Gaggle:
     """Summon glitchlings by name (using defaults) or instance (to change parameters)."""

glitchlings/zoo/_sampling.py ADDED Viewed

@@ -0,0 +1,55 @@
+from __future__ import annotations
+import random
+from typing import Sequence
+def weighted_sample_without_replacement(
+    population: Sequence[int],
+    weights: Sequence[float],
+    *,
+    k: int,
+    rng: random.Random,
+) -> list[int]:
+    """Sample ``k`` unique indices from ``population`` using ``weights``.
+    Mirrors the behaviour used by several glitchlings while centralising error
+    handling and RNG interactions so the Python and Rust implementations remain
+    aligned.
+    """
+    if k < 0:
+        raise ValueError("Sample size cannot be negative")
+    if len(population) != len(weights):
+        raise ValueError("Population and weight sequences must be the same length")
+    items = list(zip(population, weights))
+    count = len(items)
+    if k == 0 or count == 0:
+        return []
+    if k > count:
+        raise ValueError("Sample larger than population or is negative")
+    selections: list[int] = []
+    for _ in range(k):
+        total_weight = sum(weight for _, weight in items)
+        if total_weight <= 0.0:
+            chosen_index = rng.randrange(len(items))
+        else:
+            threshold = rng.random() * total_weight
+            cumulative = 0.0
+            chosen_index = len(items) - 1
+            for idx, (_, weight) in enumerate(items):
+                cumulative += weight
+                if cumulative >= threshold:
+                    chosen_index = idx
+                    break
+        value, _ = items.pop(chosen_index)
+        selections.append(value)
+    return selections
+__all__ = ["weighted_sample_without_replacement"]

glitchlings/zoo/_text_utils.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from __future__ import annotations
 import re
+from dataclasses import dataclass
+from typing import Sequence
 _WORD_SPLIT_PATTERN = re.compile(r"(\s+)")
 _TOKEN_EDGES_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$")
@@ -35,8 +37,68 @@ def token_core_length(token: str) -> int:
     return length
+@dataclass(frozen=True)
+class WordToken:
+    """Metadata describing a non-whitespace token yielded by word splitters."""
+    index: int
+    prefix: str
+    core: str
+    suffix: str
+    core_length: int
+    @property
+    def has_core(self) -> bool:
+        """Return ``True`` when the token contains at least one core character."""
+        return bool(self.core)
+def collect_word_tokens(
+    tokens: Sequence[str],
+    *,
+    skip_first_word: bool = False,
+) -> list[WordToken]:
+    """Return structured metadata for non-whitespace tokens within ``tokens``.
+    Args:
+        tokens: Token sequence produced by :func:`split_preserving_whitespace`.
+        skip_first_word: Exclude the first candidate token (used by Rushmore to
+            preserve leading words).
+    """
+    start = 2 if skip_first_word else 0
+    collected: list[WordToken] = []
+    for index in range(start, len(tokens), 2):
+        token = tokens[index]
+        if not token or token.isspace():
+            continue
+        prefix, core, suffix = split_token_edges(token)
+        core_length = len(core)
+        if core_length <= 0:
+            stripped = token.strip()
+            core_length = len(stripped) if stripped else len(token)
+        if core_length <= 0:
+            core_length = 1
+        collected.append(
+            WordToken(
+                index=index,
+                prefix=prefix,
+                core=core,
+                suffix=suffix,
+                core_length=core_length,
+            )
+        )
+    return collected
 __all__ = [
     "split_preserving_whitespace",
     "split_token_edges",
     "token_core_length",
+    "WordToken",
+    "collect_word_tokens",
 ]