PyPI - glitchlings - Versions diffs - 0.2.5__cp312-cp312-win_amd64.whl → 0.9.3__cp312-cp312-win_amd64.whl - Mend

glitchlings 0.2.5__cp312-cp312-win_amd64.whl → 0.9.3__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

glitchlings/__init__.py +36 -17
glitchlings/__main__.py +0 -1
glitchlings/_zoo_rust/__init__.py +12 -0
glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
glitchlings/assets/__init__.py +180 -0
glitchlings/assets/apostrofae_pairs.json +32 -0
glitchlings/assets/ekkokin_homophones.json +2014 -0
glitchlings/assets/hokey_assets.json +193 -0
glitchlings/assets/lexemes/academic.json +1049 -0
glitchlings/assets/lexemes/colors.json +1333 -0
glitchlings/assets/lexemes/corporate.json +716 -0
glitchlings/assets/lexemes/cyberpunk.json +22 -0
glitchlings/assets/lexemes/lovecraftian.json +23 -0
glitchlings/assets/lexemes/synonyms.json +3354 -0
glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
glitchlings/assets/pipeline_assets.json +29 -0
glitchlings/attack/__init__.py +53 -0
glitchlings/attack/compose.py +299 -0
glitchlings/attack/core.py +465 -0
glitchlings/attack/encode.py +114 -0
glitchlings/attack/metrics.py +104 -0
glitchlings/attack/metrics_dispatch.py +70 -0
glitchlings/attack/tokenization.py +157 -0
glitchlings/auggie.py +283 -0
glitchlings/compat/__init__.py +9 -0
glitchlings/compat/loaders.py +355 -0
glitchlings/compat/types.py +41 -0
glitchlings/conf/__init__.py +41 -0
glitchlings/conf/loaders.py +331 -0
glitchlings/conf/schema.py +156 -0
glitchlings/conf/types.py +72 -0
glitchlings/config.toml +2 -0
glitchlings/constants.py +59 -0
glitchlings/dev/__init__.py +3 -0
glitchlings/dev/docs.py +45 -0
glitchlings/dlc/__init__.py +17 -3
glitchlings/dlc/_shared.py +296 -0
glitchlings/dlc/gutenberg.py +400 -0
glitchlings/dlc/huggingface.py +37 -65
glitchlings/dlc/prime.py +55 -114
glitchlings/dlc/pytorch.py +98 -0
glitchlings/dlc/pytorch_lightning.py +173 -0
glitchlings/internal/__init__.py +16 -0
glitchlings/internal/rust.py +159 -0
glitchlings/internal/rust_ffi.py +432 -0
glitchlings/main.py +123 -32
glitchlings/runtime_config.py +24 -0
glitchlings/util/__init__.py +29 -176
glitchlings/util/adapters.py +65 -0
glitchlings/util/keyboards.py +311 -0
glitchlings/util/transcripts.py +108 -0
glitchlings/zoo/__init__.py +47 -24
glitchlings/zoo/assets/__init__.py +29 -0
glitchlings/zoo/core.py +301 -167
glitchlings/zoo/core_execution.py +98 -0
glitchlings/zoo/core_planning.py +451 -0
glitchlings/zoo/corrupt_dispatch.py +295 -0
glitchlings/zoo/ekkokin.py +118 -0
glitchlings/zoo/hokey.py +137 -0
glitchlings/zoo/jargoyle.py +179 -274
glitchlings/zoo/mim1c.py +106 -68
glitchlings/zoo/pedant/__init__.py +107 -0
glitchlings/zoo/pedant/core.py +105 -0
glitchlings/zoo/pedant/forms.py +74 -0
glitchlings/zoo/pedant/stones.py +74 -0
glitchlings/zoo/redactyl.py +44 -175
glitchlings/zoo/rng.py +259 -0
glitchlings/zoo/rushmore.py +359 -116
glitchlings/zoo/scannequin.py +18 -125
glitchlings/zoo/transforms.py +386 -0
glitchlings/zoo/typogre.py +76 -162
glitchlings/zoo/validation.py +477 -0
glitchlings/zoo/zeedub.py +33 -86
glitchlings-0.9.3.dist-info/METADATA +334 -0
glitchlings-0.9.3.dist-info/RECORD +80 -0
{glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/entry_points.txt +1 -0
glitchlings/zoo/_ocr_confusions.py +0 -34
glitchlings/zoo/_rate.py +0 -21
glitchlings/zoo/reduple.py +0 -169
glitchlings-0.2.5.dist-info/METADATA +0 -490
glitchlings-0.2.5.dist-info/RECORD +0 -27
/glitchlings/{zoo → assets}/ocr_confusions.tsv +0 -0
{glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/WHEEL +0 -0
{glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/licenses/LICENSE +0 -0
{glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/top_level.txt +0 -0

glitchlings/zoo/jargoyle.py CHANGED Viewed

@@ -1,336 +1,241 @@
-import random
-import re
-from collections.abc import Iterable
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Literal, cast
-try:  # pragma: no cover - exercised in environments with NLTK installed
-    import nltk  # type: ignore[import]
-except ModuleNotFoundError as exc:  # pragma: no cover - triggered when NLTK missing
-    nltk = None  # type: ignore[assignment]
-    find = None  # type: ignore[assignment]
-    _NLTK_IMPORT_ERROR = exc
-else:  # pragma: no cover - executed when NLTK is available
-    from nltk.corpus.reader import WordNetCorpusReader as _WordNetCorpusReader  # type: ignore[import]
-    from nltk.data import find as _nltk_find  # type: ignore[import]
-    find = _nltk_find
-    _NLTK_IMPORT_ERROR = None
-if TYPE_CHECKING:  # pragma: no cover - typing aid only
-    from nltk.corpus.reader import WordNetCorpusReader  # type: ignore[import]
-else:  # Use ``Any`` at runtime to avoid hard dependency when NLTK missing
-    WordNetCorpusReader = Any
-if nltk is not None:  # pragma: no cover - guarded by import success
-    try:
-        from nltk.corpus import wordnet as _WORDNET_MODULE  # type: ignore[import]
-    except ModuleNotFoundError:  # pragma: no cover - only hit on namespace packages
-        _WORDNET_MODULE = None
-    else:
-        WordNetCorpusReader = _WordNetCorpusReader  # type: ignore[assignment]
-else:
-    _WORDNET_MODULE = None
-from .core import AttackWave, Glitchling
-from ._rate import resolve_rate
-_WORDNET_HANDLE: WordNetCorpusReader | Any | None = _WORDNET_MODULE
+"""Jargoyle glitchling: Dictionary-based word drift.
-_wordnet_ready = False
+Jargoyle swaps words with alternatives from bundled lexeme dictionaries.
+Multiple dictionaries are supported:
+- "colors": Color term swapping
+- "synonyms": General synonym substitution
+- "corporate": Business jargon alternatives
+- "academic": Scholarly word substitutions
+- "cyberpunk": Neon cyberpunk slang and gadgetry
+- "lovecraftian": Cosmic horror terminology
+You can also drop additional dictionaries into ``assets/lexemes`` to make
+them available without modifying the code. The backend discovers any
+``*.json`` file in that directory at runtime.
+Two modes are available:
+- "literal": First entry in each word's alternatives (deterministic mapping)
+- "drift": Random selection from alternatives (probabilistic)
+"""
-def _require_nltk() -> None:
-    """Ensure the NLTK dependency is present before continuing."""
-    if nltk is None or find is None:
-        message = (
-            "The NLTK package is required for the jargoyle glitchling; install "
-            "the 'wordnet' extra via `pip install glitchlings[wordnet]`."
-        )
-        if '_NLTK_IMPORT_ERROR' in globals() and _NLTK_IMPORT_ERROR is not None:
-            raise RuntimeError(message) from _NLTK_IMPORT_ERROR
-        raise RuntimeError(message)
+from __future__ import annotations
+import os
+from importlib import resources
+from pathlib import Path
+from typing import Literal, cast
-def dependencies_available() -> bool:
-    """Return ``True`` when the runtime NLTK dependency is present."""
+from glitchlings.constants import DEFAULT_JARGOYLE_RATE
+from glitchlings.internal.rust_ffi import (
+    jargoyle_drift_rust,
+    list_lexeme_dictionaries_rust,
+    resolve_seed,
+)
-    return nltk is not None and find is not None
+from .core import AttackOrder, AttackWave, Glitchling, PipelineOperationPayload
+_LEXEME_ENV_VAR = "GLITCHLINGS_LEXEME_DIR"
-def _load_wordnet_reader() -> WordNetCorpusReader:
-    """Return a WordNet corpus reader from the downloaded corpus files."""
-    _require_nltk()
+def _configure_lexeme_directory() -> Path | None:
+    """Expose the bundled lexeme directory to the Rust backend via an env var."""
     try:
-        root = find("corpora/wordnet")
-    except LookupError:
-        try:
-            zip_root = find("corpora/wordnet.zip")
-        except LookupError as exc:
-            raise RuntimeError(
-                "The NLTK WordNet corpus is not installed; run `nltk.download('wordnet')`."
-            ) from exc
-        root = zip_root.join("wordnet/")
-    return WordNetCorpusReader(root, None)
-def _wordnet(force_refresh: bool = False) -> WordNetCorpusReader | Any:
-    """Retrieve the active WordNet handle, rebuilding it on demand."""
-    global _WORDNET_HANDLE
-    if force_refresh:
-        _WORDNET_HANDLE = _WORDNET_MODULE
-    if _WORDNET_HANDLE is not None:
-        return _WORDNET_HANDLE
-    _WORDNET_HANDLE = _load_wordnet_reader()
-    return _WORDNET_HANDLE
+        lexeme_root = resources.files("glitchlings.assets.lexemes")
+    except (ModuleNotFoundError, AttributeError):
+        return None
+    try:
+        with resources.as_file(lexeme_root) as resolved:
+            path = Path(resolved)
+    except FileNotFoundError:
+        return None
-def ensure_wordnet() -> None:
-    """Ensure the WordNet corpus is available before use."""
+    if not path.is_dir():
+        return None
-    global _wordnet_ready
-    if _wordnet_ready:
-        return
+    os.environ.setdefault(_LEXEME_ENV_VAR, str(path))
+    return path
-    _require_nltk()
-    resource = _wordnet()
+_configure_lexeme_directory()
-    try:
-        resource.ensure_loaded()
-    except LookupError:
-        nltk.download("wordnet", quiet=True)
-        try:
-            resource = _wordnet(force_refresh=True)
-            resource.ensure_loaded()
-        except LookupError as exc:  # pragma: no cover - only triggered when download fails
-            raise RuntimeError(
-                "Unable to load NLTK WordNet corpus for the jargoyle glitchling."
-            ) from exc
+DEFAULT_LEXEMES = "synonyms"
-    _wordnet_ready = True
+# Valid modes
+JargoyleMode = Literal["literal", "drift"]
+VALID_MODES = ("literal", "drift")
+DEFAULT_MODE: JargoyleMode = "drift"
-# Backwards compatibility for callers relying on the previous private helper name.
-_ensure_wordnet = ensure_wordnet
+def _available_lexemes() -> list[str]:
+    return sorted({name.lower() for name in list_lexeme_dictionaries_rust()})
-PartOfSpeech = Literal["n", "v", "a", "r"]
-PartOfSpeechInput = PartOfSpeech | Iterable[PartOfSpeech] | Literal["any"]
-NormalizedPartsOfSpeech = tuple[PartOfSpeech, ...]
+def _validate_lexemes(name: str) -> str:
+    normalized = name.lower()
+    available = _available_lexemes()
+    if normalized not in available:
+        raise ValueError(f"Invalid lexemes '{name}'. Must be one of: {', '.join(available)}")
+    return normalized
-_VALID_POS: tuple[PartOfSpeech, ...] = ("n", "v", "a", "r")
+def _validate_mode(mode: JargoyleMode | str) -> JargoyleMode:
+    normalized = mode.lower()
+    if normalized not in VALID_MODES:
+        raise ValueError(f"Invalid mode '{mode}'. Must be one of: {', '.join(VALID_MODES)}")
+    return cast(JargoyleMode, normalized)
-def _split_token(token: str) -> tuple[str, str, str]:
-    """Split a token into leading punctuation, core word, and trailing punctuation."""
-    match = re.match(r"^(\W*)(.*?)(\W*)$", token)
-    if not match:
-        return "", token, ""
-    prefix, core, suffix = match.groups()
-    return prefix, core, suffix
+VALID_LEXEMES = tuple(_available_lexemes())
-def _normalize_parts_of_speech(part_of_speech: PartOfSpeechInput) -> NormalizedPartsOfSpeech:
-    """Coerce user input into a tuple of valid WordNet POS tags."""
+def list_lexeme_dictionaries() -> list[str]:
+    """Return the list of available lexeme dictionaries.
-    if isinstance(part_of_speech, str):
-        lowered = part_of_speech.lower()
-        if lowered == "any":
-            return _VALID_POS
-        if lowered not in _VALID_POS:
-            raise ValueError(
-                "part_of_speech must be one of 'n', 'v', 'a', 'r', or 'any'"
-            )
-        return (cast(PartOfSpeech, lowered),)
-    normalized: list[PartOfSpeech] = []
-    for pos in part_of_speech:
-        if pos not in _VALID_POS:
-            raise ValueError(
-                "part_of_speech entries must be one of 'n', 'v', 'a', or 'r'"
-            )
-        if pos not in normalized:
-            normalized.append(pos)
-    if not normalized:
-        raise ValueError("part_of_speech iterable may not be empty")
-    return tuple(normalized)
+    Returns:
+        List of dictionary names that can be used with Jargoyle.
+    """
+    return _available_lexemes()
-@dataclass(frozen=True)
-class CandidateInfo:
-    """Metadata for a candidate token that may be replaced."""
+def jargoyle_drift(
+    text: str,
+    *,
+    lexemes: str = DEFAULT_LEXEMES,
+    mode: JargoyleMode = DEFAULT_MODE,
+    rate: float | None = None,
+    seed: int | None = None,
+) -> str:
+    """Apply dictionary-based word drift to text.
-    prefix: str
-    core_word: str
-    suffix: str
-    parts_of_speech: NormalizedPartsOfSpeech
+    Args:
+        text: Input text to transform.
+        lexemes: Name of the dictionary to use.
+        mode: "literal" for deterministic first-entry swaps,
+              "drift" for random selection from alternatives.
+        rate: Probability of transforming each matching word (0.0 to 1.0).
+        seed: Seed for deterministic randomness (only used in "drift" mode).
+    Returns:
+        Text with word substitutions applied.
-def _collect_synonyms(
-    word: str, parts_of_speech: NormalizedPartsOfSpeech
-) -> list[str]:
-    """Gather deterministic synonym candidates for the supplied word."""
+    Raises:
+        ValueError: If lexemes or mode is invalid.
+    """
+    normalized_lexemes = _validate_lexemes(lexemes)
+    normalized_mode = _validate_mode(mode)
+    effective_rate = DEFAULT_JARGOYLE_RATE if rate is None else float(rate)
+    resolved_seed = resolve_seed(seed, None) if normalized_mode == "drift" else None
+    return jargoyle_drift_rust(
+        text,
+        normalized_lexemes,
+        normalized_mode,
+        effective_rate,
+        resolved_seed,
+    )
-    normalized_word = word.lower()
-    wordnet = _wordnet()
-    synonyms: set[str] = set()
-    for pos_tag in parts_of_speech:
-        synsets = wordnet.synsets(word, pos=pos_tag)
-        if not synsets:
-            continue
-        for synset in synsets:
-            lemmas_list = [lemma.name() for lemma in cast(Any, synset).lemmas()]
-            if not lemmas_list:
-                continue
+class Jargoyle(Glitchling):
+    """Glitchling that swaps words using bundled lexeme dictionaries.
-            filtered = []
-            for lemma_str in lemmas_list:
-                cleaned = lemma_str.replace("_", " ")
-                if cleaned.lower() != normalized_word:
-                    filtered.append(cleaned)
+    Jargoyle replaces words with alternatives from one of several dictionaries:
-            if filtered:
-                synonyms.update(filtered)
-                break
+    - **colors**: Swap color terms (e.g., "red" -> "blue").
+    - **synonyms**: General synonym substitution (e.g., "fast" -> "rapid").
+    - **corporate**: Business jargon alternatives.
+    - **academic**: Scholarly word substitutions.
+    - **cyberpunk**: Neon cyberpunk slang and gadgetry.
+    - **lovecraftian**: Cosmic horror terminology.
+    - **custom**: Any ``*.json`` dictionary placed in ``assets/lexemes``.
-        if synonyms:
-            break
+    Two modes are supported:
-    return sorted(synonyms)
+    - **literal**: Use the first (canonical) entry for each word.
+    - **drift**: Randomly select from available alternatives.
+    Example:
+        >>> from glitchlings import Jargoyle
+        >>> jargoyle = Jargoyle(lexemes="colors", mode="literal")
+        >>> jargoyle("The red balloon floated away.")
+        'The blue balloon floated away.'
-def substitute_random_synonyms(
-    text: str,
-    rate: float | None = None,
-    part_of_speech: PartOfSpeechInput = "n",
-    seed: int | None = None,
-    rng: random.Random | None = None,
-    *,
-    replacement_rate: float | None = None,
-) -> str:
-    """Replace words with random WordNet synonyms.
-    Parameters
-    - text: Input text.
-    - rate: Max proportion of candidate words to replace (default 0.1).
-    - part_of_speech: WordNet POS tag(s) to target. Accepts "n", "v", "a", "r",
-      any iterable of those tags, or "any" to include all four.
-    - rng: Optional RNG instance used for deterministic sampling.
-    - seed: Optional seed if `rng` not provided.
-    Determinism
-    - Candidates collected in left-to-right order; no set() reordering.
-    - Replacement positions chosen via rng.sample.
-    - Synonyms sorted before rng.choice to fix ordering.
-    - For each POS, the first synset containing alternate lemmas is used for stability.
+        >>> jargoyle = Jargoyle(lexemes="synonyms", mode="drift", rate=0.5, seed=42)
+        >>> jargoyle("The quick fox jumps fast.")
+        'The swift fox jumps rapid.'
     """
-    effective_rate = resolve_rate(
-        rate=rate,
-        legacy_value=replacement_rate,
-        default=0.1,
-        legacy_name="replacement_rate",
-    )
-    ensure_wordnet()
-    wordnet = _wordnet()
-    active_rng: random.Random
-    if rng is not None:
-        active_rng = rng
-    else:
-        active_rng = random.Random(seed)
-    target_pos = _normalize_parts_of_speech(part_of_speech)
-    # Split but keep whitespace separators so we can rebuild easily
-    tokens = re.split(r"(\s+)", text)
-    # Collect indices of candidate tokens (even positions 0,2,.. are words given our split design)
-    candidate_indices: list[int] = []
-    candidate_metadata: dict[int, CandidateInfo] = {}
-    for idx, tok in enumerate(tokens):
-        if idx % 2 == 0 and tok and not tok.isspace():
-            prefix, core_word, suffix = _split_token(tok)
-            if not core_word:
-                continue
-            available_pos: NormalizedPartsOfSpeech = tuple(
-                pos for pos in target_pos if wordnet.synsets(core_word, pos=pos)
-            )
-            if available_pos:
-                candidate_indices.append(idx)
-                candidate_metadata[idx] = CandidateInfo(
-                    prefix=prefix,
-                    core_word=core_word,
-                    suffix=suffix,
-                    parts_of_speech=available_pos,
-                )
-    if not candidate_indices:
-        return text
-    clamped_rate = max(0.0, effective_rate)
-    max_replacements = int(len(candidate_indices) * clamped_rate)
-    if max_replacements <= 0:
-        return text
-    # Choose which positions to replace deterministically via rng.sample
-    replace_positions = active_rng.sample(candidate_indices, k=max_replacements)
-    # Process in ascending order to avoid affecting later indices
-    replace_positions.sort()
-    for pos in replace_positions:
-        metadata = candidate_metadata[pos]
-        synonyms = _collect_synonyms(metadata.core_word, metadata.parts_of_speech)
-        if not synonyms:
-            continue
-        replacement = active_rng.choice(synonyms)
-        tokens[pos] = f"{metadata.prefix}{replacement}{metadata.suffix}"
-    return "".join(tokens)
-class Jargoyle(Glitchling):
-    """Glitchling that swaps words with random WordNet synonyms."""
+    flavor = "Oh no... The worst person you know just bought a thesaurus..."
     def __init__(
         self,
         *,
+        lexemes: str = DEFAULT_LEXEMES,
+        mode: JargoyleMode = DEFAULT_MODE,
         rate: float | None = None,
-        replacement_rate: float | None = None,
-        part_of_speech: PartOfSpeechInput = "n",
         seed: int | None = None,
     ) -> None:
-        self._param_aliases = {"replacement_rate": "rate"}
-        effective_rate = resolve_rate(
-            rate=rate,
-            legacy_value=replacement_rate,
-            default=0.1,
-            legacy_name="replacement_rate",
-        )
+        """Initialize Jargoyle with the specified dictionary and mode.
+        Args:
+            lexemes: Name of the dictionary to use. See ``list_lexeme_dictionaries()``
+                for the full, dynamic list (including any custom ``*.json`` files).
+            mode: Transformation mode. "literal" for deterministic swaps,
+                "drift" for random selection.
+            rate: Probability of transforming each matching word (0.0 to 1.0).
+                Defaults to 0.01.
+            seed: Seed for deterministic randomness.
+        """
+        # Validate inputs
+        normalized_lexemes = _validate_lexemes(lexemes)
+        normalized_mode = _validate_mode(mode)
+        effective_rate = DEFAULT_JARGOYLE_RATE if rate is None else rate
         super().__init__(
             name="Jargoyle",
-            corruption_function=substitute_random_synonyms,
+            corruption_function=jargoyle_drift,
             scope=AttackWave.WORD,
+            order=AttackOrder.NORMAL,
             seed=seed,
+            lexemes=normalized_lexemes,
+            mode=normalized_mode,
             rate=effective_rate,
-            part_of_speech=part_of_speech,
+            # Pass seed explicitly to kwargs so corruption_function receives it
+            # (seed is stored separately in base class but needed by jargoyle_drift)
+        )
+        # Ensure seed is in kwargs for the corruption function
+        self.kwargs["seed"] = seed
+    def pipeline_operation(self) -> PipelineOperationPayload:
+        """Return the pipeline descriptor for the Rust backend."""
+        lexemes = self.kwargs.get("lexemes", DEFAULT_LEXEMES)
+        mode = self.kwargs.get("mode", DEFAULT_MODE)
+        rate = self.kwargs.get("rate", DEFAULT_JARGOYLE_RATE)
+        return cast(
+            PipelineOperationPayload,
+            {
+                "type": "jargoyle",
+                "lexemes": str(lexemes),
+                "mode": str(mode),
+                "rate": float(rate),
+            },
         )
+# Module-level singleton for convenience
 jargoyle = Jargoyle()
-__all__ = ["Jargoyle", "dependencies_available", "ensure_wordnet", "jargoyle"]
+__all__ = [
+    "DEFAULT_LEXEMES",
+    "DEFAULT_MODE",
+    "Jargoyle",
+    "JargoyleMode",
+    "VALID_LEXEMES",
+    "VALID_MODES",
+    "jargoyle",
+    "jargoyle_drift",
+    "list_lexeme_dictionaries",
+]