PyPI - glitchlings - Versions diffs - 0.2.6__cp310-cp310-macosx_11_0_universal2.whl → 0.4.0__cp310-cp310-macosx_11_0_universal2.whl - Mend

glitchlings 0.2.6__cp310-cp310-macosx_11_0_universal2.whl → 0.4.0__cp310-cp310-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of glitchlings might be problematic. Click here for more details.

Files changed (29) hide show

glitchlings/__init__.py +8 -0
glitchlings/_zoo_rust.cpython-310-darwin.so +0 -0
glitchlings/config.py +258 -0
glitchlings/config.toml +3 -0
glitchlings/lexicon/__init__.py +191 -0
glitchlings/lexicon/data/default_vector_cache.json +16 -0
glitchlings/lexicon/graph.py +303 -0
glitchlings/lexicon/metrics.py +169 -0
glitchlings/lexicon/vector.py +610 -0
glitchlings/lexicon/wordnet.py +182 -0
glitchlings/main.py +145 -5
glitchlings/zoo/__init__.py +20 -1
glitchlings/zoo/_sampling.py +55 -0
glitchlings/zoo/_text_utils.py +104 -0
glitchlings/zoo/adjax.py +131 -0
glitchlings/zoo/core.py +16 -14
glitchlings/zoo/jargoyle.py +190 -200
glitchlings/zoo/redactyl.py +32 -67
glitchlings/zoo/reduple.py +13 -35
glitchlings/zoo/rushmore.py +17 -28
glitchlings/zoo/typogre.py +22 -1
glitchlings/zoo/zeedub.py +40 -1
{glitchlings-0.2.6.dist-info → glitchlings-0.4.0.dist-info}/METADATA +48 -11
glitchlings-0.4.0.dist-info/RECORD +38 -0
glitchlings-0.2.6.dist-info/RECORD +0 -27
{glitchlings-0.2.6.dist-info → glitchlings-0.4.0.dist-info}/WHEEL +0 -0
{glitchlings-0.2.6.dist-info → glitchlings-0.4.0.dist-info}/entry_points.txt +0 -0
{glitchlings-0.2.6.dist-info → glitchlings-0.4.0.dist-info}/licenses/LICENSE +0 -0
{glitchlings-0.2.6.dist-info → glitchlings-0.4.0.dist-info}/top_level.txt +0 -0

glitchlings/zoo/adjax.py ADDED Viewed

@@ -0,0 +1,131 @@
+from __future__ import annotations
+import random
+from typing import Any
+from ._rate import resolve_rate
+from ._text_utils import split_preserving_whitespace, split_token_edges
+from .core import AttackWave, Glitchling
+try:
+    from glitchlings._zoo_rust import swap_adjacent_words as _swap_adjacent_words_rust
+except ImportError:  # pragma: no cover - optional acceleration
+    _swap_adjacent_words_rust = None
+def _python_swap_adjacent_words(
+    text: str,
+    *,
+    rate: float,
+    rng: random.Random,
+) -> str:
+    """Swap the cores of adjacent words while keeping affixes and spacing intact."""
+    tokens = split_preserving_whitespace(text)
+    if len(tokens) < 2:
+        return text
+    word_indices: list[int] = []
+    for index in range(len(tokens)):
+        token = tokens[index]
+        if not token or token.isspace():
+            continue
+        if index % 2 == 0:
+            word_indices.append(index)
+    if len(word_indices) < 2:
+        return text
+    clamped = max(0.0, min(rate, 1.0))
+    if clamped <= 0.0:
+        return text
+    for cursor in range(0, len(word_indices) - 1, 2):
+        left_index = word_indices[cursor]
+        right_index = word_indices[cursor + 1]
+        left_token = tokens[left_index]
+        right_token = tokens[right_index]
+        left_prefix, left_core, left_suffix = split_token_edges(left_token)
+        right_prefix, right_core, right_suffix = split_token_edges(right_token)
+        if not left_core or not right_core:
+            continue
+        should_swap = clamped >= 1.0 or rng.random() < clamped
+        if not should_swap:
+            continue
+        tokens[left_index] = f"{left_prefix}{right_core}{left_suffix}"
+        tokens[right_index] = f"{right_prefix}{left_core}{right_suffix}"
+    return "".join(tokens)
+def swap_adjacent_words(
+    text: str,
+    rate: float | None = None,
+    seed: int | None = None,
+    rng: random.Random | None = None,
+    *,
+    swap_rate: float | None = None,
+) -> str:
+    """Swap adjacent word cores while preserving spacing and punctuation."""
+    effective_rate = resolve_rate(
+        rate=rate,
+        legacy_value=swap_rate,
+        default=0.5,
+        legacy_name="swap_rate",
+    )
+    clamped_rate = max(0.0, min(effective_rate, 1.0))
+    if rng is None:
+        rng = random.Random(seed)
+    if _swap_adjacent_words_rust is not None:
+        return _swap_adjacent_words_rust(text, clamped_rate, rng)
+    return _python_swap_adjacent_words(text, rate=clamped_rate, rng=rng)
+class Adjax(Glitchling):
+    """Glitchling that swaps adjacent words to scramble local semantics."""
+    def __init__(
+        self,
+        *,
+        rate: float | None = None,
+        swap_rate: float | None = None,
+        seed: int | None = None,
+    ) -> None:
+        self._param_aliases = {"swap_rate": "rate"}
+        effective_rate = resolve_rate(
+            rate=rate,
+            legacy_value=swap_rate,
+            default=0.5,
+            legacy_name="swap_rate",
+        )
+        super().__init__(
+            name="Adjax",
+            corruption_function=swap_adjacent_words,
+            scope=AttackWave.WORD,
+            seed=seed,
+            rate=effective_rate,
+        )
+    def pipeline_operation(self) -> dict[str, Any] | None:
+        rate = self.kwargs.get("rate")
+        if rate is None:
+            return None
+        return {
+            "type": "swap_adjacent",
+            "swap_rate": float(rate),
+        }
+adjax = Adjax()
+__all__ = ["Adjax", "adjax", "swap_adjacent_words"]

glitchlings/zoo/core.py CHANGED Viewed

@@ -59,18 +59,26 @@ else:
         def with_transform(self, function: Any) -> "Dataset": ...
-def _is_transcript(value: Any) -> bool:
-    """Return True when the value resembles a chat transcript."""
+def _is_transcript(
+    value: Any,
+    *,
+    allow_empty: bool = True,
+    require_all_content: bool = False,
+) -> bool:
+    """Return `True` when `value` appears to be a chat transcript."""
     if not isinstance(value, list):
         return False
     if not value:
-        return True
+        return allow_empty
     if not all(isinstance(turn, dict) for turn in value):
         return False
+    if require_all_content:
+        return all("content" in turn for turn in value)
     return "content" in value[-1]
@@ -233,21 +241,15 @@ class Glitchling:
             message = "datasets is not installed"
             raise ModuleNotFoundError(message) from _datasets_error
-        def _is_transcript(value: Any) -> bool:
-            """Return ``True`` when the value resembles a chat transcript."""
-            if not isinstance(value, list) or not value:
-                return False
-            return all(
-                isinstance(turn, dict) and "content" in turn for turn in value
-            )
         def __corrupt_row(row: dict[str, Any]) -> dict[str, Any]:
             row = dict(row)
             for column in columns:
                 value = row[column]
-                if _is_transcript(value):
+                if _is_transcript(
+                    value,
+                    allow_empty=False,
+                    require_all_content=True,
+                ):
                     row[column] = self.corrupt(value)
                 elif isinstance(value, list):
                     row[column] = [self.corrupt(item) for item in value]

glitchlings/zoo/jargoyle.py CHANGED Viewed

@@ -2,121 +2,47 @@ import random
 import re
 from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Literal, cast
-try:  # pragma: no cover - exercised in environments with NLTK installed
-    import nltk  # type: ignore[import]
-except ModuleNotFoundError as exc:  # pragma: no cover - triggered when NLTK missing
-    nltk = None  # type: ignore[assignment]
-    find = None  # type: ignore[assignment]
-    _NLTK_IMPORT_ERROR = exc
-else:  # pragma: no cover - executed when NLTK is available
-    from nltk.corpus.reader import WordNetCorpusReader as _WordNetCorpusReader  # type: ignore[import]
-    from nltk.data import find as _nltk_find  # type: ignore[import]
-    find = _nltk_find
-    _NLTK_IMPORT_ERROR = None
-if TYPE_CHECKING:  # pragma: no cover - typing aid only
-    from nltk.corpus.reader import WordNetCorpusReader  # type: ignore[import]
-else:  # Use ``Any`` at runtime to avoid hard dependency when NLTK missing
-    WordNetCorpusReader = Any
-if nltk is not None:  # pragma: no cover - guarded by import success
-    try:
-        from nltk.corpus import wordnet as _WORDNET_MODULE  # type: ignore[import]
-    except ModuleNotFoundError:  # pragma: no cover - only hit on namespace packages
-        _WORDNET_MODULE = None
-    else:
-        WordNetCorpusReader = _WordNetCorpusReader  # type: ignore[assignment]
-else:
-    _WORDNET_MODULE = None
+from typing import Any, Literal, cast
-from .core import AttackWave, Glitchling
-from ._rate import resolve_rate
-_WORDNET_HANDLE: WordNetCorpusReader | Any | None = _WORDNET_MODULE
-_wordnet_ready = False
+from glitchlings.lexicon import Lexicon, get_default_lexicon
+try:  # pragma: no cover - optional WordNet dependency
+    from glitchlings.lexicon.wordnet import (
+        WordNetLexicon,
+        dependencies_available as _lexicon_dependencies_available,
+        ensure_wordnet as _lexicon_ensure_wordnet,
+    )
+except Exception:  # pragma: no cover - triggered when nltk unavailable
+    WordNetLexicon = None  # type: ignore[assignment]
-def _require_nltk() -> None:
-    """Ensure the NLTK dependency is present before continuing."""
+    def _lexicon_dependencies_available() -> bool:
+        return False
-    if nltk is None or find is None:
-        message = (
-            "The NLTK package is required for the jargoyle glitchling; install "
-            "the 'wordnet' extra via `pip install glitchlings[wordnet]`."
+    def _lexicon_ensure_wordnet() -> None:
+        raise RuntimeError(
+            "The WordNet backend is no longer bundled by default. Install NLTK "
+            "and download its WordNet corpus manually if you need legacy synonyms."
         )
-        if '_NLTK_IMPORT_ERROR' in globals() and _NLTK_IMPORT_ERROR is not None:
-            raise RuntimeError(message) from _NLTK_IMPORT_ERROR
-        raise RuntimeError(message)
-def dependencies_available() -> bool:
-    """Return ``True`` when the runtime NLTK dependency is present."""
-    return nltk is not None and find is not None
-def _load_wordnet_reader() -> WordNetCorpusReader:
-    """Return a WordNet corpus reader from the downloaded corpus files."""
-    _require_nltk()
-    try:
-        root = find("corpora/wordnet")
-    except LookupError:
-        try:
-            zip_root = find("corpora/wordnet.zip")
-        except LookupError as exc:
-            raise RuntimeError(
-                "The NLTK WordNet corpus is not installed; run `nltk.download('wordnet')`."
-            ) from exc
-        root = zip_root.join("wordnet/")
-    return WordNetCorpusReader(root, None)
-def _wordnet(force_refresh: bool = False) -> WordNetCorpusReader | Any:
-    """Retrieve the active WordNet handle, rebuilding it on demand."""
-    global _WORDNET_HANDLE
-    if force_refresh:
-        _WORDNET_HANDLE = _WORDNET_MODULE
-    if _WORDNET_HANDLE is not None:
-        return _WORDNET_HANDLE
-    _WORDNET_HANDLE = _load_wordnet_reader()
-    return _WORDNET_HANDLE
+from ._rate import resolve_rate
+from .core import AttackWave, Glitchling
-def ensure_wordnet() -> None:
-    """Ensure the WordNet corpus is available before use."""
+ensure_wordnet = _lexicon_ensure_wordnet
-    global _wordnet_ready
-    if _wordnet_ready:
-        return
-    _require_nltk()
+def dependencies_available() -> bool:
+    """Return ``True`` when a synonym backend is accessible."""
-    resource = _wordnet()
+    if _lexicon_dependencies_available():
+        return True
     try:
-        resource.ensure_loaded()
-    except LookupError:
-        nltk.download("wordnet", quiet=True)
-        try:
-            resource = _wordnet(force_refresh=True)
-            resource.ensure_loaded()
-        except LookupError as exc:  # pragma: no cover - only triggered when download fails
-            raise RuntimeError(
-                "Unable to load NLTK WordNet corpus for the jargoyle glitchling."
-            ) from exc
-    _wordnet_ready = True
+        # Fall back to the configured default lexicon (typically the bundled vector cache).
+        get_default_lexicon(seed=None)
+    except Exception:
+        return False
+    return True
 # Backwards compatibility for callers relying on the previous private helper name.
@@ -140,7 +66,9 @@ def _split_token(token: str) -> tuple[str, str, str]:
     return prefix, core, suffix
-def _normalize_parts_of_speech(part_of_speech: PartOfSpeechInput) -> NormalizedPartsOfSpeech:
+def _normalize_parts_of_speech(
+    part_of_speech: PartOfSpeechInput,
+) -> NormalizedPartsOfSpeech:
     """Coerce user input into a tuple of valid WordNet POS tags."""
     if isinstance(part_of_speech, str):
@@ -173,41 +101,8 @@ class CandidateInfo:
     prefix: str
     core_word: str
     suffix: str
-    parts_of_speech: NormalizedPartsOfSpeech
-def _collect_synonyms(
-    word: str, parts_of_speech: NormalizedPartsOfSpeech
-) -> list[str]:
-    """Gather deterministic synonym candidates for the supplied word."""
-    normalized_word = word.lower()
-    wordnet = _wordnet()
-    synonyms: set[str] = set()
-    for pos_tag in parts_of_speech:
-        synsets = wordnet.synsets(word, pos=pos_tag)
-        if not synsets:
-            continue
-        for synset in synsets:
-            lemmas_list = [lemma.name() for lemma in cast(Any, synset).lemmas()]
-            if not lemmas_list:
-                continue
-            filtered = []
-            for lemma_str in lemmas_list:
-                cleaned = lemma_str.replace("_", " ")
-                if cleaned.lower() != normalized_word:
-                    filtered.append(cleaned)
-            if filtered:
-                synonyms.update(filtered)
-                break
-        if synonyms:
-            break
-    return sorted(synonyms)
+    part_of_speech: str | None
+    synonyms: list[str]
 def substitute_random_synonyms(
@@ -218,22 +113,27 @@ def substitute_random_synonyms(
     rng: random.Random | None = None,
     *,
     replacement_rate: float | None = None,
+    lexicon: Lexicon | None = None,
 ) -> str:
-    """Replace words with random WordNet synonyms.
+    """Replace words with random lexicon-driven synonyms.
     Parameters
     - text: Input text.
-    - rate: Max proportion of candidate words to replace (default 0.1).
+    - rate: Max proportion of candidate words to replace (default 0.01).
     - part_of_speech: WordNet POS tag(s) to target. Accepts "n", "v", "a", "r",
-      any iterable of those tags, or "any" to include all four.
+      any iterable of those tags, or "any" to include all four. Backends that do
+      not differentiate parts of speech simply ignore the setting.
     - rng: Optional RNG instance used for deterministic sampling.
     - seed: Optional seed if `rng` not provided.
+    - lexicon: Optional :class:`~glitchlings.lexicon.Lexicon` implementation to
+      supply synonyms. Defaults to the configured lexicon priority, typically the
+      packaged vector cache.
     Determinism
     - Candidates collected in left-to-right order; no set() reordering.
     - Replacement positions chosen via rng.sample.
-    - Synonyms sorted before rng.choice to fix ordering.
-    - For each POS, the first synset containing alternate lemmas is used for stability.
+    - Synonyms sourced through the lexicon; the default backend derives
+      deterministic subsets per word and part-of-speech using the active seed.
     """
     effective_rate = resolve_rate(
         rate=rate,
@@ -242,68 +142,106 @@ def substitute_random_synonyms(
         legacy_name="replacement_rate",
     )
-    ensure_wordnet()
-    wordnet = _wordnet()
     active_rng: random.Random
     if rng is not None:
         active_rng = rng
     else:
         active_rng = random.Random(seed)
-    target_pos = _normalize_parts_of_speech(part_of_speech)
+    active_lexicon: Lexicon
+    restore_lexicon_seed = False
+    original_lexicon_seed: int | None = None
-    # Split but keep whitespace separators so we can rebuild easily
-    tokens = re.split(r"(\s+)", text)
+    if lexicon is None:
+        active_lexicon = get_default_lexicon(seed=seed)
+    else:
+        active_lexicon = lexicon
+        if seed is not None:
+            original_lexicon_seed = active_lexicon.seed
+            if original_lexicon_seed != seed:
+                active_lexicon.reseed(seed)
+                restore_lexicon_seed = True
-    # Collect indices of candidate tokens (even positions 0,2,.. are words given our split design)
-    candidate_indices: list[int] = []
-    candidate_metadata: dict[int, CandidateInfo] = {}
-    for idx, tok in enumerate(tokens):
-        if idx % 2 == 0 and tok and not tok.isspace():
-            prefix, core_word, suffix = _split_token(tok)
-            if not core_word:
+    try:
+        target_pos = _normalize_parts_of_speech(part_of_speech)
+        # Split but keep whitespace separators so we can rebuild easily
+        tokens = re.split(r"(\s+)", text)
+        # Collect indices of candidate tokens (even positions 0,2,.. are words given our split design)
+        candidate_indices: list[int] = []
+        candidate_metadata: dict[int, CandidateInfo] = {}
+        for idx, tok in enumerate(tokens):
+            if idx % 2 == 0 and tok and not tok.isspace():
+                prefix, core_word, suffix = _split_token(tok)
+                if not core_word:
+                    continue
+                chosen_pos: str | None = None
+                synonyms: list[str] = []
+                for pos in target_pos:
+                    if not active_lexicon.supports_pos(pos):
+                        continue
+                    synonyms = active_lexicon.get_synonyms(core_word, pos=pos)
+                    if synonyms:
+                        chosen_pos = pos
+                        break
+                if not synonyms and active_lexicon.supports_pos(None):
+                    synonyms = active_lexicon.get_synonyms(core_word, pos=None)
+                if synonyms:
+                    candidate_indices.append(idx)
+                    candidate_metadata[idx] = CandidateInfo(
+                        prefix=prefix,
+                        core_word=core_word,
+                        suffix=suffix,
+                        part_of_speech=chosen_pos,
+                        synonyms=synonyms,
+                    )
+        if not candidate_indices:
+            return text
+        clamped_rate = max(0.0, effective_rate)
+        if clamped_rate == 0.0:
+            return text
+        population = len(candidate_indices)
+        effective_fraction = min(clamped_rate, 1.0)
+        expected_replacements = population * effective_fraction
+        max_replacements = int(expected_replacements)
+        remainder = expected_replacements - max_replacements
+        if remainder > 0.0 and active_rng.random() < remainder:
+            max_replacements += 1
+        if clamped_rate >= 1.0:
+            max_replacements = population
+        max_replacements = min(population, max_replacements)
+        if max_replacements <= 0:
+            return text
+        # Choose which positions to replace deterministically via rng.sample
+        replace_positions = active_rng.sample(candidate_indices, k=max_replacements)
+        # Process in ascending order to avoid affecting later indices
+        replace_positions.sort()
+        for pos in replace_positions:
+            metadata = candidate_metadata[pos]
+            if not metadata.synonyms:
                 continue
-            available_pos: NormalizedPartsOfSpeech = tuple(
-                pos for pos in target_pos if wordnet.synsets(core_word, pos=pos)
-            )
-            if available_pos:
-                candidate_indices.append(idx)
-                candidate_metadata[idx] = CandidateInfo(
-                    prefix=prefix,
-                    core_word=core_word,
-                    suffix=suffix,
-                    parts_of_speech=available_pos,
-                )
-    if not candidate_indices:
-        return text
-    clamped_rate = max(0.0, effective_rate)
-    max_replacements = int(len(candidate_indices) * clamped_rate)
-    if max_replacements <= 0:
-        return text
-    # Choose which positions to replace deterministically via rng.sample
-    replace_positions = active_rng.sample(candidate_indices, k=max_replacements)
-    # Process in ascending order to avoid affecting later indices
-    replace_positions.sort()
+            replacement = active_rng.choice(metadata.synonyms)
+            tokens[pos] = f"{metadata.prefix}{replacement}{metadata.suffix}"
-    for pos in replace_positions:
-        metadata = candidate_metadata[pos]
-        synonyms = _collect_synonyms(metadata.core_word, metadata.parts_of_speech)
-        if not synonyms:
-            continue
-        replacement = active_rng.choice(synonyms)
-        tokens[pos] = f"{metadata.prefix}{replacement}{metadata.suffix}"
-    return "".join(tokens)
+        return "".join(tokens)
+    finally:
+        if restore_lexicon_seed:
+            active_lexicon.reseed(original_lexicon_seed)
 class Jargoyle(Glitchling):
-    """Glitchling that swaps words with random WordNet synonyms."""
+    """Glitchling that swaps words with lexicon-driven synonyms."""
     def __init__(
         self,
@@ -312,22 +250,74 @@ class Jargoyle(Glitchling):
         replacement_rate: float | None = None,
         part_of_speech: PartOfSpeechInput = "n",
         seed: int | None = None,
+        lexicon: Lexicon | None = None,
     ) -> None:
         self._param_aliases = {"replacement_rate": "rate"}
+        self._owns_lexicon = lexicon is None
+        self._external_lexicon_original_seed = (
+            lexicon.seed if isinstance(lexicon, Lexicon) else None
+        )
+        self._initializing = True
         effective_rate = resolve_rate(
             rate=rate,
             legacy_value=replacement_rate,
-            default=0.1,
+            default=0.01,
             legacy_name="replacement_rate",
         )
-        super().__init__(
-            name="Jargoyle",
-            corruption_function=substitute_random_synonyms,
-            scope=AttackWave.WORD,
-            seed=seed,
-            rate=effective_rate,
-            part_of_speech=part_of_speech,
-        )
+        prepared_lexicon = lexicon or get_default_lexicon(seed=seed)
+        if lexicon is not None and seed is not None:
+            prepared_lexicon.reseed(seed)
+        try:
+            super().__init__(
+                name="Jargoyle",
+                corruption_function=substitute_random_synonyms,
+                scope=AttackWave.WORD,
+                seed=seed,
+                rate=effective_rate,
+                part_of_speech=part_of_speech,
+                lexicon=prepared_lexicon,
+            )
+        finally:
+            self._initializing = False
+    def set_param(self, key: str, value: Any) -> None:
+        super().set_param(key, value)
+        aliases = getattr(self, "_param_aliases", {})
+        canonical = aliases.get(key, key)
+        if canonical == "seed":
+            current_lexicon = getattr(self, "lexicon", None)
+            if isinstance(current_lexicon, Lexicon):
+                if getattr(self, "_owns_lexicon", False):
+                    current_lexicon.reseed(self.seed)
+                else:
+                    if self.seed is not None:
+                        current_lexicon.reseed(self.seed)
+                    else:
+                        if hasattr(self, "_external_lexicon_original_seed"):
+                            original_seed = getattr(
+                                self, "_external_lexicon_original_seed", None
+                            )
+                            current_lexicon.reseed(original_seed)
+        elif canonical == "lexicon" and isinstance(value, Lexicon):
+            if getattr(self, "_initializing", False):
+                if getattr(self, "_owns_lexicon", False):
+                    if self.seed is not None:
+                        value.reseed(self.seed)
+                else:
+                    if getattr(self, "_external_lexicon_original_seed", None) is None:
+                        self._external_lexicon_original_seed = value.seed
+                    if self.seed is not None:
+                        value.reseed(self.seed)
+                return
+            self._owns_lexicon = False
+            self._external_lexicon_original_seed = value.seed
+            if self.seed is not None:
+                value.reseed(self.seed)
+            elif value.seed != self._external_lexicon_original_seed:
+                value.reseed(self._external_lexicon_original_seed)
 jargoyle = Jargoyle()