PyPI - glitchlings - Versions diffs - 0.2.5__cp312-cp312-win_amd64.whl → 0.9.3__cp312-cp312-win_amd64.whl - Mend

glitchlings 0.2.5__cp312-cp312-win_amd64.whl → 0.9.3__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

glitchlings/__init__.py +36 -17
glitchlings/__main__.py +0 -1
glitchlings/_zoo_rust/__init__.py +12 -0
glitchlings/_zoo_rust.cp312-win_amd64.pyd +0 -0
glitchlings/assets/__init__.py +180 -0
glitchlings/assets/apostrofae_pairs.json +32 -0
glitchlings/assets/ekkokin_homophones.json +2014 -0
glitchlings/assets/hokey_assets.json +193 -0
glitchlings/assets/lexemes/academic.json +1049 -0
glitchlings/assets/lexemes/colors.json +1333 -0
glitchlings/assets/lexemes/corporate.json +716 -0
glitchlings/assets/lexemes/cyberpunk.json +22 -0
glitchlings/assets/lexemes/lovecraftian.json +23 -0
glitchlings/assets/lexemes/synonyms.json +3354 -0
glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
glitchlings/assets/pipeline_assets.json +29 -0
glitchlings/attack/__init__.py +53 -0
glitchlings/attack/compose.py +299 -0
glitchlings/attack/core.py +465 -0
glitchlings/attack/encode.py +114 -0
glitchlings/attack/metrics.py +104 -0
glitchlings/attack/metrics_dispatch.py +70 -0
glitchlings/attack/tokenization.py +157 -0
glitchlings/auggie.py +283 -0
glitchlings/compat/__init__.py +9 -0
glitchlings/compat/loaders.py +355 -0
glitchlings/compat/types.py +41 -0
glitchlings/conf/__init__.py +41 -0
glitchlings/conf/loaders.py +331 -0
glitchlings/conf/schema.py +156 -0
glitchlings/conf/types.py +72 -0
glitchlings/config.toml +2 -0
glitchlings/constants.py +59 -0
glitchlings/dev/__init__.py +3 -0
glitchlings/dev/docs.py +45 -0
glitchlings/dlc/__init__.py +17 -3
glitchlings/dlc/_shared.py +296 -0
glitchlings/dlc/gutenberg.py +400 -0
glitchlings/dlc/huggingface.py +37 -65
glitchlings/dlc/prime.py +55 -114
glitchlings/dlc/pytorch.py +98 -0
glitchlings/dlc/pytorch_lightning.py +173 -0
glitchlings/internal/__init__.py +16 -0
glitchlings/internal/rust.py +159 -0
glitchlings/internal/rust_ffi.py +432 -0
glitchlings/main.py +123 -32
glitchlings/runtime_config.py +24 -0
glitchlings/util/__init__.py +29 -176
glitchlings/util/adapters.py +65 -0
glitchlings/util/keyboards.py +311 -0
glitchlings/util/transcripts.py +108 -0
glitchlings/zoo/__init__.py +47 -24
glitchlings/zoo/assets/__init__.py +29 -0
glitchlings/zoo/core.py +301 -167
glitchlings/zoo/core_execution.py +98 -0
glitchlings/zoo/core_planning.py +451 -0
glitchlings/zoo/corrupt_dispatch.py +295 -0
glitchlings/zoo/ekkokin.py +118 -0
glitchlings/zoo/hokey.py +137 -0
glitchlings/zoo/jargoyle.py +179 -274
glitchlings/zoo/mim1c.py +106 -68
glitchlings/zoo/pedant/__init__.py +107 -0
glitchlings/zoo/pedant/core.py +105 -0
glitchlings/zoo/pedant/forms.py +74 -0
glitchlings/zoo/pedant/stones.py +74 -0
glitchlings/zoo/redactyl.py +44 -175
glitchlings/zoo/rng.py +259 -0
glitchlings/zoo/rushmore.py +359 -116
glitchlings/zoo/scannequin.py +18 -125
glitchlings/zoo/transforms.py +386 -0
glitchlings/zoo/typogre.py +76 -162
glitchlings/zoo/validation.py +477 -0
glitchlings/zoo/zeedub.py +33 -86
glitchlings-0.9.3.dist-info/METADATA +334 -0
glitchlings-0.9.3.dist-info/RECORD +80 -0
{glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/entry_points.txt +1 -0
glitchlings/zoo/_ocr_confusions.py +0 -34
glitchlings/zoo/_rate.py +0 -21
glitchlings/zoo/reduple.py +0 -169
glitchlings-0.2.5.dist-info/METADATA +0 -490
glitchlings-0.2.5.dist-info/RECORD +0 -27
/glitchlings/{zoo → assets}/ocr_confusions.tsv +0 -0
{glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/WHEEL +0 -0
{glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/licenses/LICENSE +0 -0
{glitchlings-0.2.5.dist-info → glitchlings-0.9.3.dist-info}/top_level.txt +0 -0

glitchlings/zoo/scannequin.py CHANGED Viewed

@@ -1,98 +1,10 @@
-import re
 import random
-from typing import Any
+from typing import cast
-from ._ocr_confusions import load_confusion_table
-from .core import Glitchling, AttackWave, AttackOrder
-from ._rate import resolve_rate
+from glitchlings.constants import DEFAULT_SCANNEQUIN_RATE
+from glitchlings.internal.rust_ffi import ocr_artifacts_rust, resolve_seed
-try:
-    from glitchlings._zoo_rust import ocr_artifacts as _ocr_artifacts_rust
-except ImportError:  # pragma: no cover - compiled extension not present
-    _ocr_artifacts_rust = None
-def _python_ocr_artifacts(
-    text: str,
-    *,
-    rate: float,
-    rng: random.Random,
-) -> str:
-    """Introduce OCR-like artifacts into text.
-    Parameters
-    - text: Input text to corrupt.
-    - rate: Max proportion of eligible confusion matches to replace (default 0.02).
-    - seed: Optional seed if `rng` not provided.
-    - rng: Optional RNG; overrides seed.
-    Notes
-    - Uses a curated set of common OCR confusions (rn↔m, cl↔d, O↔0, l/I/1, etc.).
-    - Collects all non-overlapping candidate spans in reading order, then samples
-      a subset deterministically with the provided RNG.
-    - Replacements can change length (e.g., m→rn), so edits are applied from left
-      to right using precomputed spans to avoid index drift.
-    """
-    if not text:
-        return text
-    # Keep the confusion definitions in a shared data file so both the Python
-    # and Rust implementations stay in sync.
-    confusion_table = load_confusion_table()
-    # Build candidate matches as (start, end, choices)
-    candidates: list[tuple[int, int, list[str]]] = []
-    # To avoid double-counting overlapping patterns (like 'l' inside 'li'),
-    # we will scan longer patterns first by sorting by len(src) desc.
-    for src, choices in sorted(confusion_table, key=lambda p: -len(p[0])):
-        pattern = re.escape(src)
-        for m in re.finditer(pattern, text):
-            start, end = m.span()
-            candidates.append((start, end, choices))
-    if not candidates:
-        return text
-    # Decide how many to replace
-    k = int(len(candidates) * rate)
-    if k <= 0:
-        return text
-    # Shuffle deterministically and select non-overlapping k spans
-    rng.shuffle(candidates)
-    chosen: list[tuple[int, int, str]] = []
-    occupied: list[tuple[int, int]] = []
-    def overlaps(a: tuple[int, int], b: tuple[int, int]) -> bool:
-        return not (a[1] <= b[0] or b[1] <= a[0])
-    for start, end, choices in candidates:
-        if len(chosen) >= k:
-            break
-        span = (start, end)
-        if any(overlaps(span, occ) for occ in occupied):
-            continue
-        replacement = rng.choice(choices)
-        chosen.append((start, end, replacement))
-        occupied.append(span)
-    if not chosen:
-        return text
-    # Apply edits from left to right
-    chosen.sort(key=lambda t: t[0])
-    out_parts = []
-    cursor = 0
-    for start, end, rep in chosen:
-        if cursor < start:
-            out_parts.append(text[cursor:start])
-        out_parts.append(rep)
-        cursor = end
-    if cursor < len(text):
-        out_parts.append(text[cursor:])
-    return "".join(out_parts)
+from .core import AttackOrder, AttackWave, Glitchling, PipelineOperationPayload
 def ocr_artifacts(
@@ -100,52 +12,33 @@ def ocr_artifacts(
     rate: float | None = None,
     seed: int | None = None,
     rng: random.Random | None = None,
-    *,
-    error_rate: float | None = None,
 ) -> str:
     """Introduce OCR-like artifacts into text.
-    Prefers the Rust implementation when available.
+    Uses the Rust implementation for performance and determinism.
     """
     if not text:
         return text
-    effective_rate = resolve_rate(
-        rate=rate,
-        legacy_value=error_rate,
-        default=0.02,
-        legacy_name="error_rate",
-    )
-    if rng is None:
-        rng = random.Random(seed)
+    effective_rate = DEFAULT_SCANNEQUIN_RATE if rate is None else rate
     clamped_rate = max(0.0, effective_rate)
-    if _ocr_artifacts_rust is not None:
-        return _ocr_artifacts_rust(text, clamped_rate, rng)
-    return _python_ocr_artifacts(text, rate=clamped_rate, rng=rng)
+    return ocr_artifacts_rust(text, clamped_rate, resolve_seed(seed, rng))
 class Scannequin(Glitchling):
     """Glitchling that simulates OCR artifacts using common confusions."""
+    flavor = "Isn't it weird how the word 'bed' looks like a bed?"
     def __init__(
         self,
         *,
         rate: float | None = None,
-        error_rate: float | None = None,
         seed: int | None = None,
     ) -> None:
-        self._param_aliases = {"error_rate": "rate"}
-        effective_rate = resolve_rate(
-            rate=rate,
-            legacy_value=error_rate,
-            default=0.02,
-            legacy_name="error_rate",
-        )
+        effective_rate = DEFAULT_SCANNEQUIN_RATE if rate is None else rate
         super().__init__(
             name="Scannequin",
             corruption_function=ocr_artifacts,
@@ -155,17 +48,17 @@ class Scannequin(Glitchling):
             rate=effective_rate,
         )
-    def pipeline_operation(self) -> dict[str, Any] | None:
-        rate = self.kwargs.get("rate")
-        if rate is None:
-            rate = self.kwargs.get("error_rate")
-        if rate is None:
-            return None
-        return {"type": "ocr", "error_rate": float(rate)}
+    def pipeline_operation(self) -> PipelineOperationPayload:
+        rate_value = self.kwargs.get("rate", DEFAULT_SCANNEQUIN_RATE)
+        rate = DEFAULT_SCANNEQUIN_RATE if rate_value is None else float(rate_value)
+        return cast(
+            PipelineOperationPayload,
+            {"type": "ocr", "rate": rate},
+        )
 scannequin = Scannequin()
-__all__ = ["Scannequin", "scannequin"]
+__all__ = ["Scannequin", "scannequin", "ocr_artifacts"]

glitchlings/zoo/transforms.py ADDED Viewed

@@ -0,0 +1,386 @@
+"""Pure text transformation functions.
+This module contains text manipulation functions that are:
+- **Pure**: Output depends only on inputs, no side effects
+- **Deterministic**: Same inputs always produce same outputs
+- **Self-contained**: No RNG, no Rust FFI, no config loading
+These functions receive pre-validated inputs from boundary layers
+(see validation.py) and trust that inputs are already checked.
+Core transformation code should NOT re-validate parameters.
+Design Philosophy
+-----------------
+This module implements the innermost layer of the purity architecture:
+    CLI/API → validation.py → transforms.py → Rust FFI
+    (boundary)   (boundary)     (pure core)    (impure)
+Functions here should:
+- Accept concrete types (not Optional unless semantically required)
+- Not log, print, or mutate external state
+- Not import impure modules (internal.rust, config loaders, etc.)
+- Document any preconditions callers must satisfy
+See AGENTS.md "Functional Purity Architecture" for full details.
+"""
+from __future__ import annotations
+import re
+from collections.abc import Iterable, Mapping, Sequence
+from dataclasses import dataclass
+from typing import TypeVar, cast
+# ---------------------------------------------------------------------------
+# Text Tokenization
+# ---------------------------------------------------------------------------
+_WORD_SPLIT_PATTERN = re.compile(r"(\s+)")
+_TOKEN_EDGES_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$", re.DOTALL)
+def split_preserving_whitespace(text: str) -> list[str]:
+    """Split text while keeping whitespace tokens for stable reconstruction.
+    Returns alternating [word, whitespace, word, whitespace, ...] tokens.
+    Joining the result reconstructs the original text exactly.
+    Args:
+        text: Input text to tokenize.
+    Returns:
+        List of tokens alternating between non-whitespace and whitespace.
+    Example:
+        >>> split_preserving_whitespace("hello  world")
+        ['hello', '  ', 'world']
+    """
+    return _WORD_SPLIT_PATTERN.split(text)
+def split_token_edges(token: str) -> tuple[str, str, str]:
+    """Decompose a token into leading punctuation, core, and trailing punctuation.
+    Args:
+        token: A non-whitespace token.
+    Returns:
+        Tuple of (prefix, core, suffix) where:
+        - prefix: leading non-word characters
+        - core: central word characters
+        - suffix: trailing non-word characters
+    Example:
+        >>> split_token_edges('"Hello!"')
+        ('"', 'Hello', '!"')
+    """
+    match = cast(re.Match[str], _TOKEN_EDGES_PATTERN.match(token))
+    prefix, core, suffix = match.groups()
+    return prefix, core, suffix
+def compute_core_length(token: str) -> int:
+    """Compute the effective length of a token's core for weighting heuristics.
+    Used by weighted sampling algorithms to prioritize longer words.
+    Always returns at least 1 to avoid zero-weight issues.
+    Args:
+        token: A non-whitespace token.
+    Returns:
+        Positive integer representing the token's effective length.
+    """
+    _, core, _ = split_token_edges(token)
+    if core:
+        return len(core)
+    stripped = token.strip()
+    if stripped:
+        return len(stripped)
+    if token:
+        return len(token)
+    return 1
+@dataclass(frozen=True)
+class WordToken:
+    """Metadata describing a non-whitespace token from text tokenization.
+    Attributes:
+        index: Position in the parent token sequence.
+        prefix: Leading non-word characters (punctuation).
+        core: Central word characters.
+        suffix: Trailing non-word characters (punctuation).
+        core_length: Effective length for weighting (always >= 1).
+    """
+    index: int
+    prefix: str
+    core: str
+    suffix: str
+    core_length: int
+    @property
+    def has_core(self) -> bool:
+        """Return True when the token contains at least one core character."""
+        return bool(self.core)
+def collect_word_tokens(
+    tokens: Sequence[str],
+    *,
+    skip_first_word: bool = False,
+) -> list[WordToken]:
+    """Extract structured metadata for non-whitespace tokens.
+    Args:
+        tokens: Token sequence from split_preserving_whitespace.
+        skip_first_word: If True, exclude the first content token
+            (useful for preserving leading words in delete operations).
+    Returns:
+        List of WordToken instances for each non-whitespace token.
+    """
+    start = 2 if skip_first_word else 0
+    collected: list[WordToken] = []
+    for index in range(start, len(tokens), 2):
+        token = tokens[index]
+        if not token or token.isspace():
+            continue
+        prefix, core, suffix = split_token_edges(token)
+        core_length = compute_core_length(token)
+        collected.append(
+            WordToken(
+                index=index,
+                prefix=prefix,
+                core=core,
+                suffix=suffix,
+                core_length=core_length,
+            )
+        )
+    return collected
+def reassemble_tokens(tokens: Sequence[str]) -> str:
+    """Join tokens back into text, preserving original structure.
+    Args:
+        tokens: Token sequence (typically modified from split_preserving_whitespace).
+    Returns:
+        Reassembled text string.
+    """
+    return "".join(tokens)
+# ---------------------------------------------------------------------------
+# Keyboard Layout Processing
+# ---------------------------------------------------------------------------
+KeyNeighborMap = dict[str, list[str]]
+def build_keyboard_neighbor_map(rows: Iterable[str]) -> KeyNeighborMap:
+    """Derive 8-neighbour adjacency lists from keyboard layout rows.
+    Each row represents a keyboard row with characters positioned by index.
+    Spaces are treated as empty positions. Characters are normalized to lowercase.
+    Args:
+        rows: Iterable of strings representing keyboard rows, with
+            characters positioned to reflect their physical layout.
+    Returns:
+        Dictionary mapping each lowercase character to its adjacent characters.
+    Example:
+        >>> rows = ["qwerty", " asdfg"]  # 'a' offset by 1
+        >>> neighbors = build_keyboard_neighbor_map(rows)
+        >>> neighbors['s']  # adjacent to q, w, e, a, d on QWERTY
+        ['q', 'w', 'e', 'a', 'd']
+    """
+    grid: dict[tuple[int, int], str] = {}
+    for y, row in enumerate(rows):
+        for x, char in enumerate(row):
+            if char == " ":
+                continue
+            grid[(x, y)] = char.lower()
+    neighbors: KeyNeighborMap = {}
+    for (x, y), char in grid.items():
+        seen: list[str] = []
+        for dy in (-1, 0, 1):
+            for dx in (-1, 0, 1):
+                if dx == 0 and dy == 0:
+                    continue
+                candidate = grid.get((x + dx, y + dy))
+                if candidate is None:
+                    continue
+                seen.append(candidate)
+        # Preserve encounter order but drop duplicates for determinism
+        deduped = list(dict.fromkeys(seen))
+        neighbors[char] = deduped
+    return neighbors
+# ---------------------------------------------------------------------------
+# String Difference Computation
+# ---------------------------------------------------------------------------
+def compute_string_diffs(
+    original: str,
+    modified: str,
+) -> list[list[tuple[str, str, str]]]:
+    """Compare two strings and return grouped adjacent change operations.
+    Uses difflib's SequenceMatcher to identify changes between strings.
+    Consecutive changes are grouped together; equal regions are skipped.
+    Args:
+        original: The original string.
+        modified: The modified string.
+    Returns:
+        List of change groups. Each group is a list of (tag, old_text, new_text)
+        tuples where tag is 'replace', 'delete', or 'insert'.
+    Example:
+        >>> compute_string_diffs("hello world", "helo worlds")
+        [[('delete', 'l', '')], [('replace', '', 's')]]
+    """
+    import difflib
+    sm = difflib.SequenceMatcher(None, original, modified)
+    ops: list[list[tuple[str, str, str]]] = []
+    buffer: list[tuple[str, str, str]] = []
+    for tag, i1, i2, j1, j2 in sm.get_opcodes():
+        if tag == "equal":
+            if buffer:
+                ops.append(buffer)
+                buffer = []
+            continue
+        buffer.append((tag, original[i1:i2], modified[j1:j2]))
+    if buffer:
+        ops.append(buffer)
+    return ops
+# ---------------------------------------------------------------------------
+# Sequence Operations
+# ---------------------------------------------------------------------------
+T = TypeVar("T")
+def stable_deduplicate(items: Iterable[T]) -> list[T]:
+    """Remove duplicates while preserving original order.
+    Args:
+        items: Iterable of hashable items.
+    Returns:
+        List with duplicates removed, first occurrence preserved.
+    Example:
+        >>> stable_deduplicate([3, 1, 4, 1, 5, 9, 2, 6, 5])
+        [3, 1, 4, 5, 9, 2, 6]
+    """
+    seen: set[T] = set()
+    result: list[T] = []
+    for item in items:
+        if item not in seen:
+            seen.add(item)
+            result.append(item)
+    return result
+def interleave_lists(
+    primary: Sequence[T],
+    secondary: Sequence[T],
+    *,
+    secondary_first: bool = False,
+) -> list[T]:
+    """Interleave two sequences, padding shorter with empty slots.
+    Args:
+        primary: First sequence.
+        secondary: Second sequence.
+        secondary_first: If True, start with secondary element.
+    Returns:
+        Interleaved list [p0, s0, p1, s1, ...] or [s0, p0, s1, p1, ...].
+    """
+    result: list[T] = []
+    max_len = max(len(primary), len(secondary))
+    for i in range(max_len):
+        if secondary_first:
+            if i < len(secondary):
+                result.append(secondary[i])
+            if i < len(primary):
+                result.append(primary[i])
+        else:
+            if i < len(primary):
+                result.append(primary[i])
+            if i < len(secondary):
+                result.append(secondary[i])
+    return result
+# ---------------------------------------------------------------------------
+# Mapping Helpers
+# ---------------------------------------------------------------------------
+def invert_mapping(
+    mapping: Mapping[str, Sequence[str]],
+) -> dict[str, str]:
+    """Invert a one-to-many mapping into a many-to-one lookup.
+    Given {key: [val1, val2]}, returns {val1: key, val2: key}.
+    Later keys overwrite earlier ones if values collide.
+    Args:
+        mapping: Dictionary mapping keys to sequences of values.
+    Returns:
+        Inverted dictionary mapping each value to its key.
+    """
+    inverted: dict[str, str] = {}
+    for key, values in mapping.items():
+        for value in values:
+            inverted[value] = key
+    return inverted
+__all__ = [
+    # Tokenization
+    "split_preserving_whitespace",
+    "split_token_edges",
+    "compute_core_length",
+    "WordToken",
+    "collect_word_tokens",
+    "reassemble_tokens",
+    # Keyboard
+    "KeyNeighborMap",
+    "build_keyboard_neighbor_map",
+    # Diffs
+    "compute_string_diffs",
+    # Sequences
+    "stable_deduplicate",
+    "interleave_lists",
+    # Mappings
+    "invert_mapping",
+]