PyPI - glitchlings - Versions diffs - 1.0.0__cp313-cp313-win_amd64.whl - Mend

glitchlings 1.0.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

glitchlings/__init__.py +101 -0
glitchlings/__main__.py +8 -0
glitchlings/_corruption_engine/__init__.py +12 -0
glitchlings/_corruption_engine.cp313-win_amd64.pyd +0 -0
glitchlings/assets/__init__.py +180 -0
glitchlings/assets/apostrofae_pairs.json +32 -0
glitchlings/assets/ekkokin_homophones.json +2014 -0
glitchlings/assets/hokey_assets.json +193 -0
glitchlings/assets/lexemes/academic.json +1049 -0
glitchlings/assets/lexemes/colors.json +1333 -0
glitchlings/assets/lexemes/corporate.json +716 -0
glitchlings/assets/lexemes/cyberpunk.json +22 -0
glitchlings/assets/lexemes/lovecraftian.json +23 -0
glitchlings/assets/lexemes/synonyms.json +3354 -0
glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
glitchlings/assets/ocr_confusions.tsv +30 -0
glitchlings/assets/pipeline_assets.json +29 -0
glitchlings/attack/__init__.py +184 -0
glitchlings/attack/analysis.py +1321 -0
glitchlings/attack/core.py +819 -0
glitchlings/attack/core_execution.py +378 -0
glitchlings/attack/core_planning.py +612 -0
glitchlings/attack/encode.py +114 -0
glitchlings/attack/metrics.py +211 -0
glitchlings/attack/metrics_dispatch.py +70 -0
glitchlings/attack/tokenization.py +338 -0
glitchlings/attack/tokenizer_metrics.py +373 -0
glitchlings/auggie.py +285 -0
glitchlings/compat/__init__.py +9 -0
glitchlings/compat/loaders.py +355 -0
glitchlings/compat/types.py +41 -0
glitchlings/conf/__init__.py +39 -0
glitchlings/conf/loaders.py +331 -0
glitchlings/conf/schema.py +156 -0
glitchlings/conf/types.py +72 -0
glitchlings/config.toml +2 -0
glitchlings/constants.py +139 -0
glitchlings/dev/__init__.py +3 -0
glitchlings/dev/docs.py +45 -0
glitchlings/dlc/__init__.py +21 -0
glitchlings/dlc/_shared.py +300 -0
glitchlings/dlc/gutenberg.py +400 -0
glitchlings/dlc/huggingface.py +68 -0
glitchlings/dlc/langchain.py +147 -0
glitchlings/dlc/nemo.py +283 -0
glitchlings/dlc/prime.py +215 -0
glitchlings/dlc/pytorch.py +98 -0
glitchlings/dlc/pytorch_lightning.py +173 -0
glitchlings/internal/__init__.py +16 -0
glitchlings/internal/rust.py +159 -0
glitchlings/internal/rust_ffi.py +599 -0
glitchlings/main.py +426 -0
glitchlings/protocols.py +91 -0
glitchlings/runtime_config.py +24 -0
glitchlings/util/__init__.py +41 -0
glitchlings/util/adapters.py +65 -0
glitchlings/util/keyboards.py +508 -0
glitchlings/util/transcripts.py +108 -0
glitchlings/zoo/__init__.py +161 -0
glitchlings/zoo/assets/__init__.py +29 -0
glitchlings/zoo/core.py +852 -0
glitchlings/zoo/core_execution.py +154 -0
glitchlings/zoo/core_planning.py +451 -0
glitchlings/zoo/corrupt_dispatch.py +291 -0
glitchlings/zoo/hokey.py +139 -0
glitchlings/zoo/jargoyle.py +301 -0
glitchlings/zoo/mim1c.py +269 -0
glitchlings/zoo/pedant/__init__.py +109 -0
glitchlings/zoo/pedant/core.py +99 -0
glitchlings/zoo/pedant/forms.py +50 -0
glitchlings/zoo/pedant/stones.py +83 -0
glitchlings/zoo/redactyl.py +94 -0
glitchlings/zoo/rng.py +280 -0
glitchlings/zoo/rushmore.py +416 -0
glitchlings/zoo/scannequin.py +370 -0
glitchlings/zoo/transforms.py +331 -0
glitchlings/zoo/typogre.py +194 -0
glitchlings/zoo/validation.py +643 -0
glitchlings/zoo/wherewolf.py +120 -0
glitchlings/zoo/zeedub.py +165 -0
glitchlings-1.0.0.dist-info/METADATA +404 -0
glitchlings-1.0.0.dist-info/RECORD +86 -0
glitchlings-1.0.0.dist-info/WHEEL +5 -0
glitchlings-1.0.0.dist-info/entry_points.txt +3 -0
glitchlings-1.0.0.dist-info/licenses/LICENSE +201 -0
glitchlings-1.0.0.dist-info/top_level.txt +1 -0

glitchlings/attack/encode.py ADDED Viewed

@@ -0,0 +1,114 @@
+"""Pure encoding utilities for tokenization.
+This module contains pure functions for encoding text using tokenizers.
+The functions here do not resolve tokenizers or perform IO - they operate
+on already-resolved Tokenizer instances.
+Pure guarantees:
+- No import side effects beyond stdlib
+- No file IO or network calls
+- No environment variable access
+- Deterministic output for given inputs
+The impure tokenizer resolution lives in tokenization.py.
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Sequence
+if TYPE_CHECKING:  # pragma: no cover - typing only
+    from .tokenization import Tokenizer
+def encode_single(
+    tokenizer: "Tokenizer",
+    text: str,
+) -> tuple[list[str], list[int]]:
+    """Encode a single text string into tokens and IDs.
+    This is a thin wrapper that ensures list output types.
+    Args:
+        tokenizer: A resolved tokenizer instance.
+        text: Text to encode.
+    Returns:
+        Tuple of (tokens, token_ids) as lists.
+    """
+    tokens, ids = tokenizer.encode(text)
+    return list(tokens), list(ids)
+def encode_batch(
+    tokenizer: "Tokenizer",
+    texts: Sequence[str],
+) -> tuple[list[list[str]], list[list[int]]]:
+    """Encode multiple texts into batched tokens and IDs.
+    Attempts to use the tokenizer's batch_encode method if available,
+    otherwise falls back to per-item encoding.
+    Args:
+        tokenizer: A resolved tokenizer instance.
+        texts: Sequence of texts to encode.
+    Returns:
+        Tuple of (token_batches, id_batches) as nested lists.
+    """
+    # Try batch encoding if available
+    batch_encode = getattr(tokenizer, "encode_batch", None)
+    if callable(batch_encode):
+        encoded = batch_encode(texts)
+        token_batches: list[list[str]] = []
+        id_batches: list[list[int]] = []
+        for tokens, ids in encoded:
+            token_batches.append(list(tokens))
+            id_batches.append(list(ids))
+        return token_batches, id_batches
+    # Fallback: encode each text individually
+    token_batches_fallback: list[list[str]] = []
+    id_batches_fallback: list[list[int]] = []
+    for entry in texts:
+        tokens, ids = encode_single(tokenizer, entry)
+        token_batches_fallback.append(tokens)
+        id_batches_fallback.append(ids)
+    return token_batches_fallback, id_batches_fallback
+def describe_tokenizer(
+    tokenizer: "Tokenizer",
+    raw_spec: "str | Tokenizer | None",
+) -> str:
+    """Generate a human-readable description of a tokenizer.
+    Args:
+        tokenizer: The resolved tokenizer instance.
+        raw_spec: The original specification used to create/resolve the tokenizer.
+    Returns:
+        A descriptive string identifying the tokenizer.
+    """
+    # If the raw spec was a string, use it directly
+    if isinstance(raw_spec, str):
+        return raw_spec
+    # Try to get a name attribute
+    name = getattr(tokenizer, "name", None)
+    if isinstance(name, str) and name:
+        return name
+    # For None spec, use the class name
+    if raw_spec is None:
+        return tokenizer.__class__.__name__
+    # Fallback to string representation
+    return str(raw_spec)
+__all__ = [
+    "describe_tokenizer",
+    "encode_batch",
+    "encode_single",
+]

glitchlings/attack/metrics.py ADDED Viewed

@@ -0,0 +1,211 @@
+from __future__ import annotations
+from enum import Enum
+from typing import TYPE_CHECKING, Protocol, cast
+from ..internal.rust import get_rust_operation
+from .metrics_dispatch import TokenBatch, TokenSequence, is_batch, validate_batch_consistency
+if TYPE_CHECKING:
+    from collections.abc import Callable
+class Metric(Protocol):
+    def __call__(
+        self,
+        original_tokens: TokenSequence | TokenBatch,
+        corrupted_tokens: TokenSequence | TokenBatch,
+    ) -> float | list[float]: ...
+class BatchMetric(Protocol):
+    def __call__(self, inputs: TokenBatch, outputs: TokenBatch) -> list[float]: ...
+# Rust function references (loaded on first use via get_rust_operation)
+_single_jsd = cast(Metric, get_rust_operation("jensen_shannon_divergence"))
+_single_ned = cast(Metric, get_rust_operation("normalized_edit_distance"))
+_single_sr = cast(Metric, get_rust_operation("subsequence_retention"))
+_single_ed = cast(Metric, get_rust_operation("entropy_delta"))
+_single_msi = cast(Metric, get_rust_operation("merge_split_index"))
+_batch_jsd = cast(BatchMetric, get_rust_operation("batch_jensen_shannon_divergence"))
+_batch_ned = cast(BatchMetric, get_rust_operation("batch_normalized_edit_distance"))
+_batch_sr = cast(BatchMetric, get_rust_operation("batch_subsequence_retention"))
+_batch_ed = cast(BatchMetric, get_rust_operation("batch_entropy_delta"))
+_batch_msi = cast(BatchMetric, get_rust_operation("batch_merge_split_index"))
+def _dispatch_metric(
+    original: TokenSequence | TokenBatch,
+    corrupted: TokenSequence | TokenBatch,
+    *,
+    single: Metric,
+    batch: BatchMetric,
+    name: str,
+) -> float | list[float]:
+    """Dispatch metric computation to single or batch implementation.
+    Uses the pure is_batch function to determine which implementation to call.
+    """
+    validate_batch_consistency(original, corrupted, name)
+    if is_batch(original):
+        return batch(original, corrupted)
+    return single(original, corrupted)
+def jensen_shannon_divergence(
+    original_tokens: TokenSequence | TokenBatch,
+    corrupted_tokens: TokenSequence | TokenBatch,
+) -> float | list[float]:
+    return _dispatch_metric(
+        original_tokens,
+        corrupted_tokens,
+        single=_single_jsd,
+        batch=_batch_jsd,
+        name="jensen_shannon_divergence",
+    )
+def normalized_edit_distance(
+    original_tokens: TokenSequence | TokenBatch,
+    corrupted_tokens: TokenSequence | TokenBatch,
+) -> float | list[float]:
+    return _dispatch_metric(
+        original_tokens,
+        corrupted_tokens,
+        single=_single_ned,
+        batch=_batch_ned,
+        name="normalized_edit_distance",
+    )
+def subsequence_retention(
+    original_tokens: TokenSequence | TokenBatch,
+    corrupted_tokens: TokenSequence | TokenBatch,
+) -> float | list[float]:
+    return _dispatch_metric(
+        original_tokens,
+        corrupted_tokens,
+        single=_single_sr,
+        batch=_batch_sr,
+        name="subsequence_retention",
+    )
+def entropy_delta(
+    original_tokens: TokenSequence | TokenBatch,
+    corrupted_tokens: TokenSequence | TokenBatch,
+) -> float | list[float]:
+    """Compute normalized entropy delta between original and corrupted tokens.
+    Measures the change in token distribution entropy:
+    ΔH = H(corrupted) - H(original), normalized to [-1, 1].
+    Positive values indicate the corrupted text has higher entropy
+    (more uniform/diverse token distribution). Negative values indicate
+    lower entropy (more concentrated distribution).
+    Args:
+        original_tokens: Original token sequence(s).
+        corrupted_tokens: Corrupted token sequence(s).
+    Returns:
+        Normalized entropy delta in [-1, 1], or list for batches.
+    """
+    return _dispatch_metric(
+        original_tokens,
+        corrupted_tokens,
+        single=_single_ed,
+        batch=_batch_ed,
+        name="entropy_delta",
+    )
+def merge_split_index(
+    original_tokens: TokenSequence | TokenBatch,
+    corrupted_tokens: TokenSequence | TokenBatch,
+) -> float | list[float]:
+    """Compute merge-split index measuring subword restructuring.
+    Estimates 1→k (split) and k→1 (merge) token events from alignment.
+    Higher values indicate more dramatic tokenization changes.
+    MSI = (splits + merges) / max(m, n) ∈ [0, 1]
+    Args:
+        original_tokens: Original token sequence(s).
+        corrupted_tokens: Corrupted token sequence(s).
+    Returns:
+        Merge-split index in [0, 1], or list for batches.
+    """
+    return _dispatch_metric(
+        original_tokens,
+        corrupted_tokens,
+        single=_single_msi,
+        batch=_batch_msi,
+        name="merge_split_index",
+    )
+# ---------------------------------------------------------------------------
+# MetricName Enum
+# ---------------------------------------------------------------------------
+class MetricName(str, Enum):
+    """Built-in metric names.
+    Use these instead of string literals to avoid typos and enable IDE completion.
+    Example:
+        >>> attack = Attack(Typogre(), metrics={MetricName.NED: normalized_edit_distance})
+        >>> # or get all defaults:
+        >>> attack = Attack(Typogre(), metrics=MetricName.defaults())
+    """
+    JSD = "jensen_shannon_divergence"
+    NED = "normalized_edit_distance"
+    SR = "subsequence_retention"
+    HD = "entropy_delta"
+    MSI = "merge_split_index"
+    @property
+    def func(self) -> "Callable[..., float | list[float]]":
+        """Get the metric function for this name."""
+        return _METRIC_FUNCTIONS[self]
+    @classmethod
+    def defaults(cls) -> dict[str, "Callable[..., float | list[float]]"]:
+        """Get all built-in metrics as a dictionary.
+        Returns:
+            Dictionary mapping metric names to functions.
+        """
+        return {m.value: m.func for m in cls}
+# Mapping from enum to function - populated after functions are defined
+_METRIC_FUNCTIONS: dict[MetricName, "Callable[..., float | list[float]]"] = {
+    MetricName.JSD: jensen_shannon_divergence,
+    MetricName.NED: normalized_edit_distance,
+    MetricName.SR: subsequence_retention,
+    MetricName.HD: entropy_delta,
+    MetricName.MSI: merge_split_index,
+}
+__all__ = [
+    "Metric",
+    "BatchMetric",
+    "MetricName",
+    "TokenBatch",
+    "TokenSequence",
+    "jensen_shannon_divergence",
+    "normalized_edit_distance",
+    "subsequence_retention",
+    "entropy_delta",
+    "merge_split_index",
+]

glitchlings/attack/metrics_dispatch.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""Pure metric dispatch functions.
+This module contains pure functions for dispatching metric computations.
+It does not import Rust FFI or perform any IO - it operates on already-
+resolved metric functions.
+Pure guarantees:
+- No import side effects beyond stdlib
+- No Rust FFI loading
+- Deterministic dispatch logic
+The impure Rust metric loading lives in metrics.py.
+"""
+from __future__ import annotations
+from typing import Sequence, TypeGuard
+TokenSequence = Sequence[str]
+TokenBatch = Sequence[TokenSequence]
+def is_batch(tokens: TokenSequence | TokenBatch) -> TypeGuard[TokenBatch]:
+    """Determine if tokens represent a batch of sequences.
+    An empty list is treated as an empty batch (returning True) so that
+    ``metric([], [])`` returns ``[]`` rather than ``0.0``. This matches
+    the behavior of :meth:`Attack.run` when processing empty transcripts.
+    Args:
+        tokens: Either a sequence of token strings or a batch of such sequences.
+    Returns:
+        True if tokens is a batch (list of lists), False if a single sequence.
+    """
+    if not tokens:
+        return True  # Empty list is an empty batch
+    first = tokens[0]
+    return isinstance(first, Sequence) and not isinstance(first, (str, bytes))
+def validate_batch_consistency(
+    original: TokenSequence | TokenBatch,
+    corrupted: TokenSequence | TokenBatch,
+    metric_name: str,
+) -> None:
+    """Validate that both inputs are consistently batched or single.
+    Args:
+        original: Original token sequence or batch.
+        corrupted: Corrupted token sequence or batch.
+        metric_name: Name of the metric (for error messages).
+    Raises:
+        TypeError: If one input is batched and the other isn't.
+    """
+    original_is_batch = is_batch(original)
+    corrupted_is_batch = is_batch(corrupted)
+    if original_is_batch != corrupted_is_batch:
+        raise TypeError(f"{metric_name} expects either both batch inputs or both single sequences")
+__all__ = [
+    "TokenBatch",
+    "TokenSequence",
+    "is_batch",
+    "validate_batch_consistency",
+]