PyPI - glitchlings - Versions diffs - 1.0.0__cp313-cp313-win_amd64.whl - Mend

glitchlings 1.0.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

glitchlings/__init__.py +101 -0
glitchlings/__main__.py +8 -0
glitchlings/_corruption_engine/__init__.py +12 -0
glitchlings/_corruption_engine.cp313-win_amd64.pyd +0 -0
glitchlings/assets/__init__.py +180 -0
glitchlings/assets/apostrofae_pairs.json +32 -0
glitchlings/assets/ekkokin_homophones.json +2014 -0
glitchlings/assets/hokey_assets.json +193 -0
glitchlings/assets/lexemes/academic.json +1049 -0
glitchlings/assets/lexemes/colors.json +1333 -0
glitchlings/assets/lexemes/corporate.json +716 -0
glitchlings/assets/lexemes/cyberpunk.json +22 -0
glitchlings/assets/lexemes/lovecraftian.json +23 -0
glitchlings/assets/lexemes/synonyms.json +3354 -0
glitchlings/assets/mim1c_homoglyphs.json.gz.b64 +1064 -0
glitchlings/assets/ocr_confusions.tsv +30 -0
glitchlings/assets/pipeline_assets.json +29 -0
glitchlings/attack/__init__.py +184 -0
glitchlings/attack/analysis.py +1321 -0
glitchlings/attack/core.py +819 -0
glitchlings/attack/core_execution.py +378 -0
glitchlings/attack/core_planning.py +612 -0
glitchlings/attack/encode.py +114 -0
glitchlings/attack/metrics.py +211 -0
glitchlings/attack/metrics_dispatch.py +70 -0
glitchlings/attack/tokenization.py +338 -0
glitchlings/attack/tokenizer_metrics.py +373 -0
glitchlings/auggie.py +285 -0
glitchlings/compat/__init__.py +9 -0
glitchlings/compat/loaders.py +355 -0
glitchlings/compat/types.py +41 -0
glitchlings/conf/__init__.py +39 -0
glitchlings/conf/loaders.py +331 -0
glitchlings/conf/schema.py +156 -0
glitchlings/conf/types.py +72 -0
glitchlings/config.toml +2 -0
glitchlings/constants.py +139 -0
glitchlings/dev/__init__.py +3 -0
glitchlings/dev/docs.py +45 -0
glitchlings/dlc/__init__.py +21 -0
glitchlings/dlc/_shared.py +300 -0
glitchlings/dlc/gutenberg.py +400 -0
glitchlings/dlc/huggingface.py +68 -0
glitchlings/dlc/langchain.py +147 -0
glitchlings/dlc/nemo.py +283 -0
glitchlings/dlc/prime.py +215 -0
glitchlings/dlc/pytorch.py +98 -0
glitchlings/dlc/pytorch_lightning.py +173 -0
glitchlings/internal/__init__.py +16 -0
glitchlings/internal/rust.py +159 -0
glitchlings/internal/rust_ffi.py +599 -0
glitchlings/main.py +426 -0
glitchlings/protocols.py +91 -0
glitchlings/runtime_config.py +24 -0
glitchlings/util/__init__.py +41 -0
glitchlings/util/adapters.py +65 -0
glitchlings/util/keyboards.py +508 -0
glitchlings/util/transcripts.py +108 -0
glitchlings/zoo/__init__.py +161 -0
glitchlings/zoo/assets/__init__.py +29 -0
glitchlings/zoo/core.py +852 -0
glitchlings/zoo/core_execution.py +154 -0
glitchlings/zoo/core_planning.py +451 -0
glitchlings/zoo/corrupt_dispatch.py +291 -0
glitchlings/zoo/hokey.py +139 -0
glitchlings/zoo/jargoyle.py +301 -0
glitchlings/zoo/mim1c.py +269 -0
glitchlings/zoo/pedant/__init__.py +109 -0
glitchlings/zoo/pedant/core.py +99 -0
glitchlings/zoo/pedant/forms.py +50 -0
glitchlings/zoo/pedant/stones.py +83 -0
glitchlings/zoo/redactyl.py +94 -0
glitchlings/zoo/rng.py +280 -0
glitchlings/zoo/rushmore.py +416 -0
glitchlings/zoo/scannequin.py +370 -0
glitchlings/zoo/transforms.py +331 -0
glitchlings/zoo/typogre.py +194 -0
glitchlings/zoo/validation.py +643 -0
glitchlings/zoo/wherewolf.py +120 -0
glitchlings/zoo/zeedub.py +165 -0
glitchlings-1.0.0.dist-info/METADATA +404 -0
glitchlings-1.0.0.dist-info/RECORD +86 -0
glitchlings-1.0.0.dist-info/WHEEL +5 -0
glitchlings-1.0.0.dist-info/entry_points.txt +3 -0
glitchlings-1.0.0.dist-info/licenses/LICENSE +201 -0
glitchlings-1.0.0.dist-info/top_level.txt +1 -0

glitchlings/attack/tokenization.py ADDED Viewed

@@ -0,0 +1,338 @@
+from __future__ import annotations
+import importlib.util
+import threading
+import zlib
+from typing import Any, Protocol, Sequence
+DEFAULT_TIKTOKEN_ENCODINGS = ("o200k_base", "cl100k_base")
+# ---------------------------------------------------------------------------
+# Tokenizer Cache
+# ---------------------------------------------------------------------------
+_TOKENIZER_CACHE: dict[str, "Tokenizer"] = {}
+_TOKENIZER_CACHE_LOCK = threading.Lock()
+_TOKENIZER_CACHE_MAX_SIZE = 16
+# Sentinel for default tokenizer cache key (avoids collision with user names)
+_DEFAULT_TOKENIZER_KEY = object()
+def _get_cache_key(key: str | object) -> str:
+    """Convert cache key to string, handling sentinel."""
+    return "__default__" if key is _DEFAULT_TOKENIZER_KEY else str(key)
+def _get_cached_tokenizer(key: str | object) -> "Tokenizer | None":
+    """Thread-safe lookup of cached tokenizer with LRU refresh."""
+    cache_key = _get_cache_key(key)
+    with _TOKENIZER_CACHE_LOCK:
+        if cache_key in _TOKENIZER_CACHE:
+            # Move to end to mark as recently used (true LRU)
+            tokenizer = _TOKENIZER_CACHE.pop(cache_key)
+            _TOKENIZER_CACHE[cache_key] = tokenizer
+            return tokenizer
+        return None
+def _cache_tokenizer(key: str | object, tokenizer: "Tokenizer") -> "Tokenizer":
+    """Thread-safe caching of tokenizer with LRU eviction."""
+    cache_key = _get_cache_key(key)
+    with _TOKENIZER_CACHE_LOCK:
+        # Remove if already exists (will re-add at end)
+        if cache_key in _TOKENIZER_CACHE:
+            del _TOKENIZER_CACHE[cache_key]
+        # Evict oldest if at capacity
+        elif len(_TOKENIZER_CACHE) >= _TOKENIZER_CACHE_MAX_SIZE:
+            oldest_key = next(iter(_TOKENIZER_CACHE))
+            del _TOKENIZER_CACHE[oldest_key]
+        _TOKENIZER_CACHE[cache_key] = tokenizer
+        return tokenizer
+def clear_tokenizer_cache() -> int:
+    """Clear the tokenizer cache and return the number of entries cleared.
+    Useful for testing or when memory is constrained.
+    Returns:
+        Number of cached tokenizers that were cleared.
+    """
+    with _TOKENIZER_CACHE_LOCK:
+        count = len(_TOKENIZER_CACHE)
+        _TOKENIZER_CACHE.clear()
+        return count
+def get_tokenizer_cache_info() -> dict[str, Any]:
+    """Get information about the tokenizer cache.
+    Returns:
+        Dictionary with cache stats: size, max_size, cached_keys.
+    """
+    with _TOKENIZER_CACHE_LOCK:
+        return {
+            "size": len(_TOKENIZER_CACHE),
+            "max_size": _TOKENIZER_CACHE_MAX_SIZE,
+            "cached_keys": list(_TOKENIZER_CACHE.keys()),
+        }
+class Tokenizer(Protocol):
+    def encode(self, text: str) -> tuple[list[str], list[int]]: ...
+    def decode(self, tokens: Sequence[str]) -> str: ...
+class WhitespaceTokenizer:
+    def encode(self, text: str) -> tuple[list[str], list[int]]:
+        tokens = text.split()
+        # Synthetic IDs based on adler32 hash for stability
+        ids = [zlib.adler32(t.encode("utf-8")) & 0xFFFFFFFF for t in tokens]
+        return tokens, ids
+    def decode(self, tokens: Sequence[str]) -> str:
+        return " ".join(tokens)
+    def encode_batch(self, texts: Sequence[str]) -> list[tuple[list[str], list[int]]]:
+        return [self.encode(text) for text in texts]
+class TiktokenTokenizer:
+    def __init__(self, model_name: str):
+        import tiktoken
+        self.name = model_name
+        try:
+            self.enc = tiktoken.get_encoding(model_name)
+        except ValueError:
+            self.enc = tiktoken.encoding_for_model(model_name)
+    def encode(self, text: str) -> tuple[list[str], list[int]]:
+        ids = self.enc.encode(text)
+        tokens = [
+            self.enc.decode_single_token_bytes(i).decode("utf-8", errors="replace") for i in ids
+        ]
+        return tokens, ids
+    def decode(self, tokens: Sequence[str], sep: str = "") -> str:
+        return sep.join(tokens)
+    def encode_batch(self, texts: Sequence[str]) -> list[tuple[list[str], list[int]]]:
+        id_batches = [list(batch) for batch in self.enc.encode_batch(list(texts))]
+        token_batches: list[list[str]] = []
+        for ids in id_batches:
+            token_batches.append(
+                [
+                    self.enc.decode_single_token_bytes(i).decode("utf-8", errors="replace")
+                    for i in ids
+                ]
+            )
+        return list(zip(token_batches, id_batches))
+class HuggingFaceTokenizerWrapper:
+    def __init__(self, tokenizer_obj: Any, *, unknown_token: str = "[UNK]"):
+        self.tokenizer = tokenizer_obj
+        self.unknown_token = unknown_token
+    def encode(self, text: str) -> tuple[list[str], list[int]]:
+        # tokenizers.Tokenizer.encode returns an Encoding object
+        encoding = self.tokenizer.encode(text)
+        return encoding.tokens, encoding.ids
+    def decode(self, tokens: Sequence[str]) -> str:
+        # Use the tokenizer's decode method to properly handle model-specific
+        # artifacts (e.g., "##" for WordPiece, "Ġ" for BPE).
+        # Convert tokens to IDs first, then decode.
+        try:
+            token_ids = [self.tokenizer.token_to_id(token) for token in tokens]
+            # Filter out None values (tokens not in vocabulary)
+            valid_ids = [tid for tid in token_ids if tid is not None]
+            if valid_ids:
+                result: str = self.tokenizer.decode(valid_ids)
+                return result
+        except (AttributeError, TypeError):
+            pass
+        # Fallback: decode each token individually to handle artifacts properly
+        decoded_tokens = []
+        for token in tokens:
+            token_id = None
+            try:
+                token_id = self.tokenizer.token_to_id(token)
+            except (AttributeError, TypeError):
+                pass
+            if token_id is None:
+                decoded_tokens.append(self.unknown_token)
+            else:
+                # Decode the single token ID to properly handle artifacts
+                try:
+                    decoded = self.tokenizer.decode([token_id])
+                    decoded_tokens.append(decoded)
+                except (AttributeError, TypeError):
+                    # Last resort: strip common prefixes and use token as-is
+                    clean_token = token.lstrip("Ġ").lstrip("##").lstrip("▁")
+                    decoded_tokens.append(clean_token if clean_token else token)
+        return " ".join(decoded_tokens)
+    def encode_batch(self, texts: Sequence[str]) -> list[tuple[list[str], list[int]]]:
+        encodings = self.tokenizer.encode_batch(list(texts))
+        return [(encoding.tokens, encoding.ids) for encoding in encodings]
+def list_available_tokenizers() -> list[str]:
+    """List tokenizer names that can be resolved.
+    Returns a list of known tokenizer names including:
+    - Tiktoken encodings (if tiktoken is installed)
+    - A note about HuggingFace tokenizers (if tokenizers is installed)
+    - 'whitespace' (always available)
+    Returns:
+        List of available tokenizer names/descriptions.
+    """
+    available: list[str] = []
+    if importlib.util.find_spec("tiktoken"):
+        import tiktoken
+        # Add known tiktoken encodings
+        for encoding in DEFAULT_TIKTOKEN_ENCODINGS:
+            try:
+                tiktoken.get_encoding(encoding)
+                available.append(encoding)
+            except ValueError:
+                pass
+        # Add common model names
+        available.extend(["gpt-4", "gpt-4o", "gpt-3.5-turbo"])
+    if importlib.util.find_spec("tokenizers"):
+        available.append("<any HuggingFace tokenizer name>")
+    available.append("whitespace")
+    return available
+def resolve_tokenizer(
+    tokenizer: str | Tokenizer | None,
+    *,
+    use_cache: bool = True,
+) -> Tokenizer:
+    """Resolve a tokenizer specification to a Tokenizer instance.
+    Tokenizers resolved from string specifications are cached by default for
+    efficient reuse across multiple Attack instances.
+    Args:
+        tokenizer: One of:
+            - None: Use default tokenizer (tiktoken o200k_base, or whitespace)
+            - str: Tokenizer name (tiktoken encoding, model name, or HF tokenizer)
+            - Tokenizer: Pass through as-is
+        use_cache: Whether to use the tokenizer cache for string specs.
+            Defaults to True. Set to False to always create fresh instances.
+    Returns:
+        A Tokenizer instance.
+    Raises:
+        ValueError: If string tokenizer cannot be resolved.
+    """
+    if tokenizer is None:
+        # Default tokenizer resolution is also cached
+        return _resolve_default_tokenizer(use_cache=use_cache)
+    if isinstance(tokenizer, str):
+        # Check cache first
+        if use_cache:
+            cached = _get_cached_tokenizer(tokenizer)
+            if cached is not None:
+                return cached
+        resolved = _resolve_string_tokenizer(tokenizer)
+        if use_cache:
+            return _cache_tokenizer(tokenizer, resolved)
+        return resolved
+    # Check if it is a HuggingFace tokenizer object
+    if importlib.util.find_spec("tokenizers"):
+        from tokenizers import Tokenizer as HFTokenizer
+        if isinstance(tokenizer, HFTokenizer):
+            return HuggingFaceTokenizerWrapper(tokenizer)
+    return tokenizer
+def _resolve_string_tokenizer(tokenizer: str) -> Tokenizer:
+    """Resolve a string tokenizer specification (no caching)."""
+    if importlib.util.find_spec("tiktoken"):
+        import tiktoken
+        try:
+            # Check if valid tiktoken encoding/model
+            try:
+                tiktoken.get_encoding(tokenizer)
+                return TiktokenTokenizer(tokenizer)
+            except ValueError:
+                try:
+                    tiktoken.encoding_for_model(tokenizer)
+                    return TiktokenTokenizer(tokenizer)
+                except (ValueError, KeyError):
+                    pass
+        except ImportError:
+            pass
+    if importlib.util.find_spec("tokenizers"):
+        from tokenizers import Tokenizer
+        try:
+            return HuggingFaceTokenizerWrapper(Tokenizer.from_pretrained(tokenizer))
+        except Exception:
+            pass
+    available = list_available_tokenizers()
+    raise ValueError(
+        f"Could not resolve tokenizer: {tokenizer!r}. Available: {', '.join(available)}"
+    )
+def _resolve_default_tokenizer(*, use_cache: bool = True) -> Tokenizer:
+    """Resolve the default tokenizer with optional caching."""
+    if use_cache:
+        cached = _get_cached_tokenizer(_DEFAULT_TOKENIZER_KEY)
+        if cached is not None:
+            return cached
+    resolved = _default_tokenizer()
+    if use_cache:
+        return _cache_tokenizer(_DEFAULT_TOKENIZER_KEY, resolved)
+    return resolved
+def _default_tokenizer() -> Tokenizer:
+    """Select a modern, lightweight tokenizer with graceful fallbacks."""
+    if importlib.util.find_spec("tiktoken"):
+        import tiktoken
+        for encoding in DEFAULT_TIKTOKEN_ENCODINGS:
+            try:
+                tiktoken.get_encoding(encoding)
+                return TiktokenTokenizer(encoding)
+            except ValueError:
+                continue
+    return WhitespaceTokenizer()
+__all__ = [
+    "DEFAULT_TIKTOKEN_ENCODINGS",
+    "HuggingFaceTokenizerWrapper",
+    "TiktokenTokenizer",
+    "Tokenizer",
+    "WhitespaceTokenizer",
+    "clear_tokenizer_cache",
+    "get_tokenizer_cache_info",
+    "list_available_tokenizers",
+    "resolve_tokenizer",
+]

glitchlings/attack/tokenizer_metrics.py ADDED Viewed

@@ -0,0 +1,373 @@
+"""Tokenizer analysis metrics for evaluating tokenizer behavior.
+This module provides functions for analyzing how a tokenizer encodes text.
+Unlike the corruption metrics in metrics.py which compare before/after token
+sequences, these metrics evaluate the tokenizer's encoding of a single text.
+These metrics are implemented in Rust for performance. The functions here
+provide a Python API with documentation and type hints.
+Example:
+    >>> from glitchlings.attack.tokenizer_metrics import compression_ratio
+    >>> from glitchlings.attack.tokenization import resolve_tokenizer
+    >>> tokenizer = resolve_tokenizer("o200k_base")
+    >>> text = "Hello, world!"
+    >>> tokens, token_ids = tokenizer.encode(text)
+    >>> ratio = compression_ratio(text, tokens)
+    >>> print(f"Bytes per token: {ratio:.2f}")
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Sequence, cast
+from ..internal.rust import get_rust_operation
+if TYPE_CHECKING:
+    from .tokenization import Tokenizer
+# Rust function references (loaded on first use via get_rust_operation)
+_compression_ratio = get_rust_operation("compression_ratio")
+_batch_compression_ratio = get_rust_operation("batch_compression_ratio")
+_characters_per_token = get_rust_operation("characters_per_token")
+_batch_characters_per_token = get_rust_operation("batch_characters_per_token")
+_token_entropy = get_rust_operation("token_entropy")
+_batch_token_entropy = get_rust_operation("batch_token_entropy")
+_vocabulary_utilization = get_rust_operation("vocabulary_utilization")
+_batch_vocabulary_utilization = get_rust_operation("batch_vocabulary_utilization")
+_unknown_token_rate = get_rust_operation("unknown_token_rate")
+_batch_unknown_token_rate = get_rust_operation("batch_unknown_token_rate")
+# ---------------------------------------------------------------------------
+# Compression Metrics
+# ---------------------------------------------------------------------------
+def compression_ratio(text: str, tokens: Sequence[str]) -> float:
+    """Compute bytes per token - measures encoding efficiency.
+    Lower values indicate the tokenizer represents the text more compactly.
+    Useful for comparing tokenizer suitability across domains.
+    Args:
+        text: Input text to measure.
+        tokens: Token strings from encoding the text.
+    Returns:
+        Ratio of UTF-8 bytes to token count. Returns inf for empty output.
+    Example:
+        >>> text = "Hello, world!"
+        >>> tokens, _ = tokenizer.encode(text)
+        >>> ratio = compression_ratio(text, tokens)
+    """
+    return cast(float, _compression_ratio(text, list(tokens)))
+def batch_compression_ratio(
+    texts: Sequence[str],
+    token_batches: Sequence[Sequence[str]],
+) -> list[float]:
+    """Compute compression ratios for a batch of texts.
+    Args:
+        texts: Input texts to measure.
+        token_batches: Token sequences from encoding each text.
+    Returns:
+        List of compression ratios for each text.
+    """
+    return cast(
+        list[float],
+        _batch_compression_ratio(list(texts), [list(tokens) for tokens in token_batches]),
+    )
+def characters_per_token(text: str, tokens: Sequence[str]) -> float:
+    """Compute average characters per token - simpler efficiency measure.
+    Higher values mean fewer tokens needed. Unlike compression_ratio,
+    this ignores UTF-8 encoding costs, so it's more intuitive for
+    ASCII-heavy text but less accurate for multilingual content.
+    Args:
+        text: Input text to measure.
+        tokens: Token strings from encoding the text.
+    Returns:
+        Ratio of character count to token count. Returns inf for empty output.
+    """
+    return cast(float, _characters_per_token(text, list(tokens)))
+def batch_characters_per_token(
+    texts: Sequence[str],
+    token_batches: Sequence[Sequence[str]],
+) -> list[float]:
+    """Compute characters per token for a batch of texts.
+    Args:
+        texts: Input texts to measure.
+        token_batches: Token sequences from encoding each text.
+    Returns:
+        List of characters-per-token ratios for each text.
+    """
+    return cast(
+        list[float],
+        _batch_characters_per_token(list(texts), [list(tokens) for tokens in token_batches]),
+    )
+# ---------------------------------------------------------------------------
+# Token Distribution Metrics
+# ---------------------------------------------------------------------------
+def token_entropy(tokens: Sequence[str]) -> float:
+    """Compute Shannon entropy of token distribution.
+    Higher entropy means more uniform token usage (less repetition).
+    Useful for understanding how "spread out" the vocabulary usage is.
+    Args:
+        tokens: Token sequence to analyze.
+    Returns:
+        Entropy in bits. Returns 0.0 for empty input.
+    Example:
+        >>> tokens = ["the", "cat", "sat", "on", "the", "mat"]
+        >>> entropy = token_entropy(tokens)
+    """
+    return cast(float, _token_entropy(list(tokens)))
+def batch_token_entropy(token_batches: Sequence[Sequence[str]]) -> list[float]:
+    """Compute token entropy for a batch of token sequences.
+    Args:
+        token_batches: Token sequences to analyze.
+    Returns:
+        List of entropy values for each sequence.
+    """
+    return cast(
+        list[float],
+        _batch_token_entropy([list(tokens) for tokens in token_batches]),
+    )
+# ---------------------------------------------------------------------------
+# Vocabulary Analysis
+# ---------------------------------------------------------------------------
+def vocabulary_utilization(
+    tokens: Sequence[str],
+    token_ids: Sequence[int],
+) -> dict[str, float]:
+    """Analyze vocabulary usage patterns.
+    Provides insights into how the tokenizer uses its vocabulary for a
+    given text. Useful for identifying domain mismatches where the
+    tokenizer may be using unusual or sparse regions of its vocabulary.
+    Args:
+        tokens: Token strings from encoding.
+        token_ids: Corresponding token IDs.
+    Returns:
+        Dictionary with:
+        - unique_ratio: fraction of tokens that are unique (type/token ratio)
+        - repetition_rate: 1 - unique_ratio (how much token reuse)
+        - max_id: highest token ID used (hints at vocabulary region)
+        - id_spread: stddev of IDs (are we using clustered or spread vocab?)
+    Example:
+        >>> tokens, ids = tokenizer.encode("The quick brown fox")
+        >>> stats = vocabulary_utilization(tokens, ids)
+        >>> print(f"Unique ratio: {stats['unique_ratio']:.2%}")
+    """
+    result = _vocabulary_utilization(list(tokens), list(token_ids))
+    return dict(result)
+def batch_vocabulary_utilization(
+    token_batches: Sequence[Sequence[str]],
+    token_id_batches: Sequence[Sequence[int]],
+) -> list[dict[str, float]]:
+    """Analyze vocabulary usage patterns for a batch of token sequences.
+    Args:
+        token_batches: Token string sequences from encoding multiple texts.
+        token_id_batches: Corresponding token ID sequences.
+    Returns:
+        List of dictionaries, each with:
+        - unique_ratio: fraction of tokens that are unique
+        - repetition_rate: 1 - unique_ratio
+        - max_id: highest token ID used
+        - id_spread: stddev of IDs
+    """
+    results = _batch_vocabulary_utilization(
+        [list(tokens) for tokens in token_batches],
+        [list(ids) for ids in token_id_batches],
+    )
+    return [dict(r) for r in results]
+# ---------------------------------------------------------------------------
+# Unknown Token Detection
+# ---------------------------------------------------------------------------
+DEFAULT_UNKNOWN_MARKERS = ("[UNK]", "<unk>", "�", "\ufffd")
+def unknown_token_rate(
+    tokens: Sequence[str],
+    *,
+    unknown_markers: tuple[str, ...] | None = None,
+) -> float:
+    """Compute fraction of tokens that appear to be unknown/fallback tokens.
+    Different tokenizers use different markers for OOV (out-of-vocabulary)
+    handling. High rates suggest the tokenizer's vocabulary doesn't cover
+    this domain well.
+    Also detects byte fallback tokens (e.g., "<0xFF>") which indicate
+    characters that couldn't be represented by the vocabulary.
+    Args:
+        tokens: Token sequence to analyze.
+        unknown_markers: Tuple of strings that indicate unknown tokens.
+            Defaults to common markers like "[UNK]", "<unk>", "�".
+    Returns:
+        Fraction of tokens that are unknown/fallback tokens.
+    Example:
+        >>> tokens, _ = tokenizer.encode("日本語テスト")
+        >>> rate = unknown_token_rate(tokens)
+        >>> if rate > 0.1:
+        ...     print("Warning: high unknown token rate")
+    """
+    markers = list(unknown_markers) if unknown_markers is not None else None
+    return cast(float, _unknown_token_rate(list(tokens), markers))
+def batch_unknown_token_rate(
+    token_batches: Sequence[Sequence[str]],
+    *,
+    unknown_markers: tuple[str, ...] | None = None,
+) -> list[float]:
+    """Compute unknown token rates for a batch of token sequences.
+    Args:
+        token_batches: Token sequences to analyze.
+        unknown_markers: Tuple of strings that indicate unknown tokens.
+    Returns:
+        List of unknown token rates for each sequence.
+    """
+    markers = list(unknown_markers) if unknown_markers is not None else None
+    return cast(
+        list[float],
+        _batch_unknown_token_rate([list(tokens) for tokens in token_batches], markers),
+    )
+# ---------------------------------------------------------------------------
+# Convenience Functions (using Tokenizer directly)
+# ---------------------------------------------------------------------------
+def analyze_tokenizer(
+    text: str,
+    tokenizer: "Tokenizer",
+    *,
+    unknown_markers: tuple[str, ...] | None = None,
+) -> dict[str, float]:
+    """Comprehensive tokenizer analysis for a text.
+    Convenience function that encodes the text and computes all tokenizer
+    metrics at once.
+    Args:
+        text: Input text to analyze.
+        tokenizer: Tokenizer to evaluate.
+        unknown_markers: Tuple of strings that indicate unknown tokens.
+    Returns:
+        Dictionary with all tokenizer metrics:
+        - compression_ratio: bytes per token
+        - characters_per_token: chars per token
+        - token_entropy: Shannon entropy of token distribution
+        - unknown_token_rate: fraction of unknown tokens
+        - unique_ratio: type/token ratio
+        - repetition_rate: 1 - unique_ratio
+        - max_id: highest token ID
+        - id_spread: standard deviation of token IDs
+        - token_count: total number of tokens
+    Example:
+        >>> from glitchlings.attack.tokenization import resolve_tokenizer
+        >>> tokenizer = resolve_tokenizer("o200k_base")
+        >>> stats = analyze_tokenizer("Hello, world!", tokenizer)
+        >>> for key, value in stats.items():
+        ...     print(f"{key}: {value:.4f}")
+    """
+    if not text:
+        return {
+            "compression_ratio": 0.0,
+            "characters_per_token": 0.0,
+            "token_entropy": 0.0,
+            "unknown_token_rate": 0.0,
+            "unique_ratio": 0.0,
+            "repetition_rate": 0.0,
+            "max_id": 0.0,
+            "id_spread": 0.0,
+            "token_count": 0.0,
+        }
+    tokens, token_ids = tokenizer.encode(text)
+    # Compute all metrics
+    comp_ratio = compression_ratio(text, tokens)
+    chars_per_token = characters_per_token(text, tokens)
+    entropy = token_entropy(tokens)
+    unk_rate = unknown_token_rate(tokens, unknown_markers=unknown_markers)
+    vocab_stats = vocabulary_utilization(tokens, token_ids)
+    return {
+        "compression_ratio": comp_ratio,
+        "characters_per_token": chars_per_token,
+        "token_entropy": entropy,
+        "unknown_token_rate": unk_rate,
+        "unique_ratio": vocab_stats["unique_ratio"],
+        "repetition_rate": vocab_stats["repetition_rate"],
+        "max_id": vocab_stats["max_id"],
+        "id_spread": vocab_stats["id_spread"],
+        "token_count": float(len(tokens)),
+    }
+__all__ = [
+    # Core metrics
+    "compression_ratio",
+    "batch_compression_ratio",
+    "characters_per_token",
+    "batch_characters_per_token",
+    "token_entropy",
+    "batch_token_entropy",
+    "vocabulary_utilization",
+    "batch_vocabulary_utilization",
+    "unknown_token_rate",
+    "batch_unknown_token_rate",
+    # Convenience
+    "analyze_tokenizer",
+    # Constants
+    "DEFAULT_UNKNOWN_MARKERS",
+]