PyPI - glitchlings - Versions diffs - 0.4.4__cp313-cp313-manylinux_2_28_x86_64.whl → 0.4.5__cp313-cp313-manylinux_2_28_x86_64.whl - Mend

glitchlings 0.4.4__cp313-cp313-manylinux_2_28_x86_64.whl → 0.4.5__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of glitchlings might be problematic. Click here for more details.

Files changed (25) hide show

glitchlings/__init__.py +4 -0
glitchlings/_zoo_rust.cpython-313-x86_64-linux-gnu.so +0 -0
glitchlings/compat.py +2 -4
glitchlings/config.py +2 -4
glitchlings/data/__init__.py +1 -0
glitchlings/data/hokey_assets.json +193 -0
glitchlings/dlc/_shared.py +6 -6
glitchlings/dlc/huggingface.py +6 -6
glitchlings/dlc/prime.py +1 -1
glitchlings/dlc/pytorch.py +3 -3
glitchlings/dlc/pytorch_lightning.py +4 -10
glitchlings/lexicon/_cache.py +3 -5
glitchlings/lexicon/vector.py +6 -5
glitchlings/lexicon/wordnet.py +4 -8
glitchlings/util/hokey_generator.py +144 -0
glitchlings/util/stretch_locator.py +140 -0
glitchlings/util/stretchability.py +375 -0
glitchlings/zoo/__init__.py +5 -1
glitchlings/zoo/hokey.py +173 -0
{glitchlings-0.4.4.dist-info → glitchlings-0.4.5.dist-info}/METADATA +26 -5
{glitchlings-0.4.4.dist-info → glitchlings-0.4.5.dist-info}/RECORD +25 -19
{glitchlings-0.4.4.dist-info → glitchlings-0.4.5.dist-info}/WHEEL +0 -0
{glitchlings-0.4.4.dist-info → glitchlings-0.4.5.dist-info}/entry_points.txt +0 -0
{glitchlings-0.4.4.dist-info → glitchlings-0.4.5.dist-info}/licenses/LICENSE +0 -0
{glitchlings-0.4.4.dist-info → glitchlings-0.4.5.dist-info}/top_level.txt +0 -0

glitchlings/__init__.py CHANGED Viewed

@@ -5,6 +5,7 @@ from .zoo import (
     Apostrofae,
     Gaggle,
     Glitchling,
+    Hokey,
     Jargoyle,
     Mim1c,
     Redactyl,
@@ -15,6 +16,7 @@ from .zoo import (
     Zeedub,
     adjax,
     apostrofae,
+    hokey,
     is_rust_pipeline_enabled,
     is_rust_pipeline_supported,
     jargoyle,
@@ -42,6 +44,8 @@ __all__ = [
     "adjax",
     "Apostrofae",
     "apostrofae",
+    "Hokey",
+    "hokey",
     "Redactyl",
     "redactyl",
     "Reduple",

glitchlings/_zoo_rust.cpython-313-x86_64-linux-gnu.so CHANGED Viewed

Binary file

glitchlings/compat.py CHANGED Viewed

@@ -17,16 +17,14 @@ _MISSING = _MissingSentinel()
 class _MarkerProtocol(Protocol):
-    def evaluate(self, environment: dict[str, str]) -> bool:
-        ...
+    def evaluate(self, environment: dict[str, str]) -> bool: ...
 class _RequirementProtocol(Protocol):
     marker: _MarkerProtocol | None
     name: str
-    def __init__(self, requirement: str) -> None:
-        ...
+    def __init__(self, requirement: str) -> None: ...
 try:  # pragma: no cover - packaging is bundled with modern Python environments

glitchlings/config.py CHANGED Viewed

@@ -19,8 +19,7 @@ except ModuleNotFoundError:  # pragma: no cover - Python < 3.11
 class _TomllibModule(Protocol):
-    def load(self, fp: IO[bytes]) -> Any:
-        ...
+    def load(self, fp: IO[bytes]) -> Any: ...
 tomllib = cast(_TomllibModule, _tomllib)
@@ -29,8 +28,7 @@ tomllib = cast(_TomllibModule, _tomllib)
 class _YamlModule(Protocol):
     YAMLError: type[Exception]
-    def safe_load(self, stream: str) -> Any:
-        ...
+    def safe_load(self, stream: str) -> Any: ...
 yaml = cast(_YamlModule, importlib.import_module("yaml"))

glitchlings/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Static data assets shared across Glitchlings implementations."""

glitchlings/data/hokey_assets.json ADDED Viewed

@@ -0,0 +1,193 @@
+{
+  "lexical_prior": {
+    "so": 0.92,
+    "no": 0.89,
+    "go": 0.72,
+    "yeah": 0.86,
+    "yay": 0.81,
+    "ya": 0.7,
+    "hey": 0.66,
+    "okay": 0.68,
+    "ok": 0.64,
+    "cool": 0.78,
+    "omg": 0.74,
+    "wow": 0.88,
+    "wee": 0.62,
+    "woo": 0.69,
+    "woohoo": 0.74,
+    "whoa": 0.71,
+    "woah": 0.7,
+    "yayyy": 0.75,
+    "yayyyy": 0.76,
+    "yas": 0.79,
+    "yass": 0.8,
+    "yaaas": 0.82,
+    "please": 0.53,
+    "pleaseee": 0.57,
+    "pleaseeee": 0.6,
+    "pleaseeeee": 0.63,
+    "lol": 0.83,
+    "lmao": 0.65,
+    "omggg": 0.75,
+    "omgggg": 0.76,
+    "squee": 0.64,
+    "hahaha": 0.6,
+    "haha": 0.56,
+    "really": 0.58,
+    "very": 0.49,
+    "love": 0.55,
+    "cute": 0.52,
+    "nice": 0.47,
+    "sweet": 0.45,
+    "yayness": 0.44,
+    "ugh": 0.5,
+    "aww": 0.61,
+    "yess": 0.81,
+    "yes": 0.9,
+    "pls": 0.48,
+    "pleeeease": 0.62,
+    "nooo": 0.88,
+    "noooo": 0.89,
+    "dang": 0.41,
+    "geez": 0.39,
+    "danggg": 0.44,
+    "dangit": 0.38,
+    "sick": 0.35,
+    "epic": 0.37,
+    "rad": 0.5,
+    "goal": 0.56,
+    "great": 0.46,
+    "awesome": 0.51,
+    "amazing": 0.52,
+    "perfect": 0.49,
+    "fantastic": 0.5,
+    "stellar": 0.48,
+    "yippee": 0.67,
+    "stoked": 0.48,
+    "yikes": 0.43,
+    "gosh": 0.41,
+    "heck": 0.36
+  },
+  "interjections": [
+    "wow",
+    "omg",
+    "hey",
+    "ugh",
+    "yay",
+    "yayyy",
+    "yayyyy",
+    "woo",
+    "woohoo",
+    "whoa",
+    "woah",
+    "whooo",
+    "ah",
+    "aw",
+    "aww",
+    "hmm",
+    "huh",
+    "yo",
+    "yikes",
+    "gah",
+    "phew",
+    "sheesh"
+  ],
+  "intensifiers": [
+    "so",
+    "very",
+    "really",
+    "super",
+    "mega",
+    "ultra",
+    "too",
+    "way",
+    "crazy",
+    "insanely",
+    "totally",
+    "extremely",
+    "seriously",
+    "absolutely",
+    "completely",
+    "entirely",
+    "utterly",
+    "hella",
+    "wicked",
+    "truly"
+  ],
+  "evaluatives": [
+    "cool",
+    "great",
+    "awesome",
+    "amazing",
+    "perfect",
+    "nice",
+    "sweet",
+    "lovely",
+    "loving",
+    "silly",
+    "wild",
+    "fun",
+    "funny",
+    "adorable",
+    "cute",
+    "fantastic",
+    "fabulous",
+    "brilliant",
+    "stellar",
+    "rad",
+    "epic",
+    "delightful",
+    "gorgeous"
+  ],
+  "positive_lexicon": [
+    "love",
+    "loved",
+    "loving",
+    "like",
+    "liked",
+    "awesome",
+    "amazing",
+    "yay",
+    "great",
+    "good",
+    "fun",
+    "funny",
+    "blessed",
+    "excited",
+    "cool",
+    "best",
+    "beautiful",
+    "happy",
+    "happiest",
+    "joy",
+    "joyful",
+    "thrilled",
+    "ecstatic",
+    "stoked",
+    "pumped",
+    "glad"
+  ],
+  "negative_lexicon": [
+    "bad",
+    "sad",
+    "angry",
+    "annoyed",
+    "mad",
+    "terrible",
+    "awful",
+    "hate",
+    "hated",
+    "crying",
+    "hurt",
+    "tired",
+    "worst",
+    "ugh",
+    "nope",
+    "upset",
+    "frustrated",
+    "drained",
+    "exhausted",
+    "bummed",
+    "grumpy"
+  ]
+}

glitchlings/dlc/_shared.py CHANGED Viewed

@@ -67,10 +67,10 @@ def resolve_columns(dataset: Any, columns: Sequence[str] | None) -> list[str]:
     raise ValueError("Unable to determine which dataset columns to corrupt.")
-def normalise_column_spec(
+def normalize_column_spec(
     columns: str | int | Sequence[str | int] | None,
 ) -> list[str | int] | None:
-    """Normalise a column specification into a list of keys or indices.
+    """Normalize a column specification into a list of keys or indices.
     Args:
         columns: Column specification as a single value, sequence of values, or None.
@@ -87,10 +87,10 @@ def normalise_column_spec(
     if isinstance(columns, (str, int)):
         return [columns]
-    normalised = list(columns)
-    if not normalised:
+    normalized = list(columns)
+    if not normalized:
         raise ValueError("At least one column must be specified")
-    return normalised
+    return normalized
 def is_textual_candidate(value: Any) -> bool:
@@ -147,7 +147,7 @@ def corrupt_text_value(value: Any, gaggle: Gaggle) -> Any:
 __all__ = [
     "corrupt_text_value",
     "is_textual_candidate",
-    "normalise_column_spec",
+    "normalize_column_spec",
     "resolve_columns",
     "resolve_environment",
 ]

glitchlings/dlc/huggingface.py CHANGED Viewed

@@ -10,15 +10,15 @@ from ..util.adapters import coerce_gaggle
 from ..zoo import Gaggle, Glitchling
-def _normalise_columns(column: str | Sequence[str]) -> list[str]:
-    """Normalise a column specification to a list."""
+def _normalize_columns(column: str | Sequence[str]) -> list[str]:
+    """Normalize a column specification to a list."""
     if isinstance(column, str):
         return [column]
-    normalised = list(column)
-    if not normalised:
+    normalized = list(column)
+    if not normalized:
         raise ValueError("At least one column must be specified")
-    return normalised
+    return normalized
 def _glitch_dataset(
@@ -29,7 +29,7 @@ def _glitch_dataset(
     seed: int = 151,
 ) -> Any:
     """Apply glitchlings to the provided dataset columns."""
-    columns = _normalise_columns(column)
+    columns = _normalize_columns(column)
     gaggle = coerce_gaggle(glitchlings, seed=seed)
     return gaggle.corrupt_dataset(dataset, columns)

glitchlings/dlc/prime.py CHANGED Viewed

@@ -117,7 +117,7 @@ def _as_gaggle(
 def _extract_completion_text(completion: Any) -> str:
-    """Normalise a completion payload into a plain string."""
+    """Normalize a completion payload into a plain string."""
     if isinstance(completion, str):
         return completion

glitchlings/dlc/pytorch.py CHANGED Viewed

@@ -9,7 +9,7 @@ from ..compat import get_torch_dataloader, require_torch
 from ..compat import torch as _torch_dependency
 from ..util.adapters import coerce_gaggle
 from ..zoo import Gaggle, Glitchling
-from ._shared import corrupt_text_value, is_textual_candidate, normalise_column_spec
+from ._shared import corrupt_text_value, is_textual_candidate, normalize_column_spec
 def _apply_to_batch(batch: Any, targets: list[str | int] | None, gaggle: Gaggle) -> Any:
@@ -134,8 +134,8 @@ def _ensure_dataloader_class() -> type[Any]:
         ) -> _GlitchedDataLoader:
             """Return a lazily glitched view of the loader's batches."""
             gaggle = coerce_gaggle(glitchlings, seed=seed)
-            normalised = normalise_column_spec(columns)
-            return _GlitchedDataLoader(self, gaggle, columns=normalised)
+            normalized = normalize_column_spec(columns)
+            return _GlitchedDataLoader(self, gaggle, columns=normalized)
         setattr(dataloader_cls, "glitch", glitch)

glitchlings/dlc/pytorch_lightning.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import Any, cast
 from ..compat import get_pytorch_lightning_datamodule, require_pytorch_lightning
 from ..util.adapters import coerce_gaggle
 from ..zoo import Gaggle, Glitchling
-from ._shared import corrupt_text_value, normalise_column_spec
+from ._shared import corrupt_text_value, normalize_column_spec
 def _glitch_batch(batch: Any, columns: list[str], gaggle: Gaggle) -> Any:
@@ -40,10 +40,7 @@ def _wrap_dataloader(dataloader: Any, columns: list[str], gaggle: Gaggle) -> Any
     if isinstance(dataloader, Mapping):
         mapping_type = cast(type[Any], dataloader.__class__)
         return mapping_type(
-            {
-                key: _wrap_dataloader(value, columns, gaggle)
-                for key, value in dataloader.items()
-            }
+            {key: _wrap_dataloader(value, columns, gaggle) for key, value in dataloader.items()}
         )
     if isinstance(dataloader, list):
@@ -54,9 +51,7 @@ def _wrap_dataloader(dataloader: Any, columns: list[str], gaggle: Gaggle) -> Any
     if isinstance(dataloader, Sequence) and not isinstance(dataloader, (str, bytes, bytearray)):
         sequence_type = cast(type[Any], dataloader.__class__)
-        return sequence_type(
-            _wrap_dataloader(value, columns, gaggle) for value in dataloader
-        )
+        return sequence_type(_wrap_dataloader(value, columns, gaggle) for value in dataloader)
     return _GlitchedDataLoader(dataloader, columns, gaggle)
@@ -89,7 +84,7 @@ def _glitch_datamodule(
 ) -> Any:
     """Return a proxy that applies glitchlings to batches from the datamodule."""
-    columns = normalise_column_spec(column)
+    columns = normalize_column_spec(column)
     if columns is None:  # pragma: no cover - defensive
         raise ValueError("At least one column must be specified")
     # Lightning datamodules only support string column names (mapping keys)
@@ -212,4 +207,3 @@ else:  # pragma: no cover - optional dependency
 __all__ = ["LightningDataModule", "install"]

glitchlings/lexicon/_cache.py CHANGED Viewed

@@ -19,7 +19,7 @@ class CacheSnapshot:
     checksum: str | None = None
-def _normalise_entries(payload: Mapping[str, object]) -> CacheEntries:
+def _normalize_entries(payload: Mapping[str, object]) -> CacheEntries:
     """Convert raw cache payloads into canonical mapping form."""
     entries: CacheEntries = {}
     for key, values in payload.items():
@@ -75,7 +75,7 @@ def load_cache(path: Path) -> CacheSnapshot:
     else:
         entries_payload = payload  # legacy format without metadata
-    entries = _normalise_entries(entries_payload)
+    entries = _normalize_entries(entries_payload)
     if checksum is not None:
         expected = compute_checksum(entries)
         if checksum != expected:
@@ -88,9 +88,7 @@ def load_cache(path: Path) -> CacheSnapshot:
 def write_cache(path: Path, entries: Mapping[str, Sequence[str]]) -> CacheSnapshot:
     """Persist ``entries`` to ``path`` with checksum metadata."""
-    serialisable: CacheEntries = {
-        key: list(values) for key, values in sorted(entries.items())
-    }
+    serialisable: CacheEntries = {key: list(values) for key, values in sorted(entries.items())}
     checksum = compute_checksum(serialisable)
     payload = {
         "__meta__": {

glitchlings/lexicon/vector.py CHANGED Viewed

@@ -16,6 +16,9 @@ from ._cache import CacheSnapshot
 from ._cache import load_cache as _load_cache_file
 from ._cache import write_cache as _write_cache_file
+# Minimum number of neighbors to consider for similarity queries
+MIN_NEIGHBORS = 1
 def _cosine_similarity(vector_a: Sequence[float], vector_b: Sequence[float]) -> float:
     """Return the cosine similarity between two dense vectors."""
@@ -304,7 +307,7 @@ class VectorLexicon(LexiconBackend):
         """Initialise the lexicon with an embedding ``source`` and optional cache."""
         super().__init__(seed=seed)
         self._adapter = _resolve_source(source)
-        self._max_neighbors = max(1, max_neighbors)
+        self._max_neighbors = max(MIN_NEIGHBORS, max_neighbors)
         self._min_similarity = min_similarity
         self._cache: MutableMapping[str, list[str]] = {}
         self._cache_path: Path | None
@@ -371,7 +374,7 @@ class VectorLexicon(LexiconBackend):
         if cache_key in self._cache:
             return self._cache[cache_key]
-        neighbor_limit = self._max_neighbors if limit is None else max(1, limit)
+        neighbor_limit = self._max_neighbors if limit is None else max(MIN_NEIGHBORS, limit)
         neighbors = self._fetch_neighbors(
             original=original, normalized=normalized, limit=neighbor_limit
         )
@@ -624,9 +627,7 @@ def main(argv: Sequence[str] | None = None) -> int:
             )
             iterator = lexicon.iter_vocabulary()
             if args.limit is not None:
-                token_iter = (
-                    token for index, token in enumerate(iterator) if index < args.limit
-                )
+                token_iter = (token for index, token in enumerate(iterator) if index < args.limit)
             else:
                 token_iter = iterator

glitchlings/lexicon/wordnet.py CHANGED Viewed

@@ -13,21 +13,17 @@ from ._cache import CacheSnapshot
 class _LemmaProtocol(Protocol):
-    def name(self) -> str:
-        ...
+    def name(self) -> str: ...
 class _SynsetProtocol(Protocol):
-    def lemmas(self) -> Sequence[_LemmaProtocol]:
-        ...
+    def lemmas(self) -> Sequence[_LemmaProtocol]: ...
 class _WordNetResource(Protocol):
-    def synsets(self, word: str, pos: str | None = None) -> Sequence[_SynsetProtocol]:
-        ...
+    def synsets(self, word: str, pos: str | None = None) -> Sequence[_SynsetProtocol]: ...
-    def ensure_loaded(self) -> None:
-        ...
+    def ensure_loaded(self) -> None: ...
 WordNetCorpusReaderFactory = Callable[[Any, Any], _WordNetResource]

glitchlings/util/hokey_generator.py ADDED Viewed

@@ -0,0 +1,144 @@
+"""Hokey expressive lengthening generator."""
+from __future__ import annotations
+from dataclasses import dataclass
+from .stretch_locator import StretchSite, apply_stretch, find_stretch_site
+from .stretchability import RandomLike, StretchabilityAnalyzer, StretchabilityFeatures
+@dataclass(slots=True)
+class HokeyConfig:
+    rate: float = 0.3
+    extension_min: int = 2
+    extension_max: int = 5
+    base_p: float = 0.45
+    word_length_threshold: int = 6
+@dataclass(slots=True)
+class StretchEvent:
+    token_index: int
+    original: str
+    stretched: str
+    repeats: int
+    site: StretchSite
+    score: float
+    features: StretchabilityFeatures
+class NegativeBinomialSampler:
+    """Sample stretch lengths from a clipped negative binomial distribution."""
+    def __init__(self, base_p: float = 0.45) -> None:
+        self.base_p = base_p
+    def sample(
+        self,
+        rng: RandomLike,
+        *,
+        intensity: float,
+        minimum: int,
+        maximum: int,
+    ) -> int:
+        minimum = max(0, int(minimum))
+        maximum = max(minimum, int(maximum))
+        if maximum == 0:
+            return 0
+        if maximum == minimum:
+            return maximum
+        r = max(1, int(round(1 + 2 * intensity)))
+        adjusted_p = self.base_p / (1.0 + 0.75 * max(0.0, intensity))
+        adjusted_p = max(0.05, min(0.95, adjusted_p))
+        failures = sum(self._geometric_sample(rng, adjusted_p) for _ in range(r))
+        extra = minimum + failures
+        return max(minimum, min(maximum, extra))
+    @staticmethod
+    def _geometric_sample(rng: RandomLike, p: float) -> int:
+        count = 0
+        while rng.random() > p:
+            count += 1
+        return count
+class HokeyGenerator:
+    """Full expressive lengthening pipeline."""
+    def __init__(
+        self,
+        analyzer: StretchabilityAnalyzer | None = None,
+        sampler: NegativeBinomialSampler | None = None,
+    ) -> None:
+        self.analyzer = analyzer or StretchabilityAnalyzer()
+        self.sampler = sampler or NegativeBinomialSampler()
+    def generate(
+        self,
+        text: str,
+        *,
+        rng: RandomLike,
+        config: HokeyConfig,
+    ) -> tuple[str, list[StretchEvent]]:
+        if not text:
+            return text, []
+        if config.base_p != self.sampler.base_p:
+            self.sampler.base_p = config.base_p
+        tokens = self.analyzer.tokenise(text)
+        candidates = self.analyzer.analyse_tokens(tokens)
+        selected = self.analyzer.select_candidates(candidates, rate=config.rate, rng=rng)
+        if not selected:
+            return text, []
+        token_strings = [token.text for token in tokens]
+        events: list[StretchEvent] = []
+        for candidate in selected:
+            token_idx = candidate.token.index
+            original = token_strings[token_idx]
+            site = find_stretch_site(original)
+            if site is None:
+                continue
+            intensity = min(1.5, candidate.features.intensity() + 0.35 * candidate.score)
+            alpha_count = sum(1 for ch in original if ch.isalpha())
+            if config.word_length_threshold > 0 and alpha_count > config.word_length_threshold * 2:
+                continue
+            if config.word_length_threshold > 0 and alpha_count > config.word_length_threshold:
+                excess = alpha_count - config.word_length_threshold
+                intensity = intensity / (1.0 + 0.35 * excess)
+                if candidate.score < 0.35 and excess >= 2:
+                    continue
+            intensity = max(0.05, intensity)
+            repeats = self.sampler.sample(
+                rng,
+                intensity=intensity,
+                minimum=config.extension_min,
+                maximum=config.extension_max,
+            )
+            if repeats <= 0:
+                continue
+            stretched_word = apply_stretch(original, site, repeats)
+            token_strings[token_idx] = stretched_word
+            events.append(
+                StretchEvent(
+                    token_index=token_idx,
+                    original=original,
+                    stretched=stretched_word,
+                    repeats=repeats,
+                    site=site,
+                    score=candidate.score,
+                    features=candidate.features,
+                )
+            )
+        return "".join(token_strings), events
+__all__ = ["HokeyGenerator", "HokeyConfig", "StretchEvent", "NegativeBinomialSampler"]