PyPI - morph-hrr - Versions diffs - 0.1.0__py3-none-any.whl - Mend

morph-hrr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

morph_hrr/__init__.py +37 -0
morph_hrr/hrr.py +64 -0
morph_hrr/morphemes.py +64 -0
morph_hrr/tokenizer.py +97 -0
morph_hrr-0.1.0.dist-info/METADATA +117 -0
morph_hrr-0.1.0.dist-info/RECORD +8 -0
morph_hrr-0.1.0.dist-info/WHEEL +4 -0
morph_hrr-0.1.0.dist-info/licenses/LICENSE +21 -0

morph_hrr/__init__.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""morph-hrr: compositional HRR morpheme tokenizer for Apple MLX.
+A word becomes one fixed-width vector via circular-convolution composition of its
+morphemes (prefix (x) root (x) suffix), giving algebraically manipulable,
+out-of-vocabulary-friendly token representations.
+"""
+from __future__ import annotations
+__version__ = "0.1.0"
+from .hrr import (
+    bind,
+    bundle,
+    cosine_similarity,
+    make_unitary,
+    normalize,
+    unbind,
+    update_context,
+)
+from .morphemes import PREFIXES, SUFFIXES, segment
+from .tokenizer import HolographicMorphemeTokenizer, MorphemeTokenizer
+__all__ = [
+    "MorphemeTokenizer",
+    "HolographicMorphemeTokenizer",
+    "segment",
+    "PREFIXES",
+    "SUFFIXES",
+    "bind",
+    "unbind",
+    "bundle",
+    "normalize",
+    "make_unitary",
+    "cosine_similarity",
+    "update_context",
+    "__version__",
+]

morph_hrr/hrr.py ADDED Viewed

@@ -0,0 +1,64 @@
+"""Holographic Reduced Representation (HRR) primitives on Apple MLX.
+Binding/unbinding are circular (de)convolution in the FFT domain (Plate, 1995).
+These are the building blocks the morpheme tokenizer uses to compose
+``prefix (x) root (x) suffix`` into a single fixed-width "holistic" token vector.
+"""
+from __future__ import annotations
+import mlx.core as mx
+import numpy as np
+def normalize(x: mx.array, eps: float = 1e-6) -> mx.array:
+    """L2-normalize along the last axis; upcast to float32 for stable math."""
+    x = x.astype(mx.float32)
+    return x / mx.sqrt(mx.sum(x * x, axis=-1, keepdims=True) + eps)
+def _bind(a: mx.array, b: mx.array) -> mx.array:
+    """Circular convolution: real(IFFT(FFT(a) * FFT(b)))."""
+    return mx.real(mx.fft.ifft(mx.fft.fft(a.astype(mx.float32)) * mx.fft.fft(b.astype(mx.float32))))
+def _unbind(bound: mx.array, key: mx.array) -> mx.array:
+    """Circular correlation (approx inverse of bind): real(IFFT(FFT(bound) * conj(FFT(key))))."""
+    return mx.real(
+        mx.fft.ifft(mx.fft.fft(bound.astype(mx.float32)) * mx.conj(mx.fft.fft(key.astype(mx.float32))))
+    )
+# Compiled hot paths.
+bind = mx.compile(_bind)
+unbind = mx.compile(_unbind)
+def bundle(*vectors: mx.array) -> mx.array:
+    """Superpose vectors by normalized sum (the VSA "add")."""
+    if not vectors:
+        raise ValueError("bundle requires at least one vector")
+    return normalize(sum(v.astype(mx.float32) for v in vectors))
+def cosine_similarity(a: mx.array, b: mx.array) -> mx.array:
+    """Cosine similarity along the last axis (vectors are normalized first)."""
+    return mx.sum(normalize(a) * normalize(b), axis=-1)
+def make_unitary(dim: int, seed: int = 0) -> mx.array:
+    """Deterministic unitary vector: its FFT has all-ones magnitude, so binding with
+    it is exactly invertible (unbind perfectly recovers the bound value)."""
+    rng = np.random.default_rng(seed)
+    spectrum = np.ones(dim, dtype=np.complex64)
+    half = dim // 2
+    phases = rng.uniform(0, 2 * np.pi, max(0, half - 1))
+    spectrum[1:half] = np.exp(1j * phases)
+    spectrum[half + 1 :] = np.conj(spectrum[1:half][::-1])
+    if dim % 2 == 0:
+        spectrum[half] = 1.0
+    return mx.array(np.fft.ifft(spectrum).real.astype(np.float32))
+def update_context(context: mx.array, token: mx.array, position_key: mx.array) -> mx.array:
+    """Incrementally fold a token into a running context: normalize(context + token (x) pos)."""
+    return normalize(context.astype(mx.float32) + bind(token, position_key))

morph_hrr/morphemes.py ADDED Viewed

@@ -0,0 +1,64 @@
+"""Dependency-free English morphological segmentation.
+Splits a word into ``(prefix, root, suffix)`` using curated affix lists with
+longest-match and minimum-root guards. Deliberately simple and predictable:
+no dictionary, no learned model, no external deps. Imperfect (it won't undo
+consonant doubling like ``running -> run``) — documented as future work — but it
+handles the common derivational/inflectional cases the tokenizer relies on.
+"""
+from __future__ import annotations
+# Conservative, high-precision derivational prefixes (min root kept >= MIN_ROOT).
+PREFIXES: tuple[str, ...] = (
+    "anti", "auto", "circum", "contra", "counter", "dis", "extra", "fore",
+    "hyper", "hypo", "in", "im", "il", "ir", "inter", "intra", "mal", "mid",
+    "mis", "multi", "non", "out", "over", "poly", "post", "pre", "proto",
+    "pseudo", "re", "retro", "semi", "sub", "super", "supra", "sur", "trans",
+    "tri", "ultra", "un", "under",
+)
+# Inflectional + common derivational suffixes, tried longest-first.
+SUFFIXES: tuple[str, ...] = (
+    # 5+ chars
+    "ation", "ation", "ically", "iosity", "itious", "aceous", "acious",
+    "ality", "ative", "ator", "fully", "ially", "ables", "ibles", "iness",
+    # 4 chars
+    "able", "ably", "ance", "ence", "ency", "hood", "ible", "ical", "iest",
+    "isms", "ists", "ment", "ness", "ship", "tion", "wise",
+    # 3 chars
+    "acy", "age", "al", "ant", "ate", "dom", "ees", "ers", "est", "ful",
+    "ial", "ies", "ily", "ing", "ion", "ish", "ism", "ist", "ity", "ive",
+    "ize", "ity", "less", "ous", "ity",
+    # 2 chars
+    "al", "ed", "en", "er", "es", "ic", "ly", "or", "ty",
+    # 1 char (high-precision only)
+    "s",
+)
+# Keep roots at least this long after affix removal (avoids eating short words).
+MIN_ROOT = 3
+_SUFFIXES_BY_LEN = tuple(sorted({s for s in SUFFIXES if s}, key=len, reverse=True))
+def segment(word: str) -> tuple[str, str, str]:
+    """Return ``(prefix, root, suffix)`` for ``word``; affixes are "" when absent."""
+    w = (word or "").strip().lower()
+    if not w.isalpha():
+        return ("", w, "")
+    prefix = ""
+    for p in PREFIXES:
+        if w.startswith(p) and len(w) - len(p) >= MIN_ROOT:
+            prefix = p
+            break
+    root = w[len(prefix):]
+    suffix = ""
+    for s in _SUFFIXES_BY_LEN:
+        if root.endswith(s) and len(root) - len(s) >= MIN_ROOT:
+            suffix = s
+            root = root[: len(root) - len(s)]
+            break
+    return (prefix, root, suffix)

morph_hrr/tokenizer.py ADDED Viewed

@@ -0,0 +1,97 @@
+"""Compositional HRR morpheme tokenizer.
+Each word is represented as a single fixed-width "holistic" vector:
+    word_vec(word) = bundle(
+        prefix_role (x) bytes(prefix),
+        root_role   (x) bytes(root),
+        suffix_role (x) bytes(suffix),
+    )
+where ``(x)`` is circular convolution (``hrr.bind``) and ``bundle`` is normalized
+superposition. Because the roles are unitary, ``unbind(word_vec, role)`` recovers
+that role's filler — enabling algebraic morpheme manipulation (strip a prefix,
+swap a suffix, build a vector for an out-of-vocabulary word from its pieces).
+NOTE: this is an input **representation / embedding**, not a HuggingFace
+text<->ID tokenizer. It emits dense vectors (one per word), intended to feed
+HRR-native or experimental models.
+"""
+from __future__ import annotations
+import re
+import mlx.core as mx
+import numpy as np
+from .hrr import bind, bundle, make_unitary, normalize
+from .morphemes import segment as _segment
+_WORD_RE = re.compile(r"[A-Za-z]+|\S")
+class MorphemeTokenizer:
+    """Map text -> HRR morpheme vectors of width ``dim`` (deterministic given ``seed``)."""
+    def __init__(self, dim: int = 2048, seed: int = 0):
+        self.dim = dim
+        self.seed = seed
+        # Fixed random base vectors for each byte value; the "filler" alphabet.
+        rng = np.random.default_rng(seed)
+        byte_vectors = rng.normal(size=(256, dim)).astype(np.float32)
+        self.byte_vectors = normalize(mx.array(byte_vectors))
+        # Unitary role vectors for prefix / root / suffix slots.
+        self.prefix_role = make_unitary(dim, seed=seed + 1_001)
+        self.root_role = make_unitary(dim, seed=seed + 1_002)
+        self.suffix_role = make_unitary(dim, seed=seed + 1_003)
+    # -- morphemes ---------------------------------------------------------
+    def segment(self, word: str) -> tuple[str, str, str]:
+        """Return ``(prefix, root, suffix)`` for ``word``."""
+        return _segment(word)
+    def bytes_vector(self, text: str) -> mx.array:
+        """Fixed vector for a string: normalized fold-bind of its byte vectors."""
+        if not text:
+            return mx.zeros((self.dim,), dtype=mx.float32)
+        acc = self.byte_vectors[ord(text[0]) % 256]
+        for ch in text[1:]:
+            acc = bind(acc, self.byte_vectors[ord(ch) % 256])
+        return normalize(acc)
+    # -- composition -------------------------------------------------------
+    def word_vector(self, word: str) -> mx.array:
+        """Holistic HRR vector for ``word`` (composes its morphemes by role)."""
+        prefix, root, suffix = _segment(word)
+        pieces: list[mx.array] = []
+        if prefix:
+            pieces.append(bind(self.prefix_role, self.bytes_vector(prefix)))
+        if root:
+            pieces.append(bind(self.root_role, self.bytes_vector(root)))
+        if suffix:
+            pieces.append(bind(self.suffix_role, self.bytes_vector(suffix)))
+        if not pieces:
+            return self.bytes_vector(word)
+        return bundle(*pieces)
+    # -- text -> vectors ---------------------------------------------------
+    def iter_vectors(self, text: str):
+        """Yield one ``float16`` word vector per token in ``text`` (word or punct)."""
+        for word in _WORD_RE.findall(text):
+            vector = self.word_vector(word).astype(mx.float16)
+            mx.eval(vector)
+            yield mx.stop_gradient(vector)
+    def encode(self, text: str) -> mx.array:
+        """Stacked word vectors, shape ``(n_words, dim)`` in ``float16``."""
+        vectors = list(self.iter_vectors(text))
+        if not vectors:
+            return mx.zeros((0, self.dim), dtype=mx.float16)
+        return mx.stack(vectors)
+# Backwards-compatible alias.
+HolographicMorphemeTokenizer = MorphemeTokenizer

morph_hrr-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,117 @@
+Metadata-Version: 2.4
+Name: morph-hrr
+Version: 0.1.0
+Summary: Compositional HRR morpheme tokenizer/embeddings: circular-convolution prefix (x) root (x) suffix, for Apple MLX.
+Project-URL: Repository, https://huggingface.co/<user>/morph-hrr
+Author: BB Claude
+License-Expression: MIT
+License-File: LICENSE
+Keywords: circular-convolution,embeddings,hrr,hyperdimensional,mlx,morphology,tokenizer,vsa
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: MacOS
+Classifier: Programming Language :: Python :: 3
+Requires-Python: >=3.11
+Requires-Dist: mlx
+Provides-Extra: test
+Requires-Dist: pytest; extra == 'test'
+Description-Content-Type: text/markdown
+# morph-hrr
+A **compositional morpheme tokenizer built on Holographic Reduced Representations (HRR)**, for Apple MLX.
+Each word becomes one fixed-width dense vector by *binding* its morphemes (prefix ⊗ root ⊗ suffix) into a single "holistic" superposition. Because the binding is circular convolution with **unitary role vectors**, you can algebraically pull a morpheme back out: `unbind(word_vec, prefix_role)` recovers the prefix. Words that share morphology land near each other in vector space, and an out-of-vocabulary word built from its pieces still neighbors its root family.
+## What this is — and isn't
+**It is:** an **input representation / embedding layer** for HRR-native or experimental models. It emits dense vectors, one per word, and exposes the HRR algebra (`bind`, `unbind`, `bundle`, `make_unitary`) behind them.
+**It is not:** a HuggingFace text↔ID tokenizer. It does not produce token IDs, has no vocabulary, and will not drop into `transformers`. Think of it as an alternative to a learned embedding table — one whose vectors are *compositional by construction* rather than arbitrary.
+**It is not:** a way to make a pretrained Qwen/Gemma smaller or faster. HRR input is a different representation than what pretrained weights expect; you cannot swap it into an existing model without retraining from scratch. (See *Scope & honesty* below.)
+## Install
+```bash
+pip install morph-hrr        # Apple Silicon (mlx is the only dependency)
+```
+Development:
+```bash
+git clone <repo> && cd morph-hrr
+pip install -e ".[test]"
+pytest -q
+```
+## Quickstart
+```python
+from morph_hrr import MorphemeTokenizer, unbind, cosine_similarity
+tok = MorphemeTokenizer(dim=2048)        # deterministic given seed
+# A word is one fixed-width vector, composed from its morphemes by role.
+v = tok.word_vector("unhappy")           # mx.array, shape (2048,)
+print(tok.segment("unhappy"))            # ('un', 'happy', '')
+# Composition is algebraic: recover the prefix filler by unbinding its role.
+recovered = unbind(v, tok.prefix_role)
+print(cosine_similarity(recovered, tok.bytes_vector("un")))   # ~0.6 (vs ~0 control)
+# Shared morphology => shared neighborhood.
+print(cosine_similarity(tok.word_vector("unhappy"),
+                        tok.word_vector("happy")))            # > 0.2 (vs ~0 unrelated)
+# Encode a sentence: one float16 vector per word.
+mat = tok.encode("the quick brown fox")   # mx.array, shape (4, 2048), float16
+```
+## The math (one paragraph)
+`bind(a, b)` is circular convolution, computed in the Fourier domain as `real(IFFT(FFT(a) * FFT(b)))`. `unbind(bind(a,b), b)` is the convolution inverse, `real(IFFT(FFT(bound) * conj(FFT(b))))`. A **unitary** vector has all FFT magnitudes equal to 1, so binding with it is a perfect, lossless rotation — unbinding recovers the original to >0.99 cosine. A word vector bundles three role⊗filler pairs — `prefix_role ⊗ bytes(prefix)`, `root_role ⊗ bytes(root)`, `suffix_role ⊗ bytes(suffix)` — into one normalized superposition; unbinding a role pulls its filler back out of the superposition (the other two act as noise, which is why longer dims recover more cleanly).
+## Compositionality demo (regression-tested, dim=2048)
+| Property | HRR cosine | Random control | Test |
+|---|---|---|---|
+| Prefix recovery (`unbind(unhappy, prefix_role)` ≈ `bytes("un")`) | > 0.50 | ≈ 0 | `test_prefix_recovery_beats_control` |
+| Shared-root clustering (`unhappy` ~ `happy`) | > 0.20 | ≈ 0 | `test_shared_root_clusters` |
+| Shared-suffix clustering (`running` ~ `walking`) | > 0.15 | ≈ 0 | `test_shared_suffix_clusters` |
+| OOV root family (`unbind(unkind, root_role)` ≈ `bytes("kind")`) | > 0.50 | ≈ 0 | `test_oov_root_family` |
+| Role vectors near-orthogonal | < 0.10 | — | `test_roles_are_near_orthogonal` |
+Each HRR measurement is paired against a random-vector control of the same dimension, asserting the structured signal is meaningfully stronger than noise.
+## Public API
+| Name | What |
+|---|---|
+| `MorphemeTokenizer(dim=2048, seed=0)` | Map text → HRR morpheme vectors |
+| `.segment(word)` | `(prefix, root, suffix)` |
+| `.word_vector(word)` / `.bytes_vector(text)` | Holistic / byte-fold word vector |
+| `.encode(text)` / `.iter_vectors(text)` | Stacked `(n, dim)` float16 / lazy yield |
+| `.prefix_role` / `.root_role` / `.suffix_role` | Exposed unitary role vectors (for `unbind`) |
+| `bind`, `unbind`, `bundle`, `normalize`, `make_unitary`, `cosine_similarity`, `update_context` | HRR primitives |
+| `segment`, `PREFIXES`, `SUFFIXES` | Standalone morphological segmentation |
+`HolographicMorphemeTokenizer` is kept as a backwards-compatible alias for `MorphemeTokenizer`.
+## Segmentation: deliberately simple
+`morphemes.segment` is dependency-free, longest-match affix stripping over curated prefix/suffix lists, with a minimum-root guard. It is **imperfect by design**: it does not undo consonant doubling (`running → runn + ing`, not `run + ing`) and cannot tell a real root from a suffixable tail (`preorder → pre + ord + er`). No dictionary, no learned model, no `nltk`. This keeps the package lightweight; better segmentation is future work and is orthogonal to the HRR representation itself.
+## Scope & honesty
+- **mlx-only.** Apple Silicon target audience; a numpy/JAX backend is a documented future option, not this release.
+- **Representation, not a drop-in tokenizer.** No token IDs, no HF integration.
+- **Can't retrofit pretrained models.** Swapping HRR input into a real model throws away its pretrained weights — you must train from scratch. On consumer hardware (e.g. 16 GB) that means small research-scale models, not deployable ones. The value of this package is the *compositional input representation* and the HRR algebra, demonstrated on small models.
+- v0.1 ships the tokenizer + primitives + tests. A demo Space, a trained reference model, and a portable backend are future work.
+## License
+MIT. See `LICENSE`.
+## Cite / priority
+If you build on this representation in published work, please cite this repository. The compositional HRR morpheme representation (role⊗filler binding of prefix/root/suffix via circular convolution, with unitary roles enabling algebraic morpheme manipulation) is, to our knowledge, novel as a tokenization scheme; we'd appreciate attribution.

morph_hrr-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+morph_hrr/__init__.py,sha256=oTvvhqBRWT-r97txJxzZCU-jL18WFaT2fSrCwXViUG8,855
+morph_hrr/hrr.py,sha256=rgQGhcXKzuX6IWGsA78KMSE8wL3Yp6qfKtodZnGTe_k,2523
+morph_hrr/morphemes.py,sha256=F00Kc-oMnCB1Ift9ERtdEXJ7VyyqR2_YTxP8oKPri_Q,2475
+morph_hrr/tokenizer.py,sha256=vePQUnNmqJEfcMb_Yi-lC5MHBJGoDJBKOf4AkhSkC-g,3775
+morph_hrr-0.1.0.dist-info/METADATA,sha256=JWEyvYAlCBL27IhE784S4FHnbnRF6sV0VXV72q32h7g,7013
+morph_hrr-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+morph_hrr-0.1.0.dist-info/licenses/LICENSE,sha256=r8hvcnlc5fQIW8wMWifOEa6vCAbLFbadC3lnuLxhSfQ,1066
+morph_hrr-0.1.0.dist-info/RECORD,,

morph_hrr-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.30.1
+Root-Is-Purelib: true
+Tag: py3-none-any

morph_hrr-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 BB Claude
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.