morph-hrr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
morph_hrr/__init__.py ADDED
@@ -0,0 +1,37 @@
1
+ """morph-hrr: compositional HRR morpheme tokenizer for Apple MLX.
2
+
3
+ A word becomes one fixed-width vector via circular-convolution composition of its
4
+ morphemes (prefix (x) root (x) suffix), giving algebraically manipulable,
5
+ out-of-vocabulary-friendly token representations.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ __version__ = "0.1.0"
10
+
11
+ from .hrr import (
12
+ bind,
13
+ bundle,
14
+ cosine_similarity,
15
+ make_unitary,
16
+ normalize,
17
+ unbind,
18
+ update_context,
19
+ )
20
+ from .morphemes import PREFIXES, SUFFIXES, segment
21
+ from .tokenizer import HolographicMorphemeTokenizer, MorphemeTokenizer
22
+
23
+ __all__ = [
24
+ "MorphemeTokenizer",
25
+ "HolographicMorphemeTokenizer",
26
+ "segment",
27
+ "PREFIXES",
28
+ "SUFFIXES",
29
+ "bind",
30
+ "unbind",
31
+ "bundle",
32
+ "normalize",
33
+ "make_unitary",
34
+ "cosine_similarity",
35
+ "update_context",
36
+ "__version__",
37
+ ]
morph_hrr/hrr.py ADDED
@@ -0,0 +1,64 @@
1
+ """Holographic Reduced Representation (HRR) primitives on Apple MLX.
2
+
3
+ Binding/unbinding are circular (de)convolution in the FFT domain (Plate, 1995).
4
+ These are the building blocks the morpheme tokenizer uses to compose
5
+ ``prefix (x) root (x) suffix`` into a single fixed-width "holistic" token vector.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import mlx.core as mx
10
+ import numpy as np
11
+
12
+
13
+ def normalize(x: mx.array, eps: float = 1e-6) -> mx.array:
14
+ """L2-normalize along the last axis; upcast to float32 for stable math."""
15
+ x = x.astype(mx.float32)
16
+ return x / mx.sqrt(mx.sum(x * x, axis=-1, keepdims=True) + eps)
17
+
18
+
19
+ def _bind(a: mx.array, b: mx.array) -> mx.array:
20
+ """Circular convolution: real(IFFT(FFT(a) * FFT(b)))."""
21
+ return mx.real(mx.fft.ifft(mx.fft.fft(a.astype(mx.float32)) * mx.fft.fft(b.astype(mx.float32))))
22
+
23
+
24
+ def _unbind(bound: mx.array, key: mx.array) -> mx.array:
25
+ """Circular correlation (approx inverse of bind): real(IFFT(FFT(bound) * conj(FFT(key))))."""
26
+ return mx.real(
27
+ mx.fft.ifft(mx.fft.fft(bound.astype(mx.float32)) * mx.conj(mx.fft.fft(key.astype(mx.float32))))
28
+ )
29
+
30
+
31
+ # Compiled hot paths.
32
+ bind = mx.compile(_bind)
33
+ unbind = mx.compile(_unbind)
34
+
35
+
36
+ def bundle(*vectors: mx.array) -> mx.array:
37
+ """Superpose vectors by normalized sum (the VSA "add")."""
38
+ if not vectors:
39
+ raise ValueError("bundle requires at least one vector")
40
+ return normalize(sum(v.astype(mx.float32) for v in vectors))
41
+
42
+
43
+ def cosine_similarity(a: mx.array, b: mx.array) -> mx.array:
44
+ """Cosine similarity along the last axis (vectors are normalized first)."""
45
+ return mx.sum(normalize(a) * normalize(b), axis=-1)
46
+
47
+
48
+ def make_unitary(dim: int, seed: int = 0) -> mx.array:
49
+ """Deterministic unitary vector: its FFT has all-ones magnitude, so binding with
50
+ it is exactly invertible (unbind perfectly recovers the bound value)."""
51
+ rng = np.random.default_rng(seed)
52
+ spectrum = np.ones(dim, dtype=np.complex64)
53
+ half = dim // 2
54
+ phases = rng.uniform(0, 2 * np.pi, max(0, half - 1))
55
+ spectrum[1:half] = np.exp(1j * phases)
56
+ spectrum[half + 1 :] = np.conj(spectrum[1:half][::-1])
57
+ if dim % 2 == 0:
58
+ spectrum[half] = 1.0
59
+ return mx.array(np.fft.ifft(spectrum).real.astype(np.float32))
60
+
61
+
62
+ def update_context(context: mx.array, token: mx.array, position_key: mx.array) -> mx.array:
63
+ """Incrementally fold a token into a running context: normalize(context + token (x) pos)."""
64
+ return normalize(context.astype(mx.float32) + bind(token, position_key))
morph_hrr/morphemes.py ADDED
@@ -0,0 +1,64 @@
1
+ """Dependency-free English morphological segmentation.
2
+
3
+ Splits a word into ``(prefix, root, suffix)`` using curated affix lists with
4
+ longest-match and minimum-root guards. Deliberately simple and predictable:
5
+ no dictionary, no learned model, no external deps. Imperfect (it won't undo
6
+ consonant doubling like ``running -> run``) — documented as future work — but it
7
+ handles the common derivational/inflectional cases the tokenizer relies on.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ # Conservative, high-precision derivational prefixes (min root kept >= MIN_ROOT).
12
+ PREFIXES: tuple[str, ...] = (
13
+ "anti", "auto", "circum", "contra", "counter", "dis", "extra", "fore",
14
+ "hyper", "hypo", "in", "im", "il", "ir", "inter", "intra", "mal", "mid",
15
+ "mis", "multi", "non", "out", "over", "poly", "post", "pre", "proto",
16
+ "pseudo", "re", "retro", "semi", "sub", "super", "supra", "sur", "trans",
17
+ "tri", "ultra", "un", "under",
18
+ )
19
+
20
+ # Inflectional + common derivational suffixes, tried longest-first.
21
+ SUFFIXES: tuple[str, ...] = (
22
+ # 5+ chars
23
+ "ation", "ation", "ically", "iosity", "itious", "aceous", "acious",
24
+ "ality", "ative", "ator", "fully", "ially", "ables", "ibles", "iness",
25
+ # 4 chars
26
+ "able", "ably", "ance", "ence", "ency", "hood", "ible", "ical", "iest",
27
+ "isms", "ists", "ment", "ness", "ship", "tion", "wise",
28
+ # 3 chars
29
+ "acy", "age", "al", "ant", "ate", "dom", "ees", "ers", "est", "ful",
30
+ "ial", "ies", "ily", "ing", "ion", "ish", "ism", "ist", "ity", "ive",
31
+ "ize", "ity", "less", "ous", "ity",
32
+ # 2 chars
33
+ "al", "ed", "en", "er", "es", "ic", "ly", "or", "ty",
34
+ # 1 char (high-precision only)
35
+ "s",
36
+ )
37
+
38
+ # Keep roots at least this long after affix removal (avoids eating short words).
39
+ MIN_ROOT = 3
40
+
41
+ _SUFFIXES_BY_LEN = tuple(sorted({s for s in SUFFIXES if s}, key=len, reverse=True))
42
+
43
+
44
+ def segment(word: str) -> tuple[str, str, str]:
45
+ """Return ``(prefix, root, suffix)`` for ``word``; affixes are "" when absent."""
46
+ w = (word or "").strip().lower()
47
+ if not w.isalpha():
48
+ return ("", w, "")
49
+
50
+ prefix = ""
51
+ for p in PREFIXES:
52
+ if w.startswith(p) and len(w) - len(p) >= MIN_ROOT:
53
+ prefix = p
54
+ break
55
+ root = w[len(prefix):]
56
+
57
+ suffix = ""
58
+ for s in _SUFFIXES_BY_LEN:
59
+ if root.endswith(s) and len(root) - len(s) >= MIN_ROOT:
60
+ suffix = s
61
+ root = root[: len(root) - len(s)]
62
+ break
63
+
64
+ return (prefix, root, suffix)
morph_hrr/tokenizer.py ADDED
@@ -0,0 +1,97 @@
1
+ """Compositional HRR morpheme tokenizer.
2
+
3
+ Each word is represented as a single fixed-width "holistic" vector:
4
+
5
+ word_vec(word) = bundle(
6
+ prefix_role (x) bytes(prefix),
7
+ root_role (x) bytes(root),
8
+ suffix_role (x) bytes(suffix),
9
+ )
10
+
11
+ where ``(x)`` is circular convolution (``hrr.bind``) and ``bundle`` is normalized
12
+ superposition. Because the roles are unitary, ``unbind(word_vec, role)`` recovers
13
+ that role's filler — enabling algebraic morpheme manipulation (strip a prefix,
14
+ swap a suffix, build a vector for an out-of-vocabulary word from its pieces).
15
+
16
+ NOTE: this is an input **representation / embedding**, not a HuggingFace
17
+ text<->ID tokenizer. It emits dense vectors (one per word), intended to feed
18
+ HRR-native or experimental models.
19
+ """
20
+ from __future__ import annotations
21
+
22
+ import re
23
+
24
+ import mlx.core as mx
25
+ import numpy as np
26
+
27
+ from .hrr import bind, bundle, make_unitary, normalize
28
+ from .morphemes import segment as _segment
29
+
30
+ _WORD_RE = re.compile(r"[A-Za-z]+|\S")
31
+
32
+
33
+ class MorphemeTokenizer:
34
+ """Map text -> HRR morpheme vectors of width ``dim`` (deterministic given ``seed``)."""
35
+
36
+ def __init__(self, dim: int = 2048, seed: int = 0):
37
+ self.dim = dim
38
+ self.seed = seed
39
+ # Fixed random base vectors for each byte value; the "filler" alphabet.
40
+ rng = np.random.default_rng(seed)
41
+ byte_vectors = rng.normal(size=(256, dim)).astype(np.float32)
42
+ self.byte_vectors = normalize(mx.array(byte_vectors))
43
+ # Unitary role vectors for prefix / root / suffix slots.
44
+ self.prefix_role = make_unitary(dim, seed=seed + 1_001)
45
+ self.root_role = make_unitary(dim, seed=seed + 1_002)
46
+ self.suffix_role = make_unitary(dim, seed=seed + 1_003)
47
+
48
+ # -- morphemes ---------------------------------------------------------
49
+
50
+ def segment(self, word: str) -> tuple[str, str, str]:
51
+ """Return ``(prefix, root, suffix)`` for ``word``."""
52
+ return _segment(word)
53
+
54
+ def bytes_vector(self, text: str) -> mx.array:
55
+ """Fixed vector for a string: normalized fold-bind of its byte vectors."""
56
+ if not text:
57
+ return mx.zeros((self.dim,), dtype=mx.float32)
58
+ acc = self.byte_vectors[ord(text[0]) % 256]
59
+ for ch in text[1:]:
60
+ acc = bind(acc, self.byte_vectors[ord(ch) % 256])
61
+ return normalize(acc)
62
+
63
+ # -- composition -------------------------------------------------------
64
+
65
+ def word_vector(self, word: str) -> mx.array:
66
+ """Holistic HRR vector for ``word`` (composes its morphemes by role)."""
67
+ prefix, root, suffix = _segment(word)
68
+ pieces: list[mx.array] = []
69
+ if prefix:
70
+ pieces.append(bind(self.prefix_role, self.bytes_vector(prefix)))
71
+ if root:
72
+ pieces.append(bind(self.root_role, self.bytes_vector(root)))
73
+ if suffix:
74
+ pieces.append(bind(self.suffix_role, self.bytes_vector(suffix)))
75
+ if not pieces:
76
+ return self.bytes_vector(word)
77
+ return bundle(*pieces)
78
+
79
+ # -- text -> vectors ---------------------------------------------------
80
+
81
+ def iter_vectors(self, text: str):
82
+ """Yield one ``float16`` word vector per token in ``text`` (word or punct)."""
83
+ for word in _WORD_RE.findall(text):
84
+ vector = self.word_vector(word).astype(mx.float16)
85
+ mx.eval(vector)
86
+ yield mx.stop_gradient(vector)
87
+
88
+ def encode(self, text: str) -> mx.array:
89
+ """Stacked word vectors, shape ``(n_words, dim)`` in ``float16``."""
90
+ vectors = list(self.iter_vectors(text))
91
+ if not vectors:
92
+ return mx.zeros((0, self.dim), dtype=mx.float16)
93
+ return mx.stack(vectors)
94
+
95
+
96
+ # Backwards-compatible alias.
97
+ HolographicMorphemeTokenizer = MorphemeTokenizer
@@ -0,0 +1,117 @@
1
+ Metadata-Version: 2.4
2
+ Name: morph-hrr
3
+ Version: 0.1.0
4
+ Summary: Compositional HRR morpheme tokenizer/embeddings: circular-convolution prefix (x) root (x) suffix, for Apple MLX.
5
+ Project-URL: Repository, https://huggingface.co/<user>/morph-hrr
6
+ Author: BB Claude
7
+ License-Expression: MIT
8
+ License-File: LICENSE
9
+ Keywords: circular-convolution,embeddings,hrr,hyperdimensional,mlx,morphology,tokenizer,vsa
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: MacOS
12
+ Classifier: Programming Language :: Python :: 3
13
+ Requires-Python: >=3.11
14
+ Requires-Dist: mlx
15
+ Provides-Extra: test
16
+ Requires-Dist: pytest; extra == 'test'
17
+ Description-Content-Type: text/markdown
18
+
19
+ # morph-hrr
20
+
21
+ A **compositional morpheme tokenizer built on Holographic Reduced Representations (HRR)**, for Apple MLX.
22
+
23
+ Each word becomes one fixed-width dense vector by *binding* its morphemes (prefix ⊗ root ⊗ suffix) into a single "holistic" superposition. Because the binding is circular convolution with **unitary role vectors**, you can algebraically pull a morpheme back out: `unbind(word_vec, prefix_role)` recovers the prefix. Words that share morphology land near each other in vector space, and an out-of-vocabulary word built from its pieces still neighbors its root family.
24
+
25
+ ## What this is — and isn't
26
+
27
+ **It is:** an **input representation / embedding layer** for HRR-native or experimental models. It emits dense vectors, one per word, and exposes the HRR algebra (`bind`, `unbind`, `bundle`, `make_unitary`) behind them.
28
+
29
+ **It is not:** a HuggingFace text↔ID tokenizer. It does not produce token IDs, has no vocabulary, and will not drop into `transformers`. Think of it as an alternative to a learned embedding table — one whose vectors are *compositional by construction* rather than arbitrary.
30
+
31
+ **It is not:** a way to make a pretrained Qwen/Gemma smaller or faster. HRR input is a different representation than what pretrained weights expect; you cannot swap it into an existing model without retraining from scratch. (See *Scope & honesty* below.)
32
+
33
+ ## Install
34
+
35
+ ```bash
36
+ pip install morph-hrr # Apple Silicon (mlx is the only dependency)
37
+ ```
38
+
39
+ Development:
40
+
41
+ ```bash
42
+ git clone <repo> && cd morph-hrr
43
+ pip install -e ".[test]"
44
+ pytest -q
45
+ ```
46
+
47
+ ## Quickstart
48
+
49
+ ```python
50
+ from morph_hrr import MorphemeTokenizer, unbind, cosine_similarity
51
+
52
+ tok = MorphemeTokenizer(dim=2048) # deterministic given seed
53
+
54
+ # A word is one fixed-width vector, composed from its morphemes by role.
55
+ v = tok.word_vector("unhappy") # mx.array, shape (2048,)
56
+ print(tok.segment("unhappy")) # ('un', 'happy', '')
57
+
58
+ # Composition is algebraic: recover the prefix filler by unbinding its role.
59
+ recovered = unbind(v, tok.prefix_role)
60
+ print(cosine_similarity(recovered, tok.bytes_vector("un"))) # ~0.6 (vs ~0 control)
61
+
62
+ # Shared morphology => shared neighborhood.
63
+ print(cosine_similarity(tok.word_vector("unhappy"),
64
+ tok.word_vector("happy"))) # > 0.2 (vs ~0 unrelated)
65
+
66
+ # Encode a sentence: one float16 vector per word.
67
+ mat = tok.encode("the quick brown fox") # mx.array, shape (4, 2048), float16
68
+ ```
69
+
70
+ ## The math (one paragraph)
71
+
72
+ `bind(a, b)` is circular convolution, computed in the Fourier domain as `real(IFFT(FFT(a) * FFT(b)))`. `unbind(bind(a,b), b)` is the convolution inverse, `real(IFFT(FFT(bound) * conj(FFT(b))))`. A **unitary** vector has all FFT magnitudes equal to 1, so binding with it is a perfect, lossless rotation — unbinding recovers the original to >0.99 cosine. A word vector bundles three role⊗filler pairs — `prefix_role ⊗ bytes(prefix)`, `root_role ⊗ bytes(root)`, `suffix_role ⊗ bytes(suffix)` — into one normalized superposition; unbinding a role pulls its filler back out of the superposition (the other two act as noise, which is why longer dims recover more cleanly).
73
+
74
+ ## Compositionality demo (regression-tested, dim=2048)
75
+
76
+ | Property | HRR cosine | Random control | Test |
77
+ |---|---|---|---|
78
+ | Prefix recovery (`unbind(unhappy, prefix_role)` ≈ `bytes("un")`) | > 0.50 | ≈ 0 | `test_prefix_recovery_beats_control` |
79
+ | Shared-root clustering (`unhappy` ~ `happy`) | > 0.20 | ≈ 0 | `test_shared_root_clusters` |
80
+ | Shared-suffix clustering (`running` ~ `walking`) | > 0.15 | ≈ 0 | `test_shared_suffix_clusters` |
81
+ | OOV root family (`unbind(unkind, root_role)` ≈ `bytes("kind")`) | > 0.50 | ≈ 0 | `test_oov_root_family` |
82
+ | Role vectors near-orthogonal | < 0.10 | — | `test_roles_are_near_orthogonal` |
83
+
84
+ Each HRR measurement is paired against a random-vector control of the same dimension, asserting the structured signal is meaningfully stronger than noise.
85
+
86
+ ## Public API
87
+
88
+ | Name | What |
89
+ |---|---|
90
+ | `MorphemeTokenizer(dim=2048, seed=0)` | Map text → HRR morpheme vectors |
91
+ | `.segment(word)` | `(prefix, root, suffix)` |
92
+ | `.word_vector(word)` / `.bytes_vector(text)` | Holistic / byte-fold word vector |
93
+ | `.encode(text)` / `.iter_vectors(text)` | Stacked `(n, dim)` float16 / lazy yield |
94
+ | `.prefix_role` / `.root_role` / `.suffix_role` | Exposed unitary role vectors (for `unbind`) |
95
+ | `bind`, `unbind`, `bundle`, `normalize`, `make_unitary`, `cosine_similarity`, `update_context` | HRR primitives |
96
+ | `segment`, `PREFIXES`, `SUFFIXES` | Standalone morphological segmentation |
97
+
98
+ `HolographicMorphemeTokenizer` is kept as a backwards-compatible alias for `MorphemeTokenizer`.
99
+
100
+ ## Segmentation: deliberately simple
101
+
102
+ `morphemes.segment` is dependency-free, longest-match affix stripping over curated prefix/suffix lists, with a minimum-root guard. It is **imperfect by design**: it does not undo consonant doubling (`running → runn + ing`, not `run + ing`) and cannot tell a real root from a suffixable tail (`preorder → pre + ord + er`). No dictionary, no learned model, no `nltk`. This keeps the package lightweight; better segmentation is future work and is orthogonal to the HRR representation itself.
103
+
104
+ ## Scope & honesty
105
+
106
+ - **mlx-only.** Apple Silicon target audience; a numpy/JAX backend is a documented future option, not this release.
107
+ - **Representation, not a drop-in tokenizer.** No token IDs, no HF integration.
108
+ - **Can't retrofit pretrained models.** Swapping HRR input into a real model throws away its pretrained weights — you must train from scratch. On consumer hardware (e.g. 16 GB) that means small research-scale models, not deployable ones. The value of this package is the *compositional input representation* and the HRR algebra, demonstrated on small models.
109
+ - v0.1 ships the tokenizer + primitives + tests. A demo Space, a trained reference model, and a portable backend are future work.
110
+
111
+ ## License
112
+
113
+ MIT. See `LICENSE`.
114
+
115
+ ## Cite / priority
116
+
117
+ If you build on this representation in published work, please cite this repository. The compositional HRR morpheme representation (role⊗filler binding of prefix/root/suffix via circular convolution, with unitary roles enabling algebraic morpheme manipulation) is, to our knowledge, novel as a tokenization scheme; we'd appreciate attribution.
@@ -0,0 +1,8 @@
1
+ morph_hrr/__init__.py,sha256=oTvvhqBRWT-r97txJxzZCU-jL18WFaT2fSrCwXViUG8,855
2
+ morph_hrr/hrr.py,sha256=rgQGhcXKzuX6IWGsA78KMSE8wL3Yp6qfKtodZnGTe_k,2523
3
+ morph_hrr/morphemes.py,sha256=F00Kc-oMnCB1Ift9ERtdEXJ7VyyqR2_YTxP8oKPri_Q,2475
4
+ morph_hrr/tokenizer.py,sha256=vePQUnNmqJEfcMb_Yi-lC5MHBJGoDJBKOf4AkhSkC-g,3775
5
+ morph_hrr-0.1.0.dist-info/METADATA,sha256=JWEyvYAlCBL27IhE784S4FHnbnRF6sV0VXV72q32h7g,7013
6
+ morph_hrr-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
7
+ morph_hrr-0.1.0.dist-info/licenses/LICENSE,sha256=r8hvcnlc5fQIW8wMWifOEa6vCAbLFbadC3lnuLxhSfQ,1066
8
+ morph_hrr-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 BB Claude
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.