primoji 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- primoji-0.1.0/PKG-INFO +256 -0
- primoji-0.1.0/README.md +212 -0
- primoji-0.1.0/primoji/__init__.py +22 -0
- primoji-0.1.0/primoji/alias_map.py +104 -0
- primoji-0.1.0/primoji/byte_fallback.py +80 -0
- primoji-0.1.0/primoji/composer.py +209 -0
- primoji-0.1.0/primoji/data/common_words.json +7738 -0
- primoji-0.1.0/primoji/data/dictionary_seed.json +323308 -0
- primoji-0.1.0/primoji/data/emoji_catalog.json +22504 -0
- primoji-0.1.0/primoji/data/primitives.json +1165 -0
- primoji-0.1.0/primoji/data/proper_noun_anchors.json +3007 -0
- primoji-0.1.0/primoji/decoder.py +208 -0
- primoji-0.1.0/primoji/dictionary.py +278 -0
- primoji-0.1.0/primoji/fuzzy.py +137 -0
- primoji-0.1.0/primoji/math_handler.py +184 -0
- primoji-0.1.0/primoji/preprocessor.py +169 -0
- primoji-0.1.0/primoji/primitives.py +99 -0
- primoji-0.1.0/primoji/tokenizer.py +268 -0
- primoji-0.1.0/primoji/utils.py +234 -0
- primoji-0.1.0/primoji/vocabulary.py +321 -0
- primoji-0.1.0/primoji.egg-info/PKG-INFO +256 -0
- primoji-0.1.0/primoji.egg-info/SOURCES.txt +37 -0
- primoji-0.1.0/primoji.egg-info/dependency_links.txt +1 -0
- primoji-0.1.0/primoji.egg-info/requires.txt +22 -0
- primoji-0.1.0/primoji.egg-info/top_level.txt +1 -0
- primoji-0.1.0/pyproject.toml +90 -0
- primoji-0.1.0/setup.cfg +4 -0
- primoji-0.1.0/tests/test_alias_map.py +89 -0
- primoji-0.1.0/tests/test_bpb.py +97 -0
- primoji-0.1.0/tests/test_byte_fallback_coverage.py +117 -0
- primoji-0.1.0/tests/test_chat.py +108 -0
- primoji-0.1.0/tests/test_classify_token.py +98 -0
- primoji-0.1.0/tests/test_compositional_embedding.py +107 -0
- primoji-0.1.0/tests/test_coverage.py +83 -0
- primoji-0.1.0/tests/test_encode_decode.py +328 -0
- primoji-0.1.0/tests/test_id_ranges.py +154 -0
- primoji-0.1.0/tests/test_invariants.py +154 -0
- primoji-0.1.0/tests/test_primitives_and_vocab.py +129 -0
- primoji-0.1.0/tests/test_stress.py +173 -0
primoji-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: primoji
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A compositional semantic tokenizer for LLMs based on NSM primitives, emoji, and common word tokens
|
|
5
|
+
Author-email: Frane Bandov <frane@offbyte.com>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/frane/primoji
|
|
8
|
+
Project-URL: Documentation, https://github.com/frane/primoji#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/frane/primoji
|
|
10
|
+
Project-URL: Issues, https://github.com/frane/primoji/issues
|
|
11
|
+
Keywords: tokenizer,tokenization,nlp,llm,language-model,semantic,compositional,emoji,nsm
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
24
|
+
Classifier: Typing :: Typed
|
|
25
|
+
Requires-Python: >=3.10
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
Requires-Dist: emoji>=2.14
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=8.3; extra == "dev"
|
|
30
|
+
Requires-Dist: black>=24.10; extra == "dev"
|
|
31
|
+
Requires-Dist: ruff>=0.8; extra == "dev"
|
|
32
|
+
Provides-Extra: build-dict
|
|
33
|
+
Requires-Dist: spacy>=3.8; extra == "build-dict"
|
|
34
|
+
Requires-Dist: wordfreq>=3.0; extra == "build-dict"
|
|
35
|
+
Provides-Extra: train
|
|
36
|
+
Requires-Dist: torch>=2.5; extra == "train"
|
|
37
|
+
Requires-Dist: tokenizers>=0.21; extra == "train"
|
|
38
|
+
Requires-Dist: datasets>=3.0; extra == "train"
|
|
39
|
+
Requires-Dist: numpy>=1.26; extra == "train"
|
|
40
|
+
Provides-Extra: spelling
|
|
41
|
+
Requires-Dist: symspellpy>=6.7; extra == "spelling"
|
|
42
|
+
Provides-Extra: all
|
|
43
|
+
Requires-Dist: primoji[build-dict,dev,spelling,train]; extra == "all"
|
|
44
|
+
|
|
45
|
+
# Primoji
|
|
46
|
+
|
|
47
|
+
A compositional semantic tokenizer for language models, built from Natural
|
|
48
|
+
Semantic Metalanguage (NSM) primitives, Unicode emoji, common word tokens,
|
|
49
|
+
and a UTF-8 byte fallback.
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from primoji import Tokenizer
|
|
53
|
+
|
|
54
|
+
tok = Tokenizer()
|
|
55
|
+
ids = tok.encode("The teacher explained photosynthesis")
|
|
56
|
+
tok.decode(ids)
|
|
57
|
+
# 'teacher say photosynthesis'
|
|
58
|
+
|
|
59
|
+
tok.vocab_size
|
|
60
|
+
# 10195
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Install
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install primoji
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Requires Python 3.10+. Only runtime dependency is `emoji`. Optional extras
|
|
70
|
+
for the dictionary build pipeline (`spacy`, `wordfreq`), training scripts
|
|
71
|
+
(`torch`, `tokenizers`, `datasets`), or fuzzy matching (`symspellpy`):
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
pip install "primoji[build-dict]" # to rebuild the dictionary from sources
|
|
75
|
+
pip install "primoji[train]" # to run the training scripts
|
|
76
|
+
pip install "primoji[spelling]" # to enable conservative SymSpell fuzzy matching
|
|
77
|
+
pip install "primoji[all]" # everything (also includes dev tools)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## What it is
|
|
81
|
+
|
|
82
|
+
Primoji is a tokenizer that constructs token meanings compositionally. Instead
|
|
83
|
+
of discovering subword units from corpus statistics like BPE, it ships with a
|
|
84
|
+
fixed compositional vocabulary of about 10,000 tokens: 1,175 Unicode emoji,
|
|
85
|
+
140 NSM semantic primitives, ~7,700 common word tokens, ~830 structural and
|
|
86
|
+
geographic tokens (flags, anchors, punctuation, math), and 258 byte-fallback
|
|
87
|
+
tokens. Rare and technical words decompose into short sequences of primitives
|
|
88
|
+
(`[PLANT, HAVE, LIGHT]` for photosynthesis, `[WATER, CAUSE, AIR]` for
|
|
89
|
+
evaporation). The encode pipeline never produces an UNK token.
|
|
90
|
+
|
|
91
|
+
When you might want it: research on tokenization, controlled experiments
|
|
92
|
+
that need a vocabulary 3x smaller than BPE at near-parity compression,
|
|
93
|
+
multilingual experiments where the primitive set transfers across languages
|
|
94
|
+
by design, small-vocab architectures whose cost scales with vocabulary size,
|
|
95
|
+
or educational and scientific text where compositional technical vocabulary
|
|
96
|
+
is dense. It is not a drop-in BPE replacement for production English chatbots:
|
|
97
|
+
generation is lossy at the word level (decoded text reconstructs concepts but
|
|
98
|
+
not always exact surface forms), and creative or stylistic writing loses
|
|
99
|
+
distinctions like "melancholy" vs "sadness" that BPE preserves as distinct
|
|
100
|
+
tokens.
|
|
101
|
+
|
|
102
|
+
## Quick start
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from primoji import Tokenizer
|
|
106
|
+
|
|
107
|
+
tok = Tokenizer()
|
|
108
|
+
|
|
109
|
+
# Encode and decode
|
|
110
|
+
ids = tok.encode("Water evaporates when heated")
|
|
111
|
+
print(ids)
|
|
112
|
+
# [1266, ..., ...]
|
|
113
|
+
print(tok.decode(ids))
|
|
114
|
+
|
|
115
|
+
# Inspect a token
|
|
116
|
+
print(tok.describe(ids[0]))
|
|
117
|
+
# '💧 (ID 1266) — Tier 2 primitive: WATER — Liquid, fluid'
|
|
118
|
+
|
|
119
|
+
# Classify a word by which tier handles it
|
|
120
|
+
print(tok.classify_word("photosynthesis")) # 'dict_composed'
|
|
121
|
+
print(tok.classify_word("water")) # 'tier2_primitive'
|
|
122
|
+
print(tok.classify_word("the")) # 'dict_dropped'
|
|
123
|
+
print(tok.classify_word("Mediterranean")) # 'tier3_anchor' or 'byte_fallback'
|
|
124
|
+
|
|
125
|
+
# Vocabulary size
|
|
126
|
+
print(tok.vocab_size) # 10195
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Vocabulary structure
|
|
130
|
+
|
|
131
|
+
Four tiers with strict encoding precedence. The model only ever sees integer
|
|
132
|
+
IDs; the tier labels exist for diagnostics and the dictionary build.
|
|
133
|
+
|
|
134
|
+
| Tier | Count | Examples |
|
|
135
|
+
|------|------:|----------|
|
|
136
|
+
| 1a Direct emoji | ~1,175 | 🐕 (dog), 🏠 (house), ☀️ (sun) |
|
|
137
|
+
| 1b Common word tokens | ~7,700 | government, important, however |
|
|
138
|
+
| 2 Compositional primitives | 140 | PLANT, HAVE, LIGHT, MOVE, KNOW |
|
|
139
|
+
| 3 Structural and geographic | ~830 | 🇩🇪, 🇫🇷, punctuation, digits, math operators, NER anchors |
|
|
140
|
+
| 4 UTF-8 byte fallback | 258 | 256 byte tokens + 2 boundary markers |
|
|
141
|
+
|
|
142
|
+
The 140 primitives are 65 canonical NSM primes from
|
|
143
|
+
Wierzbicka's Natural Semantic Metalanguage plus 75 domain expansions
|
|
144
|
+
(perceptual, grammatical, scientific, social). Country flags act as geographic
|
|
145
|
+
modifiers in compositions. Anchors are the top ~500 proper nouns extracted
|
|
146
|
+
from FineWeb-Edu via spaCy NER.
|
|
147
|
+
|
|
148
|
+
A note on emoji: the model never sees glyphs, only integer IDs. Tier 1a is
|
|
149
|
+
just a convenient source of ~1,200 visually distinct token slots for concrete
|
|
150
|
+
nouns. In generated output, emoji tokens decode to their English words
|
|
151
|
+
("dog", "house", "sun"), not to Unicode emoji.
|
|
152
|
+
|
|
153
|
+
## Configuration
|
|
154
|
+
|
|
155
|
+
The dictionary, primitive set, and word list are all swappable. Five tunable
|
|
156
|
+
dimensions:
|
|
157
|
+
|
|
158
|
+
1. **Vocabulary size.** From ~5,300 tokens (~6% composed text) to ~18,000
|
|
159
|
+
tokens (~2% composed). Controls the trade-off between compression and
|
|
160
|
+
semantic-structure signal.
|
|
161
|
+
2. **Composition depth.** Maximum number of primitives per concept. Default 5.
|
|
162
|
+
3. **Primitive count.** Add domain-specific primitives (e.g.\ HEALTH for
|
|
163
|
+
medical text) without touching the rest of the system.
|
|
164
|
+
4. **Composition rate.** A consequence of the above three.
|
|
165
|
+
5. **Domain specialization.** A medical Primoji can have 3,000 medical
|
|
166
|
+
compositions on top of 10,000 general words. The dictionary build pipeline
|
|
167
|
+
accepts a target corpus and produces a corresponding word list.
|
|
168
|
+
|
|
169
|
+
Rebuild the dictionary from sources:
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
pip install "primoji[build-dict]"
|
|
173
|
+
python -m scripts.build_dictionary
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
This regenerates `data/dictionary_seed.json` from layered sources: Unicode
|
|
177
|
+
CLDR annotations, NSM primitive synonyms, NER anchors, WordNet
|
|
178
|
+
auto-compositions. Layer precedence is fixed: primitive synonyms override
|
|
179
|
+
emoji catalog entries, single-primitive word tokens override compositions to
|
|
180
|
+
preserve reverse lookup, and ELCo and emoji2vec mappings supplement CLDR base
|
|
181
|
+
entries.
|
|
182
|
+
|
|
183
|
+
## Training your own model
|
|
184
|
+
|
|
185
|
+
Reference training scripts for 125M and 1B GPT-style models live in
|
|
186
|
+
`scripts/`:
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
# Tokenize FineWeb-Edu shards (or any HuggingFace dataset)
|
|
190
|
+
python -m scripts.prepare_training_data --n-docs 500000
|
|
191
|
+
|
|
192
|
+
# Train 125M
|
|
193
|
+
python -m scripts.train --tokenizer primoji --v2 --byte-weight 0.7
|
|
194
|
+
|
|
195
|
+
# Train 1B with gradient accumulation
|
|
196
|
+
python -m scripts.train --tokenizer primoji --model-size 1b \
|
|
197
|
+
--batch-size 4 --grad-accum 8 --v2 --byte-weight 0.7
|
|
198
|
+
|
|
199
|
+
# Train BPE baseline on the same data
|
|
200
|
+
python -m scripts.train --tokenizer mistral
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
Bits-per-byte (BPB) is computed correctly across vocabulary sizes for fair
|
|
204
|
+
cross-tokenizer comparison. See `scripts/train.py` for the full
|
|
205
|
+
hyperparameter list.
|
|
206
|
+
|
|
207
|
+
## Limitations
|
|
208
|
+
|
|
209
|
+
- English only. The NSM primes are verified across 30+ language families,
|
|
210
|
+
but the dictionary, word list, and evaluation are all English. Multilingual
|
|
211
|
+
evaluation is future work.
|
|
212
|
+
- Lossy at the word level. Known compositions decode exactly; novel
|
|
213
|
+
compositions decode to their primitive names, which is sufficient for
|
|
214
|
+
training-efficiency studies (BPB) but not for verbatim text generation
|
|
215
|
+
without an auxiliary surface-form mechanism.
|
|
216
|
+
- Designed for educational and scientific text. Compression and learning
|
|
217
|
+
benefits are strongest where vocabulary is compositional and
|
|
218
|
+
concept-dense. Conversational, creative, or code-heavy text will not see
|
|
219
|
+
the same gains.
|
|
220
|
+
- Not yet tested at scale beyond 1B parameters.
|
|
221
|
+
|
|
222
|
+
## Tests
|
|
223
|
+
|
|
224
|
+
```bash
|
|
225
|
+
pip install "primoji[dev]"
|
|
226
|
+
pytest
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
616 tests covering encode/decode round-trips, byte-fallback coverage,
|
|
230
|
+
compositional embedding correctness, ID-range invariants, and frozen
|
|
231
|
+
vocabulary boundaries.
|
|
232
|
+
|
|
233
|
+
## Paper
|
|
234
|
+
|
|
235
|
+
The full paper is included in this repository as
|
|
236
|
+
[`paper.pdf`](paper.pdf). It is not yet on arXiv. If you reference this work,
|
|
237
|
+
please cite as:
|
|
238
|
+
|
|
239
|
+
```bibtex
|
|
240
|
+
@unpublished{bandov2026primoji,
|
|
241
|
+
title = {Primoji: Compositional Semantic Tokenization for Language Model Training},
|
|
242
|
+
author = {Bandov, Frane},
|
|
243
|
+
year = {2026},
|
|
244
|
+
note = {Manuscript. \url{https://github.com/frane/primoji/blob/master/paper.pdf}},
|
|
245
|
+
}
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
## License
|
|
249
|
+
|
|
250
|
+
Apache 2.0. See `LICENSE`.
|
|
251
|
+
|
|
252
|
+
## Acknowledgments
|
|
253
|
+
|
|
254
|
+
Anna Wierzbicka and the wider Natural Semantic Metalanguage research community
|
|
255
|
+
for the linguistic foundation. The FineWeb-Edu team at HuggingFace for the
|
|
256
|
+
training corpus. The Unicode CLDR project for emoji annotations.
|
primoji-0.1.0/README.md
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# Primoji
|
|
2
|
+
|
|
3
|
+
A compositional semantic tokenizer for language models, built from Natural
|
|
4
|
+
Semantic Metalanguage (NSM) primitives, Unicode emoji, common word tokens,
|
|
5
|
+
and a UTF-8 byte fallback.
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
from primoji import Tokenizer
|
|
9
|
+
|
|
10
|
+
tok = Tokenizer()
|
|
11
|
+
ids = tok.encode("The teacher explained photosynthesis")
|
|
12
|
+
tok.decode(ids)
|
|
13
|
+
# 'teacher say photosynthesis'
|
|
14
|
+
|
|
15
|
+
tok.vocab_size
|
|
16
|
+
# 10195
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Install
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install primoji
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Requires Python 3.10+. Only runtime dependency is `emoji`. Optional extras
|
|
26
|
+
for the dictionary build pipeline (`spacy`, `wordfreq`), training scripts
|
|
27
|
+
(`torch`, `tokenizers`, `datasets`), or fuzzy matching (`symspellpy`):
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install "primoji[build-dict]" # to rebuild the dictionary from sources
|
|
31
|
+
pip install "primoji[train]" # to run the training scripts
|
|
32
|
+
pip install "primoji[spelling]" # to enable conservative SymSpell fuzzy matching
|
|
33
|
+
pip install "primoji[all]" # everything (also includes dev tools)
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## What it is
|
|
37
|
+
|
|
38
|
+
Primoji is a tokenizer that constructs token meanings compositionally. Instead
|
|
39
|
+
of discovering subword units from corpus statistics like BPE, it ships with a
|
|
40
|
+
fixed compositional vocabulary of about 10,000 tokens: 1,175 Unicode emoji,
|
|
41
|
+
140 NSM semantic primitives, ~7,700 common word tokens, ~830 structural and
|
|
42
|
+
geographic tokens (flags, anchors, punctuation, math), and 258 byte-fallback
|
|
43
|
+
tokens. Rare and technical words decompose into short sequences of primitives
|
|
44
|
+
(`[PLANT, HAVE, LIGHT]` for photosynthesis, `[WATER, CAUSE, AIR]` for
|
|
45
|
+
evaporation). The encode pipeline never produces an UNK token.
|
|
46
|
+
|
|
47
|
+
When you might want it: research on tokenization, controlled experiments
|
|
48
|
+
that need a vocabulary 3x smaller than BPE at near-parity compression,
|
|
49
|
+
multilingual experiments where the primitive set transfers across languages
|
|
50
|
+
by design, small-vocab architectures whose cost scales with vocabulary size,
|
|
51
|
+
or educational and scientific text where compositional technical vocabulary
|
|
52
|
+
is dense. It is not a drop-in BPE replacement for production English chatbots:
|
|
53
|
+
generation is lossy at the word level (decoded text reconstructs concepts but
|
|
54
|
+
not always exact surface forms), and creative or stylistic writing loses
|
|
55
|
+
distinctions like "melancholy" vs "sadness" that BPE preserves as distinct
|
|
56
|
+
tokens.
|
|
57
|
+
|
|
58
|
+
## Quick start
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from primoji import Tokenizer
|
|
62
|
+
|
|
63
|
+
tok = Tokenizer()
|
|
64
|
+
|
|
65
|
+
# Encode and decode
|
|
66
|
+
ids = tok.encode("Water evaporates when heated")
|
|
67
|
+
print(ids)
|
|
68
|
+
# [1266, ..., ...]
|
|
69
|
+
print(tok.decode(ids))
|
|
70
|
+
|
|
71
|
+
# Inspect a token
|
|
72
|
+
print(tok.describe(ids[0]))
|
|
73
|
+
# '💧 (ID 1266) — Tier 2 primitive: WATER — Liquid, fluid'
|
|
74
|
+
|
|
75
|
+
# Classify a word by which tier handles it
|
|
76
|
+
print(tok.classify_word("photosynthesis")) # 'dict_composed'
|
|
77
|
+
print(tok.classify_word("water")) # 'tier2_primitive'
|
|
78
|
+
print(tok.classify_word("the")) # 'dict_dropped'
|
|
79
|
+
print(tok.classify_word("Mediterranean")) # 'tier3_anchor' or 'byte_fallback'
|
|
80
|
+
|
|
81
|
+
# Vocabulary size
|
|
82
|
+
print(tok.vocab_size) # 10195
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Vocabulary structure
|
|
86
|
+
|
|
87
|
+
Four tiers with strict encoding precedence. The model only ever sees integer
|
|
88
|
+
IDs; the tier labels exist for diagnostics and the dictionary build.
|
|
89
|
+
|
|
90
|
+
| Tier | Count | Examples |
|
|
91
|
+
|------|------:|----------|
|
|
92
|
+
| 1a Direct emoji | ~1,175 | 🐕 (dog), 🏠 (house), ☀️ (sun) |
|
|
93
|
+
| 1b Common word tokens | ~7,700 | government, important, however |
|
|
94
|
+
| 2 Compositional primitives | 140 | PLANT, HAVE, LIGHT, MOVE, KNOW |
|
|
95
|
+
| 3 Structural and geographic | ~830 | 🇩🇪, 🇫🇷, punctuation, digits, math operators, NER anchors |
|
|
96
|
+
| 4 UTF-8 byte fallback | 258 | 256 byte tokens + 2 boundary markers |
|
|
97
|
+
|
|
98
|
+
The 140 primitives are 65 canonical NSM primes from
|
|
99
|
+
Wierzbicka's Natural Semantic Metalanguage plus 75 domain expansions
|
|
100
|
+
(perceptual, grammatical, scientific, social). Country flags act as geographic
|
|
101
|
+
modifiers in compositions. Anchors are the top ~500 proper nouns extracted
|
|
102
|
+
from FineWeb-Edu via spaCy NER.
|
|
103
|
+
|
|
104
|
+
A note on emoji: the model never sees glyphs, only integer IDs. Tier 1a is
|
|
105
|
+
just a convenient source of ~1,200 visually distinct token slots for concrete
|
|
106
|
+
nouns. In generated output, emoji tokens decode to their English words
|
|
107
|
+
("dog", "house", "sun"), not to Unicode emoji.
|
|
108
|
+
|
|
109
|
+
## Configuration
|
|
110
|
+
|
|
111
|
+
The dictionary, primitive set, and word list are all swappable. Five tunable
|
|
112
|
+
dimensions:
|
|
113
|
+
|
|
114
|
+
1. **Vocabulary size.** From ~5,300 tokens (~6% composed text) to ~18,000
|
|
115
|
+
tokens (~2% composed). Controls the trade-off between compression and
|
|
116
|
+
semantic-structure signal.
|
|
117
|
+
2. **Composition depth.** Maximum number of primitives per concept. Default 5.
|
|
118
|
+
3. **Primitive count.** Add domain-specific primitives (e.g.\ HEALTH for
|
|
119
|
+
medical text) without touching the rest of the system.
|
|
120
|
+
4. **Composition rate.** A consequence of the above three.
|
|
121
|
+
5. **Domain specialization.** A medical Primoji can have 3,000 medical
|
|
122
|
+
compositions on top of 10,000 general words. The dictionary build pipeline
|
|
123
|
+
accepts a target corpus and produces a corresponding word list.
|
|
124
|
+
|
|
125
|
+
Rebuild the dictionary from sources:
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
pip install "primoji[build-dict]"
|
|
129
|
+
python -m scripts.build_dictionary
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
This regenerates `data/dictionary_seed.json` from layered sources: Unicode
|
|
133
|
+
CLDR annotations, NSM primitive synonyms, NER anchors, WordNet
|
|
134
|
+
auto-compositions. Layer precedence is fixed: primitive synonyms override
|
|
135
|
+
emoji catalog entries, single-primitive word tokens override compositions to
|
|
136
|
+
preserve reverse lookup, and ELCo and emoji2vec mappings supplement CLDR base
|
|
137
|
+
entries.
|
|
138
|
+
|
|
139
|
+
## Training your own model
|
|
140
|
+
|
|
141
|
+
Reference training scripts for 125M and 1B GPT-style models live in
|
|
142
|
+
`scripts/`:
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
# Tokenize FineWeb-Edu shards (or any HuggingFace dataset)
|
|
146
|
+
python -m scripts.prepare_training_data --n-docs 500000
|
|
147
|
+
|
|
148
|
+
# Train 125M
|
|
149
|
+
python -m scripts.train --tokenizer primoji --v2 --byte-weight 0.7
|
|
150
|
+
|
|
151
|
+
# Train 1B with gradient accumulation
|
|
152
|
+
python -m scripts.train --tokenizer primoji --model-size 1b \
|
|
153
|
+
--batch-size 4 --grad-accum 8 --v2 --byte-weight 0.7
|
|
154
|
+
|
|
155
|
+
# Train BPE baseline on the same data
|
|
156
|
+
python -m scripts.train --tokenizer mistral
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
Bits-per-byte (BPB) is computed correctly across vocabulary sizes for fair
|
|
160
|
+
cross-tokenizer comparison. See `scripts/train.py` for the full
|
|
161
|
+
hyperparameter list.
|
|
162
|
+
|
|
163
|
+
## Limitations
|
|
164
|
+
|
|
165
|
+
- English only. The NSM primes are verified across 30+ language families,
|
|
166
|
+
but the dictionary, word list, and evaluation are all English. Multilingual
|
|
167
|
+
evaluation is future work.
|
|
168
|
+
- Lossy at the word level. Known compositions decode exactly; novel
|
|
169
|
+
compositions decode to their primitive names, which is sufficient for
|
|
170
|
+
training-efficiency studies (BPB) but not for verbatim text generation
|
|
171
|
+
without an auxiliary surface-form mechanism.
|
|
172
|
+
- Designed for educational and scientific text. Compression and learning
|
|
173
|
+
benefits are strongest where vocabulary is compositional and
|
|
174
|
+
concept-dense. Conversational, creative, or code-heavy text will not see
|
|
175
|
+
the same gains.
|
|
176
|
+
- Not yet tested at scale beyond 1B parameters.
|
|
177
|
+
|
|
178
|
+
## Tests
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
pip install "primoji[dev]"
|
|
182
|
+
pytest
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
616 tests covering encode/decode round-trips, byte-fallback coverage,
|
|
186
|
+
compositional embedding correctness, ID-range invariants, and frozen
|
|
187
|
+
vocabulary boundaries.
|
|
188
|
+
|
|
189
|
+
## Paper
|
|
190
|
+
|
|
191
|
+
The full paper is included in this repository as
|
|
192
|
+
[`paper.pdf`](paper.pdf). It is not yet on arXiv. If you reference this work,
|
|
193
|
+
please cite as:
|
|
194
|
+
|
|
195
|
+
```bibtex
|
|
196
|
+
@unpublished{bandov2026primoji,
|
|
197
|
+
title = {Primoji: Compositional Semantic Tokenization for Language Model Training},
|
|
198
|
+
author = {Bandov, Frane},
|
|
199
|
+
year = {2026},
|
|
200
|
+
note = {Manuscript. \url{https://github.com/frane/primoji/blob/master/paper.pdf}},
|
|
201
|
+
}
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
## License
|
|
205
|
+
|
|
206
|
+
Apache 2.0. See `LICENSE`.
|
|
207
|
+
|
|
208
|
+
## Acknowledgments
|
|
209
|
+
|
|
210
|
+
Anna Wierzbicka and the wider Natural Semantic Metalanguage research community
|
|
211
|
+
for the linguistic foundation. The FineWeb-Edu team at HuggingFace for the
|
|
212
|
+
training corpus. The Unicode CLDR project for emoji annotations.
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Primoji: a compositional semantic tokenizer for LLMs.
|
|
2
|
+
|
|
3
|
+
Encodes English text into a vocabulary of roughly 10,000 tokens built from
|
|
4
|
+
Unicode emoji, Natural Semantic Metalanguage (NSM) primitives, common word
|
|
5
|
+
tokens, and a UTF-8 byte fallback. The encode pipeline never produces an
|
|
6
|
+
UNK token: any input is representable.
|
|
7
|
+
|
|
8
|
+
Example:
|
|
9
|
+
>>> from primoji import Tokenizer
|
|
10
|
+
>>> tok = Tokenizer()
|
|
11
|
+
>>> ids = tok.encode("The teacher explained photosynthesis")
|
|
12
|
+
>>> tok.decode(ids)
|
|
13
|
+
'teacher say photosynthesis'
|
|
14
|
+
>>> tok.vocab_size
|
|
15
|
+
10195
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from primoji.byte_fallback import decode_bytes, encode_bytes
|
|
19
|
+
from primoji.tokenizer import Tokenizer
|
|
20
|
+
|
|
21
|
+
__all__ = ["Tokenizer", "encode_bytes", "decode_bytes"]
|
|
22
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Grammar word alias map for compositional embeddings.
|
|
2
|
+
|
|
3
|
+
Maps grammar word token IDs to their primitive component IDs.
|
|
4
|
+
"is" (word token) -> [BE primitive ID, NOW primitive ID]
|
|
5
|
+
"was" (word token) -> [BE primitive ID, BEFORE primitive ID]
|
|
6
|
+
|
|
7
|
+
The model uses these to compute grammar word embeddings as the mean
|
|
8
|
+
of their primitive component embeddings. This gives grammar words
|
|
9
|
+
semantic structure without consuming them as primitives in running text.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from primoji.primitives import get_primitive_by_name
|
|
15
|
+
|
|
16
|
+
# Grammar word -> primitive decomposition (by name)
|
|
17
|
+
GRAMMAR_ALIASES: dict[str, list[str]] = {
|
|
18
|
+
# Copula/be verbs
|
|
19
|
+
"is": ["BE", "NOW"], "are": ["BE", "NOW"], "am": ["BE", "NOW"],
|
|
20
|
+
"was": ["BE", "BEFORE"], "were": ["BE", "BEFORE"],
|
|
21
|
+
"be": ["BE"], "been": ["BE", "BEFORE"], "being": ["BE"],
|
|
22
|
+
|
|
23
|
+
# Have verbs
|
|
24
|
+
"has": ["HAVE", "NOW"], "have": ["HAVE"], "had": ["HAVE", "BEFORE"],
|
|
25
|
+
|
|
26
|
+
# Do verbs
|
|
27
|
+
"do": ["DO"], "does": ["DO", "NOW"], "did": ["DO", "BEFORE"],
|
|
28
|
+
|
|
29
|
+
# Modals
|
|
30
|
+
"can": ["CAN"], "could": ["CAN", "BEFORE"],
|
|
31
|
+
"will": ["AFTER"], "would": ["WANT", "BEFORE"],
|
|
32
|
+
"should": ["GOOD", "DO"], "may": ["MAYBE"],
|
|
33
|
+
"might": ["MAYBE", "BEFORE"], "must": ["WANT", "VERY"],
|
|
34
|
+
"shall": ["AFTER"],
|
|
35
|
+
|
|
36
|
+
# Negation
|
|
37
|
+
"not": ["NOT"], "no": ["NOT"], "never": ["NOT", "TIME"],
|
|
38
|
+
|
|
39
|
+
# Pronouns
|
|
40
|
+
"i": ["SOMEONE", "THIS"], "me": ["SOMEONE", "THIS"],
|
|
41
|
+
"my": ["SOMEONE", "THIS"],
|
|
42
|
+
"you": ["SOMEONE", "OTHER"], "your": ["SOMEONE", "OTHER"],
|
|
43
|
+
"he": ["SOMEONE"], "him": ["SOMEONE"], "his": ["SOMEONE"],
|
|
44
|
+
"she": ["SOMEONE"], "her": ["SOMEONE"],
|
|
45
|
+
"it": ["SOMETHING"], "its": ["SOMETHING"],
|
|
46
|
+
"we": ["SOMEONE", "THIS", "MANY"], "us": ["SOMEONE", "THIS", "MANY"],
|
|
47
|
+
"our": ["SOMEONE", "THIS", "MANY"],
|
|
48
|
+
"they": ["SOMEONE", "OTHER", "MANY"], "them": ["SOMEONE", "OTHER", "MANY"],
|
|
49
|
+
"their": ["SOMEONE", "OTHER", "MANY"],
|
|
50
|
+
|
|
51
|
+
# Determiners
|
|
52
|
+
"this": ["THIS"], "that": ["OTHER"],
|
|
53
|
+
"these": ["THIS", "MANY"], "those": ["OTHER", "MANY"],
|
|
54
|
+
"all": ["ALL"], "every": ["ALL"],
|
|
55
|
+
"some": ["SOME"], "each": ["ALL", "ONE"],
|
|
56
|
+
"any": ["SOME"], "many": ["MANY"], "few": ["FEW"],
|
|
57
|
+
"much": ["BIG"], "more": ["MORE"], "most": ["MANY", "VERY"],
|
|
58
|
+
|
|
59
|
+
# Prepositions
|
|
60
|
+
"with": ["WITH"], "for": ["FOR"], "about": ["ABOUT"],
|
|
61
|
+
"above": ["ABOVE"], "below": ["BELOW"],
|
|
62
|
+
"near": ["NEAR"], "far": ["FAR"],
|
|
63
|
+
"before": ["BEFORE"], "after": ["AFTER"],
|
|
64
|
+
"here": ["HERE"], "there": ["THERE_IS"], "where": ["WHERE"],
|
|
65
|
+
|
|
66
|
+
# Conjunctions/logic
|
|
67
|
+
"if": ["IF"], "because": ["BECAUSE"],
|
|
68
|
+
"like": ["LIKE_AS"], "as": ["LIKE_AS"],
|
|
69
|
+
|
|
70
|
+
# Adverbs
|
|
71
|
+
"very": ["VERY"], "now": ["NOW"],
|
|
72
|
+
"also": ["ADD"],
|
|
73
|
+
"always": ["ALL", "TIME"], "sometimes": ["SOME", "TIME"],
|
|
74
|
+
"often": ["MANY", "TIME"], "usually": ["MANY", "TIME"],
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def build_alias_map(encode_fn: callable) -> dict[int, list[int]]:
|
|
79
|
+
"""Convert GRAMMAR_ALIASES to token ID -> primitive ID list.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
encode_fn: function that takes a word and returns token IDs.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
Dict mapping word token ID -> list of primitive token IDs.
|
|
86
|
+
"""
|
|
87
|
+
alias_map: dict[int, list[int]] = {}
|
|
88
|
+
|
|
89
|
+
for word, prim_names in GRAMMAR_ALIASES.items():
|
|
90
|
+
word_ids = encode_fn(word)
|
|
91
|
+
if len(word_ids) != 1:
|
|
92
|
+
continue # skip if word doesn't encode to single token
|
|
93
|
+
|
|
94
|
+
tok_id = word_ids[0]
|
|
95
|
+
prim_ids = []
|
|
96
|
+
for pname in prim_names:
|
|
97
|
+
p = get_primitive_by_name(pname)
|
|
98
|
+
if p is not None:
|
|
99
|
+
prim_ids.append(p.id)
|
|
100
|
+
|
|
101
|
+
if prim_ids:
|
|
102
|
+
alias_map[tok_id] = prim_ids
|
|
103
|
+
|
|
104
|
+
return alias_map
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Byte fallback encoder for unknown words.
|
|
2
|
+
|
|
3
|
+
When a word isn't found in the emoji dictionary and SymSpell can't correct it,
|
|
4
|
+
encode it as raw UTF-8 bytes wrapped in boundary markers. This guarantees
|
|
5
|
+
zero UNK tokens and zero information loss.
|
|
6
|
+
|
|
7
|
+
Follows the same pattern as SentencePiece's byte_fallback (used by Llama 1/2/3).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from primoji.utils import _IDS
|
|
13
|
+
|
|
14
|
+
# Token IDs for byte fallback (dynamically computed from data sizes)
|
|
15
|
+
BYTES_START_ID: int = _IDS["BYTES_START"]
|
|
16
|
+
BYTES_END_ID: int = _IDS["BYTES_END"]
|
|
17
|
+
BYTE_TOKEN_OFFSET: int = _IDS["BYTE_OFFSET"]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def encode_bytes(word: str) -> list[int]:
|
|
21
|
+
"""Encode a word as UTF-8 byte token IDs with boundary markers.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
word: Unknown word to encode as bytes.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
List of token IDs: [BYTES_START] + byte_ids + [BYTES_END]
|
|
28
|
+
"""
|
|
29
|
+
byte_ids = [BYTE_TOKEN_OFFSET + b for b in word.encode("utf-8")]
|
|
30
|
+
return [BYTES_START_ID] + byte_ids + [BYTES_END_ID]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def decode_bytes(ids: list[int]) -> str:
|
|
34
|
+
"""Decode byte token IDs back to a string.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
ids: Token ID sequence starting with BYTES_START, ending with BYTES_END.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Decoded string.
|
|
41
|
+
|
|
42
|
+
Raises:
|
|
43
|
+
ValueError: If markers are missing or bytes are invalid UTF-8.
|
|
44
|
+
"""
|
|
45
|
+
if not ids:
|
|
46
|
+
raise ValueError("Empty token ID sequence")
|
|
47
|
+
if ids[0] != BYTES_START_ID:
|
|
48
|
+
raise ValueError(f"Expected BYTES_START ({BYTES_START_ID}), got {ids[0]}")
|
|
49
|
+
|
|
50
|
+
end_idx = -1
|
|
51
|
+
for i in range(1, len(ids)):
|
|
52
|
+
if ids[i] == BYTES_END_ID:
|
|
53
|
+
end_idx = i
|
|
54
|
+
break
|
|
55
|
+
if end_idx == -1:
|
|
56
|
+
raise ValueError(f"Missing BYTES_END ({BYTES_END_ID}) marker")
|
|
57
|
+
|
|
58
|
+
byte_values = []
|
|
59
|
+
for tid in ids[1:end_idx]:
|
|
60
|
+
bval = tid - BYTE_TOKEN_OFFSET
|
|
61
|
+
if not (0 <= bval <= 255):
|
|
62
|
+
raise ValueError(f"Token ID {tid} is not a valid byte token")
|
|
63
|
+
byte_values.append(bval)
|
|
64
|
+
|
|
65
|
+
return bytes(byte_values).decode("utf-8")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def is_byte_token(token_id: int) -> bool:
|
|
69
|
+
"""Check if a token ID is a byte fallback token (0x00-0xFF range)."""
|
|
70
|
+
return BYTE_TOKEN_OFFSET <= token_id <= BYTE_TOKEN_OFFSET + 255
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def is_byte_boundary(token_id: int) -> bool:
|
|
74
|
+
"""Check if a token ID is a byte boundary marker (START or END)."""
|
|
75
|
+
return token_id in (BYTES_START_ID, BYTES_END_ID)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def is_byte_region_token(token_id: int) -> bool:
|
|
79
|
+
"""Check if a token ID is any byte-related token (boundary or data)."""
|
|
80
|
+
return is_byte_boundary(token_id) or is_byte_token(token_id)
|