primoji 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. primoji-0.1.0/PKG-INFO +256 -0
  2. primoji-0.1.0/README.md +212 -0
  3. primoji-0.1.0/primoji/__init__.py +22 -0
  4. primoji-0.1.0/primoji/alias_map.py +104 -0
  5. primoji-0.1.0/primoji/byte_fallback.py +80 -0
  6. primoji-0.1.0/primoji/composer.py +209 -0
  7. primoji-0.1.0/primoji/data/common_words.json +7738 -0
  8. primoji-0.1.0/primoji/data/dictionary_seed.json +323308 -0
  9. primoji-0.1.0/primoji/data/emoji_catalog.json +22504 -0
  10. primoji-0.1.0/primoji/data/primitives.json +1165 -0
  11. primoji-0.1.0/primoji/data/proper_noun_anchors.json +3007 -0
  12. primoji-0.1.0/primoji/decoder.py +208 -0
  13. primoji-0.1.0/primoji/dictionary.py +278 -0
  14. primoji-0.1.0/primoji/fuzzy.py +137 -0
  15. primoji-0.1.0/primoji/math_handler.py +184 -0
  16. primoji-0.1.0/primoji/preprocessor.py +169 -0
  17. primoji-0.1.0/primoji/primitives.py +99 -0
  18. primoji-0.1.0/primoji/tokenizer.py +268 -0
  19. primoji-0.1.0/primoji/utils.py +234 -0
  20. primoji-0.1.0/primoji/vocabulary.py +321 -0
  21. primoji-0.1.0/primoji.egg-info/PKG-INFO +256 -0
  22. primoji-0.1.0/primoji.egg-info/SOURCES.txt +37 -0
  23. primoji-0.1.0/primoji.egg-info/dependency_links.txt +1 -0
  24. primoji-0.1.0/primoji.egg-info/requires.txt +22 -0
  25. primoji-0.1.0/primoji.egg-info/top_level.txt +1 -0
  26. primoji-0.1.0/pyproject.toml +90 -0
  27. primoji-0.1.0/setup.cfg +4 -0
  28. primoji-0.1.0/tests/test_alias_map.py +89 -0
  29. primoji-0.1.0/tests/test_bpb.py +97 -0
  30. primoji-0.1.0/tests/test_byte_fallback_coverage.py +117 -0
  31. primoji-0.1.0/tests/test_chat.py +108 -0
  32. primoji-0.1.0/tests/test_classify_token.py +98 -0
  33. primoji-0.1.0/tests/test_compositional_embedding.py +107 -0
  34. primoji-0.1.0/tests/test_coverage.py +83 -0
  35. primoji-0.1.0/tests/test_encode_decode.py +328 -0
  36. primoji-0.1.0/tests/test_id_ranges.py +154 -0
  37. primoji-0.1.0/tests/test_invariants.py +154 -0
  38. primoji-0.1.0/tests/test_primitives_and_vocab.py +129 -0
  39. primoji-0.1.0/tests/test_stress.py +173 -0
primoji-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,256 @@
1
+ Metadata-Version: 2.4
2
+ Name: primoji
3
+ Version: 0.1.0
4
+ Summary: A compositional semantic tokenizer for LLMs based on NSM primitives, emoji, and common word tokens
5
+ Author-email: Frane Bandov <frane@offbyte.com>
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/frane/primoji
8
+ Project-URL: Documentation, https://github.com/frane/primoji#readme
9
+ Project-URL: Repository, https://github.com/frane/primoji
10
+ Project-URL: Issues, https://github.com/frane/primoji/issues
11
+ Keywords: tokenizer,tokenization,nlp,llm,language-model,semantic,compositional,emoji,nsm
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: Apache Software License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Topic :: Text Processing :: Linguistic
24
+ Classifier: Typing :: Typed
25
+ Requires-Python: >=3.10
26
+ Description-Content-Type: text/markdown
27
+ Requires-Dist: emoji>=2.14
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=8.3; extra == "dev"
30
+ Requires-Dist: black>=24.10; extra == "dev"
31
+ Requires-Dist: ruff>=0.8; extra == "dev"
32
+ Provides-Extra: build-dict
33
+ Requires-Dist: spacy>=3.8; extra == "build-dict"
34
+ Requires-Dist: wordfreq>=3.0; extra == "build-dict"
35
+ Provides-Extra: train
36
+ Requires-Dist: torch>=2.5; extra == "train"
37
+ Requires-Dist: tokenizers>=0.21; extra == "train"
38
+ Requires-Dist: datasets>=3.0; extra == "train"
39
+ Requires-Dist: numpy>=1.26; extra == "train"
40
+ Provides-Extra: spelling
41
+ Requires-Dist: symspellpy>=6.7; extra == "spelling"
42
+ Provides-Extra: all
43
+ Requires-Dist: primoji[build-dict,dev,spelling,train]; extra == "all"
44
+
45
+ # Primoji
46
+
47
+ A compositional semantic tokenizer for language models, built from Natural
48
+ Semantic Metalanguage (NSM) primitives, Unicode emoji, common word tokens,
49
+ and a UTF-8 byte fallback.
50
+
51
+ ```python
52
+ from primoji import Tokenizer
53
+
54
+ tok = Tokenizer()
55
+ ids = tok.encode("The teacher explained photosynthesis")
56
+ tok.decode(ids)
57
+ # 'teacher say photosynthesis'
58
+
59
+ tok.vocab_size
60
+ # 10195
61
+ ```
62
+
63
+ ## Install
64
+
65
+ ```bash
66
+ pip install primoji
67
+ ```
68
+
69
+ Requires Python 3.10+. Only runtime dependency is `emoji`. Optional extras
70
+ for the dictionary build pipeline (`spacy`, `wordfreq`), training scripts
71
+ (`torch`, `tokenizers`, `datasets`), or fuzzy matching (`symspellpy`):
72
+
73
+ ```bash
74
+ pip install "primoji[build-dict]" # to rebuild the dictionary from sources
75
+ pip install "primoji[train]" # to run the training scripts
76
+ pip install "primoji[spelling]" # to enable conservative SymSpell fuzzy matching
77
+ pip install "primoji[all]" # everything (also includes dev tools)
78
+ ```
79
+
80
+ ## What it is
81
+
82
+ Primoji is a tokenizer that constructs token meanings compositionally. Instead
83
+ of discovering subword units from corpus statistics like BPE, it ships with a
84
+ fixed compositional vocabulary of about 10,000 tokens: 1,175 Unicode emoji,
85
+ 140 NSM semantic primitives, ~7,700 common word tokens, ~830 structural and
86
+ geographic tokens (flags, anchors, punctuation, math), and 258 byte-fallback
87
+ tokens. Rare and technical words decompose into short sequences of primitives
88
+ (`[PLANT, HAVE, LIGHT]` for photosynthesis, `[WATER, CAUSE, AIR]` for
89
+ evaporation). The encode pipeline never produces an UNK token.
90
+
91
+ When you might want it: research on tokenization, controlled experiments
92
+ that need a vocabulary 3x smaller than BPE at near-parity compression,
93
+ multilingual experiments where the primitive set transfers across languages
94
+ by design, small-vocab architectures whose cost scales with vocabulary size,
95
+ or educational and scientific text where compositional technical vocabulary
96
+ is dense. It is not a drop-in BPE replacement for production English chatbots:
97
+ generation is lossy at the word level (decoded text reconstructs concepts but
98
+ not always exact surface forms), and creative or stylistic writing loses
99
+ distinctions like "melancholy" vs "sadness" that BPE preserves as distinct
100
+ tokens.
101
+
102
+ ## Quick start
103
+
104
+ ```python
105
+ from primoji import Tokenizer
106
+
107
+ tok = Tokenizer()
108
+
109
+ # Encode and decode
110
+ ids = tok.encode("Water evaporates when heated")
111
+ print(ids)
112
+ # [1266, ..., ...]
113
+ print(tok.decode(ids))
114
+
115
+ # Inspect a token
116
+ print(tok.describe(ids[0]))
117
+ # '💧 (ID 1266) — Tier 2 primitive: WATER — Liquid, fluid'
118
+
119
+ # Classify a word by which tier handles it
120
+ print(tok.classify_word("photosynthesis")) # 'dict_composed'
121
+ print(tok.classify_word("water")) # 'tier2_primitive'
122
+ print(tok.classify_word("the")) # 'dict_dropped'
123
+ print(tok.classify_word("Mediterranean")) # 'tier3_anchor' or 'byte_fallback'
124
+
125
+ # Vocabulary size
126
+ print(tok.vocab_size) # 10195
127
+ ```
128
+
129
+ ## Vocabulary structure
130
+
131
+ Four tiers with strict encoding precedence. The model only ever sees integer
132
+ IDs; the tier labels exist for diagnostics and the dictionary build.
133
+
134
+ | Tier | Count | Examples |
135
+ |------|------:|----------|
136
+ | 1a Direct emoji | ~1,175 | 🐕 (dog), 🏠 (house), ☀️ (sun) |
137
+ | 1b Common word tokens | ~7,700 | government, important, however |
138
+ | 2 Compositional primitives | 140 | PLANT, HAVE, LIGHT, MOVE, KNOW |
139
+ | 3 Structural and geographic | ~830 | 🇩🇪, 🇫🇷, punctuation, digits, math operators, NER anchors |
140
+ | 4 UTF-8 byte fallback | 258 | 256 byte tokens + 2 boundary markers |
141
+
142
+ The 140 primitives are 65 canonical NSM primes from
143
+ Wierzbicka's Natural Semantic Metalanguage plus 75 domain expansions
144
+ (perceptual, grammatical, scientific, social). Country flags act as geographic
145
+ modifiers in compositions. Anchors are the top ~500 proper nouns extracted
146
+ from FineWeb-Edu via spaCy NER.
147
+
148
+ A note on emoji: the model never sees glyphs, only integer IDs. Tier 1a is
149
+ just a convenient source of ~1,200 visually distinct token slots for concrete
150
+ nouns. In generated output, emoji tokens decode to their English words
151
+ ("dog", "house", "sun"), not to Unicode emoji.
152
+
153
+ ## Configuration
154
+
155
+ The dictionary, primitive set, and word list are all swappable. Five tunable
156
+ dimensions:
157
+
158
+ 1. **Vocabulary size.** From ~5,300 tokens (~6% composed text) to ~18,000
159
+ tokens (~2% composed). Controls the trade-off between compression and
160
+ semantic-structure signal.
161
+ 2. **Composition depth.** Maximum number of primitives per concept. Default 5.
162
+ 3. **Primitive count.** Add domain-specific primitives (e.g.\ HEALTH for
163
+ medical text) without touching the rest of the system.
164
+ 4. **Composition rate.** A consequence of the above three.
165
+ 5. **Domain specialization.** A medical Primoji can have 3,000 medical
166
+ compositions on top of 10,000 general words. The dictionary build pipeline
167
+ accepts a target corpus and produces a corresponding word list.
168
+
169
+ Rebuild the dictionary from sources:
170
+
171
+ ```bash
172
+ pip install "primoji[build-dict]"
173
+ python -m scripts.build_dictionary
174
+ ```
175
+
176
+ This regenerates `data/dictionary_seed.json` from layered sources: Unicode
177
+ CLDR annotations, NSM primitive synonyms, NER anchors, WordNet
178
+ auto-compositions. Layer precedence is fixed: primitive synonyms override
179
+ emoji catalog entries, single-primitive word tokens override compositions to
180
+ preserve reverse lookup, and ELCo and emoji2vec mappings supplement CLDR base
181
+ entries.
182
+
183
+ ## Training your own model
184
+
185
+ Reference training scripts for 125M and 1B GPT-style models live in
186
+ `scripts/`:
187
+
188
+ ```bash
189
+ # Tokenize FineWeb-Edu shards (or any HuggingFace dataset)
190
+ python -m scripts.prepare_training_data --n-docs 500000
191
+
192
+ # Train 125M
193
+ python -m scripts.train --tokenizer primoji --v2 --byte-weight 0.7
194
+
195
+ # Train 1B with gradient accumulation
196
+ python -m scripts.train --tokenizer primoji --model-size 1b \
197
+ --batch-size 4 --grad-accum 8 --v2 --byte-weight 0.7
198
+
199
+ # Train BPE baseline on the same data
200
+ python -m scripts.train --tokenizer mistral
201
+ ```
202
+
203
+ Bits-per-byte (BPB) is computed correctly across vocabulary sizes for fair
204
+ cross-tokenizer comparison. See `scripts/train.py` for the full
205
+ hyperparameter list.
206
+
207
+ ## Limitations
208
+
209
+ - English only. The NSM primes are verified across 30+ language families,
210
+ but the dictionary, word list, and evaluation are all English. Multilingual
211
+ evaluation is future work.
212
+ - Lossy at the word level. Known compositions decode exactly; novel
213
+ compositions decode to their primitive names, which is sufficient for
214
+ training-efficiency studies (BPB) but not for verbatim text generation
215
+ without an auxiliary surface-form mechanism.
216
+ - Designed for educational and scientific text. Compression and learning
217
+ benefits are strongest where vocabulary is compositional and
218
+ concept-dense. Conversational, creative, or code-heavy text will not see
219
+ the same gains.
220
+ - Not yet tested at scale beyond 1B parameters.
221
+
222
+ ## Tests
223
+
224
+ ```bash
225
+ pip install "primoji[dev]"
226
+ pytest
227
+ ```
228
+
229
+ 616 tests covering encode/decode round-trips, byte-fallback coverage,
230
+ compositional embedding correctness, ID-range invariants, and frozen
231
+ vocabulary boundaries.
232
+
233
+ ## Paper
234
+
235
+ The full paper is included in this repository as
236
+ [`paper.pdf`](paper.pdf). It is not yet on arXiv. If you reference this work,
237
+ please cite as:
238
+
239
+ ```bibtex
240
+ @unpublished{bandov2026primoji,
241
+ title = {Primoji: Compositional Semantic Tokenization for Language Model Training},
242
+ author = {Bandov, Frane},
243
+ year = {2026},
244
+ note = {Manuscript. \url{https://github.com/frane/primoji/blob/master/paper.pdf}},
245
+ }
246
+ ```
247
+
248
+ ## License
249
+
250
+ Apache 2.0. See `LICENSE`.
251
+
252
+ ## Acknowledgments
253
+
254
+ Anna Wierzbicka and the wider Natural Semantic Metalanguage research community
255
+ for the linguistic foundation. The FineWeb-Edu team at HuggingFace for the
256
+ training corpus. The Unicode CLDR project for emoji annotations.
@@ -0,0 +1,212 @@
1
+ # Primoji
2
+
3
+ A compositional semantic tokenizer for language models, built from Natural
4
+ Semantic Metalanguage (NSM) primitives, Unicode emoji, common word tokens,
5
+ and a UTF-8 byte fallback.
6
+
7
+ ```python
8
+ from primoji import Tokenizer
9
+
10
+ tok = Tokenizer()
11
+ ids = tok.encode("The teacher explained photosynthesis")
12
+ tok.decode(ids)
13
+ # 'teacher say photosynthesis'
14
+
15
+ tok.vocab_size
16
+ # 10195
17
+ ```
18
+
19
+ ## Install
20
+
21
+ ```bash
22
+ pip install primoji
23
+ ```
24
+
25
+ Requires Python 3.10+. Only runtime dependency is `emoji`. Optional extras
26
+ for the dictionary build pipeline (`spacy`, `wordfreq`), training scripts
27
+ (`torch`, `tokenizers`, `datasets`), or fuzzy matching (`symspellpy`):
28
+
29
+ ```bash
30
+ pip install "primoji[build-dict]" # to rebuild the dictionary from sources
31
+ pip install "primoji[train]" # to run the training scripts
32
+ pip install "primoji[spelling]" # to enable conservative SymSpell fuzzy matching
33
+ pip install "primoji[all]" # everything (also includes dev tools)
34
+ ```
35
+
36
+ ## What it is
37
+
38
+ Primoji is a tokenizer that constructs token meanings compositionally. Instead
39
+ of discovering subword units from corpus statistics like BPE, it ships with a
40
+ fixed compositional vocabulary of about 10,000 tokens: 1,175 Unicode emoji,
41
+ 140 NSM semantic primitives, ~7,700 common word tokens, ~830 structural and
42
+ geographic tokens (flags, anchors, punctuation, math), and 258 byte-fallback
43
+ tokens. Rare and technical words decompose into short sequences of primitives
44
+ (`[PLANT, HAVE, LIGHT]` for photosynthesis, `[WATER, CAUSE, AIR]` for
45
+ evaporation). The encode pipeline never produces an UNK token.
46
+
47
+ When you might want it: research on tokenization, controlled experiments
48
+ that need a vocabulary 3x smaller than BPE at near-parity compression,
49
+ multilingual experiments where the primitive set transfers across languages
50
+ by design, small-vocab architectures whose cost scales with vocabulary size,
51
+ or educational and scientific text where compositional technical vocabulary
52
+ is dense. It is not a drop-in BPE replacement for production English chatbots:
53
+ generation is lossy at the word level (decoded text reconstructs concepts but
54
+ not always exact surface forms), and creative or stylistic writing loses
55
+ distinctions like "melancholy" vs "sadness" that BPE preserves as distinct
56
+ tokens.
57
+
58
+ ## Quick start
59
+
60
+ ```python
61
+ from primoji import Tokenizer
62
+
63
+ tok = Tokenizer()
64
+
65
+ # Encode and decode
66
+ ids = tok.encode("Water evaporates when heated")
67
+ print(ids)
68
+ # [1266, ..., ...]
69
+ print(tok.decode(ids))
70
+
71
+ # Inspect a token
72
+ print(tok.describe(ids[0]))
73
+ # '💧 (ID 1266) — Tier 2 primitive: WATER — Liquid, fluid'
74
+
75
+ # Classify a word by which tier handles it
76
+ print(tok.classify_word("photosynthesis")) # 'dict_composed'
77
+ print(tok.classify_word("water")) # 'tier2_primitive'
78
+ print(tok.classify_word("the")) # 'dict_dropped'
79
+ print(tok.classify_word("Mediterranean")) # 'tier3_anchor' or 'byte_fallback'
80
+
81
+ # Vocabulary size
82
+ print(tok.vocab_size) # 10195
83
+ ```
84
+
85
+ ## Vocabulary structure
86
+
87
+ Four tiers with strict encoding precedence. The model only ever sees integer
88
+ IDs; the tier labels exist for diagnostics and the dictionary build.
89
+
90
+ | Tier | Count | Examples |
91
+ |------|------:|----------|
92
+ | 1a Direct emoji | ~1,175 | 🐕 (dog), 🏠 (house), ☀️ (sun) |
93
+ | 1b Common word tokens | ~7,700 | government, important, however |
94
+ | 2 Compositional primitives | 140 | PLANT, HAVE, LIGHT, MOVE, KNOW |
95
+ | 3 Structural and geographic | ~830 | 🇩🇪, 🇫🇷, punctuation, digits, math operators, NER anchors |
96
+ | 4 UTF-8 byte fallback | 258 | 256 byte tokens + 2 boundary markers |
97
+
98
+ The 140 primitives are 65 canonical NSM primes from
99
+ Wierzbicka's Natural Semantic Metalanguage plus 75 domain expansions
100
+ (perceptual, grammatical, scientific, social). Country flags act as geographic
101
+ modifiers in compositions. Anchors are the top ~500 proper nouns extracted
102
+ from FineWeb-Edu via spaCy NER.
103
+
104
+ A note on emoji: the model never sees glyphs, only integer IDs. Tier 1a is
105
+ just a convenient source of ~1,200 visually distinct token slots for concrete
106
+ nouns. In generated output, emoji tokens decode to their English words
107
+ ("dog", "house", "sun"), not to Unicode emoji.
108
+
109
+ ## Configuration
110
+
111
+ The dictionary, primitive set, and word list are all swappable. Five tunable
112
+ dimensions:
113
+
114
+ 1. **Vocabulary size.** From ~5,300 tokens (~6% composed text) to ~18,000
115
+ tokens (~2% composed). Controls the trade-off between compression and
116
+ semantic-structure signal.
117
+ 2. **Composition depth.** Maximum number of primitives per concept. Default 5.
118
+ 3. **Primitive count.** Add domain-specific primitives (e.g.\ HEALTH for
119
+ medical text) without touching the rest of the system.
120
+ 4. **Composition rate.** A consequence of the above three.
121
+ 5. **Domain specialization.** A medical Primoji can have 3,000 medical
122
+ compositions on top of 10,000 general words. The dictionary build pipeline
123
+ accepts a target corpus and produces a corresponding word list.
124
+
125
+ Rebuild the dictionary from sources:
126
+
127
+ ```bash
128
+ pip install "primoji[build-dict]"
129
+ python -m scripts.build_dictionary
130
+ ```
131
+
132
+ This regenerates `data/dictionary_seed.json` from layered sources: Unicode
133
+ CLDR annotations, NSM primitive synonyms, NER anchors, WordNet
134
+ auto-compositions. Layer precedence is fixed: primitive synonyms override
135
+ emoji catalog entries, single-primitive word tokens override compositions to
136
+ preserve reverse lookup, and ELCo and emoji2vec mappings supplement CLDR base
137
+ entries.
138
+
139
+ ## Training your own model
140
+
141
+ Reference training scripts for 125M and 1B GPT-style models live in
142
+ `scripts/`:
143
+
144
+ ```bash
145
+ # Tokenize FineWeb-Edu shards (or any HuggingFace dataset)
146
+ python -m scripts.prepare_training_data --n-docs 500000
147
+
148
+ # Train 125M
149
+ python -m scripts.train --tokenizer primoji --v2 --byte-weight 0.7
150
+
151
+ # Train 1B with gradient accumulation
152
+ python -m scripts.train --tokenizer primoji --model-size 1b \
153
+ --batch-size 4 --grad-accum 8 --v2 --byte-weight 0.7
154
+
155
+ # Train BPE baseline on the same data
156
+ python -m scripts.train --tokenizer mistral
157
+ ```
158
+
159
+ Bits-per-byte (BPB) is computed correctly across vocabulary sizes for fair
160
+ cross-tokenizer comparison. See `scripts/train.py` for the full
161
+ hyperparameter list.
162
+
163
+ ## Limitations
164
+
165
+ - English only. The NSM primes are verified across 30+ language families,
166
+ but the dictionary, word list, and evaluation are all English. Multilingual
167
+ evaluation is future work.
168
+ - Lossy at the word level. Known compositions decode exactly; novel
169
+ compositions decode to their primitive names, which is sufficient for
170
+ training-efficiency studies (BPB) but not for verbatim text generation
171
+ without an auxiliary surface-form mechanism.
172
+ - Designed for educational and scientific text. Compression and learning
173
+ benefits are strongest where vocabulary is compositional and
174
+ concept-dense. Conversational, creative, or code-heavy text will not see
175
+ the same gains.
176
+ - Not yet tested at scale beyond 1B parameters.
177
+
178
+ ## Tests
179
+
180
+ ```bash
181
+ pip install "primoji[dev]"
182
+ pytest
183
+ ```
184
+
185
+ 616 tests covering encode/decode round-trips, byte-fallback coverage,
186
+ compositional embedding correctness, ID-range invariants, and frozen
187
+ vocabulary boundaries.
188
+
189
+ ## Paper
190
+
191
+ The full paper is included in this repository as
192
+ [`paper.pdf`](paper.pdf). It is not yet on arXiv. If you reference this work,
193
+ please cite as:
194
+
195
+ ```bibtex
196
+ @unpublished{bandov2026primoji,
197
+ title = {Primoji: Compositional Semantic Tokenization for Language Model Training},
198
+ author = {Bandov, Frane},
199
+ year = {2026},
200
+ note = {Manuscript. \url{https://github.com/frane/primoji/blob/master/paper.pdf}},
201
+ }
202
+ ```
203
+
204
+ ## License
205
+
206
+ Apache 2.0. See `LICENSE`.
207
+
208
+ ## Acknowledgments
209
+
210
+ Anna Wierzbicka and the wider Natural Semantic Metalanguage research community
211
+ for the linguistic foundation. The FineWeb-Edu team at HuggingFace for the
212
+ training corpus. The Unicode CLDR project for emoji annotations.
@@ -0,0 +1,22 @@
1
+ """Primoji: a compositional semantic tokenizer for LLMs.
2
+
3
+ Encodes English text into a vocabulary of roughly 10,000 tokens built from
4
+ Unicode emoji, Natural Semantic Metalanguage (NSM) primitives, common word
5
+ tokens, and a UTF-8 byte fallback. The encode pipeline never produces an
6
+ UNK token: any input is representable.
7
+
8
+ Example:
9
+ >>> from primoji import Tokenizer
10
+ >>> tok = Tokenizer()
11
+ >>> ids = tok.encode("The teacher explained photosynthesis")
12
+ >>> tok.decode(ids)
13
+ 'teacher say photosynthesis'
14
+ >>> tok.vocab_size
15
+ 10195
16
+ """
17
+
18
+ from primoji.byte_fallback import decode_bytes, encode_bytes
19
+ from primoji.tokenizer import Tokenizer
20
+
21
+ __all__ = ["Tokenizer", "encode_bytes", "decode_bytes"]
22
+ __version__ = "0.1.0"
@@ -0,0 +1,104 @@
1
+ """Grammar word alias map for compositional embeddings.
2
+
3
+ Maps grammar word token IDs to their primitive component IDs.
4
+ "is" (word token) -> [BE primitive ID, NOW primitive ID]
5
+ "was" (word token) -> [BE primitive ID, BEFORE primitive ID]
6
+
7
+ The model uses these to compute grammar word embeddings as the mean
8
+ of their primitive component embeddings. This gives grammar words
9
+ semantic structure without consuming them as primitives in running text.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from primoji.primitives import get_primitive_by_name
15
+
16
+ # Grammar word -> primitive decomposition (by name)
17
+ GRAMMAR_ALIASES: dict[str, list[str]] = {
18
+ # Copula/be verbs
19
+ "is": ["BE", "NOW"], "are": ["BE", "NOW"], "am": ["BE", "NOW"],
20
+ "was": ["BE", "BEFORE"], "were": ["BE", "BEFORE"],
21
+ "be": ["BE"], "been": ["BE", "BEFORE"], "being": ["BE"],
22
+
23
+ # Have verbs
24
+ "has": ["HAVE", "NOW"], "have": ["HAVE"], "had": ["HAVE", "BEFORE"],
25
+
26
+ # Do verbs
27
+ "do": ["DO"], "does": ["DO", "NOW"], "did": ["DO", "BEFORE"],
28
+
29
+ # Modals
30
+ "can": ["CAN"], "could": ["CAN", "BEFORE"],
31
+ "will": ["AFTER"], "would": ["WANT", "BEFORE"],
32
+ "should": ["GOOD", "DO"], "may": ["MAYBE"],
33
+ "might": ["MAYBE", "BEFORE"], "must": ["WANT", "VERY"],
34
+ "shall": ["AFTER"],
35
+
36
+ # Negation
37
+ "not": ["NOT"], "no": ["NOT"], "never": ["NOT", "TIME"],
38
+
39
+ # Pronouns
40
+ "i": ["SOMEONE", "THIS"], "me": ["SOMEONE", "THIS"],
41
+ "my": ["SOMEONE", "THIS"],
42
+ "you": ["SOMEONE", "OTHER"], "your": ["SOMEONE", "OTHER"],
43
+ "he": ["SOMEONE"], "him": ["SOMEONE"], "his": ["SOMEONE"],
44
+ "she": ["SOMEONE"], "her": ["SOMEONE"],
45
+ "it": ["SOMETHING"], "its": ["SOMETHING"],
46
+ "we": ["SOMEONE", "THIS", "MANY"], "us": ["SOMEONE", "THIS", "MANY"],
47
+ "our": ["SOMEONE", "THIS", "MANY"],
48
+ "they": ["SOMEONE", "OTHER", "MANY"], "them": ["SOMEONE", "OTHER", "MANY"],
49
+ "their": ["SOMEONE", "OTHER", "MANY"],
50
+
51
+ # Determiners
52
+ "this": ["THIS"], "that": ["OTHER"],
53
+ "these": ["THIS", "MANY"], "those": ["OTHER", "MANY"],
54
+ "all": ["ALL"], "every": ["ALL"],
55
+ "some": ["SOME"], "each": ["ALL", "ONE"],
56
+ "any": ["SOME"], "many": ["MANY"], "few": ["FEW"],
57
+ "much": ["BIG"], "more": ["MORE"], "most": ["MANY", "VERY"],
58
+
59
+ # Prepositions
60
+ "with": ["WITH"], "for": ["FOR"], "about": ["ABOUT"],
61
+ "above": ["ABOVE"], "below": ["BELOW"],
62
+ "near": ["NEAR"], "far": ["FAR"],
63
+ "before": ["BEFORE"], "after": ["AFTER"],
64
+ "here": ["HERE"], "there": ["THERE_IS"], "where": ["WHERE"],
65
+
66
+ # Conjunctions/logic
67
+ "if": ["IF"], "because": ["BECAUSE"],
68
+ "like": ["LIKE_AS"], "as": ["LIKE_AS"],
69
+
70
+ # Adverbs
71
+ "very": ["VERY"], "now": ["NOW"],
72
+ "also": ["ADD"],
73
+ "always": ["ALL", "TIME"], "sometimes": ["SOME", "TIME"],
74
+ "often": ["MANY", "TIME"], "usually": ["MANY", "TIME"],
75
+ }
76
+
77
+
78
+ def build_alias_map(encode_fn: callable) -> dict[int, list[int]]:
79
+ """Convert GRAMMAR_ALIASES to token ID -> primitive ID list.
80
+
81
+ Args:
82
+ encode_fn: function that takes a word and returns token IDs.
83
+
84
+ Returns:
85
+ Dict mapping word token ID -> list of primitive token IDs.
86
+ """
87
+ alias_map: dict[int, list[int]] = {}
88
+
89
+ for word, prim_names in GRAMMAR_ALIASES.items():
90
+ word_ids = encode_fn(word)
91
+ if len(word_ids) != 1:
92
+ continue # skip if word doesn't encode to single token
93
+
94
+ tok_id = word_ids[0]
95
+ prim_ids = []
96
+ for pname in prim_names:
97
+ p = get_primitive_by_name(pname)
98
+ if p is not None:
99
+ prim_ids.append(p.id)
100
+
101
+ if prim_ids:
102
+ alias_map[tok_id] = prim_ids
103
+
104
+ return alias_map
@@ -0,0 +1,80 @@
1
+ """Byte fallback encoder for unknown words.
2
+
3
+ When a word isn't found in the emoji dictionary and SymSpell can't correct it,
4
+ encode it as raw UTF-8 bytes wrapped in boundary markers. This guarantees
5
+ zero UNK tokens and zero information loss.
6
+
7
+ Follows the same pattern as SentencePiece's byte_fallback (used by Llama 1/2/3).
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from primoji.utils import _IDS
13
+
14
+ # Token IDs for byte fallback (dynamically computed from data sizes)
15
+ BYTES_START_ID: int = _IDS["BYTES_START"]
16
+ BYTES_END_ID: int = _IDS["BYTES_END"]
17
+ BYTE_TOKEN_OFFSET: int = _IDS["BYTE_OFFSET"]
18
+
19
+
20
+ def encode_bytes(word: str) -> list[int]:
21
+ """Encode a word as UTF-8 byte token IDs with boundary markers.
22
+
23
+ Args:
24
+ word: Unknown word to encode as bytes.
25
+
26
+ Returns:
27
+ List of token IDs: [BYTES_START] + byte_ids + [BYTES_END]
28
+ """
29
+ byte_ids = [BYTE_TOKEN_OFFSET + b for b in word.encode("utf-8")]
30
+ return [BYTES_START_ID] + byte_ids + [BYTES_END_ID]
31
+
32
+
33
+ def decode_bytes(ids: list[int]) -> str:
34
+ """Decode byte token IDs back to a string.
35
+
36
+ Args:
37
+ ids: Token ID sequence starting with BYTES_START, ending with BYTES_END.
38
+
39
+ Returns:
40
+ Decoded string.
41
+
42
+ Raises:
43
+ ValueError: If markers are missing or bytes are invalid UTF-8.
44
+ """
45
+ if not ids:
46
+ raise ValueError("Empty token ID sequence")
47
+ if ids[0] != BYTES_START_ID:
48
+ raise ValueError(f"Expected BYTES_START ({BYTES_START_ID}), got {ids[0]}")
49
+
50
+ end_idx = -1
51
+ for i in range(1, len(ids)):
52
+ if ids[i] == BYTES_END_ID:
53
+ end_idx = i
54
+ break
55
+ if end_idx == -1:
56
+ raise ValueError(f"Missing BYTES_END ({BYTES_END_ID}) marker")
57
+
58
+ byte_values = []
59
+ for tid in ids[1:end_idx]:
60
+ bval = tid - BYTE_TOKEN_OFFSET
61
+ if not (0 <= bval <= 255):
62
+ raise ValueError(f"Token ID {tid} is not a valid byte token")
63
+ byte_values.append(bval)
64
+
65
+ return bytes(byte_values).decode("utf-8")
66
+
67
+
68
+ def is_byte_token(token_id: int) -> bool:
69
+ """Check if a token ID is a byte fallback token (0x00-0xFF range)."""
70
+ return BYTE_TOKEN_OFFSET <= token_id <= BYTE_TOKEN_OFFSET + 255
71
+
72
+
73
+ def is_byte_boundary(token_id: int) -> bool:
74
+ """Check if a token ID is a byte boundary marker (START or END)."""
75
+ return token_id in (BYTES_START_ID, BYTES_END_ID)
76
+
77
+
78
+ def is_byte_region_token(token_id: int) -> bool:
79
+ """Check if a token ID is any byte-related token (boundary or data)."""
80
+ return is_byte_boundary(token_id) or is_byte_token(token_id)