ommlds-rs 0.0.0.dev473__tar.gz → 0.0.0.dev495__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ommlds-rs might be problematic. Click here for more details.
- {ommlds_rs-0.0.0.dev473 → ommlds_rs-0.0.0.dev495}/PKG-INFO +2 -2
- {ommlds_rs-0.0.0.dev473 → ommlds_rs-0.0.0.dev495}/ommlds/nanochat/rustbpe/Cargo.lock +61 -60
- {ommlds_rs-0.0.0.dev473 → ommlds_rs-0.0.0.dev495}/ommlds/nanochat/rustbpe/Cargo.toml +5 -5
- {ommlds_rs-0.0.0.dev473 → ommlds_rs-0.0.0.dev495}/ommlds/nanochat/rustbpe/src/lib.rs +15 -6
- {ommlds_rs-0.0.0.dev473 → ommlds_rs-0.0.0.dev495}/ommlds_rs.egg-info/PKG-INFO +2 -2
- ommlds_rs-0.0.0.dev495/ommlds_rs.egg-info/requires.txt +1 -0
- {ommlds_rs-0.0.0.dev473 → ommlds_rs-0.0.0.dev495}/pyproject.toml +2 -2
- ommlds_rs-0.0.0.dev473/ommlds_rs.egg-info/requires.txt +0 -1
- {ommlds_rs-0.0.0.dev473 → ommlds_rs-0.0.0.dev495}/LICENSE +0 -0
- {ommlds_rs-0.0.0.dev473 → ommlds_rs-0.0.0.dev495}/README.md +0 -0
- {ommlds_rs-0.0.0.dev473 → ommlds_rs-0.0.0.dev495}/ommlds/nanochat/rustbpe/LICENSE +0 -0
- {ommlds_rs-0.0.0.dev473 → ommlds_rs-0.0.0.dev495}/ommlds/nanochat/rustbpe/README.md +0 -0
- {ommlds_rs-0.0.0.dev473 → ommlds_rs-0.0.0.dev495}/ommlds_rs.egg-info/SOURCES.txt +0 -0
- {ommlds_rs-0.0.0.dev473 → ommlds_rs-0.0.0.dev495}/ommlds_rs.egg-info/dependency_links.txt +0 -0
- {ommlds_rs-0.0.0.dev473 → ommlds_rs-0.0.0.dev495}/ommlds_rs.egg-info/top_level.txt +0 -0
- {ommlds_rs-0.0.0.dev473 → ommlds_rs-0.0.0.dev495}/setup.cfg +0 -0
- {ommlds_rs-0.0.0.dev473 → ommlds_rs-0.0.0.dev495}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ommlds-rs
|
|
3
|
-
Version: 0.0.0.
|
|
3
|
+
Version: 0.0.0.dev495
|
|
4
4
|
Summary: ommlds
|
|
5
5
|
Author: wrmsr
|
|
6
6
|
License-Expression: BSD-3-Clause
|
|
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
14
14
|
Requires-Python: >=3.13
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
16
|
License-File: LICENSE
|
|
17
|
-
Requires-Dist: ommlds==0.0.0.
|
|
17
|
+
Requires-Dist: ommlds==0.0.0.dev495
|
|
18
18
|
Dynamic: license-file
|
|
19
19
|
|
|
20
20
|
# Overview
|
|
@@ -17,9 +17,9 @@ dependencies = [
|
|
|
17
17
|
|
|
18
18
|
[[package]]
|
|
19
19
|
name = "aho-corasick"
|
|
20
|
-
version = "1.1.
|
|
20
|
+
version = "1.1.4"
|
|
21
21
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
22
|
-
checksum = "
|
|
22
|
+
checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
|
|
23
23
|
dependencies = [
|
|
24
24
|
"memchr",
|
|
25
25
|
]
|
|
@@ -62,9 +62,9 @@ dependencies = [
|
|
|
62
62
|
|
|
63
63
|
[[package]]
|
|
64
64
|
name = "cfg-if"
|
|
65
|
-
version = "1.0.
|
|
65
|
+
version = "1.0.4"
|
|
66
66
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
67
|
-
checksum = "
|
|
67
|
+
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
|
|
68
68
|
|
|
69
69
|
[[package]]
|
|
70
70
|
name = "compact_str"
|
|
@@ -107,9 +107,9 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
|
|
107
107
|
|
|
108
108
|
[[package]]
|
|
109
109
|
name = "dary_heap"
|
|
110
|
-
version = "0.3.
|
|
110
|
+
version = "0.3.8"
|
|
111
111
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
112
|
-
checksum = "
|
|
112
|
+
checksum = "06d2e3287df1c007e74221c49ca10a95d557349e54b3a75dc2fb14712c751f04"
|
|
113
113
|
|
|
114
114
|
[[package]]
|
|
115
115
|
name = "either"
|
|
@@ -125,9 +125,9 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
|
|
|
125
125
|
|
|
126
126
|
[[package]]
|
|
127
127
|
name = "fancy-regex"
|
|
128
|
-
version = "0.16.
|
|
128
|
+
version = "0.16.2"
|
|
129
129
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
130
|
-
checksum = "
|
|
130
|
+
checksum = "998b056554fbe42e03ae0e152895cd1a7e1002aec800fdc6635d20270260c46f"
|
|
131
131
|
dependencies = [
|
|
132
132
|
"bit-set",
|
|
133
133
|
"regex-automata",
|
|
@@ -136,21 +136,21 @@ dependencies = [
|
|
|
136
136
|
|
|
137
137
|
[[package]]
|
|
138
138
|
name = "getrandom"
|
|
139
|
-
version = "0.3.
|
|
139
|
+
version = "0.3.4"
|
|
140
140
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
141
|
-
checksum = "
|
|
141
|
+
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
|
|
142
142
|
dependencies = [
|
|
143
143
|
"cfg-if",
|
|
144
144
|
"libc",
|
|
145
145
|
"r-efi",
|
|
146
|
-
"
|
|
146
|
+
"wasip2",
|
|
147
147
|
]
|
|
148
148
|
|
|
149
149
|
[[package]]
|
|
150
150
|
name = "hashbrown"
|
|
151
|
-
version = "0.
|
|
151
|
+
version = "0.16.0"
|
|
152
152
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
153
|
-
checksum = "
|
|
153
|
+
checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d"
|
|
154
154
|
|
|
155
155
|
[[package]]
|
|
156
156
|
name = "heck"
|
|
@@ -160,9 +160,9 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
|
|
160
160
|
|
|
161
161
|
[[package]]
|
|
162
162
|
name = "indexmap"
|
|
163
|
-
version = "2.
|
|
163
|
+
version = "2.12.0"
|
|
164
164
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
165
|
-
checksum = "
|
|
165
|
+
checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f"
|
|
166
166
|
dependencies = [
|
|
167
167
|
"equivalent",
|
|
168
168
|
"hashbrown",
|
|
@@ -170,9 +170,12 @@ dependencies = [
|
|
|
170
170
|
|
|
171
171
|
[[package]]
|
|
172
172
|
name = "indoc"
|
|
173
|
-
version = "2.0.
|
|
173
|
+
version = "2.0.7"
|
|
174
174
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
175
|
-
checksum = "
|
|
175
|
+
checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706"
|
|
176
|
+
dependencies = [
|
|
177
|
+
"rustversion",
|
|
178
|
+
]
|
|
176
179
|
|
|
177
180
|
[[package]]
|
|
178
181
|
name = "itoa"
|
|
@@ -182,21 +185,21 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
|
|
|
182
185
|
|
|
183
186
|
[[package]]
|
|
184
187
|
name = "libc"
|
|
185
|
-
version = "0.2.
|
|
188
|
+
version = "0.2.177"
|
|
186
189
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
187
|
-
checksum = "
|
|
190
|
+
checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976"
|
|
188
191
|
|
|
189
192
|
[[package]]
|
|
190
193
|
name = "log"
|
|
191
|
-
version = "0.4.
|
|
194
|
+
version = "0.4.29"
|
|
192
195
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
193
|
-
checksum = "
|
|
196
|
+
checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
|
|
194
197
|
|
|
195
198
|
[[package]]
|
|
196
199
|
name = "memchr"
|
|
197
|
-
version = "2.7.
|
|
200
|
+
version = "2.7.6"
|
|
198
201
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
199
|
-
checksum = "
|
|
202
|
+
checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
|
|
200
203
|
|
|
201
204
|
[[package]]
|
|
202
205
|
name = "memoffset"
|
|
@@ -221,20 +224,19 @@ checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483"
|
|
|
221
224
|
|
|
222
225
|
[[package]]
|
|
223
226
|
name = "proc-macro2"
|
|
224
|
-
version = "1.0.
|
|
227
|
+
version = "1.0.103"
|
|
225
228
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
226
|
-
checksum = "
|
|
229
|
+
checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
|
|
227
230
|
dependencies = [
|
|
228
231
|
"unicode-ident",
|
|
229
232
|
]
|
|
230
233
|
|
|
231
234
|
[[package]]
|
|
232
235
|
name = "pyo3"
|
|
233
|
-
version = "0.
|
|
236
|
+
version = "0.27.2"
|
|
234
237
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
235
|
-
checksum = "
|
|
238
|
+
checksum = "ab53c047fcd1a1d2a8820fe84f05d6be69e9526be40cb03b73f86b6b03e6d87d"
|
|
236
239
|
dependencies = [
|
|
237
|
-
"cfg-if",
|
|
238
240
|
"indoc",
|
|
239
241
|
"libc",
|
|
240
242
|
"memoffset",
|
|
@@ -248,19 +250,18 @@ dependencies = [
|
|
|
248
250
|
|
|
249
251
|
[[package]]
|
|
250
252
|
name = "pyo3-build-config"
|
|
251
|
-
version = "0.
|
|
253
|
+
version = "0.27.2"
|
|
252
254
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
253
|
-
checksum = "
|
|
255
|
+
checksum = "b455933107de8642b4487ed26d912c2d899dec6114884214a0b3bb3be9261ea6"
|
|
254
256
|
dependencies = [
|
|
255
|
-
"once_cell",
|
|
256
257
|
"target-lexicon",
|
|
257
258
|
]
|
|
258
259
|
|
|
259
260
|
[[package]]
|
|
260
261
|
name = "pyo3-ffi"
|
|
261
|
-
version = "0.
|
|
262
|
+
version = "0.27.2"
|
|
262
263
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
263
|
-
checksum = "
|
|
264
|
+
checksum = "1c85c9cbfaddf651b1221594209aed57e9e5cff63c4d11d1feead529b872a089"
|
|
264
265
|
dependencies = [
|
|
265
266
|
"libc",
|
|
266
267
|
"pyo3-build-config",
|
|
@@ -268,9 +269,9 @@ dependencies = [
|
|
|
268
269
|
|
|
269
270
|
[[package]]
|
|
270
271
|
name = "pyo3-log"
|
|
271
|
-
version = "0.
|
|
272
|
+
version = "0.13.2"
|
|
272
273
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
273
|
-
checksum = "
|
|
274
|
+
checksum = "2f8bae9ad5ba08b0b0ed2bb9c2bdbaeccc69cafca96d78cf0fbcea0d45d122bb"
|
|
274
275
|
dependencies = [
|
|
275
276
|
"arc-swap",
|
|
276
277
|
"log",
|
|
@@ -279,9 +280,9 @@ dependencies = [
|
|
|
279
280
|
|
|
280
281
|
[[package]]
|
|
281
282
|
name = "pyo3-macros"
|
|
282
|
-
version = "0.
|
|
283
|
+
version = "0.27.2"
|
|
283
284
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
284
|
-
checksum = "
|
|
285
|
+
checksum = "0a5b10c9bf9888125d917fb4d2ca2d25c8df94c7ab5a52e13313a07e050a3b02"
|
|
285
286
|
dependencies = [
|
|
286
287
|
"proc-macro2",
|
|
287
288
|
"pyo3-macros-backend",
|
|
@@ -291,9 +292,9 @@ dependencies = [
|
|
|
291
292
|
|
|
292
293
|
[[package]]
|
|
293
294
|
name = "pyo3-macros-backend"
|
|
294
|
-
version = "0.
|
|
295
|
+
version = "0.27.2"
|
|
295
296
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
296
|
-
checksum = "
|
|
297
|
+
checksum = "03b51720d314836e53327f5871d4c0cfb4fb37cc2c4a11cc71907a86342c40f9"
|
|
297
298
|
dependencies = [
|
|
298
299
|
"heck",
|
|
299
300
|
"proc-macro2",
|
|
@@ -304,9 +305,9 @@ dependencies = [
|
|
|
304
305
|
|
|
305
306
|
[[package]]
|
|
306
307
|
name = "quote"
|
|
307
|
-
version = "1.0.
|
|
308
|
+
version = "1.0.42"
|
|
308
309
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
309
|
-
checksum = "
|
|
310
|
+
checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f"
|
|
310
311
|
dependencies = [
|
|
311
312
|
"proc-macro2",
|
|
312
313
|
]
|
|
@@ -339,9 +340,9 @@ dependencies = [
|
|
|
339
340
|
|
|
340
341
|
[[package]]
|
|
341
342
|
name = "regex-automata"
|
|
342
|
-
version = "0.4.
|
|
343
|
+
version = "0.4.13"
|
|
343
344
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
344
|
-
checksum = "
|
|
345
|
+
checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c"
|
|
345
346
|
dependencies = [
|
|
346
347
|
"aho-corasick",
|
|
347
348
|
"memchr",
|
|
@@ -350,9 +351,9 @@ dependencies = [
|
|
|
350
351
|
|
|
351
352
|
[[package]]
|
|
352
353
|
name = "regex-syntax"
|
|
353
|
-
version = "0.8.
|
|
354
|
+
version = "0.8.8"
|
|
354
355
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
355
|
-
checksum = "
|
|
356
|
+
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
|
|
356
357
|
|
|
357
358
|
[[package]]
|
|
358
359
|
name = "rustbpe"
|
|
@@ -389,9 +390,9 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
|
|
|
389
390
|
|
|
390
391
|
[[package]]
|
|
391
392
|
name = "syn"
|
|
392
|
-
version = "2.0.
|
|
393
|
+
version = "2.0.110"
|
|
393
394
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
394
|
-
checksum = "
|
|
395
|
+
checksum = "a99801b5bd34ede4cf3fc688c5919368fea4e4814a4664359503e6015b280aea"
|
|
395
396
|
dependencies = [
|
|
396
397
|
"proc-macro2",
|
|
397
398
|
"quote",
|
|
@@ -400,15 +401,15 @@ dependencies = [
|
|
|
400
401
|
|
|
401
402
|
[[package]]
|
|
402
403
|
name = "target-lexicon"
|
|
403
|
-
version = "0.
|
|
404
|
+
version = "0.13.3"
|
|
404
405
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
405
|
-
checksum = "
|
|
406
|
+
checksum = "df7f62577c25e07834649fc3b39fafdc597c0a3527dc1c60129201ccfcbaa50c"
|
|
406
407
|
|
|
407
408
|
[[package]]
|
|
408
409
|
name = "unicode-ident"
|
|
409
|
-
version = "1.0.
|
|
410
|
+
version = "1.0.22"
|
|
410
411
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
411
|
-
checksum = "
|
|
412
|
+
checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
|
|
412
413
|
|
|
413
414
|
[[package]]
|
|
414
415
|
name = "unindent"
|
|
@@ -423,34 +424,34 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
423
424
|
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
|
424
425
|
|
|
425
426
|
[[package]]
|
|
426
|
-
name = "
|
|
427
|
-
version = "0.
|
|
427
|
+
name = "wasip2"
|
|
428
|
+
version = "1.0.1+wasi-0.2.4"
|
|
428
429
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
429
|
-
checksum = "
|
|
430
|
+
checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7"
|
|
430
431
|
dependencies = [
|
|
431
432
|
"wit-bindgen",
|
|
432
433
|
]
|
|
433
434
|
|
|
434
435
|
[[package]]
|
|
435
436
|
name = "wit-bindgen"
|
|
436
|
-
version = "0.
|
|
437
|
+
version = "0.46.0"
|
|
437
438
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
438
|
-
checksum = "
|
|
439
|
+
checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
|
|
439
440
|
|
|
440
441
|
[[package]]
|
|
441
442
|
name = "zerocopy"
|
|
442
|
-
version = "0.8.
|
|
443
|
+
version = "0.8.27"
|
|
443
444
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
444
|
-
checksum = "
|
|
445
|
+
checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c"
|
|
445
446
|
dependencies = [
|
|
446
447
|
"zerocopy-derive",
|
|
447
448
|
]
|
|
448
449
|
|
|
449
450
|
[[package]]
|
|
450
451
|
name = "zerocopy-derive"
|
|
451
|
-
version = "0.8.
|
|
452
|
+
version = "0.8.27"
|
|
452
453
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
453
|
-
checksum = "
|
|
454
|
+
checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831"
|
|
454
455
|
dependencies = [
|
|
455
456
|
"proc-macro2",
|
|
456
457
|
"quote",
|
|
@@ -5,11 +5,11 @@ edition = "2024"
|
|
|
5
5
|
|
|
6
6
|
[dependencies]
|
|
7
7
|
dary_heap = "0.3"
|
|
8
|
-
indexmap = "2.
|
|
9
|
-
fancy-regex = "0.16.
|
|
10
|
-
log = "0.4.
|
|
11
|
-
pyo3 = { version = "0.
|
|
12
|
-
pyo3-log = "0.
|
|
8
|
+
indexmap = "2.12"
|
|
9
|
+
fancy-regex = "0.16.2"
|
|
10
|
+
log = "0.4.29"
|
|
11
|
+
pyo3 = { version = "0.27.2", features = ["extension-module"] }
|
|
12
|
+
pyo3-log = "0.13.2"
|
|
13
13
|
ahash = "0.8.12"
|
|
14
14
|
rayon = "1.11.0"
|
|
15
15
|
compact_str = "0.9.0"
|
|
@@ -19,13 +19,15 @@ type Pair = (u32, u32);
|
|
|
19
19
|
pub struct Tokenizer {
|
|
20
20
|
/// Maps pairs of token IDs to their merged token ID
|
|
21
21
|
pub merges: StdHashMap<Pair, u32>,
|
|
22
|
+
|
|
22
23
|
/// The regex pattern used for text splitting
|
|
23
24
|
pub pattern: String,
|
|
25
|
+
|
|
24
26
|
/// Compiled regex for efficiency
|
|
25
27
|
compiled_pattern: Regex,
|
|
26
28
|
}
|
|
27
29
|
|
|
28
|
-
//
|
|
30
|
+
// internal helpers
|
|
29
31
|
|
|
30
32
|
#[derive(Clone, Debug)]
|
|
31
33
|
struct Word {
|
|
@@ -78,9 +80,11 @@ impl Word {
|
|
|
78
80
|
// write merged token
|
|
79
81
|
out.push(new_id);
|
|
80
82
|
i += 2; // skip 'a' and 'b'
|
|
83
|
+
|
|
81
84
|
} else {
|
|
82
85
|
out.push(self.ids[i]);
|
|
83
86
|
i += 1;
|
|
87
|
+
|
|
84
88
|
}
|
|
85
89
|
}
|
|
86
90
|
|
|
@@ -93,6 +97,7 @@ impl Word {
|
|
|
93
97
|
struct MergeJob {
|
|
94
98
|
pair: Pair,
|
|
95
99
|
count: u64,
|
|
100
|
+
|
|
96
101
|
/// set of word indices where this pair may occur and needs processing
|
|
97
102
|
pos: AHashSet<usize>,
|
|
98
103
|
}
|
|
@@ -154,10 +159,9 @@ fn count_pairs_parallel(
|
|
|
154
159
|
)
|
|
155
160
|
}
|
|
156
161
|
|
|
157
|
-
//
|
|
162
|
+
//
|
|
158
163
|
|
|
159
164
|
impl Tokenizer {
|
|
160
|
-
|
|
161
165
|
/// Core incremental BPE training given unique words and their counts.
|
|
162
166
|
/// `words`: one entry per unique chunk (Vec<u32> of token-ids/bytes).
|
|
163
167
|
/// `counts`: same length as `words`, count per chunk.
|
|
@@ -167,11 +171,11 @@ impl Tokenizer {
|
|
|
167
171
|
log::info!("Starting BPE training: {} merges to compute", num_merges);
|
|
168
172
|
self.merges.clear();
|
|
169
173
|
|
|
170
|
-
//
|
|
174
|
+
// Initial pair_counts and where_to_update (parallel)
|
|
171
175
|
log::info!("Computing initial pair counts from {} unique sequences", words.len());
|
|
172
176
|
let (mut pair_counts, mut where_to_update) = count_pairs_parallel(&words, &counts);
|
|
173
177
|
|
|
174
|
-
//
|
|
178
|
+
// Build heap
|
|
175
179
|
log::info!("Building heap with {} unique pairs", pair_counts.len());
|
|
176
180
|
let mut heap = OctonaryHeap::with_capacity(pair_counts.len());
|
|
177
181
|
for (pair, pos) in where_to_update.drain() {
|
|
@@ -185,7 +189,7 @@ impl Tokenizer {
|
|
|
185
189
|
}
|
|
186
190
|
}
|
|
187
191
|
|
|
188
|
-
//
|
|
192
|
+
// Merge loop
|
|
189
193
|
log::info!("Starting merge loop");
|
|
190
194
|
let mut merges_done = 0u32;
|
|
191
195
|
let mut last_log_percent = 0u32;
|
|
@@ -215,6 +219,7 @@ impl Tokenizer {
|
|
|
215
219
|
for &word_idx in &top.pos {
|
|
216
220
|
// Apply merge to this word and collect pair-count deltas
|
|
217
221
|
let changes = words[word_idx].merge_pair(top.pair, new_id);
|
|
222
|
+
|
|
218
223
|
// Update global pair counts based on this word's count
|
|
219
224
|
for (pair, delta) in changes {
|
|
220
225
|
let delta_total = delta * counts[word_idx];
|
|
@@ -310,14 +315,17 @@ impl Tokenizer {
|
|
|
310
315
|
pyo3::Python::with_gil(|py| {
|
|
311
316
|
buf.clear();
|
|
312
317
|
let it = py_iter.bind(py);
|
|
318
|
+
|
|
313
319
|
loop {
|
|
314
320
|
if buf.len() >= buffer_size {
|
|
315
321
|
return Ok(false);
|
|
316
322
|
}
|
|
323
|
+
|
|
317
324
|
// next(it)
|
|
318
325
|
let next_obj = unsafe {
|
|
319
326
|
pyo3::Bound::from_owned_ptr_or_opt(py, pyo3::ffi::PyIter_Next(it.as_ptr()))
|
|
320
327
|
};
|
|
328
|
+
|
|
321
329
|
match next_obj {
|
|
322
330
|
Some(obj) => {
|
|
323
331
|
let s: String = obj.extract()?;
|
|
@@ -411,6 +419,7 @@ impl Tokenizer {
|
|
|
411
419
|
|
|
412
420
|
for (&pair, &merged_id) in sorted_merges {
|
|
413
421
|
let (left, right) = pair;
|
|
422
|
+
|
|
414
423
|
let mut merged_bytes = token_bytes[left as usize].clone();
|
|
415
424
|
merged_bytes.extend(&token_bytes[right as usize]);
|
|
416
425
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ommlds-rs
|
|
3
|
-
Version: 0.0.0.
|
|
3
|
+
Version: 0.0.0.dev495
|
|
4
4
|
Summary: ommlds
|
|
5
5
|
Author: wrmsr
|
|
6
6
|
License-Expression: BSD-3-Clause
|
|
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
14
14
|
Requires-Python: >=3.13
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
16
|
License-File: LICENSE
|
|
17
|
-
Requires-Dist: ommlds==0.0.0.
|
|
17
|
+
Requires-Dist: ommlds==0.0.0.dev495
|
|
18
18
|
Dynamic: license-file
|
|
19
19
|
|
|
20
20
|
# Overview
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ommlds==0.0.0.dev495
|
|
@@ -14,7 +14,7 @@ urls = {source = 'https://github.com/wrmsr/omlish'}
|
|
|
14
14
|
license = 'BSD-3-Clause'
|
|
15
15
|
readme = 'README.md'
|
|
16
16
|
requires-python = '>=3.13'
|
|
17
|
-
version = '0.0.0.
|
|
17
|
+
version = '0.0.0.dev495'
|
|
18
18
|
classifiers = [
|
|
19
19
|
'Development Status :: 2 - Pre-Alpha',
|
|
20
20
|
'Intended Audience :: Developers',
|
|
@@ -25,7 +25,7 @@ classifiers = [
|
|
|
25
25
|
]
|
|
26
26
|
description = 'ommlds'
|
|
27
27
|
dependencies = [
|
|
28
|
-
'ommlds == 0.0.0.
|
|
28
|
+
'ommlds == 0.0.0.dev495',
|
|
29
29
|
]
|
|
30
30
|
|
|
31
31
|
[tool.setuptools]
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
ommlds==0.0.0.dev473
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|