ommlds-rs 0.0.0.dev481__tar.gz → 0.0.0.dev495__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ommlds-rs might be problematic. Click here for more details.
- {ommlds_rs-0.0.0.dev481 → ommlds_rs-0.0.0.dev495}/PKG-INFO +2 -2
- {ommlds_rs-0.0.0.dev481 → ommlds_rs-0.0.0.dev495}/ommlds/nanochat/rustbpe/src/lib.rs +15 -6
- {ommlds_rs-0.0.0.dev481 → ommlds_rs-0.0.0.dev495}/ommlds_rs.egg-info/PKG-INFO +2 -2
- ommlds_rs-0.0.0.dev495/ommlds_rs.egg-info/requires.txt +1 -0
- {ommlds_rs-0.0.0.dev481 → ommlds_rs-0.0.0.dev495}/pyproject.toml +2 -2
- ommlds_rs-0.0.0.dev481/ommlds_rs.egg-info/requires.txt +0 -1
- {ommlds_rs-0.0.0.dev481 → ommlds_rs-0.0.0.dev495}/LICENSE +0 -0
- {ommlds_rs-0.0.0.dev481 → ommlds_rs-0.0.0.dev495}/README.md +0 -0
- {ommlds_rs-0.0.0.dev481 → ommlds_rs-0.0.0.dev495}/ommlds/nanochat/rustbpe/Cargo.lock +0 -0
- {ommlds_rs-0.0.0.dev481 → ommlds_rs-0.0.0.dev495}/ommlds/nanochat/rustbpe/Cargo.toml +0 -0
- {ommlds_rs-0.0.0.dev481 → ommlds_rs-0.0.0.dev495}/ommlds/nanochat/rustbpe/LICENSE +0 -0
- {ommlds_rs-0.0.0.dev481 → ommlds_rs-0.0.0.dev495}/ommlds/nanochat/rustbpe/README.md +0 -0
- {ommlds_rs-0.0.0.dev481 → ommlds_rs-0.0.0.dev495}/ommlds_rs.egg-info/SOURCES.txt +0 -0
- {ommlds_rs-0.0.0.dev481 → ommlds_rs-0.0.0.dev495}/ommlds_rs.egg-info/dependency_links.txt +0 -0
- {ommlds_rs-0.0.0.dev481 → ommlds_rs-0.0.0.dev495}/ommlds_rs.egg-info/top_level.txt +0 -0
- {ommlds_rs-0.0.0.dev481 → ommlds_rs-0.0.0.dev495}/setup.cfg +0 -0
- {ommlds_rs-0.0.0.dev481 → ommlds_rs-0.0.0.dev495}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ommlds-rs
|
|
3
|
-
Version: 0.0.0.
|
|
3
|
+
Version: 0.0.0.dev495
|
|
4
4
|
Summary: ommlds
|
|
5
5
|
Author: wrmsr
|
|
6
6
|
License-Expression: BSD-3-Clause
|
|
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
14
14
|
Requires-Python: >=3.13
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
16
|
License-File: LICENSE
|
|
17
|
-
Requires-Dist: ommlds==0.0.0.
|
|
17
|
+
Requires-Dist: ommlds==0.0.0.dev495
|
|
18
18
|
Dynamic: license-file
|
|
19
19
|
|
|
20
20
|
# Overview
|
|
@@ -19,13 +19,15 @@ type Pair = (u32, u32);
|
|
|
19
19
|
pub struct Tokenizer {
|
|
20
20
|
/// Maps pairs of token IDs to their merged token ID
|
|
21
21
|
pub merges: StdHashMap<Pair, u32>,
|
|
22
|
+
|
|
22
23
|
/// The regex pattern used for text splitting
|
|
23
24
|
pub pattern: String,
|
|
25
|
+
|
|
24
26
|
/// Compiled regex for efficiency
|
|
25
27
|
compiled_pattern: Regex,
|
|
26
28
|
}
|
|
27
29
|
|
|
28
|
-
//
|
|
30
|
+
// internal helpers
|
|
29
31
|
|
|
30
32
|
#[derive(Clone, Debug)]
|
|
31
33
|
struct Word {
|
|
@@ -78,9 +80,11 @@ impl Word {
|
|
|
78
80
|
// write merged token
|
|
79
81
|
out.push(new_id);
|
|
80
82
|
i += 2; // skip 'a' and 'b'
|
|
83
|
+
|
|
81
84
|
} else {
|
|
82
85
|
out.push(self.ids[i]);
|
|
83
86
|
i += 1;
|
|
87
|
+
|
|
84
88
|
}
|
|
85
89
|
}
|
|
86
90
|
|
|
@@ -93,6 +97,7 @@ impl Word {
|
|
|
93
97
|
struct MergeJob {
|
|
94
98
|
pair: Pair,
|
|
95
99
|
count: u64,
|
|
100
|
+
|
|
96
101
|
/// set of word indices where this pair may occur and needs processing
|
|
97
102
|
pos: AHashSet<usize>,
|
|
98
103
|
}
|
|
@@ -154,10 +159,9 @@ fn count_pairs_parallel(
|
|
|
154
159
|
)
|
|
155
160
|
}
|
|
156
161
|
|
|
157
|
-
//
|
|
162
|
+
//
|
|
158
163
|
|
|
159
164
|
impl Tokenizer {
|
|
160
|
-
|
|
161
165
|
/// Core incremental BPE training given unique words and their counts.
|
|
162
166
|
/// `words`: one entry per unique chunk (Vec<u32> of token-ids/bytes).
|
|
163
167
|
/// `counts`: same length as `words`, count per chunk.
|
|
@@ -167,11 +171,11 @@ impl Tokenizer {
|
|
|
167
171
|
log::info!("Starting BPE training: {} merges to compute", num_merges);
|
|
168
172
|
self.merges.clear();
|
|
169
173
|
|
|
170
|
-
//
|
|
174
|
+
// Initial pair_counts and where_to_update (parallel)
|
|
171
175
|
log::info!("Computing initial pair counts from {} unique sequences", words.len());
|
|
172
176
|
let (mut pair_counts, mut where_to_update) = count_pairs_parallel(&words, &counts);
|
|
173
177
|
|
|
174
|
-
//
|
|
178
|
+
// Build heap
|
|
175
179
|
log::info!("Building heap with {} unique pairs", pair_counts.len());
|
|
176
180
|
let mut heap = OctonaryHeap::with_capacity(pair_counts.len());
|
|
177
181
|
for (pair, pos) in where_to_update.drain() {
|
|
@@ -185,7 +189,7 @@ impl Tokenizer {
|
|
|
185
189
|
}
|
|
186
190
|
}
|
|
187
191
|
|
|
188
|
-
//
|
|
192
|
+
// Merge loop
|
|
189
193
|
log::info!("Starting merge loop");
|
|
190
194
|
let mut merges_done = 0u32;
|
|
191
195
|
let mut last_log_percent = 0u32;
|
|
@@ -215,6 +219,7 @@ impl Tokenizer {
|
|
|
215
219
|
for &word_idx in &top.pos {
|
|
216
220
|
// Apply merge to this word and collect pair-count deltas
|
|
217
221
|
let changes = words[word_idx].merge_pair(top.pair, new_id);
|
|
222
|
+
|
|
218
223
|
// Update global pair counts based on this word's count
|
|
219
224
|
for (pair, delta) in changes {
|
|
220
225
|
let delta_total = delta * counts[word_idx];
|
|
@@ -310,14 +315,17 @@ impl Tokenizer {
|
|
|
310
315
|
pyo3::Python::with_gil(|py| {
|
|
311
316
|
buf.clear();
|
|
312
317
|
let it = py_iter.bind(py);
|
|
318
|
+
|
|
313
319
|
loop {
|
|
314
320
|
if buf.len() >= buffer_size {
|
|
315
321
|
return Ok(false);
|
|
316
322
|
}
|
|
323
|
+
|
|
317
324
|
// next(it)
|
|
318
325
|
let next_obj = unsafe {
|
|
319
326
|
pyo3::Bound::from_owned_ptr_or_opt(py, pyo3::ffi::PyIter_Next(it.as_ptr()))
|
|
320
327
|
};
|
|
328
|
+
|
|
321
329
|
match next_obj {
|
|
322
330
|
Some(obj) => {
|
|
323
331
|
let s: String = obj.extract()?;
|
|
@@ -411,6 +419,7 @@ impl Tokenizer {
|
|
|
411
419
|
|
|
412
420
|
for (&pair, &merged_id) in sorted_merges {
|
|
413
421
|
let (left, right) = pair;
|
|
422
|
+
|
|
414
423
|
let mut merged_bytes = token_bytes[left as usize].clone();
|
|
415
424
|
merged_bytes.extend(&token_bytes[right as usize]);
|
|
416
425
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ommlds-rs
|
|
3
|
-
Version: 0.0.0.
|
|
3
|
+
Version: 0.0.0.dev495
|
|
4
4
|
Summary: ommlds
|
|
5
5
|
Author: wrmsr
|
|
6
6
|
License-Expression: BSD-3-Clause
|
|
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
14
14
|
Requires-Python: >=3.13
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
16
|
License-File: LICENSE
|
|
17
|
-
Requires-Dist: ommlds==0.0.0.
|
|
17
|
+
Requires-Dist: ommlds==0.0.0.dev495
|
|
18
18
|
Dynamic: license-file
|
|
19
19
|
|
|
20
20
|
# Overview
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ommlds==0.0.0.dev495
|
|
@@ -14,7 +14,7 @@ urls = {source = 'https://github.com/wrmsr/omlish'}
|
|
|
14
14
|
license = 'BSD-3-Clause'
|
|
15
15
|
readme = 'README.md'
|
|
16
16
|
requires-python = '>=3.13'
|
|
17
|
-
version = '0.0.0.
|
|
17
|
+
version = '0.0.0.dev495'
|
|
18
18
|
classifiers = [
|
|
19
19
|
'Development Status :: 2 - Pre-Alpha',
|
|
20
20
|
'Intended Audience :: Developers',
|
|
@@ -25,7 +25,7 @@ classifiers = [
|
|
|
25
25
|
]
|
|
26
26
|
description = 'ommlds'
|
|
27
27
|
dependencies = [
|
|
28
|
-
'ommlds == 0.0.0.
|
|
28
|
+
'ommlds == 0.0.0.dev495',
|
|
29
29
|
]
|
|
30
30
|
|
|
31
31
|
[tool.setuptools]
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
ommlds==0.0.0.dev481
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|