ommlds-rs 0.0.0.dev481__tar.gz → 0.0.0.dev495__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ommlds-rs might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ommlds-rs
3
- Version: 0.0.0.dev481
3
+ Version: 0.0.0.dev495
4
4
  Summary: ommlds
5
5
  Author: wrmsr
6
6
  License-Expression: BSD-3-Clause
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.13
14
14
  Requires-Python: >=3.13
15
15
  Description-Content-Type: text/markdown
16
16
  License-File: LICENSE
17
- Requires-Dist: ommlds==0.0.0.dev481
17
+ Requires-Dist: ommlds==0.0.0.dev495
18
18
  Dynamic: license-file
19
19
 
20
20
  # Overview
@@ -19,13 +19,15 @@ type Pair = (u32, u32);
19
19
  pub struct Tokenizer {
20
20
  /// Maps pairs of token IDs to their merged token ID
21
21
  pub merges: StdHashMap<Pair, u32>,
22
+
22
23
  /// The regex pattern used for text splitting
23
24
  pub pattern: String,
25
+
24
26
  /// Compiled regex for efficiency
25
27
  compiled_pattern: Regex,
26
28
  }
27
29
 
28
- // ------------------------ internal helpers ------------------------
30
+ // internal helpers
29
31
 
30
32
  #[derive(Clone, Debug)]
31
33
  struct Word {
@@ -78,9 +80,11 @@ impl Word {
78
80
  // write merged token
79
81
  out.push(new_id);
80
82
  i += 2; // skip 'a' and 'b'
83
+
81
84
  } else {
82
85
  out.push(self.ids[i]);
83
86
  i += 1;
87
+
84
88
  }
85
89
  }
86
90
 
@@ -93,6 +97,7 @@ impl Word {
93
97
  struct MergeJob {
94
98
  pair: Pair,
95
99
  count: u64,
100
+
96
101
  /// set of word indices where this pair may occur and needs processing
97
102
  pos: AHashSet<usize>,
98
103
  }
@@ -154,10 +159,9 @@ fn count_pairs_parallel(
154
159
  )
155
160
  }
156
161
 
157
- // ------------------------ END helpers ------------------------
162
+ //
158
163
 
159
164
  impl Tokenizer {
160
-
161
165
  /// Core incremental BPE training given unique words and their counts.
162
166
  /// `words`: one entry per unique chunk (Vec<u32> of token-ids/bytes).
163
167
  /// `counts`: same length as `words`, count per chunk.
@@ -167,11 +171,11 @@ impl Tokenizer {
167
171
  log::info!("Starting BPE training: {} merges to compute", num_merges);
168
172
  self.merges.clear();
169
173
 
170
- // ---- Initial pair_counts and where_to_update (parallel) ----
174
+ // Initial pair_counts and where_to_update (parallel)
171
175
  log::info!("Computing initial pair counts from {} unique sequences", words.len());
172
176
  let (mut pair_counts, mut where_to_update) = count_pairs_parallel(&words, &counts);
173
177
 
174
- // ---- Build heap ----
178
+ // Build heap
175
179
  log::info!("Building heap with {} unique pairs", pair_counts.len());
176
180
  let mut heap = OctonaryHeap::with_capacity(pair_counts.len());
177
181
  for (pair, pos) in where_to_update.drain() {
@@ -185,7 +189,7 @@ impl Tokenizer {
185
189
  }
186
190
  }
187
191
 
188
- // ---- Merge loop ----
192
+ // Merge loop
189
193
  log::info!("Starting merge loop");
190
194
  let mut merges_done = 0u32;
191
195
  let mut last_log_percent = 0u32;
@@ -215,6 +219,7 @@ impl Tokenizer {
215
219
  for &word_idx in &top.pos {
216
220
  // Apply merge to this word and collect pair-count deltas
217
221
  let changes = words[word_idx].merge_pair(top.pair, new_id);
222
+
218
223
  // Update global pair counts based on this word's count
219
224
  for (pair, delta) in changes {
220
225
  let delta_total = delta * counts[word_idx];
@@ -310,14 +315,17 @@ impl Tokenizer {
310
315
  pyo3::Python::with_gil(|py| {
311
316
  buf.clear();
312
317
  let it = py_iter.bind(py);
318
+
313
319
  loop {
314
320
  if buf.len() >= buffer_size {
315
321
  return Ok(false);
316
322
  }
323
+
317
324
  // next(it)
318
325
  let next_obj = unsafe {
319
326
  pyo3::Bound::from_owned_ptr_or_opt(py, pyo3::ffi::PyIter_Next(it.as_ptr()))
320
327
  };
328
+
321
329
  match next_obj {
322
330
  Some(obj) => {
323
331
  let s: String = obj.extract()?;
@@ -411,6 +419,7 @@ impl Tokenizer {
411
419
 
412
420
  for (&pair, &merged_id) in sorted_merges {
413
421
  let (left, right) = pair;
422
+
414
423
  let mut merged_bytes = token_bytes[left as usize].clone();
415
424
  merged_bytes.extend(&token_bytes[right as usize]);
416
425
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ommlds-rs
3
- Version: 0.0.0.dev481
3
+ Version: 0.0.0.dev495
4
4
  Summary: ommlds
5
5
  Author: wrmsr
6
6
  License-Expression: BSD-3-Clause
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.13
14
14
  Requires-Python: >=3.13
15
15
  Description-Content-Type: text/markdown
16
16
  License-File: LICENSE
17
- Requires-Dist: ommlds==0.0.0.dev481
17
+ Requires-Dist: ommlds==0.0.0.dev495
18
18
  Dynamic: license-file
19
19
 
20
20
  # Overview
@@ -0,0 +1 @@
1
+ ommlds==0.0.0.dev495
@@ -14,7 +14,7 @@ urls = {source = 'https://github.com/wrmsr/omlish'}
14
14
  license = 'BSD-3-Clause'
15
15
  readme = 'README.md'
16
16
  requires-python = '>=3.13'
17
- version = '0.0.0.dev481'
17
+ version = '0.0.0.dev495'
18
18
  classifiers = [
19
19
  'Development Status :: 2 - Pre-Alpha',
20
20
  'Intended Audience :: Developers',
@@ -25,7 +25,7 @@ classifiers = [
25
25
  ]
26
26
  description = 'ommlds'
27
27
  dependencies = [
28
- 'ommlds == 0.0.0.dev481',
28
+ 'ommlds == 0.0.0.dev495',
29
29
  ]
30
30
 
31
31
  [tool.setuptools]
@@ -1 +0,0 @@
1
- ommlds==0.0.0.dev481