PyPI - ommlds-rs - Versions diffs - 0.0.0.dev481__tar.gz → 0.0.0.dev495__tar.gz - Mend

ommlds-rs 0.0.0.dev481tar.gz → 0.0.0.dev495tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ommlds-rs might be problematic. Click here for more details.

Files changed (17) hide show

{ommlds_rs-0.0.0.dev481 → ommlds_rs-0.0.0.dev495}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ommlds-rs
-Version: 0.0.0.dev481
+Version: 0.0.0.dev495
 Summary: ommlds
 Author: wrmsr
 License-Expression: BSD-3-Clause
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.13
 Requires-Python: >=3.13
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: ommlds==0.0.0.dev481
+Requires-Dist: ommlds==0.0.0.dev495
 Dynamic: license-file
 # Overview

{ommlds_rs-0.0.0.dev481 → ommlds_rs-0.0.0.dev495}/ommlds/nanochat/rustbpe/src/lib.rs RENAMED Viewed

@@ -19,13 +19,15 @@ type Pair = (u32, u32);
 pub struct Tokenizer {
     /// Maps pairs of token IDs to their merged token ID
     pub merges: StdHashMap<Pair, u32>,
     /// The regex pattern used for text splitting
     pub pattern: String,
     /// Compiled regex for efficiency
     compiled_pattern: Regex,
 }
-// ------------------------ internal helpers ------------------------
+// internal helpers
 #[derive(Clone, Debug)]
 struct Word {
@@ -78,9 +80,11 @@ impl Word {
                 // write merged token
                 out.push(new_id);
                 i += 2; // skip 'a' and 'b'
             } else {
                 out.push(self.ids[i]);
                 i += 1;
             }
         }
@@ -93,6 +97,7 @@ impl Word {
 struct MergeJob {
     pair: Pair,
     count: u64,
     /// set of word indices where this pair may occur and needs processing
     pos: AHashSet<usize>,
 }
@@ -154,10 +159,9 @@ fn count_pairs_parallel(
         )
 }
-// ------------------------ END helpers ------------------------
+//
 impl Tokenizer {
     /// Core incremental BPE training given unique words and their counts.
     /// `words`: one entry per unique chunk (Vec<u32> of token-ids/bytes).
     /// `counts`: same length as `words`, count per chunk.
@@ -167,11 +171,11 @@ impl Tokenizer {
         log::info!("Starting BPE training: {} merges to compute", num_merges);
         self.merges.clear();
-        // ---- Initial pair_counts and where_to_update (parallel) ----
+        // Initial pair_counts and where_to_update (parallel)
         log::info!("Computing initial pair counts from {} unique sequences", words.len());
         let (mut pair_counts, mut where_to_update) = count_pairs_parallel(&words, &counts);
-        // ---- Build heap ----
+        // Build heap
         log::info!("Building heap with {} unique pairs", pair_counts.len());
         let mut heap = OctonaryHeap::with_capacity(pair_counts.len());
         for (pair, pos) in where_to_update.drain() {
@@ -185,7 +189,7 @@ impl Tokenizer {
             }
         }
-        // ---- Merge loop ----
+        // Merge loop
         log::info!("Starting merge loop");
         let mut merges_done = 0u32;
         let mut last_log_percent = 0u32;
@@ -215,6 +219,7 @@ impl Tokenizer {
             for &word_idx in &top.pos {
                 // Apply merge to this word and collect pair-count deltas
                 let changes = words[word_idx].merge_pair(top.pair, new_id);
                 // Update global pair counts based on this word's count
                 for (pair, delta) in changes {
                     let delta_total = delta * counts[word_idx];
@@ -310,14 +315,17 @@ impl Tokenizer {
             pyo3::Python::with_gil(|py| {
                 buf.clear();
                 let it = py_iter.bind(py);
                 loop {
                     if buf.len() >= buffer_size {
                         return Ok(false);
                     }
                     // next(it)
                     let next_obj = unsafe {
                         pyo3::Bound::from_owned_ptr_or_opt(py, pyo3::ffi::PyIter_Next(it.as_ptr()))
                     };
                     match next_obj {
                         Some(obj) => {
                             let s: String = obj.extract()?;
@@ -411,6 +419,7 @@ impl Tokenizer {
         for (&pair, &merged_id) in sorted_merges {
             let (left, right) = pair;
             let mut merged_bytes = token_bytes[left as usize].clone();
             merged_bytes.extend(&token_bytes[right as usize]);

{ommlds_rs-0.0.0.dev481 → ommlds_rs-0.0.0.dev495}/ommlds_rs.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ommlds-rs
-Version: 0.0.0.dev481
+Version: 0.0.0.dev495
 Summary: ommlds
 Author: wrmsr
 License-Expression: BSD-3-Clause
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.13
 Requires-Python: >=3.13
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: ommlds==0.0.0.dev481
+Requires-Dist: ommlds==0.0.0.dev495
 Dynamic: license-file
 # Overview

ommlds_rs-0.0.0.dev495/ommlds_rs.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ ommlds==0.0.0.dev495

{ommlds_rs-0.0.0.dev481 → ommlds_rs-0.0.0.dev495}/pyproject.toml RENAMED Viewed

@@ -14,7 +14,7 @@ urls = {source = 'https://github.com/wrmsr/omlish'}
 license = 'BSD-3-Clause'
 readme = 'README.md'
 requires-python = '>=3.13'
-version = '0.0.0.dev481'
+version = '0.0.0.dev495'
 classifiers = [
     'Development Status :: 2 - Pre-Alpha',
     'Intended Audience :: Developers',
@@ -25,7 +25,7 @@ classifiers = [
 ]
 description = 'ommlds'
 dependencies = [
-    'ommlds == 0.0.0.dev481',
+    'ommlds == 0.0.0.dev495',
 ]
 [tool.setuptools]