RubyGems - lancelot - Versions diffs - 0.2.0 → 0.3.2 - Mend

lancelot 0.2.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/README.md +42 -7
data/examples/idempotent_create.rb +66 -0
data/examples/optional_fields_demo.rb +83 -0
data/ext/lancelot/src/conversion.rs +30 -9
data/ext/lancelot/src/dataset.rs +25 -19
data/lib/lancelot/dataset.rb +8 -0
data/lib/lancelot/version.rb +1 -1
metadata +4 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 7d1c9e53cfc1948e847b195b8d8860c6dba8a85813714b60a7d7747966533de3
-  data.tar.gz: 7c130c04c01a8eebce16755a1da082f4db6ec1f85ac2857d84943272c695eb89
+  metadata.gz: cda5bd00de23ad9f4840b1cc7f6e96b7eb4ec124ba902d3901ef36b37594de97
+  data.tar.gz: 23e84c317e5bcd0f52870c673d0b96f9abf4f3ab97f20eb4e5b8a8047dd1cf1f
 SHA512:
-  metadata.gz: fb1a939d21c8935cec9fb9eb0ba1ddbef7ab9a8edd380149e0026db00ce67167609c99ed98597ccd692baa59f046dcd041fc76f653a5a635c152b82c343310d4
-  data.tar.gz: d35819fbfc3c2dec2b2cf65236b2bbf74eaef9f501d32f2a0a0ca7724ffc4bc1bdef56222cb50a3390d59d91e6e492a1170114b2762cebac38bcf4529bb27925
+  metadata.gz: 32cb3e852ed77b8ec2831b08f93252b5ccb176942c217a366079e1cb2a2a97c4be74247b5f8410a2892852456278a5ec8e5b556bd9ca3379d740b292cc32c15b
+  data.tar.gz: 718a99dbfcd51dce872feedb87265be86417faba152d6f2512a9391f28e5fc5e115deb0154be52dc029c2771611b71c685f1f143a4d2da5157aaafc28997ffa7

data/README.md CHANGED Viewed

@@ -2,11 +2,52 @@
 Ruby bindings for [Lance](https://github.com/lancedb/lance), a modern columnar data format for ML. Lancelot provides a Ruby-native interface to Lance, enabling efficient storage and search of multimodal data including text, vectors, and more.
+## Quickstart
+```ruby
+require 'lancelot'
+require 'red-candle'
+strings = [
+  "apple",
+  "orange",
+  "google"
+]
+model = Candle::EmbeddingModel.from_pretrained
+dataset = Lancelot::Dataset.open_or_create("words", schema: {
+  text: :string,
+  embedding: { type: "vector", dimension: 768 }
+})
+records = strings.collect do |string|
+  embedding = model.embedding(string).first.to_a
+  { text: string, embedding: embedding }
+end
+dataset.add_documents(records)
+dataset.create_vector_index("embedding")
+dataset.create_text_index("text")
+query = "fruit"
+query_embedding = model.embedding(query).first.to_a
+dataset.vector_search(query_embedding, column: "embedding", limit: 5).each { |r| puts r[:text] }; nil
+dataset.text_search("apple", column: "text", limit: 5).each { |r| puts r[:text] }; nil
+query = "tech company"
+query_embedding = model.embedding(query).first.to_a
+dataset.vector_search(query_embedding, column: "embedding", limit: 5).each { |r| puts r[:text] }; nil
+```
 ## Features
 ### Implemented
 - **Dataset Creation**: Create Lance datasets with schemas
-- **Data Storage**: Add documents to datasets
+- **Data Storage**: Add documents to datasets
 - **Document Retrieval**: Read documents from datasets with enumerable support
 - **Vector Search**: Create vector indices and perform similarity search
 - **Full-Text Search**: Built-in full-text search with inverted indices
@@ -14,12 +55,6 @@ Ruby bindings for [Lance](https://github.com/lancedb/lance), a modern columnar d
 - **Schema Support**: Define schemas with string, float32, and vector types
 - **Row Counting**: Get the number of rows in a dataset
-### Planned
-- **Multimodal Support**: Store and search across different data types beyond text and vectors
-- **Schema Evolution**: Add new columns to existing datasets without rewriting data
-- **Additional Fusion Methods**: Support for other fusion algorithms beyond RRF
 ## Installation
 Install the gem and add to the application's Gemfile by executing:

data/examples/idempotent_create.rb ADDED Viewed

@@ -0,0 +1,66 @@
+#!/usr/bin/env ruby
+# Demonstrates idempotent dataset creation with open_or_create
+require 'bundler/setup'
+require 'lancelot'
+require 'fileutils'
+dataset_path = "words"
+puts "="*60
+puts "Idempotent Dataset Creation Demo"
+puts "="*60
+schema = {
+  text: :string,
+  embedding: { type: "vector", dimension: 768 }
+}
+# First call - will CREATE the dataset
+puts "\n1. First call to open_or_create (should create)..."
+dataset = Lancelot::Dataset.open_or_create(dataset_path, schema: schema)
+puts "   Dataset opened/created. Current count: #{dataset.count}"
+# Add some data
+dataset.add_documents([
+  { text: "hello", embedding: Array.new(768) { rand } },
+  { text: "world", embedding: Array.new(768) { rand } }
+])
+puts "   Added 2 documents. New count: #{dataset.count}"
+# Second call - will OPEN the existing dataset
+puts "\n2. Second call to open_or_create (should open existing)..."
+dataset2 = Lancelot::Dataset.open_or_create(dataset_path, schema: schema)
+puts "   Dataset opened. Current count: #{dataset2.count}"
+puts "   ✓ Data persisted from previous session!"
+# Third call - still idempotent
+puts "\n3. Third call - still works..."
+dataset3 = Lancelot::Dataset.open_or_create(dataset_path, schema: schema)
+dataset3.add_documents([
+  { text: "more", embedding: Array.new(768) { rand } }
+])
+puts "   Added 1 more document. New count: #{dataset3.count}"
+# Demonstrate the OLD way that would fail
+puts "\n4. Compare with non-idempotent create (would fail)..."
+begin
+  # This will fail because dataset already exists
+  failing_dataset = Lancelot::Dataset.create(dataset_path, schema: schema)
+  puts "   ✗ This shouldn't happen!"
+rescue => e
+  puts "   ✓ Dataset.create correctly failed: #{e.class}"
+  puts "   Message: #{e.message[0..50]}..."
+end
+# Clean up
+FileUtils.rm_rf(dataset_path)
+puts "\n" + "="*60
+puts "Summary: Use open_or_create for idempotent operations!"
+puts "="*60
+puts "\nInstead of:"
+puts '  dataset = Lancelot::Dataset.create("words", schema: {...})'
+puts "\nUse:"
+puts '  dataset = Lancelot::Dataset.open_or_create("words", schema: {...})'
+puts "\nThis way your code works whether the dataset exists or not!"

data/examples/optional_fields_demo.rb ADDED Viewed

@@ -0,0 +1,83 @@
+#!/usr/bin/env ruby
+# This example demonstrates optional field support in lancelot
+# After the fix in conversion.rs, documents can have missing fields
+require 'bundler/setup'
+require 'lancelot'
+require 'fileutils'
+dataset_path = "example_optional_fields"
+FileUtils.rm_rf(dataset_path)
+puts "="*60
+puts "Lancelot Optional Fields Demo"
+puts "="*60
+# Step 1: Create dataset with initial schema
+puts "\n1. Creating dataset with 3 fields (id, text, score)..."
+schema = {
+  id: :string,
+  text: :string,
+  score: :float32
+}
+dataset = Lancelot::Dataset.create(dataset_path, schema: schema)
+# Add initial documents
+initial_docs = [
+  { id: "1", text: "First document", score: 0.9 },
+  { id: "2", text: "Second document", score: 0.8 }
+]
+dataset.add_documents(initial_docs)
+puts "   Added #{dataset.count} documents"
+# Step 2: Simulate schema evolution (adding a new field)
+puts "\n2. Simulating schema evolution (adding 'category' field)..."
+# Get existing data
+all_docs = dataset.to_a
+# Recreate with expanded schema
+FileUtils.rm_rf(dataset_path)
+expanded_schema = {
+  id: :string,
+  text: :string,
+  score: :float32,
+  category: :string  # NEW FIELD
+}
+dataset = Lancelot::Dataset.create(dataset_path, schema: expanded_schema)
+# Re-add existing docs with the new field
+docs_with_category = all_docs.map { |doc| doc.merge(category: "original") }
+dataset.add_documents(docs_with_category)
+puts "   Recreated dataset with expanded schema"
+# Step 3: Add new documents WITHOUT the new field
+puts "\n3. Adding new documents WITHOUT the 'category' field..."
+new_docs = [
+  { id: "3", text: "Third document", score: 0.7 },  # No category!
+  { id: "4", text: "Fourth document", score: 0.6 }  # No category!
+]
+begin
+  dataset.add_documents(new_docs)
+  puts "   ✅ SUCCESS! Added #{new_docs.size} documents with missing fields"
+rescue => e
+  puts "   ❌ FAILED: #{e.message}"
+  puts "   (This would have failed before the fix in conversion.rs)"
+end
+# Step 4: Verify the data
+puts "\n4. Verifying all documents..."
+dataset.to_a.each do |doc|
+  category = doc[:category] || "nil"
+  puts "   Doc #{doc[:id]}: category=#{category}"
+end
+puts "\nTotal documents: #{dataset.count}"
+# Cleanup
+FileUtils.rm_rf(dataset_path)
+puts "\n" + "="*60
+puts "Demo complete! Optional fields work correctly."
+puts "="*60

data/ext/lancelot/src/conversion.rs CHANGED Viewed

@@ -39,11 +39,13 @@ pub fn build_record_batch(
         let item = RHash::try_convert(item)?;
         for field in schema.fields() {
             let key = Symbol::new(field.name());
-            let value: Value = item.fetch(key)
-                .or_else(|_| {
+            // Make fields optional - use get instead of fetch
+            let value: Value = item.get(key)
+                .or_else(|| {
                     // Try with string key
-                    item.fetch(field.name().as_str())
-                })?;
+                    item.get(field.name().as_str())
+                })
+                .unwrap_or_else(|| Ruby::get().unwrap().qnil().as_value());
             match field.data_type() {
                 DataType::Utf8 => {
@@ -157,9 +159,9 @@ pub fn build_record_batch(
         .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))
 }
-pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<Vec<RHash>, Error> {
+pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<RArray, Error> {
     let ruby = Ruby::get().unwrap();
-    let mut documents = Vec::new();
+    let documents = ruby.ary_new();
     let num_rows = batch.num_rows();
     let schema = batch.schema();
@@ -171,6 +173,15 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<Vec<RHash>, Error> {
             let column = batch.column(col_idx);
             let key = Symbol::new(field.name());
+            // CRITICAL: Add bounds checking for all array access
+            if row_idx >= column.len() {
+                return Err(Error::new(
+                    magnus::exception::runtime_error(),
+                    format!("Row index {} out of bounds for column '{}' with length {}",
+                            row_idx, field.name(), column.len())
+                ));
+            }
             match field.data_type() {
                 DataType::Utf8 => {
                     let array = column.as_any().downcast_ref::<StringArray>()
@@ -223,9 +234,19 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<Vec<RHash>, Error> {
                         let float_array = values.as_any().downcast_ref::<Float32Array>()
                             .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "Failed to cast vector values to Float32Array"))?;
+                        // CRITICAL: Verify the float_array has the expected size
+                        let expected_size = *list_size as usize;
+                        if float_array.len() != expected_size {
+                            return Err(Error::new(
+                                magnus::exception::runtime_error(),
+                                format!("Vector data corruption: expected {} elements but found {} for field '{}'",
+                                        expected_size, float_array.len(), field.name())
+                            ));
+                        }
                         let ruby_array = ruby.ary_new();
-                        for i in 0..*list_size {
-                            ruby_array.push(float_array.value(i as usize))?;
+                        for i in 0..expected_size {
+                            ruby_array.push(float_array.value(i))?;
                         }
                         doc.aset(key, ruby_array)?;
                     }
@@ -236,7 +257,7 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<Vec<RHash>, Error> {
             }
         }
-        documents.push(doc);
+        documents.push(doc)?;
     }
     Ok(documents)

data/ext/lancelot/src/dataset.rs CHANGED Viewed

@@ -1,4 +1,4 @@
-use magnus::{Error, Ruby, RHash, RArray, Symbol, TryConvert, function, method, RClass, Module, Object};
+use magnus::{Error, Ruby, RHash, RArray, Symbol, TryConvert, Value, function, method, RClass, Module, Object};
 use std::cell::RefCell;
 use std::sync::Arc;
 use tokio::runtime::Runtime;
@@ -157,9 +157,10 @@ impl LancelotDataset {
         let result_array = ruby.ary_new();
         for batch in batches {
-            let documents = convert_batch_to_ruby(&batch)?;
-            for doc in documents {
-                result_array.push(doc)?;
+            let batch_docs = convert_batch_to_ruby(&batch)?;
+            // Merge arrays by pushing each element
+            for i in 0..batch_docs.len() {
+                result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
             }
         }
@@ -191,9 +192,10 @@ impl LancelotDataset {
         let result_array = ruby.ary_new();
         for batch in batches {
-            let documents = convert_batch_to_ruby(&batch)?;
-            for doc in documents {
-                result_array.push(doc)?;
+            let batch_docs = convert_batch_to_ruby(&batch)?;
+            // Merge arrays by pushing each element
+            for i in 0..batch_docs.len() {
+                result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
             }
         }
@@ -265,9 +267,10 @@ impl LancelotDataset {
         let result_array = ruby.ary_new();
         for batch in batches {
-            let documents = convert_batch_to_ruby(&batch)?;
-            for doc in documents {
-                result_array.push(doc)?;
+            let batch_docs = convert_batch_to_ruby(&batch)?;
+            // Merge arrays by pushing each element
+            for i in 0..batch_docs.len() {
+                result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
             }
         }
@@ -330,9 +333,10 @@ impl LancelotDataset {
         let result_array = ruby.ary_new();
         for batch in batches {
-            let documents = convert_batch_to_ruby(&batch)?;
-            for doc in documents {
-                result_array.push(doc)?;
+            let batch_docs = convert_batch_to_ruby(&batch)?;
+            // Merge arrays by pushing each element
+            for i in 0..batch_docs.len() {
+                result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
             }
         }
@@ -380,9 +384,10 @@ impl LancelotDataset {
         let result_array = ruby.ary_new();
         for batch in batches {
-            let documents = convert_batch_to_ruby(&batch)?;
-            for doc in documents {
-                result_array.push(doc)?;
+            let batch_docs = convert_batch_to_ruby(&batch)?;
+            // Merge arrays by pushing each element
+            for i in 0..batch_docs.len() {
+                result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
             }
         }
@@ -422,9 +427,10 @@ impl LancelotDataset {
         let result_array = ruby.ary_new();
         for batch in batches {
-            let documents = convert_batch_to_ruby(&batch)?;
-            for doc in documents {
-                result_array.push(doc)?;
+            let batch_docs = convert_batch_to_ruby(&batch)?;
+            // Merge arrays by pushing each element
+            for i in 0..batch_docs.len() {
+                result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
             }
         }

data/lib/lancelot/dataset.rb CHANGED Viewed

@@ -15,6 +15,14 @@ module Lancelot
         dataset
       end
+      def open_or_create(path, schema:)
+        if File.exist?(path)
+          open(path)
+        else
+          create(path, schema: schema)
+        end
+      end
       private
       def normalize_schema(schema)

data/lib/lancelot/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Lancelot
-  VERSION = "0.2.0"
+  VERSION = "0.3.2"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: lancelot
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.2
 platform: ruby
 authors:
 - Chris Petersen
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2025-07-27 00:00:00.000000000 Z
+date: 2025-08-21 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rb_sys
@@ -113,6 +113,8 @@ files:
 - Rakefile
 - examples/basic_usage.rb
 - examples/full_text_search.rb
+- examples/idempotent_create.rb
+- examples/optional_fields_demo.rb
 - examples/red_candle_integration.rb
 - examples/vector_search.rb
 - ext/lancelot/.gitignore