lancelot 0.2.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7d1c9e53cfc1948e847b195b8d8860c6dba8a85813714b60a7d7747966533de3
4
- data.tar.gz: 7c130c04c01a8eebce16755a1da082f4db6ec1f85ac2857d84943272c695eb89
3
+ metadata.gz: cda5bd00de23ad9f4840b1cc7f6e96b7eb4ec124ba902d3901ef36b37594de97
4
+ data.tar.gz: 23e84c317e5bcd0f52870c673d0b96f9abf4f3ab97f20eb4e5b8a8047dd1cf1f
5
5
  SHA512:
6
- metadata.gz: fb1a939d21c8935cec9fb9eb0ba1ddbef7ab9a8edd380149e0026db00ce67167609c99ed98597ccd692baa59f046dcd041fc76f653a5a635c152b82c343310d4
7
- data.tar.gz: d35819fbfc3c2dec2b2cf65236b2bbf74eaef9f501d32f2a0a0ca7724ffc4bc1bdef56222cb50a3390d59d91e6e492a1170114b2762cebac38bcf4529bb27925
6
+ metadata.gz: 32cb3e852ed77b8ec2831b08f93252b5ccb176942c217a366079e1cb2a2a97c4be74247b5f8410a2892852456278a5ec8e5b556bd9ca3379d740b292cc32c15b
7
+ data.tar.gz: 718a99dbfcd51dce872feedb87265be86417faba152d6f2512a9391f28e5fc5e115deb0154be52dc029c2771611b71c685f1f143a4d2da5157aaafc28997ffa7
data/README.md CHANGED
@@ -2,11 +2,52 @@
2
2
 
3
3
  Ruby bindings for [Lance](https://github.com/lancedb/lance), a modern columnar data format for ML. Lancelot provides a Ruby-native interface to Lance, enabling efficient storage and search of multimodal data including text, vectors, and more.
4
4
 
5
+ ## Quickstart
6
+
7
+ ```ruby
8
+ require 'lancelot'
9
+ require 'red-candle'
10
+
11
+ strings = [
12
+ "apple",
13
+ "orange",
14
+ "google"
15
+ ]
16
+
17
+ model = Candle::EmbeddingModel.from_pretrained
18
+
19
+ dataset = Lancelot::Dataset.open_or_create("words", schema: {
20
+ text: :string,
21
+ embedding: { type: "vector", dimension: 768 }
22
+ })
23
+
24
+ records = strings.collect do |string|
25
+ embedding = model.embedding(string).first.to_a
26
+ { text: string, embedding: embedding }
27
+ end
28
+
29
+ dataset.add_documents(records)
30
+
31
+ dataset.create_vector_index("embedding")
32
+ dataset.create_text_index("text")
33
+
34
+
35
+ query = "fruit"
36
+ query_embedding = model.embedding(query).first.to_a
37
+ dataset.vector_search(query_embedding, column: "embedding", limit: 5).each { |r| puts r[:text] }; nil
38
+
39
+ dataset.text_search("apple", column: "text", limit: 5).each { |r| puts r[:text] }; nil
40
+
41
+ query = "tech company"
42
+ query_embedding = model.embedding(query).first.to_a
43
+ dataset.vector_search(query_embedding, column: "embedding", limit: 5).each { |r| puts r[:text] }; nil
44
+ ```
45
+
5
46
  ## Features
6
47
 
7
48
  ### Implemented
8
49
  - **Dataset Creation**: Create Lance datasets with schemas
9
- - **Data Storage**: Add documents to datasets
50
+ - **Data Storage**: Add documents to datasets
10
51
  - **Document Retrieval**: Read documents from datasets with enumerable support
11
52
  - **Vector Search**: Create vector indices and perform similarity search
12
53
  - **Full-Text Search**: Built-in full-text search with inverted indices
@@ -14,12 +55,6 @@ Ruby bindings for [Lance](https://github.com/lancedb/lance), a modern columnar d
14
55
  - **Schema Support**: Define schemas with string, float32, and vector types
15
56
  - **Row Counting**: Get the number of rows in a dataset
16
57
 
17
- ### Planned
18
-
19
- - **Multimodal Support**: Store and search across different data types beyond text and vectors
20
- - **Schema Evolution**: Add new columns to existing datasets without rewriting data
21
- - **Additional Fusion Methods**: Support for other fusion algorithms beyond RRF
22
-
23
58
  ## Installation
24
59
 
25
60
  Install the gem and add to the application's Gemfile by executing:
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env ruby
2
+ # Demonstrates idempotent dataset creation with open_or_create
3
+
4
+ require 'bundler/setup'
5
+ require 'lancelot'
6
+ require 'fileutils'
7
+
8
+ dataset_path = "words"
9
+
10
+ puts "="*60
11
+ puts "Idempotent Dataset Creation Demo"
12
+ puts "="*60
13
+
14
+ schema = {
15
+ text: :string,
16
+ embedding: { type: "vector", dimension: 768 }
17
+ }
18
+
19
+ # First call - will CREATE the dataset
20
+ puts "\n1. First call to open_or_create (should create)..."
21
+ dataset = Lancelot::Dataset.open_or_create(dataset_path, schema: schema)
22
+ puts " Dataset opened/created. Current count: #{dataset.count}"
23
+
24
+ # Add some data
25
+ dataset.add_documents([
26
+ { text: "hello", embedding: Array.new(768) { rand } },
27
+ { text: "world", embedding: Array.new(768) { rand } }
28
+ ])
29
+ puts " Added 2 documents. New count: #{dataset.count}"
30
+
31
+ # Second call - will OPEN the existing dataset
32
+ puts "\n2. Second call to open_or_create (should open existing)..."
33
+ dataset2 = Lancelot::Dataset.open_or_create(dataset_path, schema: schema)
34
+ puts " Dataset opened. Current count: #{dataset2.count}"
35
+ puts " ✓ Data persisted from previous session!"
36
+
37
+ # Third call - still idempotent
38
+ puts "\n3. Third call - still works..."
39
+ dataset3 = Lancelot::Dataset.open_or_create(dataset_path, schema: schema)
40
+ dataset3.add_documents([
41
+ { text: "more", embedding: Array.new(768) { rand } }
42
+ ])
43
+ puts " Added 1 more document. New count: #{dataset3.count}"
44
+
45
+ # Demonstrate the OLD way that would fail
46
+ puts "\n4. Compare with non-idempotent create (would fail)..."
47
+ begin
48
+ # This will fail because dataset already exists
49
+ failing_dataset = Lancelot::Dataset.create(dataset_path, schema: schema)
50
+ puts " ✗ This shouldn't happen!"
51
+ rescue => e
52
+ puts " ✓ Dataset.create correctly failed: #{e.class}"
53
+ puts " Message: #{e.message[0..50]}..."
54
+ end
55
+
56
+ # Clean up
57
+ FileUtils.rm_rf(dataset_path)
58
+
59
+ puts "\n" + "="*60
60
+ puts "Summary: Use open_or_create for idempotent operations!"
61
+ puts "="*60
62
+ puts "\nInstead of:"
63
+ puts ' dataset = Lancelot::Dataset.create("words", schema: {...})'
64
+ puts "\nUse:"
65
+ puts ' dataset = Lancelot::Dataset.open_or_create("words", schema: {...})'
66
+ puts "\nThis way your code works whether the dataset exists or not!"
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env ruby
2
+ # This example demonstrates optional field support in lancelot
3
+ # After the fix in conversion.rs, documents can have missing fields
4
+
5
+ require 'bundler/setup'
6
+ require 'lancelot'
7
+ require 'fileutils'
8
+
9
+ dataset_path = "example_optional_fields"
10
+ FileUtils.rm_rf(dataset_path)
11
+
12
+ puts "="*60
13
+ puts "Lancelot Optional Fields Demo"
14
+ puts "="*60
15
+
16
+ # Step 1: Create dataset with initial schema
17
+ puts "\n1. Creating dataset with 3 fields (id, text, score)..."
18
+ schema = {
19
+ id: :string,
20
+ text: :string,
21
+ score: :float32
22
+ }
23
+ dataset = Lancelot::Dataset.create(dataset_path, schema: schema)
24
+
25
+ # Add initial documents
26
+ initial_docs = [
27
+ { id: "1", text: "First document", score: 0.9 },
28
+ { id: "2", text: "Second document", score: 0.8 }
29
+ ]
30
+ dataset.add_documents(initial_docs)
31
+ puts " Added #{dataset.count} documents"
32
+
33
+ # Step 2: Simulate schema evolution (adding a new field)
34
+ puts "\n2. Simulating schema evolution (adding 'category' field)..."
35
+
36
+ # Get existing data
37
+ all_docs = dataset.to_a
38
+
39
+ # Recreate with expanded schema
40
+ FileUtils.rm_rf(dataset_path)
41
+ expanded_schema = {
42
+ id: :string,
43
+ text: :string,
44
+ score: :float32,
45
+ category: :string # NEW FIELD
46
+ }
47
+ dataset = Lancelot::Dataset.create(dataset_path, schema: expanded_schema)
48
+
49
+ # Re-add existing docs with the new field
50
+ docs_with_category = all_docs.map { |doc| doc.merge(category: "original") }
51
+ dataset.add_documents(docs_with_category)
52
+ puts " Recreated dataset with expanded schema"
53
+
54
+ # Step 3: Add new documents WITHOUT the new field
55
+ puts "\n3. Adding new documents WITHOUT the 'category' field..."
56
+ new_docs = [
57
+ { id: "3", text: "Third document", score: 0.7 }, # No category!
58
+ { id: "4", text: "Fourth document", score: 0.6 } # No category!
59
+ ]
60
+
61
+ begin
62
+ dataset.add_documents(new_docs)
63
+ puts " ✅ SUCCESS! Added #{new_docs.size} documents with missing fields"
64
+ rescue => e
65
+ puts " ❌ FAILED: #{e.message}"
66
+ puts " (This would have failed before the fix in conversion.rs)"
67
+ end
68
+
69
+ # Step 4: Verify the data
70
+ puts "\n4. Verifying all documents..."
71
+ dataset.to_a.each do |doc|
72
+ category = doc[:category] || "nil"
73
+ puts " Doc #{doc[:id]}: category=#{category}"
74
+ end
75
+
76
+ puts "\nTotal documents: #{dataset.count}"
77
+
78
+ # Cleanup
79
+ FileUtils.rm_rf(dataset_path)
80
+
81
+ puts "\n" + "="*60
82
+ puts "Demo complete! Optional fields work correctly."
83
+ puts "="*60
@@ -39,11 +39,13 @@ pub fn build_record_batch(
39
39
  let item = RHash::try_convert(item)?;
40
40
  for field in schema.fields() {
41
41
  let key = Symbol::new(field.name());
42
- let value: Value = item.fetch(key)
43
- .or_else(|_| {
42
+ // Make fields optional - use get instead of fetch
43
+ let value: Value = item.get(key)
44
+ .or_else(|| {
44
45
  // Try with string key
45
- item.fetch(field.name().as_str())
46
- })?;
46
+ item.get(field.name().as_str())
47
+ })
48
+ .unwrap_or_else(|| Ruby::get().unwrap().qnil().as_value());
47
49
 
48
50
  match field.data_type() {
49
51
  DataType::Utf8 => {
@@ -157,9 +159,9 @@ pub fn build_record_batch(
157
159
  .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))
158
160
  }
159
161
 
160
- pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<Vec<RHash>, Error> {
162
+ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<RArray, Error> {
161
163
  let ruby = Ruby::get().unwrap();
162
- let mut documents = Vec::new();
164
+ let documents = ruby.ary_new();
163
165
 
164
166
  let num_rows = batch.num_rows();
165
167
  let schema = batch.schema();
@@ -171,6 +173,15 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<Vec<RHash>, Error> {
171
173
  let column = batch.column(col_idx);
172
174
  let key = Symbol::new(field.name());
173
175
 
176
+ // CRITICAL: Add bounds checking for all array access
177
+ if row_idx >= column.len() {
178
+ return Err(Error::new(
179
+ magnus::exception::runtime_error(),
180
+ format!("Row index {} out of bounds for column '{}' with length {}",
181
+ row_idx, field.name(), column.len())
182
+ ));
183
+ }
184
+
174
185
  match field.data_type() {
175
186
  DataType::Utf8 => {
176
187
  let array = column.as_any().downcast_ref::<StringArray>()
@@ -223,9 +234,19 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<Vec<RHash>, Error> {
223
234
  let float_array = values.as_any().downcast_ref::<Float32Array>()
224
235
  .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "Failed to cast vector values to Float32Array"))?;
225
236
 
237
+ // CRITICAL: Verify the float_array has the expected size
238
+ let expected_size = *list_size as usize;
239
+ if float_array.len() != expected_size {
240
+ return Err(Error::new(
241
+ magnus::exception::runtime_error(),
242
+ format!("Vector data corruption: expected {} elements but found {} for field '{}'",
243
+ expected_size, float_array.len(), field.name())
244
+ ));
245
+ }
246
+
226
247
  let ruby_array = ruby.ary_new();
227
- for i in 0..*list_size {
228
- ruby_array.push(float_array.value(i as usize))?;
248
+ for i in 0..expected_size {
249
+ ruby_array.push(float_array.value(i))?;
229
250
  }
230
251
  doc.aset(key, ruby_array)?;
231
252
  }
@@ -236,7 +257,7 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<Vec<RHash>, Error> {
236
257
  }
237
258
  }
238
259
 
239
- documents.push(doc);
260
+ documents.push(doc)?;
240
261
  }
241
262
 
242
263
  Ok(documents)
@@ -1,4 +1,4 @@
1
- use magnus::{Error, Ruby, RHash, RArray, Symbol, TryConvert, function, method, RClass, Module, Object};
1
+ use magnus::{Error, Ruby, RHash, RArray, Symbol, TryConvert, Value, function, method, RClass, Module, Object};
2
2
  use std::cell::RefCell;
3
3
  use std::sync::Arc;
4
4
  use tokio::runtime::Runtime;
@@ -157,9 +157,10 @@ impl LancelotDataset {
157
157
  let result_array = ruby.ary_new();
158
158
 
159
159
  for batch in batches {
160
- let documents = convert_batch_to_ruby(&batch)?;
161
- for doc in documents {
162
- result_array.push(doc)?;
160
+ let batch_docs = convert_batch_to_ruby(&batch)?;
161
+ // Merge arrays by pushing each element
162
+ for i in 0..batch_docs.len() {
163
+ result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
163
164
  }
164
165
  }
165
166
 
@@ -191,9 +192,10 @@ impl LancelotDataset {
191
192
  let result_array = ruby.ary_new();
192
193
 
193
194
  for batch in batches {
194
- let documents = convert_batch_to_ruby(&batch)?;
195
- for doc in documents {
196
- result_array.push(doc)?;
195
+ let batch_docs = convert_batch_to_ruby(&batch)?;
196
+ // Merge arrays by pushing each element
197
+ for i in 0..batch_docs.len() {
198
+ result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
197
199
  }
198
200
  }
199
201
 
@@ -265,9 +267,10 @@ impl LancelotDataset {
265
267
  let result_array = ruby.ary_new();
266
268
 
267
269
  for batch in batches {
268
- let documents = convert_batch_to_ruby(&batch)?;
269
- for doc in documents {
270
- result_array.push(doc)?;
270
+ let batch_docs = convert_batch_to_ruby(&batch)?;
271
+ // Merge arrays by pushing each element
272
+ for i in 0..batch_docs.len() {
273
+ result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
271
274
  }
272
275
  }
273
276
 
@@ -330,9 +333,10 @@ impl LancelotDataset {
330
333
  let result_array = ruby.ary_new();
331
334
 
332
335
  for batch in batches {
333
- let documents = convert_batch_to_ruby(&batch)?;
334
- for doc in documents {
335
- result_array.push(doc)?;
336
+ let batch_docs = convert_batch_to_ruby(&batch)?;
337
+ // Merge arrays by pushing each element
338
+ for i in 0..batch_docs.len() {
339
+ result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
336
340
  }
337
341
  }
338
342
 
@@ -380,9 +384,10 @@ impl LancelotDataset {
380
384
  let result_array = ruby.ary_new();
381
385
 
382
386
  for batch in batches {
383
- let documents = convert_batch_to_ruby(&batch)?;
384
- for doc in documents {
385
- result_array.push(doc)?;
387
+ let batch_docs = convert_batch_to_ruby(&batch)?;
388
+ // Merge arrays by pushing each element
389
+ for i in 0..batch_docs.len() {
390
+ result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
386
391
  }
387
392
  }
388
393
 
@@ -422,9 +427,10 @@ impl LancelotDataset {
422
427
  let result_array = ruby.ary_new();
423
428
 
424
429
  for batch in batches {
425
- let documents = convert_batch_to_ruby(&batch)?;
426
- for doc in documents {
427
- result_array.push(doc)?;
430
+ let batch_docs = convert_batch_to_ruby(&batch)?;
431
+ // Merge arrays by pushing each element
432
+ for i in 0..batch_docs.len() {
433
+ result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
428
434
  }
429
435
  }
430
436
 
@@ -15,6 +15,14 @@ module Lancelot
15
15
  dataset
16
16
  end
17
17
 
18
+ def open_or_create(path, schema:)
19
+ if File.exist?(path)
20
+ open(path)
21
+ else
22
+ create(path, schema: schema)
23
+ end
24
+ end
25
+
18
26
  private
19
27
 
20
28
  def normalize_schema(schema)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Lancelot
4
- VERSION = "0.2.0"
4
+ VERSION = "0.3.2"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lancelot
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Petersen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-07-27 00:00:00.000000000 Z
11
+ date: 2025-08-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -113,6 +113,8 @@ files:
113
113
  - Rakefile
114
114
  - examples/basic_usage.rb
115
115
  - examples/full_text_search.rb
116
+ - examples/idempotent_create.rb
117
+ - examples/optional_fields_demo.rb
116
118
  - examples/red_candle_integration.rb
117
119
  - examples/vector_search.rb
118
120
  - ext/lancelot/.gitignore