lancelot 0.2.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +42 -7
- data/examples/idempotent_create.rb +66 -0
- data/examples/optional_fields_demo.rb +83 -0
- data/ext/lancelot/src/conversion.rs +30 -9
- data/ext/lancelot/src/dataset.rs +25 -19
- data/lib/lancelot/dataset.rb +8 -0
- data/lib/lancelot/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cda5bd00de23ad9f4840b1cc7f6e96b7eb4ec124ba902d3901ef36b37594de97
|
4
|
+
data.tar.gz: 23e84c317e5bcd0f52870c673d0b96f9abf4f3ab97f20eb4e5b8a8047dd1cf1f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 32cb3e852ed77b8ec2831b08f93252b5ccb176942c217a366079e1cb2a2a97c4be74247b5f8410a2892852456278a5ec8e5b556bd9ca3379d740b292cc32c15b
|
7
|
+
data.tar.gz: 718a99dbfcd51dce872feedb87265be86417faba152d6f2512a9391f28e5fc5e115deb0154be52dc029c2771611b71c685f1f143a4d2da5157aaafc28997ffa7
|
data/README.md
CHANGED
@@ -2,11 +2,52 @@
|
|
2
2
|
|
3
3
|
Ruby bindings for [Lance](https://github.com/lancedb/lance), a modern columnar data format for ML. Lancelot provides a Ruby-native interface to Lance, enabling efficient storage and search of multimodal data including text, vectors, and more.
|
4
4
|
|
5
|
+
## Quickstart
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
require 'lancelot'
|
9
|
+
require 'red-candle'
|
10
|
+
|
11
|
+
strings = [
|
12
|
+
"apple",
|
13
|
+
"orange",
|
14
|
+
"google"
|
15
|
+
]
|
16
|
+
|
17
|
+
model = Candle::EmbeddingModel.from_pretrained
|
18
|
+
|
19
|
+
dataset = Lancelot::Dataset.open_or_create("words", schema: {
|
20
|
+
text: :string,
|
21
|
+
embedding: { type: "vector", dimension: 768 }
|
22
|
+
})
|
23
|
+
|
24
|
+
records = strings.collect do |string|
|
25
|
+
embedding = model.embedding(string).first.to_a
|
26
|
+
{ text: string, embedding: embedding }
|
27
|
+
end
|
28
|
+
|
29
|
+
dataset.add_documents(records)
|
30
|
+
|
31
|
+
dataset.create_vector_index("embedding")
|
32
|
+
dataset.create_text_index("text")
|
33
|
+
|
34
|
+
|
35
|
+
query = "fruit"
|
36
|
+
query_embedding = model.embedding(query).first.to_a
|
37
|
+
dataset.vector_search(query_embedding, column: "embedding", limit: 5).each { |r| puts r[:text] }; nil
|
38
|
+
|
39
|
+
dataset.text_search("apple", column: "text", limit: 5).each { |r| puts r[:text] }; nil
|
40
|
+
|
41
|
+
query = "tech company"
|
42
|
+
query_embedding = model.embedding(query).first.to_a
|
43
|
+
dataset.vector_search(query_embedding, column: "embedding", limit: 5).each { |r| puts r[:text] }; nil
|
44
|
+
```
|
45
|
+
|
5
46
|
## Features
|
6
47
|
|
7
48
|
### Implemented
|
8
49
|
- **Dataset Creation**: Create Lance datasets with schemas
|
9
|
-
- **Data Storage**: Add documents to datasets
|
50
|
+
- **Data Storage**: Add documents to datasets
|
10
51
|
- **Document Retrieval**: Read documents from datasets with enumerable support
|
11
52
|
- **Vector Search**: Create vector indices and perform similarity search
|
12
53
|
- **Full-Text Search**: Built-in full-text search with inverted indices
|
@@ -14,12 +55,6 @@ Ruby bindings for [Lance](https://github.com/lancedb/lance), a modern columnar d
|
|
14
55
|
- **Schema Support**: Define schemas with string, float32, and vector types
|
15
56
|
- **Row Counting**: Get the number of rows in a dataset
|
16
57
|
|
17
|
-
### Planned
|
18
|
-
|
19
|
-
- **Multimodal Support**: Store and search across different data types beyond text and vectors
|
20
|
-
- **Schema Evolution**: Add new columns to existing datasets without rewriting data
|
21
|
-
- **Additional Fusion Methods**: Support for other fusion algorithms beyond RRF
|
22
|
-
|
23
58
|
## Installation
|
24
59
|
|
25
60
|
Install the gem and add to the application's Gemfile by executing:
|
@@ -0,0 +1,66 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# Demonstrates idempotent dataset creation with open_or_create
|
3
|
+
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'lancelot'
|
6
|
+
require 'fileutils'
|
7
|
+
|
8
|
+
dataset_path = "words"
|
9
|
+
|
10
|
+
puts "="*60
|
11
|
+
puts "Idempotent Dataset Creation Demo"
|
12
|
+
puts "="*60
|
13
|
+
|
14
|
+
schema = {
|
15
|
+
text: :string,
|
16
|
+
embedding: { type: "vector", dimension: 768 }
|
17
|
+
}
|
18
|
+
|
19
|
+
# First call - will CREATE the dataset
|
20
|
+
puts "\n1. First call to open_or_create (should create)..."
|
21
|
+
dataset = Lancelot::Dataset.open_or_create(dataset_path, schema: schema)
|
22
|
+
puts " Dataset opened/created. Current count: #{dataset.count}"
|
23
|
+
|
24
|
+
# Add some data
|
25
|
+
dataset.add_documents([
|
26
|
+
{ text: "hello", embedding: Array.new(768) { rand } },
|
27
|
+
{ text: "world", embedding: Array.new(768) { rand } }
|
28
|
+
])
|
29
|
+
puts " Added 2 documents. New count: #{dataset.count}"
|
30
|
+
|
31
|
+
# Second call - will OPEN the existing dataset
|
32
|
+
puts "\n2. Second call to open_or_create (should open existing)..."
|
33
|
+
dataset2 = Lancelot::Dataset.open_or_create(dataset_path, schema: schema)
|
34
|
+
puts " Dataset opened. Current count: #{dataset2.count}"
|
35
|
+
puts " ✓ Data persisted from previous session!"
|
36
|
+
|
37
|
+
# Third call - still idempotent
|
38
|
+
puts "\n3. Third call - still works..."
|
39
|
+
dataset3 = Lancelot::Dataset.open_or_create(dataset_path, schema: schema)
|
40
|
+
dataset3.add_documents([
|
41
|
+
{ text: "more", embedding: Array.new(768) { rand } }
|
42
|
+
])
|
43
|
+
puts " Added 1 more document. New count: #{dataset3.count}"
|
44
|
+
|
45
|
+
# Demonstrate the OLD way that would fail
|
46
|
+
puts "\n4. Compare with non-idempotent create (would fail)..."
|
47
|
+
begin
|
48
|
+
# This will fail because dataset already exists
|
49
|
+
failing_dataset = Lancelot::Dataset.create(dataset_path, schema: schema)
|
50
|
+
puts " ✗ This shouldn't happen!"
|
51
|
+
rescue => e
|
52
|
+
puts " ✓ Dataset.create correctly failed: #{e.class}"
|
53
|
+
puts " Message: #{e.message[0..50]}..."
|
54
|
+
end
|
55
|
+
|
56
|
+
# Clean up
|
57
|
+
FileUtils.rm_rf(dataset_path)
|
58
|
+
|
59
|
+
puts "\n" + "="*60
|
60
|
+
puts "Summary: Use open_or_create for idempotent operations!"
|
61
|
+
puts "="*60
|
62
|
+
puts "\nInstead of:"
|
63
|
+
puts ' dataset = Lancelot::Dataset.create("words", schema: {...})'
|
64
|
+
puts "\nUse:"
|
65
|
+
puts ' dataset = Lancelot::Dataset.open_or_create("words", schema: {...})'
|
66
|
+
puts "\nThis way your code works whether the dataset exists or not!"
|
@@ -0,0 +1,83 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# This example demonstrates optional field support in lancelot
|
3
|
+
# After the fix in conversion.rs, documents can have missing fields
|
4
|
+
|
5
|
+
require 'bundler/setup'
|
6
|
+
require 'lancelot'
|
7
|
+
require 'fileutils'
|
8
|
+
|
9
|
+
dataset_path = "example_optional_fields"
|
10
|
+
FileUtils.rm_rf(dataset_path)
|
11
|
+
|
12
|
+
puts "="*60
|
13
|
+
puts "Lancelot Optional Fields Demo"
|
14
|
+
puts "="*60
|
15
|
+
|
16
|
+
# Step 1: Create dataset with initial schema
|
17
|
+
puts "\n1. Creating dataset with 3 fields (id, text, score)..."
|
18
|
+
schema = {
|
19
|
+
id: :string,
|
20
|
+
text: :string,
|
21
|
+
score: :float32
|
22
|
+
}
|
23
|
+
dataset = Lancelot::Dataset.create(dataset_path, schema: schema)
|
24
|
+
|
25
|
+
# Add initial documents
|
26
|
+
initial_docs = [
|
27
|
+
{ id: "1", text: "First document", score: 0.9 },
|
28
|
+
{ id: "2", text: "Second document", score: 0.8 }
|
29
|
+
]
|
30
|
+
dataset.add_documents(initial_docs)
|
31
|
+
puts " Added #{dataset.count} documents"
|
32
|
+
|
33
|
+
# Step 2: Simulate schema evolution (adding a new field)
|
34
|
+
puts "\n2. Simulating schema evolution (adding 'category' field)..."
|
35
|
+
|
36
|
+
# Get existing data
|
37
|
+
all_docs = dataset.to_a
|
38
|
+
|
39
|
+
# Recreate with expanded schema
|
40
|
+
FileUtils.rm_rf(dataset_path)
|
41
|
+
expanded_schema = {
|
42
|
+
id: :string,
|
43
|
+
text: :string,
|
44
|
+
score: :float32,
|
45
|
+
category: :string # NEW FIELD
|
46
|
+
}
|
47
|
+
dataset = Lancelot::Dataset.create(dataset_path, schema: expanded_schema)
|
48
|
+
|
49
|
+
# Re-add existing docs with the new field
|
50
|
+
docs_with_category = all_docs.map { |doc| doc.merge(category: "original") }
|
51
|
+
dataset.add_documents(docs_with_category)
|
52
|
+
puts " Recreated dataset with expanded schema"
|
53
|
+
|
54
|
+
# Step 3: Add new documents WITHOUT the new field
|
55
|
+
puts "\n3. Adding new documents WITHOUT the 'category' field..."
|
56
|
+
new_docs = [
|
57
|
+
{ id: "3", text: "Third document", score: 0.7 }, # No category!
|
58
|
+
{ id: "4", text: "Fourth document", score: 0.6 } # No category!
|
59
|
+
]
|
60
|
+
|
61
|
+
begin
|
62
|
+
dataset.add_documents(new_docs)
|
63
|
+
puts " ✅ SUCCESS! Added #{new_docs.size} documents with missing fields"
|
64
|
+
rescue => e
|
65
|
+
puts " ❌ FAILED: #{e.message}"
|
66
|
+
puts " (This would have failed before the fix in conversion.rs)"
|
67
|
+
end
|
68
|
+
|
69
|
+
# Step 4: Verify the data
|
70
|
+
puts "\n4. Verifying all documents..."
|
71
|
+
dataset.to_a.each do |doc|
|
72
|
+
category = doc[:category] || "nil"
|
73
|
+
puts " Doc #{doc[:id]}: category=#{category}"
|
74
|
+
end
|
75
|
+
|
76
|
+
puts "\nTotal documents: #{dataset.count}"
|
77
|
+
|
78
|
+
# Cleanup
|
79
|
+
FileUtils.rm_rf(dataset_path)
|
80
|
+
|
81
|
+
puts "\n" + "="*60
|
82
|
+
puts "Demo complete! Optional fields work correctly."
|
83
|
+
puts "="*60
|
@@ -39,11 +39,13 @@ pub fn build_record_batch(
|
|
39
39
|
let item = RHash::try_convert(item)?;
|
40
40
|
for field in schema.fields() {
|
41
41
|
let key = Symbol::new(field.name());
|
42
|
-
|
43
|
-
|
42
|
+
// Make fields optional - use get instead of fetch
|
43
|
+
let value: Value = item.get(key)
|
44
|
+
.or_else(|| {
|
44
45
|
// Try with string key
|
45
|
-
item.
|
46
|
-
})
|
46
|
+
item.get(field.name().as_str())
|
47
|
+
})
|
48
|
+
.unwrap_or_else(|| Ruby::get().unwrap().qnil().as_value());
|
47
49
|
|
48
50
|
match field.data_type() {
|
49
51
|
DataType::Utf8 => {
|
@@ -157,9 +159,9 @@ pub fn build_record_batch(
|
|
157
159
|
.map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))
|
158
160
|
}
|
159
161
|
|
160
|
-
pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<
|
162
|
+
pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<RArray, Error> {
|
161
163
|
let ruby = Ruby::get().unwrap();
|
162
|
-
let
|
164
|
+
let documents = ruby.ary_new();
|
163
165
|
|
164
166
|
let num_rows = batch.num_rows();
|
165
167
|
let schema = batch.schema();
|
@@ -171,6 +173,15 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<Vec<RHash>, Error> {
|
|
171
173
|
let column = batch.column(col_idx);
|
172
174
|
let key = Symbol::new(field.name());
|
173
175
|
|
176
|
+
// CRITICAL: Add bounds checking for all array access
|
177
|
+
if row_idx >= column.len() {
|
178
|
+
return Err(Error::new(
|
179
|
+
magnus::exception::runtime_error(),
|
180
|
+
format!("Row index {} out of bounds for column '{}' with length {}",
|
181
|
+
row_idx, field.name(), column.len())
|
182
|
+
));
|
183
|
+
}
|
184
|
+
|
174
185
|
match field.data_type() {
|
175
186
|
DataType::Utf8 => {
|
176
187
|
let array = column.as_any().downcast_ref::<StringArray>()
|
@@ -223,9 +234,19 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<Vec<RHash>, Error> {
|
|
223
234
|
let float_array = values.as_any().downcast_ref::<Float32Array>()
|
224
235
|
.ok_or_else(|| Error::new(magnus::exception::runtime_error(), "Failed to cast vector values to Float32Array"))?;
|
225
236
|
|
237
|
+
// CRITICAL: Verify the float_array has the expected size
|
238
|
+
let expected_size = *list_size as usize;
|
239
|
+
if float_array.len() != expected_size {
|
240
|
+
return Err(Error::new(
|
241
|
+
magnus::exception::runtime_error(),
|
242
|
+
format!("Vector data corruption: expected {} elements but found {} for field '{}'",
|
243
|
+
expected_size, float_array.len(), field.name())
|
244
|
+
));
|
245
|
+
}
|
246
|
+
|
226
247
|
let ruby_array = ruby.ary_new();
|
227
|
-
for i in 0
|
228
|
-
ruby_array.push(float_array.value(i
|
248
|
+
for i in 0..expected_size {
|
249
|
+
ruby_array.push(float_array.value(i))?;
|
229
250
|
}
|
230
251
|
doc.aset(key, ruby_array)?;
|
231
252
|
}
|
@@ -236,7 +257,7 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<Vec<RHash>, Error> {
|
|
236
257
|
}
|
237
258
|
}
|
238
259
|
|
239
|
-
documents.push(doc)
|
260
|
+
documents.push(doc)?;
|
240
261
|
}
|
241
262
|
|
242
263
|
Ok(documents)
|
data/ext/lancelot/src/dataset.rs
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
use magnus::{Error, Ruby, RHash, RArray, Symbol, TryConvert, function, method, RClass, Module, Object};
|
1
|
+
use magnus::{Error, Ruby, RHash, RArray, Symbol, TryConvert, Value, function, method, RClass, Module, Object};
|
2
2
|
use std::cell::RefCell;
|
3
3
|
use std::sync::Arc;
|
4
4
|
use tokio::runtime::Runtime;
|
@@ -157,9 +157,10 @@ impl LancelotDataset {
|
|
157
157
|
let result_array = ruby.ary_new();
|
158
158
|
|
159
159
|
for batch in batches {
|
160
|
-
let
|
161
|
-
|
162
|
-
|
160
|
+
let batch_docs = convert_batch_to_ruby(&batch)?;
|
161
|
+
// Merge arrays by pushing each element
|
162
|
+
for i in 0..batch_docs.len() {
|
163
|
+
result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
|
163
164
|
}
|
164
165
|
}
|
165
166
|
|
@@ -191,9 +192,10 @@ impl LancelotDataset {
|
|
191
192
|
let result_array = ruby.ary_new();
|
192
193
|
|
193
194
|
for batch in batches {
|
194
|
-
let
|
195
|
-
|
196
|
-
|
195
|
+
let batch_docs = convert_batch_to_ruby(&batch)?;
|
196
|
+
// Merge arrays by pushing each element
|
197
|
+
for i in 0..batch_docs.len() {
|
198
|
+
result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
|
197
199
|
}
|
198
200
|
}
|
199
201
|
|
@@ -265,9 +267,10 @@ impl LancelotDataset {
|
|
265
267
|
let result_array = ruby.ary_new();
|
266
268
|
|
267
269
|
for batch in batches {
|
268
|
-
let
|
269
|
-
|
270
|
-
|
270
|
+
let batch_docs = convert_batch_to_ruby(&batch)?;
|
271
|
+
// Merge arrays by pushing each element
|
272
|
+
for i in 0..batch_docs.len() {
|
273
|
+
result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
|
271
274
|
}
|
272
275
|
}
|
273
276
|
|
@@ -330,9 +333,10 @@ impl LancelotDataset {
|
|
330
333
|
let result_array = ruby.ary_new();
|
331
334
|
|
332
335
|
for batch in batches {
|
333
|
-
let
|
334
|
-
|
335
|
-
|
336
|
+
let batch_docs = convert_batch_to_ruby(&batch)?;
|
337
|
+
// Merge arrays by pushing each element
|
338
|
+
for i in 0..batch_docs.len() {
|
339
|
+
result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
|
336
340
|
}
|
337
341
|
}
|
338
342
|
|
@@ -380,9 +384,10 @@ impl LancelotDataset {
|
|
380
384
|
let result_array = ruby.ary_new();
|
381
385
|
|
382
386
|
for batch in batches {
|
383
|
-
let
|
384
|
-
|
385
|
-
|
387
|
+
let batch_docs = convert_batch_to_ruby(&batch)?;
|
388
|
+
// Merge arrays by pushing each element
|
389
|
+
for i in 0..batch_docs.len() {
|
390
|
+
result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
|
386
391
|
}
|
387
392
|
}
|
388
393
|
|
@@ -422,9 +427,10 @@ impl LancelotDataset {
|
|
422
427
|
let result_array = ruby.ary_new();
|
423
428
|
|
424
429
|
for batch in batches {
|
425
|
-
let
|
426
|
-
|
427
|
-
|
430
|
+
let batch_docs = convert_batch_to_ruby(&batch)?;
|
431
|
+
// Merge arrays by pushing each element
|
432
|
+
for i in 0..batch_docs.len() {
|
433
|
+
result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
|
428
434
|
}
|
429
435
|
}
|
430
436
|
|
data/lib/lancelot/dataset.rb
CHANGED
data/lib/lancelot/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lancelot
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Petersen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-08-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -113,6 +113,8 @@ files:
|
|
113
113
|
- Rakefile
|
114
114
|
- examples/basic_usage.rb
|
115
115
|
- examples/full_text_search.rb
|
116
|
+
- examples/idempotent_create.rb
|
117
|
+
- examples/optional_fields_demo.rb
|
116
118
|
- examples/red_candle_integration.rb
|
117
119
|
- examples/vector_search.rb
|
118
120
|
- ext/lancelot/.gitignore
|