lancelot 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +42 -7
- data/examples/idempotent_create.rb +66 -0
- data/examples/optional_fields_demo.rb +83 -0
- data/ext/lancelot/src/conversion.rs +6 -4
- data/lib/lancelot/dataset.rb +8 -0
- data/lib/lancelot/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 59a61f845bead9178dc6b7ca831b0cd1f3577969c189d723b085a706527bb9a9
|
4
|
+
data.tar.gz: f6941d534cc770393803c152f1718dad15639086b4ae78e84da93f8f8ab6f43f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 04f63038fe699b2441c22618daac20ef710df82a5d95c6c8258317a2600b23ea2844ab5c2f1bc01245fb4ff90d531f3ec09f7ddcb28876e758b983a0bcb1fdc2
|
7
|
+
data.tar.gz: 20b8c06914dc993b6868b7264cfcfba0eeb0a0a0c70b0473ac6e3b9e163c8ed3855c2eeae1ad02ab3f19125f80c844a0aebd2c26a91aafc005af0dfebd8d37d6
|
data/README.md
CHANGED
@@ -2,11 +2,52 @@
|
|
2
2
|
|
3
3
|
Ruby bindings for [Lance](https://github.com/lancedb/lance), a modern columnar data format for ML. Lancelot provides a Ruby-native interface to Lance, enabling efficient storage and search of multimodal data including text, vectors, and more.
|
4
4
|
|
5
|
+
## Quickstart
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
require 'lancelot'
|
9
|
+
require 'red-candle'
|
10
|
+
|
11
|
+
strings = [
|
12
|
+
"apple",
|
13
|
+
"orange",
|
14
|
+
"google"
|
15
|
+
]
|
16
|
+
|
17
|
+
model = Candle::EmbeddingModel.from_pretrained
|
18
|
+
|
19
|
+
dataset = Lancelot::Dataset.open_or_create("words", schema: {
|
20
|
+
text: :string,
|
21
|
+
embedding: { type: "vector", dimension: 768 }
|
22
|
+
})
|
23
|
+
|
24
|
+
records = strings.collect do |string|
|
25
|
+
embedding = model.embedding(string).first.to_a
|
26
|
+
{ text: string, embedding: embedding }
|
27
|
+
end
|
28
|
+
|
29
|
+
dataset.add_documents(records)
|
30
|
+
|
31
|
+
dataset.create_vector_index("embedding")
|
32
|
+
dataset.create_text_index("text")
|
33
|
+
|
34
|
+
|
35
|
+
query = "fruit"
|
36
|
+
query_embedding = model.embedding(query).first.to_a
|
37
|
+
dataset.vector_search(query_embedding, column: "embedding", limit: 5).each { |r| puts r[:text] }; nil
|
38
|
+
|
39
|
+
dataset.text_search("apple", column: "text", limit: 5).each { |r| puts r[:text] }; nil
|
40
|
+
|
41
|
+
query = "tech company"
|
42
|
+
query_embedding = model.embedding(query).first.to_a
|
43
|
+
dataset.vector_search(query_embedding, column: "embedding", limit: 5).each { |r| puts r[:text] }; nil
|
44
|
+
```
|
45
|
+
|
5
46
|
## Features
|
6
47
|
|
7
48
|
### Implemented
|
8
49
|
- **Dataset Creation**: Create Lance datasets with schemas
|
9
|
-
- **Data Storage**: Add documents to datasets
|
50
|
+
- **Data Storage**: Add documents to datasets
|
10
51
|
- **Document Retrieval**: Read documents from datasets with enumerable support
|
11
52
|
- **Vector Search**: Create vector indices and perform similarity search
|
12
53
|
- **Full-Text Search**: Built-in full-text search with inverted indices
|
@@ -14,12 +55,6 @@ Ruby bindings for [Lance](https://github.com/lancedb/lance), a modern columnar d
|
|
14
55
|
- **Schema Support**: Define schemas with string, float32, and vector types
|
15
56
|
- **Row Counting**: Get the number of rows in a dataset
|
16
57
|
|
17
|
-
### Planned
|
18
|
-
|
19
|
-
- **Multimodal Support**: Store and search across different data types beyond text and vectors
|
20
|
-
- **Schema Evolution**: Add new columns to existing datasets without rewriting data
|
21
|
-
- **Additional Fusion Methods**: Support for other fusion algorithms beyond RRF
|
22
|
-
|
23
58
|
## Installation
|
24
59
|
|
25
60
|
Install the gem and add to the application's Gemfile by executing:
|
@@ -0,0 +1,66 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# Demonstrates idempotent dataset creation with open_or_create
|
3
|
+
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'lancelot'
|
6
|
+
require 'fileutils'
|
7
|
+
|
8
|
+
dataset_path = "words"
|
9
|
+
|
10
|
+
puts "="*60
|
11
|
+
puts "Idempotent Dataset Creation Demo"
|
12
|
+
puts "="*60
|
13
|
+
|
14
|
+
schema = {
|
15
|
+
text: :string,
|
16
|
+
embedding: { type: "vector", dimension: 768 }
|
17
|
+
}
|
18
|
+
|
19
|
+
# First call - will CREATE the dataset
|
20
|
+
puts "\n1. First call to open_or_create (should create)..."
|
21
|
+
dataset = Lancelot::Dataset.open_or_create(dataset_path, schema: schema)
|
22
|
+
puts " Dataset opened/created. Current count: #{dataset.count}"
|
23
|
+
|
24
|
+
# Add some data
|
25
|
+
dataset.add_documents([
|
26
|
+
{ text: "hello", embedding: Array.new(768) { rand } },
|
27
|
+
{ text: "world", embedding: Array.new(768) { rand } }
|
28
|
+
])
|
29
|
+
puts " Added 2 documents. New count: #{dataset.count}"
|
30
|
+
|
31
|
+
# Second call - will OPEN the existing dataset
|
32
|
+
puts "\n2. Second call to open_or_create (should open existing)..."
|
33
|
+
dataset2 = Lancelot::Dataset.open_or_create(dataset_path, schema: schema)
|
34
|
+
puts " Dataset opened. Current count: #{dataset2.count}"
|
35
|
+
puts " ✓ Data persisted from previous session!"
|
36
|
+
|
37
|
+
# Third call - still idempotent
|
38
|
+
puts "\n3. Third call - still works..."
|
39
|
+
dataset3 = Lancelot::Dataset.open_or_create(dataset_path, schema: schema)
|
40
|
+
dataset3.add_documents([
|
41
|
+
{ text: "more", embedding: Array.new(768) { rand } }
|
42
|
+
])
|
43
|
+
puts " Added 1 more document. New count: #{dataset3.count}"
|
44
|
+
|
45
|
+
# Demonstrate the OLD way that would fail
|
46
|
+
puts "\n4. Compare with non-idempotent create (would fail)..."
|
47
|
+
begin
|
48
|
+
# This will fail because dataset already exists
|
49
|
+
failing_dataset = Lancelot::Dataset.create(dataset_path, schema: schema)
|
50
|
+
puts " ✗ This shouldn't happen!"
|
51
|
+
rescue => e
|
52
|
+
puts " ✓ Dataset.create correctly failed: #{e.class}"
|
53
|
+
puts " Message: #{e.message[0..50]}..."
|
54
|
+
end
|
55
|
+
|
56
|
+
# Clean up
|
57
|
+
FileUtils.rm_rf(dataset_path)
|
58
|
+
|
59
|
+
puts "\n" + "="*60
|
60
|
+
puts "Summary: Use open_or_create for idempotent operations!"
|
61
|
+
puts "="*60
|
62
|
+
puts "\nInstead of:"
|
63
|
+
puts ' dataset = Lancelot::Dataset.create("words", schema: {...})'
|
64
|
+
puts "\nUse:"
|
65
|
+
puts ' dataset = Lancelot::Dataset.open_or_create("words", schema: {...})'
|
66
|
+
puts "\nThis way your code works whether the dataset exists or not!"
|
@@ -0,0 +1,83 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# This example demonstrates optional field support in lancelot
|
3
|
+
# After the fix in conversion.rs, documents can have missing fields
|
4
|
+
|
5
|
+
require 'bundler/setup'
|
6
|
+
require 'lancelot'
|
7
|
+
require 'fileutils'
|
8
|
+
|
9
|
+
dataset_path = "example_optional_fields"
|
10
|
+
FileUtils.rm_rf(dataset_path)
|
11
|
+
|
12
|
+
puts "="*60
|
13
|
+
puts "Lancelot Optional Fields Demo"
|
14
|
+
puts "="*60
|
15
|
+
|
16
|
+
# Step 1: Create dataset with initial schema
|
17
|
+
puts "\n1. Creating dataset with 3 fields (id, text, score)..."
|
18
|
+
schema = {
|
19
|
+
id: :string,
|
20
|
+
text: :string,
|
21
|
+
score: :float32
|
22
|
+
}
|
23
|
+
dataset = Lancelot::Dataset.create(dataset_path, schema: schema)
|
24
|
+
|
25
|
+
# Add initial documents
|
26
|
+
initial_docs = [
|
27
|
+
{ id: "1", text: "First document", score: 0.9 },
|
28
|
+
{ id: "2", text: "Second document", score: 0.8 }
|
29
|
+
]
|
30
|
+
dataset.add_documents(initial_docs)
|
31
|
+
puts " Added #{dataset.count} documents"
|
32
|
+
|
33
|
+
# Step 2: Simulate schema evolution (adding a new field)
|
34
|
+
puts "\n2. Simulating schema evolution (adding 'category' field)..."
|
35
|
+
|
36
|
+
# Get existing data
|
37
|
+
all_docs = dataset.to_a
|
38
|
+
|
39
|
+
# Recreate with expanded schema
|
40
|
+
FileUtils.rm_rf(dataset_path)
|
41
|
+
expanded_schema = {
|
42
|
+
id: :string,
|
43
|
+
text: :string,
|
44
|
+
score: :float32,
|
45
|
+
category: :string # NEW FIELD
|
46
|
+
}
|
47
|
+
dataset = Lancelot::Dataset.create(dataset_path, schema: expanded_schema)
|
48
|
+
|
49
|
+
# Re-add existing docs with the new field
|
50
|
+
docs_with_category = all_docs.map { |doc| doc.merge(category: "original") }
|
51
|
+
dataset.add_documents(docs_with_category)
|
52
|
+
puts " Recreated dataset with expanded schema"
|
53
|
+
|
54
|
+
# Step 3: Add new documents WITHOUT the new field
|
55
|
+
puts "\n3. Adding new documents WITHOUT the 'category' field..."
|
56
|
+
new_docs = [
|
57
|
+
{ id: "3", text: "Third document", score: 0.7 }, # No category!
|
58
|
+
{ id: "4", text: "Fourth document", score: 0.6 } # No category!
|
59
|
+
]
|
60
|
+
|
61
|
+
begin
|
62
|
+
dataset.add_documents(new_docs)
|
63
|
+
puts " ✅ SUCCESS! Added #{new_docs.size} documents with missing fields"
|
64
|
+
rescue => e
|
65
|
+
puts " ❌ FAILED: #{e.message}"
|
66
|
+
puts " (This would have failed before the fix in conversion.rs)"
|
67
|
+
end
|
68
|
+
|
69
|
+
# Step 4: Verify the data
|
70
|
+
puts "\n4. Verifying all documents..."
|
71
|
+
dataset.to_a.each do |doc|
|
72
|
+
category = doc[:category] || "nil"
|
73
|
+
puts " Doc #{doc[:id]}: category=#{category}"
|
74
|
+
end
|
75
|
+
|
76
|
+
puts "\nTotal documents: #{dataset.count}"
|
77
|
+
|
78
|
+
# Cleanup
|
79
|
+
FileUtils.rm_rf(dataset_path)
|
80
|
+
|
81
|
+
puts "\n" + "="*60
|
82
|
+
puts "Demo complete! Optional fields work correctly."
|
83
|
+
puts "="*60
|
@@ -39,11 +39,13 @@ pub fn build_record_batch(
|
|
39
39
|
let item = RHash::try_convert(item)?;
|
40
40
|
for field in schema.fields() {
|
41
41
|
let key = Symbol::new(field.name());
|
42
|
-
|
43
|
-
|
42
|
+
// Make fields optional - use get instead of fetch
|
43
|
+
let value: Value = item.get(key)
|
44
|
+
.or_else(|| {
|
44
45
|
// Try with string key
|
45
|
-
item.
|
46
|
-
})
|
46
|
+
item.get(field.name().as_str())
|
47
|
+
})
|
48
|
+
.unwrap_or_else(|| Ruby::get().unwrap().qnil().as_value());
|
47
49
|
|
48
50
|
match field.data_type() {
|
49
51
|
DataType::Utf8 => {
|
data/lib/lancelot/dataset.rb
CHANGED
data/lib/lancelot/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lancelot
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Petersen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-08-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -113,6 +113,8 @@ files:
|
|
113
113
|
- Rakefile
|
114
114
|
- examples/basic_usage.rb
|
115
115
|
- examples/full_text_search.rb
|
116
|
+
- examples/idempotent_create.rb
|
117
|
+
- examples/optional_fields_demo.rb
|
116
118
|
- examples/red_candle_integration.rb
|
117
119
|
- examples/vector_search.rb
|
118
120
|
- ext/lancelot/.gitignore
|