lancelot 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7d1c9e53cfc1948e847b195b8d8860c6dba8a85813714b60a7d7747966533de3
4
- data.tar.gz: 7c130c04c01a8eebce16755a1da082f4db6ec1f85ac2857d84943272c695eb89
3
+ metadata.gz: 59a61f845bead9178dc6b7ca831b0cd1f3577969c189d723b085a706527bb9a9
4
+ data.tar.gz: f6941d534cc770393803c152f1718dad15639086b4ae78e84da93f8f8ab6f43f
5
5
  SHA512:
6
- metadata.gz: fb1a939d21c8935cec9fb9eb0ba1ddbef7ab9a8edd380149e0026db00ce67167609c99ed98597ccd692baa59f046dcd041fc76f653a5a635c152b82c343310d4
7
- data.tar.gz: d35819fbfc3c2dec2b2cf65236b2bbf74eaef9f501d32f2a0a0ca7724ffc4bc1bdef56222cb50a3390d59d91e6e492a1170114b2762cebac38bcf4529bb27925
6
+ metadata.gz: 04f63038fe699b2441c22618daac20ef710df82a5d95c6c8258317a2600b23ea2844ab5c2f1bc01245fb4ff90d531f3ec09f7ddcb28876e758b983a0bcb1fdc2
7
+ data.tar.gz: 20b8c06914dc993b6868b7264cfcfba0eeb0a0a0c70b0473ac6e3b9e163c8ed3855c2eeae1ad02ab3f19125f80c844a0aebd2c26a91aafc005af0dfebd8d37d6
data/README.md CHANGED
@@ -2,11 +2,52 @@
2
2
 
3
3
  Ruby bindings for [Lance](https://github.com/lancedb/lance), a modern columnar data format for ML. Lancelot provides a Ruby-native interface to Lance, enabling efficient storage and search of multimodal data including text, vectors, and more.
4
4
 
5
+ ## Quickstart
6
+
7
+ ```ruby
8
+ require 'lancelot'
9
+ require 'red-candle'
10
+
11
+ strings = [
12
+ "apple",
13
+ "orange",
14
+ "google"
15
+ ]
16
+
17
+ model = Candle::EmbeddingModel.from_pretrained
18
+
19
+ dataset = Lancelot::Dataset.open_or_create("words", schema: {
20
+ text: :string,
21
+ embedding: { type: "vector", dimension: 768 }
22
+ })
23
+
24
+ records = strings.collect do |string|
25
+ embedding = model.embedding(string).first.to_a
26
+ { text: string, embedding: embedding }
27
+ end
28
+
29
+ dataset.add_documents(records)
30
+
31
+ dataset.create_vector_index("embedding")
32
+ dataset.create_text_index("text")
33
+
34
+
35
+ query = "fruit"
36
+ query_embedding = model.embedding(query).first.to_a
37
+ dataset.vector_search(query_embedding, column: "embedding", limit: 5).each { |r| puts r[:text] }; nil
38
+
39
+ dataset.text_search("apple", column: "text", limit: 5).each { |r| puts r[:text] }; nil
40
+
41
+ query = "tech company"
42
+ query_embedding = model.embedding(query).first.to_a
43
+ dataset.vector_search(query_embedding, column: "embedding", limit: 5).each { |r| puts r[:text] }; nil
44
+ ```
45
+
5
46
  ## Features
6
47
 
7
48
  ### Implemented
8
49
  - **Dataset Creation**: Create Lance datasets with schemas
9
- - **Data Storage**: Add documents to datasets
50
+ - **Data Storage**: Add documents to datasets
10
51
  - **Document Retrieval**: Read documents from datasets with enumerable support
11
52
  - **Vector Search**: Create vector indices and perform similarity search
12
53
  - **Full-Text Search**: Built-in full-text search with inverted indices
@@ -14,12 +55,6 @@ Ruby bindings for [Lance](https://github.com/lancedb/lance), a modern columnar d
14
55
  - **Schema Support**: Define schemas with string, float32, and vector types
15
56
  - **Row Counting**: Get the number of rows in a dataset
16
57
 
17
- ### Planned
18
-
19
- - **Multimodal Support**: Store and search across different data types beyond text and vectors
20
- - **Schema Evolution**: Add new columns to existing datasets without rewriting data
21
- - **Additional Fusion Methods**: Support for other fusion algorithms beyond RRF
22
-
23
58
  ## Installation
24
59
 
25
60
  Install the gem and add to the application's Gemfile by executing:
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env ruby
2
+ # Demonstrates idempotent dataset creation with open_or_create
3
+
4
+ require 'bundler/setup'
5
+ require 'lancelot'
6
+ require 'fileutils'
7
+
8
+ dataset_path = "words"
9
+
10
+ puts "="*60
11
+ puts "Idempotent Dataset Creation Demo"
12
+ puts "="*60
13
+
14
+ schema = {
15
+ text: :string,
16
+ embedding: { type: "vector", dimension: 768 }
17
+ }
18
+
19
+ # First call - will CREATE the dataset
20
+ puts "\n1. First call to open_or_create (should create)..."
21
+ dataset = Lancelot::Dataset.open_or_create(dataset_path, schema: schema)
22
+ puts " Dataset opened/created. Current count: #{dataset.count}"
23
+
24
+ # Add some data
25
+ dataset.add_documents([
26
+ { text: "hello", embedding: Array.new(768) { rand } },
27
+ { text: "world", embedding: Array.new(768) { rand } }
28
+ ])
29
+ puts " Added 2 documents. New count: #{dataset.count}"
30
+
31
+ # Second call - will OPEN the existing dataset
32
+ puts "\n2. Second call to open_or_create (should open existing)..."
33
+ dataset2 = Lancelot::Dataset.open_or_create(dataset_path, schema: schema)
34
+ puts " Dataset opened. Current count: #{dataset2.count}"
35
+ puts " ✓ Data persisted from previous session!"
36
+
37
+ # Third call - still idempotent
38
+ puts "\n3. Third call - still works..."
39
+ dataset3 = Lancelot::Dataset.open_or_create(dataset_path, schema: schema)
40
+ dataset3.add_documents([
41
+ { text: "more", embedding: Array.new(768) { rand } }
42
+ ])
43
+ puts " Added 1 more document. New count: #{dataset3.count}"
44
+
45
+ # Demonstrate the OLD way that would fail
46
+ puts "\n4. Compare with non-idempotent create (would fail)..."
47
+ begin
48
+ # This will fail because dataset already exists
49
+ failing_dataset = Lancelot::Dataset.create(dataset_path, schema: schema)
50
+ puts " ✗ This shouldn't happen!"
51
+ rescue => e
52
+ puts " ✓ Dataset.create correctly failed: #{e.class}"
53
+ puts " Message: #{e.message[0..50]}..."
54
+ end
55
+
56
+ # Clean up
57
+ FileUtils.rm_rf(dataset_path)
58
+
59
+ puts "\n" + "="*60
60
+ puts "Summary: Use open_or_create for idempotent operations!"
61
+ puts "="*60
62
+ puts "\nInstead of:"
63
+ puts ' dataset = Lancelot::Dataset.create("words", schema: {...})'
64
+ puts "\nUse:"
65
+ puts ' dataset = Lancelot::Dataset.open_or_create("words", schema: {...})'
66
+ puts "\nThis way your code works whether the dataset exists or not!"
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env ruby
2
+ # This example demonstrates optional field support in lancelot
3
+ # After the fix in conversion.rs, documents can have missing fields
4
+
5
+ require 'bundler/setup'
6
+ require 'lancelot'
7
+ require 'fileutils'
8
+
9
+ dataset_path = "example_optional_fields"
10
+ FileUtils.rm_rf(dataset_path)
11
+
12
+ puts "="*60
13
+ puts "Lancelot Optional Fields Demo"
14
+ puts "="*60
15
+
16
+ # Step 1: Create dataset with initial schema
17
+ puts "\n1. Creating dataset with 3 fields (id, text, score)..."
18
+ schema = {
19
+ id: :string,
20
+ text: :string,
21
+ score: :float32
22
+ }
23
+ dataset = Lancelot::Dataset.create(dataset_path, schema: schema)
24
+
25
+ # Add initial documents
26
+ initial_docs = [
27
+ { id: "1", text: "First document", score: 0.9 },
28
+ { id: "2", text: "Second document", score: 0.8 }
29
+ ]
30
+ dataset.add_documents(initial_docs)
31
+ puts " Added #{dataset.count} documents"
32
+
33
+ # Step 2: Simulate schema evolution (adding a new field)
34
+ puts "\n2. Simulating schema evolution (adding 'category' field)..."
35
+
36
+ # Get existing data
37
+ all_docs = dataset.to_a
38
+
39
+ # Recreate with expanded schema
40
+ FileUtils.rm_rf(dataset_path)
41
+ expanded_schema = {
42
+ id: :string,
43
+ text: :string,
44
+ score: :float32,
45
+ category: :string # NEW FIELD
46
+ }
47
+ dataset = Lancelot::Dataset.create(dataset_path, schema: expanded_schema)
48
+
49
+ # Re-add existing docs with the new field
50
+ docs_with_category = all_docs.map { |doc| doc.merge(category: "original") }
51
+ dataset.add_documents(docs_with_category)
52
+ puts " Recreated dataset with expanded schema"
53
+
54
+ # Step 3: Add new documents WITHOUT the new field
55
+ puts "\n3. Adding new documents WITHOUT the 'category' field..."
56
+ new_docs = [
57
+ { id: "3", text: "Third document", score: 0.7 }, # No category!
58
+ { id: "4", text: "Fourth document", score: 0.6 } # No category!
59
+ ]
60
+
61
+ begin
62
+ dataset.add_documents(new_docs)
63
+ puts " ✅ SUCCESS! Added #{new_docs.size} documents with missing fields"
64
+ rescue => e
65
+ puts " ❌ FAILED: #{e.message}"
66
+ puts " (This would have failed before the fix in conversion.rs)"
67
+ end
68
+
69
+ # Step 4: Verify the data
70
+ puts "\n4. Verifying all documents..."
71
+ dataset.to_a.each do |doc|
72
+ category = doc[:category] || "nil"
73
+ puts " Doc #{doc[:id]}: category=#{category}"
74
+ end
75
+
76
+ puts "\nTotal documents: #{dataset.count}"
77
+
78
+ # Cleanup
79
+ FileUtils.rm_rf(dataset_path)
80
+
81
+ puts "\n" + "="*60
82
+ puts "Demo complete! Optional fields work correctly."
83
+ puts "="*60
@@ -39,11 +39,13 @@ pub fn build_record_batch(
39
39
  let item = RHash::try_convert(item)?;
40
40
  for field in schema.fields() {
41
41
  let key = Symbol::new(field.name());
42
- let value: Value = item.fetch(key)
43
- .or_else(|_| {
42
+ // Make fields optional - use get instead of fetch
43
+ let value: Value = item.get(key)
44
+ .or_else(|| {
44
45
  // Try with string key
45
- item.fetch(field.name().as_str())
46
- })?;
46
+ item.get(field.name().as_str())
47
+ })
48
+ .unwrap_or_else(|| Ruby::get().unwrap().qnil().as_value());
47
49
 
48
50
  match field.data_type() {
49
51
  DataType::Utf8 => {
@@ -15,6 +15,14 @@ module Lancelot
15
15
  dataset
16
16
  end
17
17
 
18
+ def open_or_create(path, schema:)
19
+ if File.exist?(path)
20
+ open(path)
21
+ else
22
+ create(path, schema: schema)
23
+ end
24
+ end
25
+
18
26
  private
19
27
 
20
28
  def normalize_schema(schema)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Lancelot
4
- VERSION = "0.2.0"
4
+ VERSION = "0.3.1"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lancelot
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Petersen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-07-27 00:00:00.000000000 Z
11
+ date: 2025-08-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -113,6 +113,8 @@ files:
113
113
  - Rakefile
114
114
  - examples/basic_usage.rb
115
115
  - examples/full_text_search.rb
116
+ - examples/idempotent_create.rb
117
+ - examples/optional_fields_demo.rb
116
118
  - examples/red_candle_integration.rb
117
119
  - examples/vector_search.rb
118
120
  - ext/lancelot/.gitignore