lancelot 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 59a61f845bead9178dc6b7ca831b0cd1f3577969c189d723b085a706527bb9a9
4
- data.tar.gz: f6941d534cc770393803c152f1718dad15639086b4ae78e84da93f8f8ab6f43f
3
+ metadata.gz: b6da55f7ccdf6c0b08d1e317e63a627573e9599af9fe2c2f7c93ab8cdefc755d
4
+ data.tar.gz: fd75f76de200412842b7ad963a54dff99e691cf6a4a9c4e1df37ea65587c23a1
5
5
  SHA512:
6
- metadata.gz: 04f63038fe699b2441c22618daac20ef710df82a5d95c6c8258317a2600b23ea2844ab5c2f1bc01245fb4ff90d531f3ec09f7ddcb28876e758b983a0bcb1fdc2
7
- data.tar.gz: 20b8c06914dc993b6868b7264cfcfba0eeb0a0a0c70b0473ac6e3b9e163c8ed3855c2eeae1ad02ab3f19125f80c844a0aebd2c26a91aafc005af0dfebd8d37d6
6
+ metadata.gz: 44762f80ae8374f6ecf930360ca30b7e6987bbd978ae40d1fdb0c69093cc22d68877f5af2d18e25d088a13d386a2cfaba663c48d118c310e061ad130d5d86f1f
7
+ data.tar.gz: 976fee93527713f6404d9a3057e2feaa2d9b2f38601280a961c30653e9706a762aab2d1bf729dcee0b3954d4c07103f2db02e788736853ae70cd026bda0f8021
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # Lancelot
1
+ <img src="/docs/assets/lancelot-wide.png" alt="lancelot" height="80px">
2
2
 
3
3
  Ruby bindings for [Lance](https://github.com/lancedb/lance), a modern columnar data format for ML. Lancelot provides a Ruby-native interface to Lance, enabling efficient storage and search of multimodal data including text, vectors, and more.
4
4
 
@@ -57,6 +57,25 @@ dataset.vector_search(query_embedding, column: "embedding", limit: 5).each { |r|
57
57
 
58
58
  ## Installation
59
59
 
60
+ ### System Requirements
61
+
62
+ Lancelot requires the Protocol Buffers compiler (`protoc`) to build from source:
63
+
64
+ ```bash
65
+ # macOS (via Homebrew)
66
+ brew install protobuf
67
+
68
+ # Ubuntu/Debian
69
+ sudo apt-get install protobuf-compiler
70
+
71
+ # Other systems
72
+ # Download from https://github.com/protocolbuffers/protobuf/releases
73
+ ```
74
+
75
+ **Note**: The `protoc` compiler is only needed when building the gem from source. Pre-built gems distributed through RubyGems.org do not require `protoc` to be installed.
76
+
77
+ ### Install the Gem
78
+
60
79
  Install the gem and add to the application's Gemfile by executing:
61
80
 
62
81
  ```bash
@@ -276,7 +295,7 @@ bundle exec rake compile
276
295
 
277
296
  ## Contributing
278
297
 
279
- Bug reports and pull requests are welcome on GitHub at https://github.com/cpetersen/lancelot. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/cpetersen/lancelot/blob/main/CODE_OF_CONDUCT.md).
298
+ Bug reports and pull requests are welcome on GitHub at https://github.com/scientist-labs/lancelot. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/scientist-labs/lancelot/blob/main/CODE_OF_CONDUCT.md).
280
299
 
281
300
  ## License
282
301
 
@@ -284,4 +303,4 @@ The gem is available as open source under the terms of the [MIT License](https:/
284
303
 
285
304
  ## Code of Conduct
286
305
 
287
- Everyone interacting in the Lancelot project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/cpetersen/lancelot/blob/main/CODE_OF_CONDUCT.md).
306
+ Everyone interacting in the Lancelot project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/scientist-labs/lancelot/blob/main/CODE_OF_CONDUCT.md).
Binary file
Binary file
@@ -159,9 +159,9 @@ pub fn build_record_batch(
159
159
  .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))
160
160
  }
161
161
 
162
- pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<Vec<RHash>, Error> {
162
+ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<RArray, Error> {
163
163
  let ruby = Ruby::get().unwrap();
164
- let mut documents = Vec::new();
164
+ let documents = ruby.ary_new();
165
165
 
166
166
  let num_rows = batch.num_rows();
167
167
  let schema = batch.schema();
@@ -173,6 +173,15 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<Vec<RHash>, Error> {
173
173
  let column = batch.column(col_idx);
174
174
  let key = Symbol::new(field.name());
175
175
 
176
+ // CRITICAL: Add bounds checking for all array access
177
+ if row_idx >= column.len() {
178
+ return Err(Error::new(
179
+ magnus::exception::runtime_error(),
180
+ format!("Row index {} out of bounds for column '{}' with length {}",
181
+ row_idx, field.name(), column.len())
182
+ ));
183
+ }
184
+
176
185
  match field.data_type() {
177
186
  DataType::Utf8 => {
178
187
  let array = column.as_any().downcast_ref::<StringArray>()
@@ -225,9 +234,19 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<Vec<RHash>, Error> {
225
234
  let float_array = values.as_any().downcast_ref::<Float32Array>()
226
235
  .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "Failed to cast vector values to Float32Array"))?;
227
236
 
237
+ // CRITICAL: Verify the float_array has the expected size
238
+ let expected_size = *list_size as usize;
239
+ if float_array.len() != expected_size {
240
+ return Err(Error::new(
241
+ magnus::exception::runtime_error(),
242
+ format!("Vector data corruption: expected {} elements but found {} for field '{}'",
243
+ expected_size, float_array.len(), field.name())
244
+ ));
245
+ }
246
+
228
247
  let ruby_array = ruby.ary_new();
229
- for i in 0..*list_size {
230
- ruby_array.push(float_array.value(i as usize))?;
248
+ for i in 0..expected_size {
249
+ ruby_array.push(float_array.value(i))?;
231
250
  }
232
251
  doc.aset(key, ruby_array)?;
233
252
  }
@@ -238,7 +257,7 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<Vec<RHash>, Error> {
238
257
  }
239
258
  }
240
259
 
241
- documents.push(doc);
260
+ documents.push(doc)?;
242
261
  }
243
262
 
244
263
  Ok(documents)
@@ -1,4 +1,4 @@
1
- use magnus::{Error, Ruby, RHash, RArray, Symbol, TryConvert, function, method, RClass, Module, Object};
1
+ use magnus::{Error, Ruby, RHash, RArray, Symbol, TryConvert, Value, function, method, RClass, Module, Object};
2
2
  use std::cell::RefCell;
3
3
  use std::sync::Arc;
4
4
  use tokio::runtime::Runtime;
@@ -11,6 +11,26 @@ use futures::stream::TryStreamExt;
11
11
 
12
12
  use crate::schema::build_arrow_schema;
13
13
  use crate::conversion::{build_record_batch, convert_batch_to_ruby};
14
+ use arrow_schema::DataType;
15
+
16
+ /// Convert Arrow DataType to Ruby-friendly string representation
17
+ fn datatype_to_ruby_string(dtype: &DataType) -> &'static str {
18
+ match dtype {
19
+ DataType::Utf8 | DataType::LargeUtf8 => "string",
20
+ DataType::Boolean => "boolean",
21
+ DataType::Int8 | DataType::Int16 | DataType::Int32 => "int32",
22
+ DataType::Int64 => "int64",
23
+ DataType::UInt8 | DataType::UInt16 | DataType::UInt32 => "int32",
24
+ DataType::UInt64 => "int64",
25
+ DataType::Float32 => "float32",
26
+ DataType::Float64 => "float64",
27
+ DataType::Date32 => "date",
28
+ DataType::Date64 => "datetime",
29
+ DataType::Timestamp(_, _) => "datetime",
30
+ DataType::FixedSizeList(_, _) => "vector", // Will be handled specially
31
+ _ => "unknown"
32
+ }
33
+ }
14
34
 
15
35
  #[magnus::wrap(class = "Lancelot::Dataset", free_immediately, size)]
16
36
  pub struct LancelotDataset {
@@ -121,16 +141,40 @@ impl LancelotDataset {
121
141
 
122
142
  pub fn schema(&self) -> Result<RHash, Error> {
123
143
  let dataset = self.dataset.borrow();
124
- let _dataset = dataset.as_ref()
144
+ let dataset = dataset.as_ref()
125
145
  .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "Dataset not opened"))?;
126
146
 
147
+ // Get the actual schema from the Lance dataset
148
+ let schema = self.runtime.borrow_mut().block_on(async {
149
+ dataset.schema()
150
+ });
151
+
152
+ // Convert Lance schema to Arrow schema
153
+ let arrow_schema: arrow_schema::Schema = schema.into();
154
+ let arrow_schema = Arc::new(arrow_schema);
155
+
127
156
  let ruby = Ruby::get().unwrap();
128
157
  let hash = ruby.hash_new();
129
158
 
130
- // TODO: Read actual schema from Lance dataset once we figure out the 0.31 API
131
- // For now, return a hardcoded schema that matches what we support
132
- hash.aset(Symbol::new("text"), "string")?;
133
- hash.aset(Symbol::new("score"), "float32")?;
159
+ // Iterate over Arrow schema fields
160
+ for field in arrow_schema.fields() {
161
+ let field_name = Symbol::new(&field.name());
162
+
163
+ // Handle vector columns specially
164
+ if let DataType::FixedSizeList(inner_field, dimension) = field.data_type() {
165
+ // Check if it's a vector (float list)
166
+ if matches!(inner_field.data_type(), DataType::Float32 | DataType::Float16) {
167
+ let vector_info = ruby.hash_new();
168
+ vector_info.aset(Symbol::new("type"), "vector")?;
169
+ vector_info.aset(Symbol::new("dimension"), *dimension)?;
170
+ hash.aset(field_name, vector_info)?;
171
+ continue;
172
+ }
173
+ }
174
+
175
+ let field_type = datatype_to_ruby_string(field.data_type());
176
+ hash.aset(field_name, field_type)?;
177
+ }
134
178
 
135
179
  Ok(hash)
136
180
  }
@@ -157,9 +201,10 @@ impl LancelotDataset {
157
201
  let result_array = ruby.ary_new();
158
202
 
159
203
  for batch in batches {
160
- let documents = convert_batch_to_ruby(&batch)?;
161
- for doc in documents {
162
- result_array.push(doc)?;
204
+ let batch_docs = convert_batch_to_ruby(&batch)?;
205
+ // Merge arrays by pushing each element
206
+ for i in 0..batch_docs.len() {
207
+ result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
163
208
  }
164
209
  }
165
210
 
@@ -191,9 +236,10 @@ impl LancelotDataset {
191
236
  let result_array = ruby.ary_new();
192
237
 
193
238
  for batch in batches {
194
- let documents = convert_batch_to_ruby(&batch)?;
195
- for doc in documents {
196
- result_array.push(doc)?;
239
+ let batch_docs = convert_batch_to_ruby(&batch)?;
240
+ // Merge arrays by pushing each element
241
+ for i in 0..batch_docs.len() {
242
+ result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
197
243
  }
198
244
  }
199
245
 
@@ -265,9 +311,10 @@ impl LancelotDataset {
265
311
  let result_array = ruby.ary_new();
266
312
 
267
313
  for batch in batches {
268
- let documents = convert_batch_to_ruby(&batch)?;
269
- for doc in documents {
270
- result_array.push(doc)?;
314
+ let batch_docs = convert_batch_to_ruby(&batch)?;
315
+ // Merge arrays by pushing each element
316
+ for i in 0..batch_docs.len() {
317
+ result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
271
318
  }
272
319
  }
273
320
 
@@ -330,9 +377,10 @@ impl LancelotDataset {
330
377
  let result_array = ruby.ary_new();
331
378
 
332
379
  for batch in batches {
333
- let documents = convert_batch_to_ruby(&batch)?;
334
- for doc in documents {
335
- result_array.push(doc)?;
380
+ let batch_docs = convert_batch_to_ruby(&batch)?;
381
+ // Merge arrays by pushing each element
382
+ for i in 0..batch_docs.len() {
383
+ result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
336
384
  }
337
385
  }
338
386
 
@@ -380,9 +428,10 @@ impl LancelotDataset {
380
428
  let result_array = ruby.ary_new();
381
429
 
382
430
  for batch in batches {
383
- let documents = convert_batch_to_ruby(&batch)?;
384
- for doc in documents {
385
- result_array.push(doc)?;
431
+ let batch_docs = convert_batch_to_ruby(&batch)?;
432
+ // Merge arrays by pushing each element
433
+ for i in 0..batch_docs.len() {
434
+ result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
386
435
  }
387
436
  }
388
437
 
@@ -422,9 +471,10 @@ impl LancelotDataset {
422
471
  let result_array = ruby.ary_new();
423
472
 
424
473
  for batch in batches {
425
- let documents = convert_batch_to_ruby(&batch)?;
426
- for doc in documents {
427
- result_array.push(doc)?;
474
+ let batch_docs = convert_batch_to_ruby(&batch)?;
475
+ // Merge arrays by pushing each element
476
+ for i in 0..batch_docs.len() {
477
+ result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
428
478
  }
429
479
  }
430
480
 
@@ -62,8 +62,16 @@ module Lancelot
62
62
  count_rows
63
63
  end
64
64
 
65
- alias_method :count, :size
66
65
  alias_method :length, :size
66
+
67
+ # Override Enumerable's count to use our efficient count_rows when no block given
68
+ def count(&block)
69
+ if block_given?
70
+ super(&block) # Use Enumerable's count with block
71
+ else
72
+ count_rows # Use our efficient count without block
73
+ end
74
+ end
67
75
 
68
76
  def all
69
77
  scan_all
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Lancelot
4
- VERSION = "0.3.1"
4
+ VERSION = "0.3.3"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lancelot
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Petersen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-08-10 00:00:00.000000000 Z
11
+ date: 2025-09-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -111,6 +111,8 @@ files:
111
111
  - LICENSE.txt
112
112
  - README.md
113
113
  - Rakefile
114
+ - docs/assets/lancelot-wide.png
115
+ - docs/assets/lancelot.png
114
116
  - examples/basic_usage.rb
115
117
  - examples/full_text_search.rb
116
118
  - examples/idempotent_create.rb
@@ -129,13 +131,13 @@ files:
129
131
  - lib/lancelot/rank_fusion.rb
130
132
  - lib/lancelot/version.rb
131
133
  - sig/lancelot.rbs
132
- homepage: https://github.com/cpetersen/lancelot
134
+ homepage: https://github.com/scientist-labs/lancelot
133
135
  licenses:
134
136
  - MIT
135
137
  metadata:
136
- homepage_uri: https://github.com/cpetersen/lancelot
137
- source_code_uri: https://github.com/cpetersen/lancelot
138
- changelog_uri: https://github.com/cpetersen/lancelot/blob/main/CHANGELOG.md
138
+ homepage_uri: https://github.com/scientist-labs/lancelot
139
+ source_code_uri: https://github.com/scientist-labs/lancelot
140
+ changelog_uri: https://github.com/scientist-labs/lancelot/blob/main/CHANGELOG.md
139
141
  post_install_message:
140
142
  rdoc_options: []
141
143
  require_paths: