lancelot 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +22 -3
- data/docs/assets/lancelot-wide.png +0 -0
- data/docs/assets/lancelot.png +0 -0
- data/ext/lancelot/src/conversion.rs +24 -5
- data/ext/lancelot/src/dataset.rs +74 -24
- data/lib/lancelot/dataset.rb +9 -1
- data/lib/lancelot/version.rb +1 -1
- metadata +8 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b6da55f7ccdf6c0b08d1e317e63a627573e9599af9fe2c2f7c93ab8cdefc755d
|
4
|
+
data.tar.gz: fd75f76de200412842b7ad963a54dff99e691cf6a4a9c4e1df37ea65587c23a1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 44762f80ae8374f6ecf930360ca30b7e6987bbd978ae40d1fdb0c69093cc22d68877f5af2d18e25d088a13d386a2cfaba663c48d118c310e061ad130d5d86f1f
|
7
|
+
data.tar.gz: 976fee93527713f6404d9a3057e2feaa2d9b2f38601280a961c30653e9706a762aab2d1bf729dcee0b3954d4c07103f2db02e788736853ae70cd026bda0f8021
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
<img src="/docs/assets/lancelot-wide.png" alt="lancelot" height="80px">
|
2
2
|
|
3
3
|
Ruby bindings for [Lance](https://github.com/lancedb/lance), a modern columnar data format for ML. Lancelot provides a Ruby-native interface to Lance, enabling efficient storage and search of multimodal data including text, vectors, and more.
|
4
4
|
|
@@ -57,6 +57,25 @@ dataset.vector_search(query_embedding, column: "embedding", limit: 5).each { |r|
|
|
57
57
|
|
58
58
|
## Installation
|
59
59
|
|
60
|
+
### System Requirements
|
61
|
+
|
62
|
+
Lancelot requires the Protocol Buffers compiler (`protoc`) to build from source:
|
63
|
+
|
64
|
+
```bash
|
65
|
+
# macOS (via Homebrew)
|
66
|
+
brew install protobuf
|
67
|
+
|
68
|
+
# Ubuntu/Debian
|
69
|
+
sudo apt-get install protobuf-compiler
|
70
|
+
|
71
|
+
# Other systems
|
72
|
+
# Download from https://github.com/protocolbuffers/protobuf/releases
|
73
|
+
```
|
74
|
+
|
75
|
+
**Note**: The `protoc` compiler is only needed when building the gem from source. Pre-built gems distributed through RubyGems.org do not require `protoc` to be installed.
|
76
|
+
|
77
|
+
### Install the Gem
|
78
|
+
|
60
79
|
Install the gem and add to the application's Gemfile by executing:
|
61
80
|
|
62
81
|
```bash
|
@@ -276,7 +295,7 @@ bundle exec rake compile
|
|
276
295
|
|
277
296
|
## Contributing
|
278
297
|
|
279
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
298
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/scientist-labs/lancelot. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/scientist-labs/lancelot/blob/main/CODE_OF_CONDUCT.md).
|
280
299
|
|
281
300
|
## License
|
282
301
|
|
@@ -284,4 +303,4 @@ The gem is available as open source under the terms of the [MIT License](https:/
|
|
284
303
|
|
285
304
|
## Code of Conduct
|
286
305
|
|
287
|
-
Everyone interacting in the Lancelot project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/
|
306
|
+
Everyone interacting in the Lancelot project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/scientist-labs/lancelot/blob/main/CODE_OF_CONDUCT.md).
|
Binary file
|
Binary file
|
@@ -159,9 +159,9 @@ pub fn build_record_batch(
|
|
159
159
|
.map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))
|
160
160
|
}
|
161
161
|
|
162
|
-
pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<
|
162
|
+
pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<RArray, Error> {
|
163
163
|
let ruby = Ruby::get().unwrap();
|
164
|
-
let
|
164
|
+
let documents = ruby.ary_new();
|
165
165
|
|
166
166
|
let num_rows = batch.num_rows();
|
167
167
|
let schema = batch.schema();
|
@@ -173,6 +173,15 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<Vec<RHash>, Error> {
|
|
173
173
|
let column = batch.column(col_idx);
|
174
174
|
let key = Symbol::new(field.name());
|
175
175
|
|
176
|
+
// CRITICAL: Add bounds checking for all array access
|
177
|
+
if row_idx >= column.len() {
|
178
|
+
return Err(Error::new(
|
179
|
+
magnus::exception::runtime_error(),
|
180
|
+
format!("Row index {} out of bounds for column '{}' with length {}",
|
181
|
+
row_idx, field.name(), column.len())
|
182
|
+
));
|
183
|
+
}
|
184
|
+
|
176
185
|
match field.data_type() {
|
177
186
|
DataType::Utf8 => {
|
178
187
|
let array = column.as_any().downcast_ref::<StringArray>()
|
@@ -225,9 +234,19 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<Vec<RHash>, Error> {
|
|
225
234
|
let float_array = values.as_any().downcast_ref::<Float32Array>()
|
226
235
|
.ok_or_else(|| Error::new(magnus::exception::runtime_error(), "Failed to cast vector values to Float32Array"))?;
|
227
236
|
|
237
|
+
// CRITICAL: Verify the float_array has the expected size
|
238
|
+
let expected_size = *list_size as usize;
|
239
|
+
if float_array.len() != expected_size {
|
240
|
+
return Err(Error::new(
|
241
|
+
magnus::exception::runtime_error(),
|
242
|
+
format!("Vector data corruption: expected {} elements but found {} for field '{}'",
|
243
|
+
expected_size, float_array.len(), field.name())
|
244
|
+
));
|
245
|
+
}
|
246
|
+
|
228
247
|
let ruby_array = ruby.ary_new();
|
229
|
-
for i in 0
|
230
|
-
ruby_array.push(float_array.value(i
|
248
|
+
for i in 0..expected_size {
|
249
|
+
ruby_array.push(float_array.value(i))?;
|
231
250
|
}
|
232
251
|
doc.aset(key, ruby_array)?;
|
233
252
|
}
|
@@ -238,7 +257,7 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<Vec<RHash>, Error> {
|
|
238
257
|
}
|
239
258
|
}
|
240
259
|
|
241
|
-
documents.push(doc)
|
260
|
+
documents.push(doc)?;
|
242
261
|
}
|
243
262
|
|
244
263
|
Ok(documents)
|
data/ext/lancelot/src/dataset.rs
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
use magnus::{Error, Ruby, RHash, RArray, Symbol, TryConvert, function, method, RClass, Module, Object};
|
1
|
+
use magnus::{Error, Ruby, RHash, RArray, Symbol, TryConvert, Value, function, method, RClass, Module, Object};
|
2
2
|
use std::cell::RefCell;
|
3
3
|
use std::sync::Arc;
|
4
4
|
use tokio::runtime::Runtime;
|
@@ -11,6 +11,26 @@ use futures::stream::TryStreamExt;
|
|
11
11
|
|
12
12
|
use crate::schema::build_arrow_schema;
|
13
13
|
use crate::conversion::{build_record_batch, convert_batch_to_ruby};
|
14
|
+
use arrow_schema::DataType;
|
15
|
+
|
16
|
+
/// Convert Arrow DataType to Ruby-friendly string representation
|
17
|
+
fn datatype_to_ruby_string(dtype: &DataType) -> &'static str {
|
18
|
+
match dtype {
|
19
|
+
DataType::Utf8 | DataType::LargeUtf8 => "string",
|
20
|
+
DataType::Boolean => "boolean",
|
21
|
+
DataType::Int8 | DataType::Int16 | DataType::Int32 => "int32",
|
22
|
+
DataType::Int64 => "int64",
|
23
|
+
DataType::UInt8 | DataType::UInt16 | DataType::UInt32 => "int32",
|
24
|
+
DataType::UInt64 => "int64",
|
25
|
+
DataType::Float32 => "float32",
|
26
|
+
DataType::Float64 => "float64",
|
27
|
+
DataType::Date32 => "date",
|
28
|
+
DataType::Date64 => "datetime",
|
29
|
+
DataType::Timestamp(_, _) => "datetime",
|
30
|
+
DataType::FixedSizeList(_, _) => "vector", // Will be handled specially
|
31
|
+
_ => "unknown"
|
32
|
+
}
|
33
|
+
}
|
14
34
|
|
15
35
|
#[magnus::wrap(class = "Lancelot::Dataset", free_immediately, size)]
|
16
36
|
pub struct LancelotDataset {
|
@@ -121,16 +141,40 @@ impl LancelotDataset {
|
|
121
141
|
|
122
142
|
pub fn schema(&self) -> Result<RHash, Error> {
|
123
143
|
let dataset = self.dataset.borrow();
|
124
|
-
let
|
144
|
+
let dataset = dataset.as_ref()
|
125
145
|
.ok_or_else(|| Error::new(magnus::exception::runtime_error(), "Dataset not opened"))?;
|
126
146
|
|
147
|
+
// Get the actual schema from the Lance dataset
|
148
|
+
let schema = self.runtime.borrow_mut().block_on(async {
|
149
|
+
dataset.schema()
|
150
|
+
});
|
151
|
+
|
152
|
+
// Convert Lance schema to Arrow schema
|
153
|
+
let arrow_schema: arrow_schema::Schema = schema.into();
|
154
|
+
let arrow_schema = Arc::new(arrow_schema);
|
155
|
+
|
127
156
|
let ruby = Ruby::get().unwrap();
|
128
157
|
let hash = ruby.hash_new();
|
129
158
|
|
130
|
-
//
|
131
|
-
|
132
|
-
|
133
|
-
|
159
|
+
// Iterate over Arrow schema fields
|
160
|
+
for field in arrow_schema.fields() {
|
161
|
+
let field_name = Symbol::new(&field.name());
|
162
|
+
|
163
|
+
// Handle vector columns specially
|
164
|
+
if let DataType::FixedSizeList(inner_field, dimension) = field.data_type() {
|
165
|
+
// Check if it's a vector (float list)
|
166
|
+
if matches!(inner_field.data_type(), DataType::Float32 | DataType::Float16) {
|
167
|
+
let vector_info = ruby.hash_new();
|
168
|
+
vector_info.aset(Symbol::new("type"), "vector")?;
|
169
|
+
vector_info.aset(Symbol::new("dimension"), *dimension)?;
|
170
|
+
hash.aset(field_name, vector_info)?;
|
171
|
+
continue;
|
172
|
+
}
|
173
|
+
}
|
174
|
+
|
175
|
+
let field_type = datatype_to_ruby_string(field.data_type());
|
176
|
+
hash.aset(field_name, field_type)?;
|
177
|
+
}
|
134
178
|
|
135
179
|
Ok(hash)
|
136
180
|
}
|
@@ -157,9 +201,10 @@ impl LancelotDataset {
|
|
157
201
|
let result_array = ruby.ary_new();
|
158
202
|
|
159
203
|
for batch in batches {
|
160
|
-
let
|
161
|
-
|
162
|
-
|
204
|
+
let batch_docs = convert_batch_to_ruby(&batch)?;
|
205
|
+
// Merge arrays by pushing each element
|
206
|
+
for i in 0..batch_docs.len() {
|
207
|
+
result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
|
163
208
|
}
|
164
209
|
}
|
165
210
|
|
@@ -191,9 +236,10 @@ impl LancelotDataset {
|
|
191
236
|
let result_array = ruby.ary_new();
|
192
237
|
|
193
238
|
for batch in batches {
|
194
|
-
let
|
195
|
-
|
196
|
-
|
239
|
+
let batch_docs = convert_batch_to_ruby(&batch)?;
|
240
|
+
// Merge arrays by pushing each element
|
241
|
+
for i in 0..batch_docs.len() {
|
242
|
+
result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
|
197
243
|
}
|
198
244
|
}
|
199
245
|
|
@@ -265,9 +311,10 @@ impl LancelotDataset {
|
|
265
311
|
let result_array = ruby.ary_new();
|
266
312
|
|
267
313
|
for batch in batches {
|
268
|
-
let
|
269
|
-
|
270
|
-
|
314
|
+
let batch_docs = convert_batch_to_ruby(&batch)?;
|
315
|
+
// Merge arrays by pushing each element
|
316
|
+
for i in 0..batch_docs.len() {
|
317
|
+
result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
|
271
318
|
}
|
272
319
|
}
|
273
320
|
|
@@ -330,9 +377,10 @@ impl LancelotDataset {
|
|
330
377
|
let result_array = ruby.ary_new();
|
331
378
|
|
332
379
|
for batch in batches {
|
333
|
-
let
|
334
|
-
|
335
|
-
|
380
|
+
let batch_docs = convert_batch_to_ruby(&batch)?;
|
381
|
+
// Merge arrays by pushing each element
|
382
|
+
for i in 0..batch_docs.len() {
|
383
|
+
result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
|
336
384
|
}
|
337
385
|
}
|
338
386
|
|
@@ -380,9 +428,10 @@ impl LancelotDataset {
|
|
380
428
|
let result_array = ruby.ary_new();
|
381
429
|
|
382
430
|
for batch in batches {
|
383
|
-
let
|
384
|
-
|
385
|
-
|
431
|
+
let batch_docs = convert_batch_to_ruby(&batch)?;
|
432
|
+
// Merge arrays by pushing each element
|
433
|
+
for i in 0..batch_docs.len() {
|
434
|
+
result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
|
386
435
|
}
|
387
436
|
}
|
388
437
|
|
@@ -422,9 +471,10 @@ impl LancelotDataset {
|
|
422
471
|
let result_array = ruby.ary_new();
|
423
472
|
|
424
473
|
for batch in batches {
|
425
|
-
let
|
426
|
-
|
427
|
-
|
474
|
+
let batch_docs = convert_batch_to_ruby(&batch)?;
|
475
|
+
// Merge arrays by pushing each element
|
476
|
+
for i in 0..batch_docs.len() {
|
477
|
+
result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
|
428
478
|
}
|
429
479
|
}
|
430
480
|
|
data/lib/lancelot/dataset.rb
CHANGED
@@ -62,8 +62,16 @@ module Lancelot
|
|
62
62
|
count_rows
|
63
63
|
end
|
64
64
|
|
65
|
-
alias_method :count, :size
|
66
65
|
alias_method :length, :size
|
66
|
+
|
67
|
+
# Override Enumerable's count to use our efficient count_rows when no block given
|
68
|
+
def count(&block)
|
69
|
+
if block_given?
|
70
|
+
super(&block) # Use Enumerable's count with block
|
71
|
+
else
|
72
|
+
count_rows # Use our efficient count without block
|
73
|
+
end
|
74
|
+
end
|
67
75
|
|
68
76
|
def all
|
69
77
|
scan_all
|
data/lib/lancelot/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lancelot
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Petersen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-09-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -111,6 +111,8 @@ files:
|
|
111
111
|
- LICENSE.txt
|
112
112
|
- README.md
|
113
113
|
- Rakefile
|
114
|
+
- docs/assets/lancelot-wide.png
|
115
|
+
- docs/assets/lancelot.png
|
114
116
|
- examples/basic_usage.rb
|
115
117
|
- examples/full_text_search.rb
|
116
118
|
- examples/idempotent_create.rb
|
@@ -129,13 +131,13 @@ files:
|
|
129
131
|
- lib/lancelot/rank_fusion.rb
|
130
132
|
- lib/lancelot/version.rb
|
131
133
|
- sig/lancelot.rbs
|
132
|
-
homepage: https://github.com/
|
134
|
+
homepage: https://github.com/scientist-labs/lancelot
|
133
135
|
licenses:
|
134
136
|
- MIT
|
135
137
|
metadata:
|
136
|
-
homepage_uri: https://github.com/
|
137
|
-
source_code_uri: https://github.com/
|
138
|
-
changelog_uri: https://github.com/
|
138
|
+
homepage_uri: https://github.com/scientist-labs/lancelot
|
139
|
+
source_code_uri: https://github.com/scientist-labs/lancelot
|
140
|
+
changelog_uri: https://github.com/scientist-labs/lancelot/blob/main/CHANGELOG.md
|
139
141
|
post_install_message:
|
140
142
|
rdoc_options: []
|
141
143
|
require_paths:
|