lancelot 0.3.4 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +39 -6
- data/ext/lancelot/Cargo.lock +5625 -0
- data/ext/lancelot/Cargo.toml +8 -8
- data/ext/lancelot/src/conversion.rs +49 -44
- data/ext/lancelot/src/dataset.rs +120 -107
- data/ext/lancelot/src/lib.rs +5 -5
- data/ext/lancelot/src/schema.rs +10 -12
- data/lib/lancelot/version.rb +1 -1
- data/lib/lancelot.rb +14 -1
- metadata +4 -6
data/ext/lancelot/Cargo.toml
CHANGED
|
@@ -7,15 +7,15 @@ edition = "2021"
|
|
|
7
7
|
crate-type = ["cdylib"]
|
|
8
8
|
|
|
9
9
|
[dependencies]
|
|
10
|
-
magnus = { version = "0.
|
|
11
|
-
lance = { version = "0
|
|
12
|
-
lance-index = "0
|
|
13
|
-
lance-linalg = "0
|
|
10
|
+
magnus = { version = "0.8", features = ["embed"] }
|
|
11
|
+
lance = { version = "3.0", default-features = false }
|
|
12
|
+
lance-index = "3.0"
|
|
13
|
+
lance-linalg = "3.0"
|
|
14
14
|
tokio = { version = "1", features = ["rt-multi-thread", "macros"] }
|
|
15
|
-
arrow = "
|
|
16
|
-
arrow-array = "
|
|
17
|
-
arrow-schema = "
|
|
18
|
-
arrow-data = "
|
|
15
|
+
arrow = "57"
|
|
16
|
+
arrow-array = "57"
|
|
17
|
+
arrow-schema = "57"
|
|
18
|
+
arrow-data = "57"
|
|
19
19
|
futures = "0.3"
|
|
20
20
|
thiserror = "2"
|
|
21
21
|
serde = { version = "1", features = ["derive"] }
|
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
use magnus::{Error, Ruby, RHash, RArray,
|
|
1
|
+
use magnus::{Error, Ruby, RHash, RArray, Value, TryConvert, value::ReprValue};
|
|
2
2
|
use arrow_schema::{DataType, Schema as ArrowSchema};
|
|
3
3
|
use arrow_array::{RecordBatch, StringArray, Float32Array, ArrayRef, Array, FixedSizeListArray};
|
|
4
4
|
use std::collections::HashMap;
|
|
5
5
|
use std::sync::Arc;
|
|
6
6
|
|
|
7
7
|
pub fn build_record_batch(
|
|
8
|
+
ruby: &Ruby,
|
|
8
9
|
data: RArray,
|
|
9
10
|
schema: &ArrowSchema,
|
|
10
11
|
) -> Result<RecordBatch, Error> {
|
|
@@ -13,7 +14,7 @@ pub fn build_record_batch(
|
|
|
13
14
|
let mut int_columns: HashMap<String, Vec<Option<i64>>> = HashMap::new();
|
|
14
15
|
let mut bool_columns: HashMap<String, Vec<Option<bool>>> = HashMap::new();
|
|
15
16
|
let mut vector_columns: HashMap<String, Vec<Option<Vec<f32>>>> = HashMap::new();
|
|
16
|
-
|
|
17
|
+
|
|
17
18
|
for field in schema.fields() {
|
|
18
19
|
match field.data_type() {
|
|
19
20
|
DataType::Utf8 => {
|
|
@@ -35,18 +36,20 @@ pub fn build_record_batch(
|
|
|
35
36
|
}
|
|
36
37
|
}
|
|
37
38
|
|
|
38
|
-
|
|
39
|
-
|
|
39
|
+
// Index-based iteration over data RArray
|
|
40
|
+
for idx in 0..data.len() {
|
|
41
|
+
let item_value: Value = data.entry(idx as isize)?;
|
|
42
|
+
let item = RHash::try_convert(item_value)?;
|
|
40
43
|
for field in schema.fields() {
|
|
41
|
-
let key =
|
|
44
|
+
let key = ruby.to_symbol(field.name());
|
|
42
45
|
// Make fields optional - use get instead of fetch
|
|
43
46
|
let value: Value = item.get(key)
|
|
44
47
|
.or_else(|| {
|
|
45
|
-
// Try with string key
|
|
48
|
+
// Try with string key
|
|
46
49
|
item.get(field.name().as_str())
|
|
47
50
|
})
|
|
48
|
-
.unwrap_or_else(||
|
|
49
|
-
|
|
51
|
+
.unwrap_or_else(|| ruby.qnil().as_value());
|
|
52
|
+
|
|
50
53
|
match field.data_type() {
|
|
51
54
|
DataType::Utf8 => {
|
|
52
55
|
if value.is_nil() {
|
|
@@ -85,9 +88,12 @@ pub fn build_record_batch(
|
|
|
85
88
|
vector_columns.get_mut(field.name()).unwrap().push(None);
|
|
86
89
|
} else {
|
|
87
90
|
let arr = RArray::try_convert(value)?;
|
|
88
|
-
let
|
|
89
|
-
|
|
90
|
-
|
|
91
|
+
let len = arr.len();
|
|
92
|
+
let mut vec: Vec<f32> = Vec::with_capacity(len);
|
|
93
|
+
for j in 0..len {
|
|
94
|
+
let v: f64 = arr.entry(j as isize)?;
|
|
95
|
+
vec.push(v as f32);
|
|
96
|
+
}
|
|
91
97
|
vector_columns.get_mut(field.name()).unwrap().push(Some(vec));
|
|
92
98
|
}
|
|
93
99
|
}
|
|
@@ -97,7 +103,7 @@ pub fn build_record_batch(
|
|
|
97
103
|
}
|
|
98
104
|
|
|
99
105
|
let mut arrays: Vec<ArrayRef> = Vec::new();
|
|
100
|
-
|
|
106
|
+
|
|
101
107
|
for field in schema.fields() {
|
|
102
108
|
let array: ArrayRef = match field.data_type() {
|
|
103
109
|
DataType::Utf8 => {
|
|
@@ -125,7 +131,7 @@ pub fn build_record_batch(
|
|
|
125
131
|
Some(vec) => {
|
|
126
132
|
if vec.len() != *list_size as usize {
|
|
127
133
|
return Err(Error::new(
|
|
128
|
-
|
|
134
|
+
ruby.exception_arg_error(),
|
|
129
135
|
format!("Vector dimension mismatch. Expected {}, got {}", list_size, vec.len())
|
|
130
136
|
));
|
|
131
137
|
}
|
|
@@ -137,7 +143,7 @@ pub fn build_record_batch(
|
|
|
137
143
|
}
|
|
138
144
|
}
|
|
139
145
|
}
|
|
140
|
-
|
|
146
|
+
|
|
141
147
|
let flat_array = Float32Array::from(flat_values);
|
|
142
148
|
Arc::new(FixedSizeListArray::new(
|
|
143
149
|
inner_field.clone(),
|
|
@@ -147,46 +153,45 @@ pub fn build_record_batch(
|
|
|
147
153
|
))
|
|
148
154
|
}
|
|
149
155
|
_ => return Err(Error::new(
|
|
150
|
-
|
|
156
|
+
ruby.exception_runtime_error(),
|
|
151
157
|
format!("Unsupported data type: {:?}", field.data_type())
|
|
152
158
|
))
|
|
153
159
|
};
|
|
154
|
-
|
|
160
|
+
|
|
155
161
|
arrays.push(array);
|
|
156
162
|
}
|
|
157
163
|
|
|
158
164
|
RecordBatch::try_new(Arc::new(schema.clone()), arrays)
|
|
159
|
-
.map_err(|e| Error::new(
|
|
165
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))
|
|
160
166
|
}
|
|
161
167
|
|
|
162
|
-
pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<RArray, Error> {
|
|
163
|
-
let ruby = Ruby::get().unwrap();
|
|
168
|
+
pub fn convert_batch_to_ruby(ruby: &Ruby, batch: &RecordBatch) -> Result<RArray, Error> {
|
|
164
169
|
let documents = ruby.ary_new();
|
|
165
|
-
|
|
170
|
+
|
|
166
171
|
let num_rows = batch.num_rows();
|
|
167
172
|
let schema = batch.schema();
|
|
168
|
-
|
|
173
|
+
|
|
169
174
|
for row_idx in 0..num_rows {
|
|
170
175
|
let doc = ruby.hash_new();
|
|
171
|
-
|
|
176
|
+
|
|
172
177
|
for (col_idx, field) in schema.fields().iter().enumerate() {
|
|
173
178
|
let column = batch.column(col_idx);
|
|
174
|
-
let key =
|
|
175
|
-
|
|
179
|
+
let key = ruby.to_symbol(field.name());
|
|
180
|
+
|
|
176
181
|
// CRITICAL: Add bounds checking for all array access
|
|
177
182
|
if row_idx >= column.len() {
|
|
178
183
|
return Err(Error::new(
|
|
179
|
-
|
|
180
|
-
format!("Row index {} out of bounds for column '{}' with length {}",
|
|
184
|
+
ruby.exception_runtime_error(),
|
|
185
|
+
format!("Row index {} out of bounds for column '{}' with length {}",
|
|
181
186
|
row_idx, field.name(), column.len())
|
|
182
187
|
));
|
|
183
188
|
}
|
|
184
|
-
|
|
189
|
+
|
|
185
190
|
match field.data_type() {
|
|
186
191
|
DataType::Utf8 => {
|
|
187
192
|
let array = column.as_any().downcast_ref::<StringArray>()
|
|
188
|
-
.ok_or_else(|| Error::new(
|
|
189
|
-
|
|
193
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "Failed to cast to StringArray"))?;
|
|
194
|
+
|
|
190
195
|
if array.is_null(row_idx) {
|
|
191
196
|
doc.aset(key, ruby.qnil())?;
|
|
192
197
|
} else {
|
|
@@ -195,8 +200,8 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<RArray, Error> {
|
|
|
195
200
|
}
|
|
196
201
|
DataType::Float32 => {
|
|
197
202
|
let array = column.as_any().downcast_ref::<Float32Array>()
|
|
198
|
-
.ok_or_else(|| Error::new(
|
|
199
|
-
|
|
203
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "Failed to cast to Float32Array"))?;
|
|
204
|
+
|
|
200
205
|
if array.is_null(row_idx) {
|
|
201
206
|
doc.aset(key, ruby.qnil())?;
|
|
202
207
|
} else {
|
|
@@ -205,8 +210,8 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<RArray, Error> {
|
|
|
205
210
|
}
|
|
206
211
|
DataType::Int64 => {
|
|
207
212
|
let array = column.as_any().downcast_ref::<arrow_array::Int64Array>()
|
|
208
|
-
.ok_or_else(|| Error::new(
|
|
209
|
-
|
|
213
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "Failed to cast to Int64Array"))?;
|
|
214
|
+
|
|
210
215
|
if array.is_null(row_idx) {
|
|
211
216
|
doc.aset(key, ruby.qnil())?;
|
|
212
217
|
} else {
|
|
@@ -215,8 +220,8 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<RArray, Error> {
|
|
|
215
220
|
}
|
|
216
221
|
DataType::Boolean => {
|
|
217
222
|
let array = column.as_any().downcast_ref::<arrow_array::BooleanArray>()
|
|
218
|
-
.ok_or_else(|| Error::new(
|
|
219
|
-
|
|
223
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "Failed to cast to BooleanArray"))?;
|
|
224
|
+
|
|
220
225
|
if array.is_null(row_idx) {
|
|
221
226
|
doc.aset(key, ruby.qnil())?;
|
|
222
227
|
} else {
|
|
@@ -225,25 +230,25 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<RArray, Error> {
|
|
|
225
230
|
}
|
|
226
231
|
DataType::FixedSizeList(_, list_size) => {
|
|
227
232
|
let array = column.as_any().downcast_ref::<FixedSizeListArray>()
|
|
228
|
-
.ok_or_else(|| Error::new(
|
|
229
|
-
|
|
233
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "Failed to cast to FixedSizeListArray"))?;
|
|
234
|
+
|
|
230
235
|
if array.is_null(row_idx) {
|
|
231
236
|
doc.aset(key, ruby.qnil())?;
|
|
232
237
|
} else {
|
|
233
238
|
let values = array.value(row_idx);
|
|
234
239
|
let float_array = values.as_any().downcast_ref::<Float32Array>()
|
|
235
|
-
.ok_or_else(|| Error::new(
|
|
236
|
-
|
|
240
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "Failed to cast vector values to Float32Array"))?;
|
|
241
|
+
|
|
237
242
|
// CRITICAL: Verify the float_array has the expected size
|
|
238
243
|
let expected_size = *list_size as usize;
|
|
239
244
|
if float_array.len() != expected_size {
|
|
240
245
|
return Err(Error::new(
|
|
241
|
-
|
|
246
|
+
ruby.exception_runtime_error(),
|
|
242
247
|
format!("Vector data corruption: expected {} elements but found {} for field '{}'",
|
|
243
248
|
expected_size, float_array.len(), field.name())
|
|
244
249
|
));
|
|
245
250
|
}
|
|
246
|
-
|
|
251
|
+
|
|
247
252
|
let ruby_array = ruby.ary_new();
|
|
248
253
|
for i in 0..expected_size {
|
|
249
254
|
ruby_array.push(float_array.value(i))?;
|
|
@@ -256,9 +261,9 @@ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<RArray, Error> {
|
|
|
256
261
|
}
|
|
257
262
|
}
|
|
258
263
|
}
|
|
259
|
-
|
|
264
|
+
|
|
260
265
|
documents.push(doc)?;
|
|
261
266
|
}
|
|
262
|
-
|
|
267
|
+
|
|
263
268
|
Ok(documents)
|
|
264
|
-
}
|
|
269
|
+
}
|