lancelot 0.3.4 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +39 -6
- data/ext/lancelot/Cargo.lock +5625 -0
- data/ext/lancelot/Cargo.toml +8 -8
- data/ext/lancelot/src/conversion.rs +49 -44
- data/ext/lancelot/src/dataset.rs +120 -107
- data/ext/lancelot/src/lib.rs +5 -5
- data/ext/lancelot/src/schema.rs +10 -12
- data/lib/lancelot/version.rb +1 -1
- data/lib/lancelot.rb +14 -1
- metadata +4 -6
data/ext/lancelot/src/dataset.rs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
use magnus::{Error, Ruby, RHash, RArray,
|
|
1
|
+
use magnus::{Error, Ruby, RHash, RArray, Value, function, method, RClass, Module, Object};
|
|
2
2
|
use std::cell::RefCell;
|
|
3
3
|
use std::sync::Arc;
|
|
4
4
|
use tokio::runtime::Runtime;
|
|
@@ -41,9 +41,10 @@ pub struct LancelotDataset {
|
|
|
41
41
|
|
|
42
42
|
impl LancelotDataset {
|
|
43
43
|
pub fn new(path: String) -> Result<Self, Error> {
|
|
44
|
+
let ruby = Ruby::get().unwrap();
|
|
44
45
|
let runtime = Runtime::new()
|
|
45
|
-
.map_err(|e| Error::new(
|
|
46
|
-
|
|
46
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
47
|
+
|
|
47
48
|
Ok(Self {
|
|
48
49
|
dataset: RefCell::new(None),
|
|
49
50
|
runtime: RefCell::new(runtime),
|
|
@@ -56,15 +57,16 @@ impl LancelotDataset {
|
|
|
56
57
|
}
|
|
57
58
|
|
|
58
59
|
pub fn create(&self, schema_hash: RHash) -> Result<(), Error> {
|
|
59
|
-
let
|
|
60
|
-
|
|
60
|
+
let ruby = Ruby::get().unwrap();
|
|
61
|
+
let schema = build_arrow_schema(&ruby, schema_hash)?;
|
|
62
|
+
|
|
61
63
|
let empty_batch = RecordBatch::new_empty(Arc::new(schema.clone()));
|
|
62
64
|
let batches = vec![empty_batch];
|
|
63
65
|
let reader = RecordBatchIterator::new(
|
|
64
66
|
batches.into_iter().map(Ok),
|
|
65
67
|
Arc::new(schema)
|
|
66
68
|
);
|
|
67
|
-
|
|
69
|
+
|
|
68
70
|
let dataset = self.runtime.borrow_mut().block_on(async {
|
|
69
71
|
Dataset::write(
|
|
70
72
|
reader,
|
|
@@ -72,7 +74,7 @@ impl LancelotDataset {
|
|
|
72
74
|
None,
|
|
73
75
|
)
|
|
74
76
|
.await
|
|
75
|
-
.map_err(|e| Error::new(
|
|
77
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))
|
|
76
78
|
})?;
|
|
77
79
|
|
|
78
80
|
self.dataset.replace(Some(dataset));
|
|
@@ -80,10 +82,11 @@ impl LancelotDataset {
|
|
|
80
82
|
}
|
|
81
83
|
|
|
82
84
|
pub fn open(&self) -> Result<(), Error> {
|
|
85
|
+
let ruby = Ruby::get().unwrap();
|
|
83
86
|
let dataset = self.runtime.borrow_mut().block_on(async {
|
|
84
87
|
Dataset::open(&self.path)
|
|
85
88
|
.await
|
|
86
|
-
.map_err(|e| Error::new(
|
|
89
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))
|
|
87
90
|
})?;
|
|
88
91
|
|
|
89
92
|
self.dataset.replace(Some(dataset));
|
|
@@ -91,9 +94,10 @@ impl LancelotDataset {
|
|
|
91
94
|
}
|
|
92
95
|
|
|
93
96
|
pub fn add_data(&self, data: RArray) -> Result<(), Error> {
|
|
97
|
+
let ruby = Ruby::get().unwrap();
|
|
94
98
|
let mut dataset = self.dataset.borrow_mut();
|
|
95
99
|
let dataset = dataset.as_mut()
|
|
96
|
-
.ok_or_else(|| Error::new(
|
|
100
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "Dataset not opened"))?;
|
|
97
101
|
|
|
98
102
|
// Check if data is empty
|
|
99
103
|
if data.len() == 0 {
|
|
@@ -104,74 +108,75 @@ impl LancelotDataset {
|
|
|
104
108
|
let schema = self.runtime.borrow_mut().block_on(async {
|
|
105
109
|
dataset.schema()
|
|
106
110
|
});
|
|
107
|
-
|
|
111
|
+
|
|
108
112
|
// Convert Lance schema to Arrow schema
|
|
109
113
|
let arrow_schema = schema.into();
|
|
110
114
|
|
|
111
|
-
let batch = build_record_batch(data, &arrow_schema)?;
|
|
115
|
+
let batch = build_record_batch(&ruby, data, &arrow_schema)?;
|
|
112
116
|
|
|
113
117
|
let batches = vec![batch];
|
|
114
118
|
let reader = RecordBatchIterator::new(
|
|
115
119
|
batches.into_iter().map(Ok),
|
|
116
120
|
Arc::new(arrow_schema)
|
|
117
121
|
);
|
|
118
|
-
|
|
122
|
+
|
|
119
123
|
self.runtime.borrow_mut().block_on(async move {
|
|
120
124
|
dataset.append(reader, None)
|
|
121
125
|
.await
|
|
122
|
-
.map_err(|e| Error::new(
|
|
126
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))
|
|
123
127
|
})?;
|
|
124
128
|
|
|
125
129
|
Ok(())
|
|
126
130
|
}
|
|
127
131
|
|
|
128
132
|
pub fn count_rows(&self) -> Result<i64, Error> {
|
|
133
|
+
let ruby = Ruby::get().unwrap();
|
|
129
134
|
let dataset = self.dataset.borrow();
|
|
130
135
|
let dataset = dataset.as_ref()
|
|
131
|
-
.ok_or_else(|| Error::new(
|
|
136
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "Dataset not opened"))?;
|
|
132
137
|
|
|
133
138
|
let count = self.runtime.borrow_mut().block_on(async {
|
|
134
139
|
dataset.count_rows(None)
|
|
135
140
|
.await
|
|
136
|
-
.map_err(|e| Error::new(
|
|
141
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))
|
|
137
142
|
})?;
|
|
138
143
|
|
|
139
144
|
Ok(count as i64)
|
|
140
145
|
}
|
|
141
146
|
|
|
142
147
|
pub fn schema(&self) -> Result<RHash, Error> {
|
|
148
|
+
let ruby = Ruby::get().unwrap();
|
|
143
149
|
let dataset = self.dataset.borrow();
|
|
144
150
|
let dataset = dataset.as_ref()
|
|
145
|
-
.ok_or_else(|| Error::new(
|
|
151
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "Dataset not opened"))?;
|
|
146
152
|
|
|
147
153
|
// Get the actual schema from the Lance dataset
|
|
148
154
|
let schema = self.runtime.borrow_mut().block_on(async {
|
|
149
155
|
dataset.schema()
|
|
150
156
|
});
|
|
151
|
-
|
|
157
|
+
|
|
152
158
|
// Convert Lance schema to Arrow schema
|
|
153
159
|
let arrow_schema: arrow_schema::Schema = schema.into();
|
|
154
160
|
let arrow_schema = Arc::new(arrow_schema);
|
|
155
161
|
|
|
156
|
-
let ruby = Ruby::get().unwrap();
|
|
157
162
|
let hash = ruby.hash_new();
|
|
158
|
-
|
|
163
|
+
|
|
159
164
|
// Iterate over Arrow schema fields
|
|
160
165
|
for field in arrow_schema.fields() {
|
|
161
|
-
let field_name =
|
|
162
|
-
|
|
166
|
+
let field_name = ruby.to_symbol(field.name());
|
|
167
|
+
|
|
163
168
|
// Handle vector columns specially
|
|
164
169
|
if let DataType::FixedSizeList(inner_field, dimension) = field.data_type() {
|
|
165
170
|
// Check if it's a vector (float list)
|
|
166
171
|
if matches!(inner_field.data_type(), DataType::Float32 | DataType::Float16) {
|
|
167
172
|
let vector_info = ruby.hash_new();
|
|
168
|
-
vector_info.aset(
|
|
169
|
-
vector_info.aset(
|
|
173
|
+
vector_info.aset(ruby.to_symbol("type"), "vector")?;
|
|
174
|
+
vector_info.aset(ruby.to_symbol("dimension"), *dimension)?;
|
|
170
175
|
hash.aset(field_name, vector_info)?;
|
|
171
176
|
continue;
|
|
172
177
|
}
|
|
173
178
|
}
|
|
174
|
-
|
|
179
|
+
|
|
175
180
|
let field_type = datatype_to_ruby_string(field.data_type());
|
|
176
181
|
hash.aset(field_name, field_type)?;
|
|
177
182
|
}
|
|
@@ -180,28 +185,28 @@ impl LancelotDataset {
|
|
|
180
185
|
}
|
|
181
186
|
|
|
182
187
|
pub fn scan_all(&self) -> Result<RArray, Error> {
|
|
188
|
+
let ruby = Ruby::get().unwrap();
|
|
183
189
|
let dataset = self.dataset.borrow();
|
|
184
190
|
let dataset = dataset.as_ref()
|
|
185
|
-
.ok_or_else(|| Error::new(
|
|
191
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "Dataset not opened"))?;
|
|
186
192
|
|
|
187
193
|
let batches: Vec<RecordBatch> = self.runtime.borrow_mut().block_on(async {
|
|
188
194
|
let scanner = dataset.scan();
|
|
189
195
|
let stream = scanner
|
|
190
196
|
.try_into_stream()
|
|
191
197
|
.await
|
|
192
|
-
.map_err(|e| Error::new(
|
|
193
|
-
|
|
198
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
199
|
+
|
|
194
200
|
stream
|
|
195
201
|
.try_collect::<Vec<_>>()
|
|
196
202
|
.await
|
|
197
|
-
.map_err(|e| Error::new(
|
|
203
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))
|
|
198
204
|
})?;
|
|
199
205
|
|
|
200
|
-
let ruby = Ruby::get().unwrap();
|
|
201
206
|
let result_array = ruby.ary_new();
|
|
202
207
|
|
|
203
208
|
for batch in batches {
|
|
204
|
-
let batch_docs = convert_batch_to_ruby(&batch)?;
|
|
209
|
+
let batch_docs = convert_batch_to_ruby(&ruby, &batch)?;
|
|
205
210
|
// Merge arrays by pushing each element
|
|
206
211
|
for i in 0..batch_docs.len() {
|
|
207
212
|
result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
|
|
@@ -212,31 +217,31 @@ impl LancelotDataset {
|
|
|
212
217
|
}
|
|
213
218
|
|
|
214
219
|
pub fn scan_limit(&self, limit: i64) -> Result<RArray, Error> {
|
|
220
|
+
let ruby = Ruby::get().unwrap();
|
|
215
221
|
let dataset = self.dataset.borrow();
|
|
216
222
|
let dataset = dataset.as_ref()
|
|
217
|
-
.ok_or_else(|| Error::new(
|
|
223
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "Dataset not opened"))?;
|
|
218
224
|
|
|
219
225
|
let batches: Vec<RecordBatch> = self.runtime.borrow_mut().block_on(async {
|
|
220
226
|
let mut scanner = dataset.scan();
|
|
221
227
|
scanner.limit(Some(limit), None)
|
|
222
|
-
.map_err(|e| Error::new(
|
|
223
|
-
|
|
228
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
229
|
+
|
|
224
230
|
let stream = scanner
|
|
225
231
|
.try_into_stream()
|
|
226
232
|
.await
|
|
227
|
-
.map_err(|e| Error::new(
|
|
228
|
-
|
|
233
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
234
|
+
|
|
229
235
|
stream
|
|
230
236
|
.try_collect::<Vec<_>>()
|
|
231
237
|
.await
|
|
232
|
-
.map_err(|e| Error::new(
|
|
238
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))
|
|
233
239
|
})?;
|
|
234
240
|
|
|
235
|
-
let ruby = Ruby::get().unwrap();
|
|
236
241
|
let result_array = ruby.ary_new();
|
|
237
242
|
|
|
238
243
|
for batch in batches {
|
|
239
|
-
let batch_docs = convert_batch_to_ruby(&batch)?;
|
|
244
|
+
let batch_docs = convert_batch_to_ruby(&ruby, &batch)?;
|
|
240
245
|
// Merge arrays by pushing each element
|
|
241
246
|
for i in 0..batch_docs.len() {
|
|
242
247
|
result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
|
|
@@ -247,25 +252,26 @@ impl LancelotDataset {
|
|
|
247
252
|
}
|
|
248
253
|
|
|
249
254
|
pub fn create_vector_index(&self, column: String) -> Result<(), Error> {
|
|
255
|
+
let ruby = Ruby::get().unwrap();
|
|
250
256
|
let mut dataset = self.dataset.borrow_mut();
|
|
251
257
|
let dataset = dataset.as_mut()
|
|
252
|
-
.ok_or_else(|| Error::new(
|
|
258
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "Dataset not opened"))?;
|
|
253
259
|
|
|
254
260
|
self.runtime.borrow_mut().block_on(async move {
|
|
255
261
|
// Get row count to determine optimal number of partitions
|
|
256
262
|
let num_rows = dataset.count_rows(None).await
|
|
257
|
-
.map_err(|e| Error::new(
|
|
258
|
-
|
|
263
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
264
|
+
|
|
259
265
|
// Use fewer partitions for small datasets
|
|
260
266
|
let num_partitions = if num_rows < 256 {
|
|
261
267
|
std::cmp::max(1, (num_rows / 4) as usize)
|
|
262
268
|
} else {
|
|
263
269
|
256
|
|
264
270
|
};
|
|
265
|
-
|
|
271
|
+
|
|
266
272
|
// Create IVF_FLAT vector index parameters
|
|
267
273
|
let params = VectorIndexParams::ivf_flat(num_partitions, lance_linalg::distance::MetricType::L2);
|
|
268
|
-
|
|
274
|
+
|
|
269
275
|
dataset.create_index(
|
|
270
276
|
&[&column],
|
|
271
277
|
IndexType::Vector,
|
|
@@ -274,44 +280,47 @@ impl LancelotDataset {
|
|
|
274
280
|
true
|
|
275
281
|
)
|
|
276
282
|
.await
|
|
277
|
-
.
|
|
283
|
+
.map(|_| ())
|
|
284
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))
|
|
278
285
|
})
|
|
279
286
|
}
|
|
280
287
|
|
|
281
288
|
pub fn vector_search(&self, column: String, query_vector: RArray, limit: i64) -> Result<RArray, Error> {
|
|
289
|
+
let ruby = Ruby::get().unwrap();
|
|
282
290
|
let dataset = self.dataset.borrow();
|
|
283
291
|
let dataset = dataset.as_ref()
|
|
284
|
-
.ok_or_else(|| Error::new(
|
|
285
|
-
|
|
286
|
-
// Convert Ruby array to Vec<f32>
|
|
287
|
-
let
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
.
|
|
292
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "Dataset not opened"))?;
|
|
293
|
+
|
|
294
|
+
// Convert Ruby array to Vec<f32> using index-based iteration
|
|
295
|
+
let len = query_vector.len();
|
|
296
|
+
let mut vector: Vec<f32> = Vec::with_capacity(len);
|
|
297
|
+
for i in 0..len {
|
|
298
|
+
let v: f64 = query_vector.entry(i as isize)?;
|
|
299
|
+
vector.push(v as f32);
|
|
300
|
+
}
|
|
291
301
|
|
|
292
302
|
let batches: Vec<RecordBatch> = self.runtime.borrow_mut().block_on(async {
|
|
293
303
|
let mut scanner = dataset.scan();
|
|
294
|
-
|
|
304
|
+
|
|
295
305
|
// Use nearest for vector search
|
|
296
306
|
scanner.nearest(&column, &Float32Array::from(vector), limit as usize)
|
|
297
|
-
.map_err(|e| Error::new(
|
|
298
|
-
|
|
307
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
308
|
+
|
|
299
309
|
let stream = scanner
|
|
300
310
|
.try_into_stream()
|
|
301
311
|
.await
|
|
302
|
-
.map_err(|e| Error::new(
|
|
303
|
-
|
|
312
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
313
|
+
|
|
304
314
|
stream
|
|
305
315
|
.try_collect::<Vec<_>>()
|
|
306
316
|
.await
|
|
307
|
-
.map_err(|e| Error::new(
|
|
317
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))
|
|
308
318
|
})?;
|
|
309
319
|
|
|
310
|
-
let ruby = Ruby::get().unwrap();
|
|
311
320
|
let result_array = ruby.ary_new();
|
|
312
321
|
|
|
313
322
|
for batch in batches {
|
|
314
|
-
let batch_docs = convert_batch_to_ruby(&batch)?;
|
|
323
|
+
let batch_docs = convert_batch_to_ruby(&ruby, &batch)?;
|
|
315
324
|
// Merge arrays by pushing each element
|
|
316
325
|
for i in 0..batch_docs.len() {
|
|
317
326
|
result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
|
|
@@ -322,14 +331,15 @@ impl LancelotDataset {
|
|
|
322
331
|
}
|
|
323
332
|
|
|
324
333
|
pub fn create_text_index(&self, column: String) -> Result<(), Error> {
|
|
334
|
+
let ruby = Ruby::get().unwrap();
|
|
325
335
|
let mut dataset = self.dataset.borrow_mut();
|
|
326
336
|
let dataset = dataset.as_mut()
|
|
327
|
-
.ok_or_else(|| Error::new(
|
|
337
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "Dataset not opened"))?;
|
|
328
338
|
|
|
329
339
|
self.runtime.borrow_mut().block_on(async move {
|
|
330
340
|
// Create inverted index for full-text search
|
|
331
341
|
let params = InvertedIndexParams::default();
|
|
332
|
-
|
|
342
|
+
|
|
333
343
|
dataset.create_index(
|
|
334
344
|
&[&column],
|
|
335
345
|
IndexType::Inverted,
|
|
@@ -338,46 +348,47 @@ impl LancelotDataset {
|
|
|
338
348
|
true
|
|
339
349
|
)
|
|
340
350
|
.await
|
|
341
|
-
.
|
|
351
|
+
.map(|_| ())
|
|
352
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))
|
|
342
353
|
})
|
|
343
354
|
}
|
|
344
355
|
|
|
345
356
|
pub fn text_search(&self, column: String, query: String, limit: i64) -> Result<RArray, Error> {
|
|
357
|
+
let ruby = Ruby::get().unwrap();
|
|
346
358
|
let dataset = self.dataset.borrow();
|
|
347
359
|
let dataset = dataset.as_ref()
|
|
348
|
-
.ok_or_else(|| Error::new(
|
|
360
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "Dataset not opened"))?;
|
|
349
361
|
|
|
350
362
|
let batches: Vec<RecordBatch> = self.runtime.borrow_mut().block_on(async {
|
|
351
363
|
let mut scanner = dataset.scan();
|
|
352
|
-
|
|
364
|
+
|
|
353
365
|
// Use full-text search with inverted index
|
|
354
366
|
let fts_query = FullTextSearchQuery::new(query)
|
|
355
367
|
.with_column(column)
|
|
356
|
-
.map_err(|e| Error::new(
|
|
357
|
-
|
|
368
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
369
|
+
|
|
358
370
|
scanner.full_text_search(fts_query)
|
|
359
|
-
.map_err(|e| Error::new(
|
|
360
|
-
|
|
371
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
372
|
+
|
|
361
373
|
// Apply limit
|
|
362
374
|
scanner.limit(Some(limit), None)
|
|
363
|
-
.map_err(|e| Error::new(
|
|
364
|
-
|
|
375
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
376
|
+
|
|
365
377
|
let stream = scanner
|
|
366
378
|
.try_into_stream()
|
|
367
379
|
.await
|
|
368
|
-
.map_err(|e| Error::new(
|
|
369
|
-
|
|
380
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
381
|
+
|
|
370
382
|
stream
|
|
371
383
|
.try_collect::<Vec<_>>()
|
|
372
384
|
.await
|
|
373
|
-
.map_err(|e| Error::new(
|
|
385
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))
|
|
374
386
|
})?;
|
|
375
387
|
|
|
376
|
-
let ruby = Ruby::get().unwrap();
|
|
377
388
|
let result_array = ruby.ary_new();
|
|
378
389
|
|
|
379
390
|
for batch in batches {
|
|
380
|
-
let batch_docs = convert_batch_to_ruby(&batch)?;
|
|
391
|
+
let batch_docs = convert_batch_to_ruby(&ruby, &batch)?;
|
|
381
392
|
// Merge arrays by pushing each element
|
|
382
393
|
for i in 0..batch_docs.len() {
|
|
383
394
|
result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
|
|
@@ -388,47 +399,49 @@ impl LancelotDataset {
|
|
|
388
399
|
}
|
|
389
400
|
|
|
390
401
|
pub fn multi_column_text_search(&self, columns: RArray, query: String, limit: i64) -> Result<RArray, Error> {
|
|
402
|
+
let ruby = Ruby::get().unwrap();
|
|
391
403
|
let dataset = self.dataset.borrow();
|
|
392
404
|
let dataset = dataset.as_ref()
|
|
393
|
-
.ok_or_else(|| Error::new(
|
|
394
|
-
|
|
395
|
-
// Convert Ruby array of columns to Vec<String>
|
|
396
|
-
let
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
.
|
|
405
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "Dataset not opened"))?;
|
|
406
|
+
|
|
407
|
+
// Convert Ruby array of columns to Vec<String> using index-based iteration
|
|
408
|
+
let len = columns.len();
|
|
409
|
+
let mut cols: Vec<String> = Vec::with_capacity(len);
|
|
410
|
+
for i in 0..len {
|
|
411
|
+
let v: String = columns.entry(i as isize)?;
|
|
412
|
+
cols.push(v);
|
|
413
|
+
}
|
|
400
414
|
|
|
401
415
|
let batches: Vec<RecordBatch> = self.runtime.borrow_mut().block_on(async {
|
|
402
416
|
let mut scanner = dataset.scan();
|
|
403
|
-
|
|
417
|
+
|
|
404
418
|
// Create a full-text search query for multiple columns
|
|
405
419
|
let fts_query = FullTextSearchQuery::new(query)
|
|
406
|
-
.with_columns(&
|
|
407
|
-
.map_err(|e| Error::new(
|
|
408
|
-
|
|
420
|
+
.with_columns(&cols)
|
|
421
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
422
|
+
|
|
409
423
|
scanner.full_text_search(fts_query)
|
|
410
|
-
.map_err(|e| Error::new(
|
|
411
|
-
|
|
424
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
425
|
+
|
|
412
426
|
// Apply limit
|
|
413
427
|
scanner.limit(Some(limit), None)
|
|
414
|
-
.map_err(|e| Error::new(
|
|
415
|
-
|
|
428
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
429
|
+
|
|
416
430
|
let stream = scanner
|
|
417
431
|
.try_into_stream()
|
|
418
432
|
.await
|
|
419
|
-
.map_err(|e| Error::new(
|
|
420
|
-
|
|
433
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
434
|
+
|
|
421
435
|
stream
|
|
422
436
|
.try_collect::<Vec<_>>()
|
|
423
437
|
.await
|
|
424
|
-
.map_err(|e| Error::new(
|
|
438
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))
|
|
425
439
|
})?;
|
|
426
440
|
|
|
427
|
-
let ruby = Ruby::get().unwrap();
|
|
428
441
|
let result_array = ruby.ary_new();
|
|
429
442
|
|
|
430
443
|
for batch in batches {
|
|
431
|
-
let batch_docs = convert_batch_to_ruby(&batch)?;
|
|
444
|
+
let batch_docs = convert_batch_to_ruby(&ruby, &batch)?;
|
|
432
445
|
// Merge arrays by pushing each element
|
|
433
446
|
for i in 0..batch_docs.len() {
|
|
434
447
|
result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
|
|
@@ -439,39 +452,39 @@ impl LancelotDataset {
|
|
|
439
452
|
}
|
|
440
453
|
|
|
441
454
|
pub fn filter_scan(&self, filter_expr: String, limit: Option<i64>) -> Result<RArray, Error> {
|
|
455
|
+
let ruby = Ruby::get().unwrap();
|
|
442
456
|
let dataset = self.dataset.borrow();
|
|
443
457
|
let dataset = dataset.as_ref()
|
|
444
|
-
.ok_or_else(|| Error::new(
|
|
458
|
+
.ok_or_else(|| Error::new(ruby.exception_runtime_error(), "Dataset not opened"))?;
|
|
445
459
|
|
|
446
460
|
let batches: Vec<RecordBatch> = self.runtime.borrow_mut().block_on(async {
|
|
447
461
|
let mut scanner = dataset.scan();
|
|
448
|
-
|
|
462
|
+
|
|
449
463
|
// Apply SQL-like filter
|
|
450
464
|
scanner.filter(&filter_expr)
|
|
451
|
-
.map_err(|e| Error::new(
|
|
452
|
-
|
|
465
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
466
|
+
|
|
453
467
|
// Apply limit if provided
|
|
454
468
|
if let Some(lim) = limit {
|
|
455
469
|
scanner.limit(Some(lim), None)
|
|
456
|
-
.map_err(|e| Error::new(
|
|
470
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
457
471
|
}
|
|
458
|
-
|
|
472
|
+
|
|
459
473
|
let stream = scanner
|
|
460
474
|
.try_into_stream()
|
|
461
475
|
.await
|
|
462
|
-
.map_err(|e| Error::new(
|
|
463
|
-
|
|
476
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
477
|
+
|
|
464
478
|
stream
|
|
465
479
|
.try_collect::<Vec<_>>()
|
|
466
480
|
.await
|
|
467
|
-
.map_err(|e| Error::new(
|
|
481
|
+
.map_err(|e| Error::new(ruby.exception_runtime_error(), e.to_string()))
|
|
468
482
|
})?;
|
|
469
483
|
|
|
470
|
-
let ruby = Ruby::get().unwrap();
|
|
471
484
|
let result_array = ruby.ary_new();
|
|
472
485
|
|
|
473
486
|
for batch in batches {
|
|
474
|
-
let batch_docs = convert_batch_to_ruby(&batch)?;
|
|
487
|
+
let batch_docs = convert_batch_to_ruby(&ruby, &batch)?;
|
|
475
488
|
// Merge arrays by pushing each element
|
|
476
489
|
for i in 0..batch_docs.len() {
|
|
477
490
|
result_array.push(batch_docs.entry::<Value>(i as isize)?)?;
|
|
@@ -501,4 +514,4 @@ impl LancelotDataset {
|
|
|
501
514
|
class.define_method("filter_scan", method!(LancelotDataset::filter_scan, 2))?;
|
|
502
515
|
Ok(())
|
|
503
516
|
}
|
|
504
|
-
}
|
|
517
|
+
}
|
data/ext/lancelot/src/lib.rs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
use magnus::{
|
|
1
|
+
use magnus::{Error, Ruby, Module};
|
|
2
2
|
|
|
3
3
|
mod dataset;
|
|
4
4
|
mod schema;
|
|
@@ -8,10 +8,10 @@ use dataset::LancelotDataset;
|
|
|
8
8
|
|
|
9
9
|
#[magnus::init]
|
|
10
10
|
fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
11
|
-
let module = define_module("Lancelot")?;
|
|
12
|
-
|
|
11
|
+
let module = ruby.define_module("Lancelot")?;
|
|
12
|
+
|
|
13
13
|
let dataset_class = module.define_class("Dataset", ruby.class_object())?;
|
|
14
14
|
LancelotDataset::bind(&dataset_class)?;
|
|
15
|
-
|
|
15
|
+
|
|
16
16
|
Ok(())
|
|
17
|
-
}
|
|
17
|
+
}
|
data/ext/lancelot/src/schema.rs
CHANGED
|
@@ -1,28 +1,26 @@
|
|
|
1
|
-
use magnus::{Error, RHash, Symbol, Value, TryConvert, r_hash::ForEach
|
|
1
|
+
use magnus::{Error, Ruby, RHash, Symbol, Value, TryConvert, r_hash::ForEach};
|
|
2
2
|
use arrow_schema::{DataType, Field, Schema as ArrowSchema};
|
|
3
3
|
use std::sync::Arc;
|
|
4
4
|
|
|
5
|
-
pub fn build_arrow_schema(schema_hash: RHash) -> Result<ArrowSchema, Error> {
|
|
5
|
+
pub fn build_arrow_schema(ruby: &Ruby, schema_hash: RHash) -> Result<ArrowSchema, Error> {
|
|
6
6
|
let mut fields = Vec::new();
|
|
7
7
|
|
|
8
8
|
schema_hash.foreach(|key: Symbol, value: Value| {
|
|
9
9
|
let field_name = key.name()?.to_string();
|
|
10
|
-
|
|
11
|
-
let data_type = if
|
|
12
|
-
let
|
|
13
|
-
|
|
14
|
-
let type_str: String = hash.fetch(Symbol::new("type"))?;
|
|
15
|
-
|
|
10
|
+
|
|
11
|
+
let data_type = if let Some(hash) = RHash::from_value(value) {
|
|
12
|
+
let type_str: String = hash.fetch(ruby.to_symbol("type"))?;
|
|
13
|
+
|
|
16
14
|
match type_str.as_str() {
|
|
17
15
|
"vector" => {
|
|
18
|
-
let dimension: i32 = hash.fetch(
|
|
16
|
+
let dimension: i32 = hash.fetch(ruby.to_symbol("dimension"))?;
|
|
19
17
|
DataType::FixedSizeList(
|
|
20
18
|
Arc::new(Field::new("item", DataType::Float32, true)),
|
|
21
19
|
dimension,
|
|
22
20
|
)
|
|
23
21
|
}
|
|
24
22
|
_ => return Err(Error::new(
|
|
25
|
-
|
|
23
|
+
ruby.exception_arg_error(),
|
|
26
24
|
format!("Unknown field type: {}", type_str)
|
|
27
25
|
))
|
|
28
26
|
}
|
|
@@ -36,7 +34,7 @@ pub fn build_arrow_schema(schema_hash: RHash) -> Result<ArrowSchema, Error> {
|
|
|
36
34
|
"int64" => DataType::Int64,
|
|
37
35
|
"boolean" => DataType::Boolean,
|
|
38
36
|
_ => return Err(Error::new(
|
|
39
|
-
|
|
37
|
+
ruby.exception_arg_error(),
|
|
40
38
|
format!("Unknown field type: {}", type_str)
|
|
41
39
|
))
|
|
42
40
|
}
|
|
@@ -47,4 +45,4 @@ pub fn build_arrow_schema(schema_hash: RHash) -> Result<ArrowSchema, Error> {
|
|
|
47
45
|
})?;
|
|
48
46
|
|
|
49
47
|
Ok(ArrowSchema::new(fields))
|
|
50
|
-
}
|
|
48
|
+
}
|
data/lib/lancelot/version.rb
CHANGED
data/lib/lancelot.rb
CHANGED
|
@@ -1,7 +1,20 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "lancelot/version"
|
|
4
|
-
|
|
4
|
+
|
|
5
|
+
# Load the compiled Rust extension. Precompiled (platform) gems install it into a
|
|
6
|
+
# Ruby-ABI-versioned subdir (lib/lancelot/<major.minor>/lancelot.{so,bundle}) so a
|
|
7
|
+
# single fat gem can carry a binary per Ruby version; source/dev builds place it flat
|
|
8
|
+
# at lib/lancelot/lancelot.{so,bundle}. Try the versioned path first, fall back to the
|
|
9
|
+
# flat one. Resolution goes through $LOAD_PATH (`require`, never `require_relative`)
|
|
10
|
+
# because RubyGems installs native extensions outside the gem's lib/ dir.
|
|
11
|
+
begin
|
|
12
|
+
RUBY_VERSION =~ /(\d+\.\d+)/
|
|
13
|
+
require "lancelot/#{Regexp.last_match(1)}/lancelot"
|
|
14
|
+
rescue LoadError
|
|
15
|
+
require "lancelot/lancelot"
|
|
16
|
+
end
|
|
17
|
+
|
|
5
18
|
require_relative "lancelot/dataset"
|
|
6
19
|
require_relative "lancelot/rank_fusion"
|
|
7
20
|
|