lancelot 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,102 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "lancelot"
5
+ require "tmpdir"
6
+
7
+ # Simple function to generate random embeddings
8
+ def generate_embedding(text)
9
+ # In real applications, you'd use an actual embedding model
10
+ # This is just for demonstration
11
+ text.bytes.take(128).map { |b| b / 255.0 } + [0.0] * (128 - text.bytes.take(128).length)
12
+ end
13
+
14
+ # Create a temporary directory for our dataset
15
+ Dir.mktmpdir do |dir|
16
+ dataset_path = File.join(dir, "vector_dataset")
17
+
18
+ puts "Creating dataset with vector support at: #{dataset_path}"
19
+
20
+ # Create a dataset with vector schema
21
+ dataset = Lancelot::Dataset.create(dataset_path, schema: {
22
+ text: :string,
23
+ score: :float32,
24
+ vector: { type: "vector", dimension: 128 }
25
+ })
26
+
27
+ # Sample documents about programming languages
28
+ documents = [
29
+ {
30
+ text: "Ruby is a dynamic, object-oriented programming language",
31
+ score: 0.95
32
+ },
33
+ {
34
+ text: "Python is great for data science and machine learning",
35
+ score: 0.92
36
+ },
37
+ {
38
+ text: "JavaScript runs in browsers and on servers with Node.js",
39
+ score: 0.88
40
+ },
41
+ {
42
+ text: "Rust provides memory safety without garbage collection",
43
+ score: 0.91
44
+ },
45
+ {
46
+ text: "Go makes concurrent programming easy with goroutines",
47
+ score: 0.89
48
+ },
49
+ {
50
+ text: "Java is widely used in enterprise applications",
51
+ score: 0.85
52
+ },
53
+ {
54
+ text: "C++ offers high performance and low-level control",
55
+ score: 0.90
56
+ },
57
+ {
58
+ text: "TypeScript adds static typing to JavaScript",
59
+ score: 0.87
60
+ }
61
+ ]
62
+
63
+ # Add embeddings to documents
64
+ documents_with_embeddings = documents.map do |doc|
65
+ doc.merge(vector: generate_embedding(doc[:text]))
66
+ end
67
+
68
+ # Add documents to dataset
69
+ puts "Adding documents to dataset..."
70
+ dataset.add_documents(documents_with_embeddings)
71
+ puts "Added #{dataset.count} documents"
72
+
73
+ # Create vector index
74
+ puts "\nCreating vector index..."
75
+ dataset.create_vector_index("vector")
76
+
77
+ # Perform vector search
78
+ puts "\nSearching for documents similar to 'dynamic programming languages'..."
79
+ query_embedding = generate_embedding("dynamic programming languages")
80
+
81
+ results = dataset.vector_search(query_embedding, column: "vector", limit: 3)
82
+
83
+ puts "\nTop 3 most similar documents:"
84
+ results.each_with_index do |doc, i|
85
+ puts "#{i + 1}. #{doc[:text]} (score: #{doc[:score]})"
86
+ puts
87
+ end
88
+
89
+ # Search with nearest_neighbors alias
90
+ puts "Searching for documents similar to 'memory safety and performance'..."
91
+ query_embedding2 = generate_embedding("memory safety and performance")
92
+
93
+ similar = dataset.nearest_neighbors(query_embedding2, k: 3, column: "vector")
94
+
95
+ puts "\nTop 3 nearest neighbors:"
96
+ similar.each_with_index do |doc, i|
97
+ puts "#{i + 1}. #{doc[:text]} (score: #{doc[:score]})"
98
+ puts
99
+ end
100
+ end
101
+
102
+ puts "\nDone!"
@@ -0,0 +1,10 @@
1
+ # Rust
2
+ target/
3
+ Cargo.lock
4
+
5
+ # Ruby extension build artifacts
6
+ *.bundle
7
+ *.so
8
+ *.o
9
+ *.a
10
+ mkmf.log
@@ -0,0 +1,28 @@
1
+ [package]
2
+ name = "lancelot"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+
6
+ [lib]
7
+ crate-type = ["cdylib"]
8
+
9
+ [dependencies]
10
+ magnus = { version = "0.7", features = ["rb-sys"] }
11
+ lance = { version = "0.31", default-features = false }
12
+ lance-index = "0.31"
13
+ lance-linalg = "0.31"
14
+ tokio = { version = "1", features = ["rt-multi-thread", "macros"] }
15
+ arrow = "55"
16
+ arrow-array = "55"
17
+ arrow-schema = "55"
18
+ arrow-data = "55"
19
+ futures = "0.3"
20
+ thiserror = "2"
21
+ serde = { version = "1", features = ["derive"] }
22
+ serde_json = "1"
23
+
24
+ [package.metadata]
25
+ # Minimum supported Rust version
26
+ rust-version = "1.86.0"
27
+
28
+ [dev-dependencies]
@@ -0,0 +1,4 @@
1
+ require "mkmf"
2
+ require "rb_sys/mkmf"
3
+
4
+ create_rust_makefile("lancelot/lancelot")
@@ -0,0 +1,243 @@
1
+ use magnus::{Error, Ruby, RHash, RArray, Symbol, Value, TryConvert, value::ReprValue};
2
+ use arrow_schema::{DataType, Schema as ArrowSchema};
3
+ use arrow_array::{RecordBatch, StringArray, Float32Array, ArrayRef, Array, FixedSizeListArray};
4
+ use std::collections::HashMap;
5
+ use std::sync::Arc;
6
+
7
+ pub fn build_record_batch(
8
+ data: RArray,
9
+ schema: &ArrowSchema,
10
+ ) -> Result<RecordBatch, Error> {
11
+ let mut columns: HashMap<String, Vec<Option<String>>> = HashMap::new();
12
+ let mut float_columns: HashMap<String, Vec<Option<f32>>> = HashMap::new();
13
+ let mut int_columns: HashMap<String, Vec<Option<i64>>> = HashMap::new();
14
+ let mut bool_columns: HashMap<String, Vec<Option<bool>>> = HashMap::new();
15
+ let mut vector_columns: HashMap<String, Vec<Option<Vec<f32>>>> = HashMap::new();
16
+
17
+ for field in schema.fields() {
18
+ match field.data_type() {
19
+ DataType::Utf8 => {
20
+ columns.insert(field.name().to_string(), Vec::new());
21
+ }
22
+ DataType::Float32 => {
23
+ float_columns.insert(field.name().to_string(), Vec::new());
24
+ }
25
+ DataType::Int64 => {
26
+ int_columns.insert(field.name().to_string(), Vec::new());
27
+ }
28
+ DataType::Boolean => {
29
+ bool_columns.insert(field.name().to_string(), Vec::new());
30
+ }
31
+ DataType::FixedSizeList(_, _) => {
32
+ vector_columns.insert(field.name().to_string(), Vec::new());
33
+ }
34
+ _ => {}
35
+ }
36
+ }
37
+
38
+ for item in data.into_iter() {
39
+ let item = RHash::try_convert(item)?;
40
+ for field in schema.fields() {
41
+ let key = Symbol::new(field.name());
42
+ let value: Value = item.fetch(key)
43
+ .or_else(|_| {
44
+ // Try with string key
45
+ item.fetch(field.name().as_str())
46
+ })?;
47
+
48
+ match field.data_type() {
49
+ DataType::Utf8 => {
50
+ if value.is_nil() {
51
+ columns.get_mut(field.name()).unwrap().push(None);
52
+ } else {
53
+ let s = String::try_convert(value)?;
54
+ columns.get_mut(field.name()).unwrap().push(Some(s));
55
+ }
56
+ }
57
+ DataType::Float32 => {
58
+ if value.is_nil() {
59
+ float_columns.get_mut(field.name()).unwrap().push(None);
60
+ } else {
61
+ let f = f64::try_convert(value)?;
62
+ float_columns.get_mut(field.name()).unwrap().push(Some(f as f32));
63
+ }
64
+ }
65
+ DataType::Int64 => {
66
+ if value.is_nil() {
67
+ int_columns.get_mut(field.name()).unwrap().push(None);
68
+ } else {
69
+ let i = i64::try_convert(value)?;
70
+ int_columns.get_mut(field.name()).unwrap().push(Some(i));
71
+ }
72
+ }
73
+ DataType::Boolean => {
74
+ if value.is_nil() {
75
+ bool_columns.get_mut(field.name()).unwrap().push(None);
76
+ } else {
77
+ let b = bool::try_convert(value)?;
78
+ bool_columns.get_mut(field.name()).unwrap().push(Some(b));
79
+ }
80
+ }
81
+ DataType::FixedSizeList(_, _) => {
82
+ if value.is_nil() {
83
+ vector_columns.get_mut(field.name()).unwrap().push(None);
84
+ } else {
85
+ let arr = RArray::try_convert(value)?;
86
+ let vec: Vec<f32> = arr.into_iter()
87
+ .map(|v| f64::try_convert(v).map(|f| f as f32))
88
+ .collect::<Result<Vec<_>, _>>()?;
89
+ vector_columns.get_mut(field.name()).unwrap().push(Some(vec));
90
+ }
91
+ }
92
+ _ => {}
93
+ }
94
+ }
95
+ }
96
+
97
+ let mut arrays: Vec<ArrayRef> = Vec::new();
98
+
99
+ for field in schema.fields() {
100
+ let array: ArrayRef = match field.data_type() {
101
+ DataType::Utf8 => {
102
+ let values = columns.get(field.name()).unwrap();
103
+ Arc::new(StringArray::from(values.clone()))
104
+ }
105
+ DataType::Float32 => {
106
+ let values = float_columns.get(field.name()).unwrap();
107
+ Arc::new(Float32Array::from(values.clone()))
108
+ }
109
+ DataType::Int64 => {
110
+ let values = int_columns.get(field.name()).unwrap();
111
+ Arc::new(arrow_array::Int64Array::from(values.clone()))
112
+ }
113
+ DataType::Boolean => {
114
+ let values = bool_columns.get(field.name()).unwrap();
115
+ Arc::new(arrow_array::BooleanArray::from(values.clone()))
116
+ }
117
+ DataType::FixedSizeList(inner_field, list_size) => {
118
+ let values = vector_columns.get(field.name()).unwrap();
119
+ // Build flat array of all values
120
+ let mut flat_values = Vec::new();
121
+ for vec_opt in values {
122
+ match vec_opt {
123
+ Some(vec) => {
124
+ if vec.len() != *list_size as usize {
125
+ return Err(Error::new(
126
+ magnus::exception::arg_error(),
127
+ format!("Vector dimension mismatch. Expected {}, got {}", list_size, vec.len())
128
+ ));
129
+ }
130
+ flat_values.extend(vec);
131
+ }
132
+ None => {
133
+ // Add nulls for the entire vector
134
+ flat_values.extend(vec![0.0f32; *list_size as usize]);
135
+ }
136
+ }
137
+ }
138
+
139
+ let flat_array = Float32Array::from(flat_values);
140
+ Arc::new(FixedSizeListArray::new(
141
+ inner_field.clone(),
142
+ *list_size,
143
+ Arc::new(flat_array),
144
+ None
145
+ ))
146
+ }
147
+ _ => return Err(Error::new(
148
+ magnus::exception::runtime_error(),
149
+ format!("Unsupported data type: {:?}", field.data_type())
150
+ ))
151
+ };
152
+
153
+ arrays.push(array);
154
+ }
155
+
156
+ RecordBatch::try_new(Arc::new(schema.clone()), arrays)
157
+ .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))
158
+ }
159
+
160
+ pub fn convert_batch_to_ruby(batch: &RecordBatch) -> Result<Vec<RHash>, Error> {
161
+ let ruby = Ruby::get().unwrap();
162
+ let mut documents = Vec::new();
163
+
164
+ let num_rows = batch.num_rows();
165
+ let schema = batch.schema();
166
+
167
+ for row_idx in 0..num_rows {
168
+ let doc = ruby.hash_new();
169
+
170
+ for (col_idx, field) in schema.fields().iter().enumerate() {
171
+ let column = batch.column(col_idx);
172
+ let key = Symbol::new(field.name());
173
+
174
+ match field.data_type() {
175
+ DataType::Utf8 => {
176
+ let array = column.as_any().downcast_ref::<StringArray>()
177
+ .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "Failed to cast to StringArray"))?;
178
+
179
+ if array.is_null(row_idx) {
180
+ doc.aset(key, ruby.qnil())?;
181
+ } else {
182
+ doc.aset(key, array.value(row_idx))?;
183
+ }
184
+ }
185
+ DataType::Float32 => {
186
+ let array = column.as_any().downcast_ref::<Float32Array>()
187
+ .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "Failed to cast to Float32Array"))?;
188
+
189
+ if array.is_null(row_idx) {
190
+ doc.aset(key, ruby.qnil())?;
191
+ } else {
192
+ doc.aset(key, array.value(row_idx))?;
193
+ }
194
+ }
195
+ DataType::Int64 => {
196
+ let array = column.as_any().downcast_ref::<arrow_array::Int64Array>()
197
+ .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "Failed to cast to Int64Array"))?;
198
+
199
+ if array.is_null(row_idx) {
200
+ doc.aset(key, ruby.qnil())?;
201
+ } else {
202
+ doc.aset(key, array.value(row_idx))?;
203
+ }
204
+ }
205
+ DataType::Boolean => {
206
+ let array = column.as_any().downcast_ref::<arrow_array::BooleanArray>()
207
+ .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "Failed to cast to BooleanArray"))?;
208
+
209
+ if array.is_null(row_idx) {
210
+ doc.aset(key, ruby.qnil())?;
211
+ } else {
212
+ doc.aset(key, array.value(row_idx))?;
213
+ }
214
+ }
215
+ DataType::FixedSizeList(_, list_size) => {
216
+ let array = column.as_any().downcast_ref::<FixedSizeListArray>()
217
+ .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "Failed to cast to FixedSizeListArray"))?;
218
+
219
+ if array.is_null(row_idx) {
220
+ doc.aset(key, ruby.qnil())?;
221
+ } else {
222
+ let values = array.value(row_idx);
223
+ let float_array = values.as_any().downcast_ref::<Float32Array>()
224
+ .ok_or_else(|| Error::new(magnus::exception::runtime_error(), "Failed to cast vector values to Float32Array"))?;
225
+
226
+ let ruby_array = ruby.ary_new();
227
+ for i in 0..*list_size {
228
+ ruby_array.push(float_array.value(i as usize))?;
229
+ }
230
+ doc.aset(key, ruby_array)?;
231
+ }
232
+ }
233
+ _ => {
234
+ // Skip unsupported types for now
235
+ }
236
+ }
237
+ }
238
+
239
+ documents.push(doc);
240
+ }
241
+
242
+ Ok(documents)
243
+ }