slatedb 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,242 @@
1
+ use std::sync::Arc;
2
+ use std::thread;
3
+
4
+ use bytes::Bytes;
5
+ use log::error;
6
+ use magnus::rb_sys::{AsRawValue, FromRawValue};
7
+ use magnus::value::ReprValue;
8
+ use magnus::{Error, RHash, Ruby, Value};
9
+ use slatedb::{MergeOperator, MergeOperatorError};
10
+
11
+ use crate::errors::invalid_argument_error;
12
+ use crate::utils::get_optional;
13
+
14
+ struct StringConcatMergeOperator;
15
+
16
+ impl MergeOperator for StringConcatMergeOperator {
17
+ fn merge(
18
+ &self,
19
+ _key: &Bytes,
20
+ existing_value: Option<Bytes>,
21
+ value: Bytes,
22
+ ) -> Result<Bytes, MergeOperatorError> {
23
+ let mut result = existing_value.unwrap_or_default().to_vec();
24
+ result.extend_from_slice(&value);
25
+ Ok(Bytes::from(result))
26
+ }
27
+
28
+ fn merge_batch(
29
+ &self,
30
+ _key: &Bytes,
31
+ existing_value: Option<Bytes>,
32
+ operands: &[Bytes],
33
+ ) -> Result<Bytes, MergeOperatorError> {
34
+ let mut result = existing_value.unwrap_or_default().to_vec();
35
+ for operand in operands {
36
+ result.extend_from_slice(operand);
37
+ }
38
+ Ok(Bytes::from(result))
39
+ }
40
+ }
41
+
42
+ /// A merge operator that calls a Ruby block/proc.
43
+ ///
44
+ /// This stores the raw Ruby VALUE and calls it via `with_gvl` when merge
45
+ /// operations are needed. The proc is called with (key, existing_value, new_value)
46
+ /// and should return the merged value as a String.
47
+ ///
48
+ /// # Thread Safety
49
+ ///
50
+ /// The Ruby proc can only be called from the Ruby thread that created this operator.
51
+ /// If the merge is called from a different thread (e.g., a Tokio worker thread during
52
+ /// background compaction), the merge will use a fallback string concatenation behavior.
53
+ ///
54
+ /// # Safety
55
+ ///
56
+ /// The Ruby proc must be kept alive (not garbage collected) for the lifetime
57
+ /// of this operator. This is typically handled by storing a reference to the
58
+ /// proc in the Ruby Database object.
59
+ pub struct RubyProcMergeOperator {
60
+ /// The raw Ruby VALUE of the proc. We store this as a raw value because
61
+ /// magnus::Value is not Send+Sync, but we need to be thread-safe.
62
+ /// We re-acquire the GVL before using it, which makes this safe.
63
+ proc_value: usize,
64
+ /// The thread ID of the Ruby thread that created this operator.
65
+ /// We can only safely call Ruby from this thread.
66
+ ruby_thread_id: thread::ThreadId,
67
+ }
68
+
69
+ // SAFETY: We only access the proc_value when we hold the GVL via the Ruby thread,
70
+ // which ensures thread-safe access to Ruby objects.
71
+ unsafe impl Send for RubyProcMergeOperator {}
72
+ unsafe impl Sync for RubyProcMergeOperator {}
73
+
74
+ impl RubyProcMergeOperator {
75
+ /// Create a new RubyProcMergeOperator from a Ruby proc/block.
76
+ ///
77
+ /// # Safety
78
+ ///
79
+ /// The caller must ensure the proc remains alive (not GC'd) for the
80
+ /// lifetime of this operator.
81
+ pub fn new(proc: Value) -> Self {
82
+ Self {
83
+ proc_value: proc.as_raw() as usize,
84
+ ruby_thread_id: thread::current().id(),
85
+ }
86
+ }
87
+
88
+ /// Check if we're on the Ruby thread that created this operator.
89
+ fn is_ruby_thread(&self) -> bool {
90
+ thread::current().id() == self.ruby_thread_id
91
+ }
92
+
93
+ /// Call the Ruby proc with the given arguments.
94
+ /// This must only be called from the Ruby thread (after checking is_ruby_thread).
95
+ fn call_proc_on_ruby_thread(
96
+ &self,
97
+ key: &str,
98
+ existing_value: Option<&str>,
99
+ new_value: &str,
100
+ ) -> Result<Bytes, MergeOperatorError> {
101
+ // We're on the Ruby thread, so we can use with_gvl
102
+ // Import here to avoid the circular dependency at module level
103
+ use crate::runtime::with_gvl;
104
+
105
+ let key_owned = key.to_string();
106
+ let existing_owned = existing_value.map(|s| s.to_string());
107
+ let new_owned = new_value.to_string();
108
+
109
+ with_gvl(|| {
110
+ let ruby = Ruby::get().expect("Ruby runtime not available");
111
+
112
+ // Reconstruct the proc Value from the raw pointer
113
+ let proc = unsafe { Value::from_raw(self.proc_value as _) };
114
+
115
+ // Build arguments: (key, existing_value, new_value)
116
+ let existing_arg: Value = match &existing_owned {
117
+ Some(s) => ruby.str_new(s).as_value(),
118
+ None => ruby.qnil().as_value(),
119
+ };
120
+
121
+ // Call the proc
122
+ let result: Result<String, magnus::Error> = proc.funcall(
123
+ "call",
124
+ (
125
+ ruby.str_new(&key_owned),
126
+ existing_arg,
127
+ ruby.str_new(&new_owned),
128
+ ),
129
+ );
130
+
131
+ match result {
132
+ Ok(merged) => Ok(Bytes::from(merged)),
133
+ Err(e) => {
134
+ error!("Ruby merge operator error: {}", e);
135
+ Err(MergeOperatorError::EmptyBatch)
136
+ }
137
+ }
138
+ })
139
+ }
140
+
141
+ /// Fallback merge when we're not on the Ruby thread.
142
+ /// Uses simple concatenation as a safe default.
143
+ fn fallback_merge(
144
+ &self,
145
+ existing_value: Option<&Bytes>,
146
+ new_value: &Bytes,
147
+ ) -> Result<Bytes, MergeOperatorError> {
148
+ error!(
149
+ "Ruby merge operator called from non-Ruby thread, using fallback concatenation. \
150
+ This can happen during background compaction."
151
+ );
152
+ let mut result = existing_value
153
+ .map(|v| v.to_vec())
154
+ .unwrap_or_default();
155
+ result.extend_from_slice(new_value);
156
+ Ok(Bytes::from(result))
157
+ }
158
+
159
+ /// Call the Ruby proc with the given arguments, handling thread safety.
160
+ fn call_proc(
161
+ &self,
162
+ key: &Bytes,
163
+ existing_value: Option<&Bytes>,
164
+ new_value: &Bytes,
165
+ ) -> Result<Bytes, MergeOperatorError> {
166
+ let key_str = String::from_utf8_lossy(key);
167
+ let existing_str = existing_value.map(|v| String::from_utf8_lossy(v));
168
+ let new_str = String::from_utf8_lossy(new_value);
169
+
170
+ if self.is_ruby_thread() {
171
+ self.call_proc_on_ruby_thread(&key_str, existing_str.as_deref(), &new_str)
172
+ } else {
173
+ // We're on a worker thread, use fallback
174
+ self.fallback_merge(existing_value, new_value)
175
+ }
176
+ }
177
+ }
178
+
179
+ impl MergeOperator for RubyProcMergeOperator {
180
+ fn merge(
181
+ &self,
182
+ key: &Bytes,
183
+ existing_value: Option<Bytes>,
184
+ value: Bytes,
185
+ ) -> Result<Bytes, MergeOperatorError> {
186
+ self.call_proc(key, existing_value.as_ref(), &value)
187
+ }
188
+
189
+ fn merge_batch(
190
+ &self,
191
+ key: &Bytes,
192
+ existing_value: Option<Bytes>,
193
+ operands: &[Bytes],
194
+ ) -> Result<Bytes, MergeOperatorError> {
195
+ // Apply operands one at a time through the Ruby proc
196
+ let mut current = existing_value;
197
+ for operand in operands {
198
+ current = Some(self.call_proc(key, current.as_ref(), operand)?);
199
+ }
200
+ Ok(current.unwrap_or_default())
201
+ }
202
+ }
203
+
204
+ pub fn parse_merge_operator(
205
+ kwargs: &RHash,
206
+ ) -> Result<Option<Arc<dyn MergeOperator + Send + Sync>>, Error> {
207
+ let merge_operator = get_optional::<String>(kwargs, "merge_operator")?;
208
+ let Some(merge_operator) = merge_operator else {
209
+ return Ok(None);
210
+ };
211
+
212
+ let operator: Arc<dyn MergeOperator + Send + Sync> = match merge_operator.as_str() {
213
+ "string_concat" | "concat" => Arc::new(StringConcatMergeOperator),
214
+ _ => {
215
+ return Err(invalid_argument_error(&format!(
216
+ "invalid merge_operator: {} (expected 'string_concat', 'concat', or use merge_operator_proc for a custom block)",
217
+ merge_operator
218
+ )))
219
+ }
220
+ };
221
+
222
+ Ok(Some(operator))
223
+ }
224
+
225
+ /// Parse a Ruby proc as a merge operator.
226
+ pub fn parse_merge_operator_proc(
227
+ kwargs: &RHash,
228
+ ) -> Result<Option<Arc<dyn MergeOperator + Send + Sync>>, Error> {
229
+ let proc_value = get_optional::<Value>(kwargs, "merge_operator_proc")?;
230
+ let Some(proc) = proc_value else {
231
+ return Ok(None);
232
+ };
233
+
234
+ // Verify it's callable
235
+ if !proc.respond_to("call", false).unwrap_or(false) {
236
+ return Err(invalid_argument_error(
237
+ "merge_operator_proc must respond to 'call'",
238
+ ));
239
+ }
240
+
241
+ Ok(Some(Arc::new(RubyProcMergeOperator::new(proc))))
242
+ }
@@ -7,6 +7,7 @@ use slatedb::DbReader;
7
7
 
8
8
  use crate::errors::invalid_argument_error;
9
9
  use crate::iterator::Iterator;
10
+ use crate::merge_ops::parse_merge_operator;
10
11
  use crate::runtime::block_on_result;
11
12
  use crate::utils::{get_optional, resolve_object_store};
12
13
 
@@ -26,7 +27,7 @@ impl Reader {
26
27
  /// * `path` - The path identifier for the database
27
28
  /// * `url` - Optional object store URL
28
29
  /// * `checkpoint_id` - Optional checkpoint UUID to read at
29
- /// * `kwargs` - Additional options (manifest_poll_interval, checkpoint_lifetime, max_memtable_bytes)
30
+ /// * `kwargs` - Additional options (manifest_poll_interval, checkpoint_lifetime, max_memtable_bytes, merge_operator)
30
31
  pub fn open(
31
32
  path: String,
32
33
  url: Option<String>,
@@ -39,6 +40,7 @@ impl Reader {
39
40
  let checkpoint_lifetime = get_optional::<u64>(&kwargs, "checkpoint_lifetime")?
40
41
  .map(std::time::Duration::from_millis);
41
42
  let max_memtable_bytes = get_optional::<u64>(&kwargs, "max_memtable_bytes")?;
43
+ let merge_operator = parse_merge_operator(&kwargs)?;
42
44
 
43
45
  // Parse checkpoint_id as UUID
44
46
  let checkpoint_uuid =
@@ -51,10 +53,10 @@ impl Reader {
51
53
  };
52
54
 
53
55
  let reader = block_on_result(async {
54
- let object_store: Arc<dyn object_store::ObjectStore> = if let Some(ref url) = url {
56
+ let object_store: Arc<dyn slatedb::object_store::ObjectStore> = if let Some(ref url) = url {
55
57
  resolve_object_store(url)?
56
58
  } else {
57
- Arc::new(object_store::memory::InMemory::new())
59
+ Arc::new(slatedb::object_store::memory::InMemory::new())
58
60
  };
59
61
 
60
62
  let mut options = DbReaderOptions::default();
@@ -67,6 +69,9 @@ impl Reader {
67
69
  if let Some(max_bytes) = max_memtable_bytes {
68
70
  options.max_memtable_bytes = max_bytes;
69
71
  }
72
+ if let Some(merge_operator) = merge_operator {
73
+ options.merge_operator = Some(merge_operator);
74
+ }
70
75
 
71
76
  DbReader::open(path, object_store, checkpoint_uuid, options).await
72
77
  })?;
@@ -200,6 +205,67 @@ impl Reader {
200
205
  Ok(Iterator::new(iter))
201
206
  }
202
207
 
208
+ /// Scan all keys with a given prefix.
209
+ pub fn scan_prefix(&self, prefix: String) -> Result<Iterator, Error> {
210
+ if prefix.is_empty() {
211
+ return Err(invalid_argument_error("prefix cannot be empty"));
212
+ }
213
+
214
+ let iter = block_on_result(async { self.inner.scan_prefix(prefix.as_bytes()).await })?;
215
+
216
+ Ok(Iterator::new(iter))
217
+ }
218
+
219
+ /// Scan all keys with a given prefix with options.
220
+ pub fn scan_prefix_with_options(
221
+ &self,
222
+ prefix: String,
223
+ kwargs: RHash,
224
+ ) -> Result<Iterator, Error> {
225
+ if prefix.is_empty() {
226
+ return Err(invalid_argument_error("prefix cannot be empty"));
227
+ }
228
+
229
+ let mut opts = ScanOptions::default();
230
+
231
+ if let Some(df) = get_optional::<String>(&kwargs, "durability_filter")? {
232
+ opts.durability_filter = match df.as_str() {
233
+ "remote" => DurabilityLevel::Remote,
234
+ "memory" => DurabilityLevel::Memory,
235
+ other => {
236
+ return Err(invalid_argument_error(&format!(
237
+ "invalid durability_filter: {} (expected 'remote' or 'memory')",
238
+ other
239
+ )))
240
+ }
241
+ };
242
+ }
243
+
244
+ if let Some(dirty) = get_optional::<bool>(&kwargs, "dirty")? {
245
+ opts.dirty = dirty;
246
+ }
247
+
248
+ if let Some(rab) = get_optional::<usize>(&kwargs, "read_ahead_bytes")? {
249
+ opts.read_ahead_bytes = rab;
250
+ }
251
+
252
+ if let Some(cb) = get_optional::<bool>(&kwargs, "cache_blocks")? {
253
+ opts.cache_blocks = cb;
254
+ }
255
+
256
+ if let Some(mft) = get_optional::<usize>(&kwargs, "max_fetch_tasks")? {
257
+ opts.max_fetch_tasks = mft;
258
+ }
259
+
260
+ let iter = block_on_result(async {
261
+ self.inner
262
+ .scan_prefix_with_options(prefix.as_bytes(), &opts)
263
+ .await
264
+ })?;
265
+
266
+ Ok(Iterator::new(iter))
267
+ }
268
+
203
269
  /// Close the reader.
204
270
  pub fn close(&self) -> Result<(), Error> {
205
271
  block_on_result(async { self.inner.close().await })?;
@@ -220,6 +286,11 @@ pub fn define_reader_class(ruby: &Ruby, module: &magnus::RModule) -> Result<(),
220
286
  class.define_method("get_bytes", method!(Reader::get_bytes, 1))?;
221
287
  class.define_method("_scan", method!(Reader::scan, 2))?;
222
288
  class.define_method("_scan_with_options", method!(Reader::scan_with_options, 3))?;
289
+ class.define_method("_scan_prefix", method!(Reader::scan_prefix, 1))?;
290
+ class.define_method(
291
+ "_scan_prefix_with_options",
292
+ method!(Reader::scan_prefix_with_options, 2),
293
+ )?;
223
294
  class.define_method("close", method!(Reader::close, 0))?;
224
295
 
225
296
  Ok(())
@@ -1,6 +1,6 @@
1
1
  use magnus::Error;
2
2
  use once_cell::sync::OnceCell;
3
- use rb_sys::rb_thread_call_without_gvl;
3
+ use rb_sys::{rb_thread_call_with_gvl, rb_thread_call_without_gvl};
4
4
  use slatedb::Error as SlateError;
5
5
  use std::ffi::c_void;
6
6
  use std::future::Future;
@@ -98,3 +98,54 @@ where
98
98
 
99
99
  closure.result.expect("closure did not run")
100
100
  }
101
+
102
+ /// Execute a closure while holding the Ruby GVL.
103
+ ///
104
+ /// This is the inverse of `without_gvl`. Use this when you need to call back
105
+ /// into Ruby from code that has previously released the GVL (e.g., inside
106
+ /// a future being executed by `block_on`).
107
+ ///
108
+ /// # Safety
109
+ ///
110
+ /// This function can ONLY be called from a Ruby thread that has previously
111
+ /// released the GVL via `without_gvl` or `rb_thread_call_without_gvl`.
112
+ /// Calling it from a non-Ruby thread (like a spawned Tokio task) will cause
113
+ /// a Ruby fatal error.
114
+ ///
115
+ /// # Panics
116
+ ///
117
+ /// Panics if called from a non-Ruby thread.
118
+ pub fn with_gvl<F, T>(f: F) -> T
119
+ where
120
+ F: FnOnce() -> T,
121
+ {
122
+ struct Closure<F, T> {
123
+ f: Option<F>,
124
+ result: Option<T>,
125
+ }
126
+
127
+ extern "C" fn call_closure<F, T>(data: *mut c_void) -> *mut c_void
128
+ where
129
+ F: FnOnce() -> T,
130
+ {
131
+ let closure = unsafe { &mut *(data as *mut Closure<F, T>) };
132
+ if let Some(f) = closure.f.take() {
133
+ closure.result = Some(f());
134
+ }
135
+ std::ptr::null_mut()
136
+ }
137
+
138
+ let mut closure = Closure {
139
+ f: Some(f),
140
+ result: None,
141
+ };
142
+
143
+ unsafe {
144
+ rb_thread_call_with_gvl(
145
+ Some(call_closure::<F, T>),
146
+ &mut closure as *mut _ as *mut c_void,
147
+ );
148
+ }
149
+
150
+ closure.result.expect("closure did not run")
151
+ }
@@ -162,6 +162,77 @@ impl Snapshot {
162
162
  Ok(Iterator::new(iter))
163
163
  }
164
164
 
165
+ /// Scan all keys with a given prefix from the snapshot.
166
+ pub fn scan_prefix(&self, prefix: String) -> Result<Iterator, Error> {
167
+ if prefix.is_empty() {
168
+ return Err(invalid_argument_error("prefix cannot be empty"));
169
+ }
170
+
171
+ let guard = self.inner.borrow();
172
+ let snapshot = guard
173
+ .as_ref()
174
+ .ok_or_else(|| closed_error("snapshot is closed"))?;
175
+
176
+ let iter = block_on_result(async { snapshot.scan_prefix(prefix.as_bytes()).await })?;
177
+
178
+ Ok(Iterator::new(iter))
179
+ }
180
+
181
+ /// Scan all keys with a given prefix with options from the snapshot.
182
+ pub fn scan_prefix_with_options(
183
+ &self,
184
+ prefix: String,
185
+ kwargs: RHash,
186
+ ) -> Result<Iterator, Error> {
187
+ if prefix.is_empty() {
188
+ return Err(invalid_argument_error("prefix cannot be empty"));
189
+ }
190
+
191
+ let mut opts = ScanOptions::default();
192
+
193
+ if let Some(df) = get_optional::<String>(&kwargs, "durability_filter")? {
194
+ opts.durability_filter = match df.as_str() {
195
+ "remote" => DurabilityLevel::Remote,
196
+ "memory" => DurabilityLevel::Memory,
197
+ other => {
198
+ return Err(invalid_argument_error(&format!(
199
+ "invalid durability_filter: {} (expected 'remote' or 'memory')",
200
+ other
201
+ )))
202
+ }
203
+ };
204
+ }
205
+
206
+ if let Some(dirty) = get_optional::<bool>(&kwargs, "dirty")? {
207
+ opts.dirty = dirty;
208
+ }
209
+
210
+ if let Some(rab) = get_optional::<usize>(&kwargs, "read_ahead_bytes")? {
211
+ opts.read_ahead_bytes = rab;
212
+ }
213
+
214
+ if let Some(cb) = get_optional::<bool>(&kwargs, "cache_blocks")? {
215
+ opts.cache_blocks = cb;
216
+ }
217
+
218
+ if let Some(mft) = get_optional::<usize>(&kwargs, "max_fetch_tasks")? {
219
+ opts.max_fetch_tasks = mft;
220
+ }
221
+
222
+ let guard = self.inner.borrow();
223
+ let snapshot = guard
224
+ .as_ref()
225
+ .ok_or_else(|| closed_error("snapshot is closed"))?;
226
+
227
+ let iter = block_on_result(async {
228
+ snapshot
229
+ .scan_prefix_with_options(prefix.as_bytes(), &opts)
230
+ .await
231
+ })?;
232
+
233
+ Ok(Iterator::new(iter))
234
+ }
235
+
165
236
  /// Close the snapshot and release resources.
166
237
  pub fn close(&self) -> Result<(), Error> {
167
238
  let _ = self.inner.borrow_mut().take();
@@ -186,6 +257,11 @@ pub fn define_snapshot_class(ruby: &Ruby, module: &magnus::RModule) -> Result<()
186
257
  "_scan_with_options",
187
258
  method!(Snapshot::scan_with_options, 3),
188
259
  )?;
260
+ class.define_method("_scan_prefix", method!(Snapshot::scan_prefix, 1))?;
261
+ class.define_method(
262
+ "_scan_prefix_with_options",
263
+ method!(Snapshot::scan_prefix_with_options, 2),
264
+ )?;
189
265
  class.define_method("close", method!(Snapshot::close, 0))?;
190
266
  class.define_method("closed?", method!(Snapshot::is_closed, 0))?;
191
267