RubyGems - slatedb - Versions diffs - 0.1.1 → 0.2.0 - Mend

slatedb 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/README.md +144 -3
data/ext/slatedb/Cargo.toml +9 -10
data/ext/slatedb/src/admin.rs +3 -2
data/ext/slatedb/src/database.rs +206 -9
data/ext/slatedb/src/lib.rs +1 -0
data/ext/slatedb/src/merge_ops.rs +242 -0
data/ext/slatedb/src/reader.rs +74 -3
data/ext/slatedb/src/runtime.rs +52 -1
data/ext/slatedb/src/snapshot.rs +76 -0
data/ext/slatedb/src/transaction.rs +147 -5
data/ext/slatedb/src/utils.rs +5 -4
data/ext/slatedb/src/write_batch.rs +43 -1
data/lib/slatedb/database.rb +109 -3
data/lib/slatedb/reader.rb +35 -1
data/lib/slatedb/snapshot.rb +32 -0
data/lib/slatedb/transaction.rb +67 -0
data/lib/slatedb/version.rb +1 -1
data/lib/slatedb/write_batch.rb +20 -0
metadata +3 -2

data/ext/slatedb/src/merge_ops.rs ADDED Viewed

@@ -0,0 +1,242 @@
+use std::sync::Arc;
+use std::thread;
+use bytes::Bytes;
+use log::error;
+use magnus::rb_sys::{AsRawValue, FromRawValue};
+use magnus::value::ReprValue;
+use magnus::{Error, RHash, Ruby, Value};
+use slatedb::{MergeOperator, MergeOperatorError};
+use crate::errors::invalid_argument_error;
+use crate::utils::get_optional;
+struct StringConcatMergeOperator;
+impl MergeOperator for StringConcatMergeOperator {
+    fn merge(
+        &self,
+        _key: &Bytes,
+        existing_value: Option<Bytes>,
+        value: Bytes,
+    ) -> Result<Bytes, MergeOperatorError> {
+        let mut result = existing_value.unwrap_or_default().to_vec();
+        result.extend_from_slice(&value);
+        Ok(Bytes::from(result))
+    }
+    fn merge_batch(
+        &self,
+        _key: &Bytes,
+        existing_value: Option<Bytes>,
+        operands: &[Bytes],
+    ) -> Result<Bytes, MergeOperatorError> {
+        let mut result = existing_value.unwrap_or_default().to_vec();
+        for operand in operands {
+            result.extend_from_slice(operand);
+        }
+        Ok(Bytes::from(result))
+    }
+}
+/// A merge operator that calls a Ruby block/proc.
+///
+/// This stores the raw Ruby VALUE and calls it via `with_gvl` when merge
+/// operations are needed. The proc is called with (key, existing_value, new_value)
+/// and should return the merged value as a String.
+///
+/// # Thread Safety
+///
+/// The Ruby proc can only be called from the Ruby thread that created this operator.
+/// If the merge is called from a different thread (e.g., a Tokio worker thread during
+/// background compaction), the merge will use a fallback string concatenation behavior.
+///
+/// # Safety
+///
+/// The Ruby proc must be kept alive (not garbage collected) for the lifetime
+/// of this operator. This is typically handled by storing a reference to the
+/// proc in the Ruby Database object.
+pub struct RubyProcMergeOperator {
+    /// The raw Ruby VALUE of the proc. We store this as a raw value because
+    /// magnus::Value is not Send+Sync, but we need to be thread-safe.
+    /// We re-acquire the GVL before using it, which makes this safe.
+    proc_value: usize,
+    /// The thread ID of the Ruby thread that created this operator.
+    /// We can only safely call Ruby from this thread.
+    ruby_thread_id: thread::ThreadId,
+}
+// SAFETY: We only access the proc_value when we hold the GVL via the Ruby thread,
+// which ensures thread-safe access to Ruby objects.
+unsafe impl Send for RubyProcMergeOperator {}
+unsafe impl Sync for RubyProcMergeOperator {}
+impl RubyProcMergeOperator {
+    /// Create a new RubyProcMergeOperator from a Ruby proc/block.
+    ///
+    /// # Safety
+    ///
+    /// The caller must ensure the proc remains alive (not GC'd) for the
+    /// lifetime of this operator.
+    pub fn new(proc: Value) -> Self {
+        Self {
+            proc_value: proc.as_raw() as usize,
+            ruby_thread_id: thread::current().id(),
+        }
+    }
+    /// Check if we're on the Ruby thread that created this operator.
+    fn is_ruby_thread(&self) -> bool {
+        thread::current().id() == self.ruby_thread_id
+    }
+    /// Call the Ruby proc with the given arguments.
+    /// This must only be called from the Ruby thread (after checking is_ruby_thread).
+    fn call_proc_on_ruby_thread(
+        &self,
+        key: &str,
+        existing_value: Option<&str>,
+        new_value: &str,
+    ) -> Result<Bytes, MergeOperatorError> {
+        // We're on the Ruby thread, so we can use with_gvl
+        // Import here to avoid the circular dependency at module level
+        use crate::runtime::with_gvl;
+        let key_owned = key.to_string();
+        let existing_owned = existing_value.map(|s| s.to_string());
+        let new_owned = new_value.to_string();
+        with_gvl(|| {
+            let ruby = Ruby::get().expect("Ruby runtime not available");
+            // Reconstruct the proc Value from the raw pointer
+            let proc = unsafe { Value::from_raw(self.proc_value as _) };
+            // Build arguments: (key, existing_value, new_value)
+            let existing_arg: Value = match &existing_owned {
+                Some(s) => ruby.str_new(s).as_value(),
+                None => ruby.qnil().as_value(),
+            };
+            // Call the proc
+            let result: Result<String, magnus::Error> = proc.funcall(
+                "call",
+                (
+                    ruby.str_new(&key_owned),
+                    existing_arg,
+                    ruby.str_new(&new_owned),
+                ),
+            );
+            match result {
+                Ok(merged) => Ok(Bytes::from(merged)),
+                Err(e) => {
+                    error!("Ruby merge operator error: {}", e);
+                    Err(MergeOperatorError::EmptyBatch)
+                }
+            }
+        })
+    }
+    /// Fallback merge when we're not on the Ruby thread.
+    /// Uses simple concatenation as a safe default.
+    fn fallback_merge(
+        &self,
+        existing_value: Option<&Bytes>,
+        new_value: &Bytes,
+    ) -> Result<Bytes, MergeOperatorError> {
+        error!(
+            "Ruby merge operator called from non-Ruby thread, using fallback concatenation. \
+             This can happen during background compaction."
+        );
+        let mut result = existing_value
+            .map(|v| v.to_vec())
+            .unwrap_or_default();
+        result.extend_from_slice(new_value);
+        Ok(Bytes::from(result))
+    }
+    /// Call the Ruby proc with the given arguments, handling thread safety.
+    fn call_proc(
+        &self,
+        key: &Bytes,
+        existing_value: Option<&Bytes>,
+        new_value: &Bytes,
+    ) -> Result<Bytes, MergeOperatorError> {
+        let key_str = String::from_utf8_lossy(key);
+        let existing_str = existing_value.map(|v| String::from_utf8_lossy(v));
+        let new_str = String::from_utf8_lossy(new_value);
+        if self.is_ruby_thread() {
+            self.call_proc_on_ruby_thread(&key_str, existing_str.as_deref(), &new_str)
+        } else {
+            // We're on a worker thread, use fallback
+            self.fallback_merge(existing_value, new_value)
+        }
+    }
+}
+impl MergeOperator for RubyProcMergeOperator {
+    fn merge(
+        &self,
+        key: &Bytes,
+        existing_value: Option<Bytes>,
+        value: Bytes,
+    ) -> Result<Bytes, MergeOperatorError> {
+        self.call_proc(key, existing_value.as_ref(), &value)
+    }
+    fn merge_batch(
+        &self,
+        key: &Bytes,
+        existing_value: Option<Bytes>,
+        operands: &[Bytes],
+    ) -> Result<Bytes, MergeOperatorError> {
+        // Apply operands one at a time through the Ruby proc
+        let mut current = existing_value;
+        for operand in operands {
+            current = Some(self.call_proc(key, current.as_ref(), operand)?);
+        }
+        Ok(current.unwrap_or_default())
+    }
+}
+pub fn parse_merge_operator(
+    kwargs: &RHash,
+) -> Result<Option<Arc<dyn MergeOperator + Send + Sync>>, Error> {
+    let merge_operator = get_optional::<String>(kwargs, "merge_operator")?;
+    let Some(merge_operator) = merge_operator else {
+        return Ok(None);
+    };
+    let operator: Arc<dyn MergeOperator + Send + Sync> = match merge_operator.as_str() {
+        "string_concat" | "concat" => Arc::new(StringConcatMergeOperator),
+        _ => {
+            return Err(invalid_argument_error(&format!(
+                "invalid merge_operator: {} (expected 'string_concat', 'concat', or use merge_operator_proc for a custom block)",
+                merge_operator
+            )))
+        }
+    };
+    Ok(Some(operator))
+}
+/// Parse a Ruby proc as a merge operator.
+pub fn parse_merge_operator_proc(
+    kwargs: &RHash,
+) -> Result<Option<Arc<dyn MergeOperator + Send + Sync>>, Error> {
+    let proc_value = get_optional::<Value>(kwargs, "merge_operator_proc")?;
+    let Some(proc) = proc_value else {
+        return Ok(None);
+    };
+    // Verify it's callable
+    if !proc.respond_to("call", false).unwrap_or(false) {
+        return Err(invalid_argument_error(
+            "merge_operator_proc must respond to 'call'",
+        ));
+    }
+    Ok(Some(Arc::new(RubyProcMergeOperator::new(proc))))
+}

data/ext/slatedb/src/reader.rs CHANGED Viewed

@@ -7,6 +7,7 @@ use slatedb::DbReader;
 use crate::errors::invalid_argument_error;
 use crate::iterator::Iterator;
+use crate::merge_ops::parse_merge_operator;
 use crate::runtime::block_on_result;
 use crate::utils::{get_optional, resolve_object_store};
@@ -26,7 +27,7 @@ impl Reader {
     /// * `path` - The path identifier for the database
     /// * `url` - Optional object store URL
     /// * `checkpoint_id` - Optional checkpoint UUID to read at
-    /// * `kwargs` - Additional options (manifest_poll_interval, checkpoint_lifetime, max_memtable_bytes)
+    /// * `kwargs` - Additional options (manifest_poll_interval, checkpoint_lifetime, max_memtable_bytes, merge_operator)
     pub fn open(
         path: String,
         url: Option<String>,
@@ -39,6 +40,7 @@ impl Reader {
         let checkpoint_lifetime = get_optional::<u64>(&kwargs, "checkpoint_lifetime")?
             .map(std::time::Duration::from_millis);
         let max_memtable_bytes = get_optional::<u64>(&kwargs, "max_memtable_bytes")?;
+        let merge_operator = parse_merge_operator(&kwargs)?;
         // Parse checkpoint_id as UUID
         let checkpoint_uuid =
@@ -51,10 +53,10 @@ impl Reader {
             };
         let reader = block_on_result(async {
-            let object_store: Arc<dyn object_store::ObjectStore> = if let Some(ref url) = url {
+            let object_store: Arc<dyn slatedb::object_store::ObjectStore> = if let Some(ref url) = url {
                 resolve_object_store(url)?
             } else {
-                Arc::new(object_store::memory::InMemory::new())
+                Arc::new(slatedb::object_store::memory::InMemory::new())
             };
             let mut options = DbReaderOptions::default();
@@ -67,6 +69,9 @@ impl Reader {
             if let Some(max_bytes) = max_memtable_bytes {
                 options.max_memtable_bytes = max_bytes;
             }
+            if let Some(merge_operator) = merge_operator {
+                options.merge_operator = Some(merge_operator);
+            }
             DbReader::open(path, object_store, checkpoint_uuid, options).await
         })?;
@@ -200,6 +205,67 @@ impl Reader {
         Ok(Iterator::new(iter))
     }
+    /// Scan all keys with a given prefix.
+    pub fn scan_prefix(&self, prefix: String) -> Result<Iterator, Error> {
+        if prefix.is_empty() {
+            return Err(invalid_argument_error("prefix cannot be empty"));
+        }
+        let iter = block_on_result(async { self.inner.scan_prefix(prefix.as_bytes()).await })?;
+        Ok(Iterator::new(iter))
+    }
+    /// Scan all keys with a given prefix with options.
+    pub fn scan_prefix_with_options(
+        &self,
+        prefix: String,
+        kwargs: RHash,
+    ) -> Result<Iterator, Error> {
+        if prefix.is_empty() {
+            return Err(invalid_argument_error("prefix cannot be empty"));
+        }
+        let mut opts = ScanOptions::default();
+        if let Some(df) = get_optional::<String>(&kwargs, "durability_filter")? {
+            opts.durability_filter = match df.as_str() {
+                "remote" => DurabilityLevel::Remote,
+                "memory" => DurabilityLevel::Memory,
+                other => {
+                    return Err(invalid_argument_error(&format!(
+                        "invalid durability_filter: {} (expected 'remote' or 'memory')",
+                        other
+                    )))
+                }
+            };
+        }
+        if let Some(dirty) = get_optional::<bool>(&kwargs, "dirty")? {
+            opts.dirty = dirty;
+        }
+        if let Some(rab) = get_optional::<usize>(&kwargs, "read_ahead_bytes")? {
+            opts.read_ahead_bytes = rab;
+        }
+        if let Some(cb) = get_optional::<bool>(&kwargs, "cache_blocks")? {
+            opts.cache_blocks = cb;
+        }
+        if let Some(mft) = get_optional::<usize>(&kwargs, "max_fetch_tasks")? {
+            opts.max_fetch_tasks = mft;
+        }
+        let iter = block_on_result(async {
+            self.inner
+                .scan_prefix_with_options(prefix.as_bytes(), &opts)
+                .await
+        })?;
+        Ok(Iterator::new(iter))
+    }
     /// Close the reader.
     pub fn close(&self) -> Result<(), Error> {
         block_on_result(async { self.inner.close().await })?;
@@ -220,6 +286,11 @@ pub fn define_reader_class(ruby: &Ruby, module: &magnus::RModule) -> Result<(),
     class.define_method("get_bytes", method!(Reader::get_bytes, 1))?;
     class.define_method("_scan", method!(Reader::scan, 2))?;
     class.define_method("_scan_with_options", method!(Reader::scan_with_options, 3))?;
+    class.define_method("_scan_prefix", method!(Reader::scan_prefix, 1))?;
+    class.define_method(
+        "_scan_prefix_with_options",
+        method!(Reader::scan_prefix_with_options, 2),
+    )?;
     class.define_method("close", method!(Reader::close, 0))?;
     Ok(())

data/ext/slatedb/src/runtime.rs CHANGED Viewed

@@ -1,6 +1,6 @@
 use magnus::Error;
 use once_cell::sync::OnceCell;
-use rb_sys::rb_thread_call_without_gvl;
+use rb_sys::{rb_thread_call_with_gvl, rb_thread_call_without_gvl};
 use slatedb::Error as SlateError;
 use std::ffi::c_void;
 use std::future::Future;
@@ -98,3 +98,54 @@ where
     closure.result.expect("closure did not run")
 }
+/// Execute a closure while holding the Ruby GVL.
+///
+/// This is the inverse of `without_gvl`. Use this when you need to call back
+/// into Ruby from code that has previously released the GVL (e.g., inside
+/// a future being executed by `block_on`).
+///
+/// # Safety
+///
+/// This function can ONLY be called from a Ruby thread that has previously
+/// released the GVL via `without_gvl` or `rb_thread_call_without_gvl`.
+/// Calling it from a non-Ruby thread (like a spawned Tokio task) will cause
+/// a Ruby fatal error.
+///
+/// # Panics
+///
+/// Panics if called from a non-Ruby thread.
+pub fn with_gvl<F, T>(f: F) -> T
+where
+    F: FnOnce() -> T,
+{
+    struct Closure<F, T> {
+        f: Option<F>,
+        result: Option<T>,
+    }
+    extern "C" fn call_closure<F, T>(data: *mut c_void) -> *mut c_void
+    where
+        F: FnOnce() -> T,
+    {
+        let closure = unsafe { &mut *(data as *mut Closure<F, T>) };
+        if let Some(f) = closure.f.take() {
+            closure.result = Some(f());
+        }
+        std::ptr::null_mut()
+    }
+    let mut closure = Closure {
+        f: Some(f),
+        result: None,
+    };
+    unsafe {
+        rb_thread_call_with_gvl(
+            Some(call_closure::<F, T>),
+            &mut closure as *mut _ as *mut c_void,
+        );
+    }
+    closure.result.expect("closure did not run")
+}

data/ext/slatedb/src/snapshot.rs CHANGED Viewed

@@ -162,6 +162,77 @@ impl Snapshot {
         Ok(Iterator::new(iter))
     }
+    /// Scan all keys with a given prefix from the snapshot.
+    pub fn scan_prefix(&self, prefix: String) -> Result<Iterator, Error> {
+        if prefix.is_empty() {
+            return Err(invalid_argument_error("prefix cannot be empty"));
+        }
+        let guard = self.inner.borrow();
+        let snapshot = guard
+            .as_ref()
+            .ok_or_else(|| closed_error("snapshot is closed"))?;
+        let iter = block_on_result(async { snapshot.scan_prefix(prefix.as_bytes()).await })?;
+        Ok(Iterator::new(iter))
+    }
+    /// Scan all keys with a given prefix with options from the snapshot.
+    pub fn scan_prefix_with_options(
+        &self,
+        prefix: String,
+        kwargs: RHash,
+    ) -> Result<Iterator, Error> {
+        if prefix.is_empty() {
+            return Err(invalid_argument_error("prefix cannot be empty"));
+        }
+        let mut opts = ScanOptions::default();
+        if let Some(df) = get_optional::<String>(&kwargs, "durability_filter")? {
+            opts.durability_filter = match df.as_str() {
+                "remote" => DurabilityLevel::Remote,
+                "memory" => DurabilityLevel::Memory,
+                other => {
+                    return Err(invalid_argument_error(&format!(
+                        "invalid durability_filter: {} (expected 'remote' or 'memory')",
+                        other
+                    )))
+                }
+            };
+        }
+        if let Some(dirty) = get_optional::<bool>(&kwargs, "dirty")? {
+            opts.dirty = dirty;
+        }
+        if let Some(rab) = get_optional::<usize>(&kwargs, "read_ahead_bytes")? {
+            opts.read_ahead_bytes = rab;
+        }
+        if let Some(cb) = get_optional::<bool>(&kwargs, "cache_blocks")? {
+            opts.cache_blocks = cb;
+        }
+        if let Some(mft) = get_optional::<usize>(&kwargs, "max_fetch_tasks")? {
+            opts.max_fetch_tasks = mft;
+        }
+        let guard = self.inner.borrow();
+        let snapshot = guard
+            .as_ref()
+            .ok_or_else(|| closed_error("snapshot is closed"))?;
+        let iter = block_on_result(async {
+            snapshot
+                .scan_prefix_with_options(prefix.as_bytes(), &opts)
+                .await
+        })?;
+        Ok(Iterator::new(iter))
+    }
     /// Close the snapshot and release resources.
     pub fn close(&self) -> Result<(), Error> {
         let _ = self.inner.borrow_mut().take();
@@ -186,6 +257,11 @@ pub fn define_snapshot_class(ruby: &Ruby, module: &magnus::RModule) -> Result<()
         "_scan_with_options",
         method!(Snapshot::scan_with_options, 3),
     )?;
+    class.define_method("_scan_prefix", method!(Snapshot::scan_prefix, 1))?;
+    class.define_method(
+        "_scan_prefix_with_options",
+        method!(Snapshot::scan_prefix_with_options, 2),
+    )?;
     class.define_method("close", method!(Snapshot::close, 0))?;
     class.define_method("closed?", method!(Snapshot::is_closed, 0))?;