RubyGems - gte - Versions diffs - 0.0.3 → 0.0.5 - Mend

gte 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/README.md +122 -10
data/Rakefile +8 -0
data/VERSION +1 -1
data/ext/gte/Cargo.toml +1 -1
data/ext/gte/src/embedder.rs +34 -268
data/ext/gte/src/lib.rs +3 -0
data/ext/gte/src/model_profile.rs +179 -0
data/ext/gte/src/pipeline.rs +60 -0
data/ext/gte/src/postprocess.rs +25 -2
data/ext/gte/src/reranker.rs +120 -0
data/ext/gte/src/ruby_embedder.rs +165 -7
data/ext/gte/src/session.rs +9 -39
data/ext/gte/src/tokenizer.rs +21 -2
data/ext/gte/tests/inference_integration_test.rs +8 -4
data/ext/gte/tests/postprocess_unit_test.rs +17 -0
data/ext/gte/tests/tokenizer_unit_test.rs +4 -1
data/lib/gte/config.rb +15 -0
data/lib/gte/model.rb +35 -0
data/lib/gte/reranker.rb +54 -0
data/lib/gte/version.rb +5 -0
data/lib/gte.rb +27 -19
metadata +10 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: b7ce34f894403d3d2767d9c7f694aa712b42af251b0babf741e2dcd9dd6c7a27
-  data.tar.gz: c91aa21b10b2a20358c5d56c511623927c6e4cd4e0667cc7f40cdca405a4d10f
+  metadata.gz: ae83f737b57f798d39cf1fdc895d67948de27d36b46ea02c211a440d3acaa8c9
+  data.tar.gz: 9eaf9651b2ccf1fdb93efe4666ed70537628453a8cf92e234b454560560a83e8
 SHA512:
-  metadata.gz: 87e824d3fa79dc67a9584b902d17329aa85eb4f8fc4a358a6350c7f19e3d4e3c170a59b852abd16332caada49106bbba3356b6a5486bbb52c97b8bef22b1b9a0
-  data.tar.gz: 0dfeb1f6b4223f7ee88609411b94548740b588d89a92b55ba7e093564417086f24a12ebbf98bfee6a9fbd4c74d0f55dc0d66c2a6095d0d7ad7d9b1adca1b2eb7
+  metadata.gz: a262194a53bf804e47b0ef9c5910c1e2b814a9824823a92a73867a631c7b26310b3163e61997d9c163dab402a40d49946b76a64cc0421741ae235f623180cb95
+  data.tar.gz: 6acf5b58140012df9fa25971ed0f1fdfa707cc3efbe5f7f22104e35ad57877778a08cf9f8b311017be8f40e255289e3249e35c1e3780ae231f9f66e08cbb6ac3

data/README.md CHANGED Viewed

@@ -9,8 +9,105 @@ Inspired by https://github.com/fbilhaut/gte-rs
 ```ruby
 require "gte"
-model = GTE.new(ENV.fetch("GTE_MODEL_DIR"))
-vector = model["query: hello world"]
+model = GTE.config(ENV.fetch("GTE_MODEL_DIR"))
+# String input => GTE::Tensor (1 row)
+tensor = model.embed("query: hello world")
+vector = tensor.row(0)
+# [] with string => Array<Float> (single vector)
+single = model["query: nearest coffee shop"]
+# [] with array => GTE::Tensor (batch)
+batch = model[["query: hello", "query: world"]]
+```
+## Embedding Config (`GTE.config`)
+`GTE.config(model_dir)` builds (and caches) a `GTE::Model`.
+```ruby
+default_model = GTE.config(ENV.fetch("GTE_MODEL_DIR"))
+raw_model = GTE.config(ENV.fetch("GTE_MODEL_DIR")) do |config|
+  config.with(normalize: false)
+end
+full_throttle = GTE.config(ENV.fetch("GTE_MODEL_DIR")) do |config|
+  config.with(threads: 0)
+end
+custom = GTE.config(ENV.fetch("GTE_MODEL_DIR")) do |config|
+  config.with(
+    output_tensor: "last_hidden_state",
+    max_length: 256,
+    optimization_level: 3
+  )
+end
+```
+Config fields and defaults:
+- `model_dir`: absolute path to model directory
+- `threads`: `3` (set `0` for ONNX Runtime full-throttle threadpool)
+- `optimization_level`: `3`
+- `model_name`: `nil`
+- `normalize`: `true` (L2 normalization at Ruby-facing API)
+- `output_tensor`: `nil` (auto-select output tensor)
+- `max_length`: `nil` (uses tokenizer/model defaults)
+Notes:
+- Return a `Config::Text` from the block (for example, `config.with(...)`).
+- Model instances are cached by full config key; different config values create different cached instances.
+## Reranker
+Use `GTE::Reranker.config(model_dir)` for cross-encoder reranking.
+```ruby
+reranker = GTE::Reranker.config(ENV.fetch("GTE_RERANK_DIR")) do |config|
+  config.with(sigmoid: true, threads: 0)
+end
+query = "how to train a neural network?"
+candidates = [
+  "Backpropagation and gradient descent are core techniques.",
+  "This recipe uses flour and eggs."
+]
+# Raw scores aligned with input order
+scores = reranker.score(query, candidates)
+# => [0.93, 0.07]
+# Ranked output sorted by score desc
+ranked = reranker.rerank(query: query, candidates: candidates)
+# => [
+#      { index: 0, score: 0.93, text: "Backpropagation and gradient descent are core techniques." },
+#      { index: 1, score: 0.07, text: "This recipe uses flour and eggs." }
+#    ]
+```
+Reranker config fields and defaults:
+- `model_dir`: absolute path to model directory
+- `threads`: `3`
+- `optimization_level`: `3`
+- `model_name`: `nil`
+- `sigmoid`: `false` (set `true` if you want bounded [0,1] style scores)
+- `output_tensor`: `nil`
+- `max_length`: `nil`
+## Runtime + Result Examples
+Process-local reuse (recommended for Puma/web servers):
+```ruby
+EMBEDDER = GTE.config(ENV.fetch("GTE_MODEL_DIR"))
+def embed_query(text)
+  EMBEDDER[text] # Array<Float>
+end
 ```
 ## Model Directory
@@ -22,14 +119,28 @@ A model directory must include `tokenizer.json` and one ONNX model, resolved in
 3. `onnx/model.onnx`
 4. `model.onnx`
+Input policy is text-only. Graphs requiring unsupported multimodal inputs (such as `pixel_values`) are intentionally rejected.
+## Execution Providers
+Default execution provider is `xnnpack` on all platforms (including macOS arm64).
+To opt in to CoreML explicitly:
+```bash
+export GTE_EXECUTION_PROVIDERS=xnnpack,coreml
+```
 ## Development
-Run commands inside `nix develop`.
+Run commands inside `nix develop` via Make targets:
 ```bash
-bundle exec rake compile
-cargo test --manifest-path ext/gte/Cargo.toml --no-default-features
-bundle exec rspec
+make setup
+make compile
+make test
+make lint
+make ci
 ```
 ## Benchmark
@@ -37,13 +148,14 @@ bundle exec rspec
 The repo includes two benchmark paths:
 ```bash
-bundle exec rake bench:pure_compare
-bundle exec rake bench:puma_compare
-bundle exec rake bench:matrix_sweep
+make bench
+nix develop -c bundle exec rake bench:pure_compare
+nix develop -c bundle exec rake bench:matrix_sweep
+nix develop -c bundle exec ruby bench/memory_probe.rb --compare-pure
 ```
 For release tracking and regression detection, record a run entry in `RUNS.md`:
 ```bash
-bundle exec rake bench:record_run
+make bench-record
 ```

data/Rakefile CHANGED Viewed

@@ -48,6 +48,14 @@ namespace :bench do
     )
   end
+  desc 'Run memory probe for single-instance vs duplicate-instance behavior'
+  task :memory_probe do
+    run_in_nix(
+      'bundle', 'exec', 'ruby', 'bench/memory_probe.rb',
+      '--compare-pure'
+    )
+  end
   desc 'Run Puma benchmark, append RUNS.md entry, and enforce goal/regression checks'
   task :record_run do
     run_in_nix(

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.0.3
1	+ 0.0.5

data/ext/gte/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "gte"
-version = "0.0.3"
+version = "0.0.5"
 edition = "2021"
 authors = ["elcuervo <elcuervo@elcuervo.net>"]
 license = "MIT"

data/ext/gte/src/embedder.rs CHANGED Viewed

@@ -1,19 +1,15 @@
 use crate::error::{GteError, Result};
 use crate::model_config::{ExtractorMode, ModelConfig};
+use crate::model_profile::{
+    has_input, infer_extraction_mode, read_max_length, resolve_default_text_model, resolve_named_model,
+    resolve_tokenizer_path, select_output_tensor, validate_supported_text_inputs,
+};
 use crate::postprocess::normalize_l2 as normalize_l2_rows;
 use crate::session::{build_session, run_session};
 use crate::tokenizer::{Tokenized, Tokenizer};
 use ndarray::Array2;
 use ort::session::Session;
-use std::path::{Path, PathBuf};
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum ModelFamily {
-    E5Like,
-    SiglipLike,
-    ClipLike,
-    Other,
-}
+use std::path::Path;
 pub struct Embedder {
     tokenizer: Tokenizer,
@@ -41,23 +37,35 @@ impl Embedder {
         num_threads: usize,
         optimization_level: u8,
         model_name: Option<&str>,
+        output_tensor_override: Option<&str>,
+        max_length_override: Option<usize>,
     ) -> Result<Self> {
+        const PREFERRED_EMBEDDING_OUTPUTS: [&str; 4] = [
+            "pooler_output",
+            "text_embeds",
+            "sentence_embedding",
+            "last_hidden_state",
+        ];
         let dir = dir.as_ref();
-        let tokenizer_path = dir.join("tokenizer.json");
+        let tokenizer_path = resolve_tokenizer_path(dir)?;
         let model_path = match model_name.filter(|s| !s.is_empty()) {
             Some(name) => resolve_named_model(dir, name)?,
-            None => resolve_model_path(dir)?,
+            None => resolve_default_text_model(dir)?,
         };
-        if !tokenizer_path.exists() {
-            return Err(GteError::Tokenizer(format!(
-                "tokenizer.json not found in {}",
-                dir.display()
-            )));
-        }
+        let max_length = if let Some(override_value) = max_length_override {
+            if override_value == 0 {
+                return Err(GteError::Inference(
+                    "max_length override must be greater than 0".to_string(),
+                ));
+            }
+            override_value
+        } else {
+            read_max_length(dir)
+        };
-        let max_length = read_max_length(dir);
-        let temp_config = ModelConfig {
+        let session_config = ModelConfig {
             max_length,
             output_tensor: String::new(),
             mode: ExtractorMode::Raw,
@@ -66,13 +74,13 @@ impl Embedder {
             num_threads,
             optimization_level,
         };
-        let session = build_session(&model_path, &temp_config)?;
+        let session = build_session(&model_path, &session_config)?;
-        validate_supported_inputs(&session)?;
-        let with_type_ids = session.inputs.iter().any(|i| i.name == "token_type_ids");
-        let with_attention_mask = session.inputs.iter().any(|i| i.name == "attention_mask");
-        let output_tensor = select_output_tensor(&session)?;
-        let output_base = output_basename(output_tensor.as_str()).to_string();
+        validate_supported_text_inputs(&session, "text embedding")?;
+        let with_type_ids = has_input(&session, "token_type_ids");
+        let with_attention_mask = has_input(&session, "attention_mask");
+        let output_tensor =
+            select_output_tensor(&session, output_tensor_override, &PREFERRED_EMBEDDING_OUTPUTS)?;
         let mode = infer_extraction_mode(&session, output_tensor.as_str())?;
         if matches!(mode, ExtractorMode::MeanPool) && !with_attention_mask {
             return Err(GteError::Inference(
@@ -80,29 +88,16 @@ impl Embedder {
             ));
         }
-        let tuned_num_threads = tune_num_threads(
-            num_threads,
-            with_attention_mask,
-            with_type_ids,
-            output_base.as_str(),
-        );
         let config = ModelConfig {
             max_length,
             output_tensor,
             mode,
             with_type_ids,
             with_attention_mask,
-            num_threads: tuned_num_threads,
+            num_threads,
             optimization_level,
         };
-        let session = if tuned_num_threads != num_threads {
-            build_session(&model_path, &config)?
-        } else {
-            session
-        };
         let tokenizer = Tokenizer::new(&tokenizer_path, config.max_length, config.with_type_ids)?;
         Ok(Self {
@@ -124,235 +119,6 @@ impl Embedder {
     pub fn run(&self, tokenized: &Tokenized) -> crate::error::Result<Array2<f32>> {
         run_session(&self.session, tokenized, &self.config)
     }
-}
-fn tune_num_threads(
-    requested: usize,
-    with_attention_mask: bool,
-    with_type_ids: bool,
-    output_name: &str,
-) -> usize {
-    if requested > 0 {
-        return requested;
-    }
-    let family = infer_model_family(with_attention_mask, with_type_ids, output_name);
-    let target_concurrency = puma_target_concurrency();
-    let host_cores = host_parallelism();
-    let budgeted_threads = (host_cores / target_concurrency).max(1);
-    match family {
-        // Puma-like workloads typically run many concurrent single-item requests where
-        // one intra-op thread per request gives the best tail behavior.
-        ModelFamily::E5Like | ModelFamily::ClipLike | ModelFamily::SiglipLike => {
-            budgeted_threads.min(1)
-        }
-        ModelFamily::Other => 0,
-    }
-}
-fn infer_model_family(
-    with_attention_mask: bool,
-    with_type_ids: bool,
-    output_name: &str,
-) -> ModelFamily {
-    if output_name == "last_hidden_state" && with_attention_mask && with_type_ids {
-        return ModelFamily::E5Like;
-    }
-    if output_name == "last_hidden_state" && with_attention_mask && !with_type_ids {
-        return ModelFamily::SiglipLike;
-    }
-    if output_name == "text_embeds" && !with_attention_mask {
-        return ModelFamily::ClipLike;
-    }
-    ModelFamily::Other
-}
-fn puma_target_concurrency() -> usize {
-    std::env::var("GTE_PUMA_CONCURRENCY")
-        .ok()
-        .and_then(|raw| raw.parse::<usize>().ok())
-        .filter(|value| *value > 0)
-        .unwrap_or(16)
-}
-fn host_parallelism() -> usize {
-    std::thread::available_parallelism()
-        .map(|n| n.get())
-        .unwrap_or(1)
-}
-fn resolve_named_model(dir: &Path, name: &str) -> Result<PathBuf> {
-    let candidates = [dir.join("onnx").join(name), dir.join(name)];
-    for path in &candidates {
-        if path.exists() {
-            return Ok(path.clone());
-        }
-    }
-    Err(GteError::Inference(format!(
-        "model '{}' not found in {} (checked onnx/{0} and {0})",
-        name,
-        dir.display()
-    )))
-}
-fn resolve_model_path(dir: &Path) -> Result<PathBuf> {
-    let candidates = [
-        dir.join("onnx").join("text_model.onnx"),
-        dir.join("text_model.onnx"),
-        dir.join("onnx").join("model.onnx"),
-        dir.join("model.onnx"),
-    ];
-    for path in &candidates {
-        if path.exists() {
-            return Ok(path.clone());
-        }
-    }
-    Err(GteError::Inference(format!(
-        "no ONNX model found in {} (checked text_model.onnx and model.onnx)",
-        dir.display()
-    )))
-}
-const SUPPORTED_INPUTS: [&str; 3] = ["input_ids", "attention_mask", "token_type_ids"];
-fn validate_supported_inputs(session: &Session) -> Result<()> {
-    let unsupported: Vec<String> = session
-        .inputs
-        .iter()
-        .filter(|i| !SUPPORTED_INPUTS.contains(&i.name.as_str()))
-        .map(|i| i.name.clone())
-        .collect();
-    if unsupported.is_empty() {
-        return Ok(());
-    }
-    let mut message = format!(
-        "unsupported model inputs for text embedding API: {}",
-        unsupported.join(", ")
-    );
-    if unsupported.iter().any(|n| n == "pixel_values") {
-        message.push_str(
-            ". This looks like a multimodal graph. Provide a text-only export (for example onnx/text_model.onnx).",
-        );
-    } else {
-        message.push_str(". Supported inputs are: input_ids, attention_mask, token_type_ids.");
-    }
-    Err(GteError::Inference(message))
-}
-fn output_name_matches(name: &str, preferred: &str) -> bool {
-    let lower = name.to_ascii_lowercase();
-    lower == preferred || lower.ends_with(&format!("/{}", preferred))
-}
-fn select_output_tensor(session: &Session) -> Result<String> {
-    const PREFERRED: [&str; 4] = [
-        "text_embeds",
-        "pooler_output",
-        "sentence_embedding",
-        "last_hidden_state",
-    ];
-    for preferred in PREFERRED {
-        if let Some(output) = session
-            .outputs
-            .iter()
-            .find(|o| output_name_matches(o.name.as_str(), preferred))
-        {
-            return Ok(output.name.clone());
-        }
-    }
-    session
-        .outputs
-        .first()
-        .map(|o| o.name.clone())
-        .ok_or_else(|| GteError::Inference("model has no outputs".into()))
-}
-fn read_max_length(dir: &Path) -> usize {
-    (|| -> Option<usize> {
-        let contents = std::fs::read_to_string(dir.join("tokenizer_config.json")).ok()?;
-        let json: serde_json::Value = serde_json::from_str(&contents).ok()?;
-        let v = json.get("model_max_length")?;
-        let n = v
-            .as_u64()
-            .or_else(|| v.as_f64().filter(|&f| f > 0.0 && f < 1e15).map(|f| f as u64))?;
-        Some((n as usize).min(8192))
-    })()
-    .unwrap_or(512)
-}
-#[cfg(test)]
-mod tests {
-    use super::{infer_model_family, tune_num_threads, ModelFamily};
-    #[test]
-    fn infer_model_family_recognizes_known_signatures() {
-        assert_eq!(
-            infer_model_family(true, true, "last_hidden_state"),
-            ModelFamily::E5Like
-        );
-        assert_eq!(
-            infer_model_family(true, false, "last_hidden_state"),
-            ModelFamily::SiglipLike
-        );
-        assert_eq!(
-            infer_model_family(false, false, "text_embeds"),
-            ModelFamily::ClipLike
-        );
-        assert_eq!(infer_model_family(true, false, "pooler_output"), ModelFamily::Other);
-    }
-    #[test]
-    fn tune_num_threads_respects_requested_value() {
-        assert_eq!(tune_num_threads(7, true, true, "last_hidden_state"), 7);
-    }
-    #[test]
-    fn tune_num_threads_returns_ort_default_for_other_family() {
-        assert_eq!(tune_num_threads(0, true, false, "pooler_output"), 0);
-    }
-}
-fn output_basename(name: &str) -> &str {
-    name.rsplit('/').next().unwrap_or(name)
-}
-fn infer_extraction_mode(session: &Session, output_tensor: &str) -> Result<ExtractorMode> {
-    let output = session
-        .outputs
-        .iter()
-        .find(|o| o.name == output_tensor)
-        .ok_or_else(|| {
-            GteError::Inference(format!(
-                "output tensor '{}' not found in model outputs",
-                output_tensor
-            ))
-        })?;
-    let ndims = match &output.output_type {
-        ort::value::ValueType::Tensor { dimensions, .. } => dimensions.len(),
-        other => {
-            return Err(GteError::Inference(format!(
-                "output is not a tensor: {:?}",
-                other
-            )))
-        }
-    };
-    match (output_basename(output_tensor), ndims) {
-        ("last_hidden_state", 3) => Ok(ExtractorMode::MeanPool),
-        (_, 2) => Ok(ExtractorMode::Raw),
-        (_, 3) => Ok(ExtractorMode::MeanPool),
-        (_, n) => Err(GteError::Inference(format!(
-            "unexpected output tensor rank {} for '{}': expected 2 (Raw) or 3 (MeanPool)",
-            n, output_tensor
-        ))),
-    }
 }
 pub fn normalize_l2(embeddings: Array2<f32>) -> Array2<f32> {

data/ext/gte/src/lib.rs CHANGED Viewed

@@ -1,7 +1,10 @@
 pub mod embedder;
 pub mod error;
 pub mod model_config;
+pub mod model_profile;
+pub mod pipeline;
 pub mod postprocess;
+pub mod reranker;
 pub mod session;
 pub mod tokenizer;