RubyGems - gte - Versions diffs - 0.0.13 → 0.0.14 - Mend

gte 0.0.13 → 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/README.md +93 -27
data/VERSION +1 -1
data/ext/gte/Cargo.toml +26 -4
data/ext/gte/benches/hot_path.rs +20 -54
data/ext/gte/build.rs +2 -6
data/ext/gte/rustfmt.toml +5 -0
data/ext/gte/src/embedder.rs +71 -43
data/ext/gte/src/error.rs +4 -4
data/ext/gte/src/lib.rs +1 -1
data/ext/gte/src/model_config.rs +4 -0
data/ext/gte/src/model_profile.rs +26 -87
data/ext/gte/src/pipeline.rs +11 -30
data/ext/gte/src/postprocess.rs +8 -14
data/ext/gte/src/reranker.rs +50 -50
data/ext/gte/src/ruby_embedder.rs +48 -53
data/ext/gte/src/session.rs +136 -248
data/ext/gte/src/tokenizer.rs +51 -125
data/ext/gte/tests/inference_integration_test.rs +8 -18
data/ext/gte/tests/padding_regression_test.rs +13 -26
data/ext/gte/tests/tokenizer_unit_test.rs +10 -24
data/lib/gte/config.rb +2 -1
data/lib/gte/embedder.rb +6 -2
data/lib/gte/reranker.rb +3 -1
data/lib/gte.rb +6 -0
metadata +2 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 278028df09fbcdd14fd583f0af5e1a8c9553adb28fe7aa0bc67b67666dbbdccd
-  data.tar.gz: ce994e3f505200ed4654ca8f87f585ff88919201fe82dd79007622f07a3d1ea0
+  metadata.gz: 37ad2ef3f640b8bbaefa14cae29541f329c3b99950a2867b9054b8e8854ca242
+  data.tar.gz: 0b54c757ca510ccc8644e0d3c1519aa7b6576fa1da4ba9012535e0b0b7d598dc
 SHA512:
-  metadata.gz: 742f1830ff2b83f89726be527c4323a81649b04f341b7adc0544a9000373f6a097c0b4b4ba211ead5912ba45d876565fbaab6d723ef8f06c488ab7827323f827
-  data.tar.gz: 75e91b3d4c3980b166268c6468b96bebe4b74db999e0cee433a295e57d89bec95c7614b004c61e8b3ed88cff30f02f3b6aff74de710d3dd3bb34552f36fb3422
+  metadata.gz: 5d67fc8c73aa2b162bf804082accc849ce972b4df51e2ec86bb7d977cd760e2bf5b760d7de141b5fbd5b6f68dd0e51d2bd15c84d1cb8dc9a8f4bf424b62490ba
+  data.tar.gz: c6cefec4d42f7ca72980e4d3454447aa681f05f0b72f96ceec226b5efd5ae32bade13d85ae55f032098d7cb7f72b530d8332f1eaf25fa364d0c33703da13e043

data/README.md CHANGED Viewed

@@ -58,6 +58,8 @@ Notes:
 - Return a `Config::Text` from the block (for example, `config.with(...)`).
 - Model instances are cached by full config key; different config values create different cached instances.
+- `GTE.warmup(model, threads:)` pre-warms thread-local ONNX sessions eagerly at boot.
+  Useful in multi-threaded servers (Puma, Sidekiq) to avoid ~100-500ms cold-start latency.
 Common model presets:
@@ -73,7 +75,7 @@ end
 siglip2 = GTE.config(ENV.fetch("GTE_SIGLIP2_DIR")) do |config|
   config.with(
-    model_name: "text_model_int8.onnx",
+    model_name: "text_model.onnx",
     output_tensor: "pooler_output",
     max_length: 64,
     execution_providers: "cpu"
@@ -147,6 +149,53 @@ Session pool sizing:
 - `GTE_SESSION_POOL_CAP`: optional positive integer cap for internal ONNX session pool size.
 - Unset by default; runtime uses available CPU parallelism.
+## Automatic Tuning
+`gte` automatically adapts to the hardware — no configuration required.
+### ONNX Intra-op Threads
+- Auto-detected via `std::thread::available_parallelism()` capped at 4.
+- Prevents oversubscription on high-concurrency workloads.
+- Override with `GTE_INTRA_OP_NUM_THREADS` env var.
+### ONNX Inter-op Threads
+- Defaults to 1 (text embedding graphs are linear chains with no independent parallel nodes).
+- Override with `GTE_INTER_OP_NUM_THREADS` env var.
+### Execution Providers
+`gte` automatically tries XNNPACK for optimized CPU inference. Falls back to
+ORT's default CPU provider if unavailable.
+- **ARM64** (Apple Silicon, AWS Graviton): XNNPACK is typically **~25% faster**
+  than plain CPU while producing identical embeddings (cos=1.0, max_abs=0.0).
+- **x86/x64** (Intel, AMD): XNNPACK offers minimal benefit — ORT's default CPU
+  provider already uses MKL-DNN/oneDNN, which are better tuned for these chips.
+  The auto-detect silently falls back to the default provider.
+Configure providers explicitly with `GTE_EXECUTION_PROVIDERS` (comma-separated):
+```bash
+export GTE_EXECUTION_PROVIDERS=xnnpack,coreml
+```
+Set `cpu` or `none` to skip auto-detect and use ORT's default CPU provider.
+### Session Pre-Warming
+ONNX sessions are created lazily per OS thread. In multi-threaded servers (Puma, Sidekiq),
+each thread creates its own session on first use (~100-500ms cold start).
+Pre-warm sessions eagerly at boot:
+```ruby
+model = GTE.config(ENV.fetch("GTE_MODEL_DIR"))
+# Pre-warm thread-local sessions for a Puma server with 5 threads:
+GTE.warmup(model, threads: 5)
+```
 ## Runtime + Result Examples
 Process-local reuse (recommended for Puma/web servers):
@@ -170,47 +219,64 @@ A model directory must include `tokenizer.json` and one ONNX model, resolved in
 Input policy is text-only. Graphs requiring unsupported multimodal inputs (such as `pixel_values`) are intentionally rejected.
-## Execution Providers
-Default behavior is CPU fallback via ONNX Runtime's default provider (no explicit provider registration).
+## Development
+Run commands inside `nix develop` via Make targets:
+```bash
+make setup
+make compile
+make test
+make lint
+make ci
+```
+## Benchmarks
-Configure providers with `GTE_EXECUTION_PROVIDERS` (comma-separated, case-insensitive).
-Supported values:
+### Docker Rails+Puma+wrk (Real-World HTTP)
-- `cpu` or `none`: CPU fallback (skip explicit provider registration)
-- `xnnpack`
-- `coreml`
+The `bench/rails/` directory contains a full-stack benchmark: Rails 7.1 API app served by Puma,
+loaded with wrk (randomized text queries, 135 diverse texts).
-Examples:
+Run for all models:
 ```bash
-export GTE_EXECUTION_PROVIDERS=cpu
-export GTE_EXECUTION_PROVIDERS=xnnpack,coreml
+make bench-docker-compare
 ```
-Ruby per-instance override (takes precedence over `GTE_EXECUTION_PROVIDERS`):
+Run for a single model:
-```ruby
-model = GTE.config(ENV.fetch("GTE_MODEL_DIR")) do |config|
-  config.with(execution_providers: "cpu")
-end
+```bash
+make bench-docker-sweep-siglip2
+make bench-docker-validate  # cross-validation checks
 ```
-## Development
+#### Siglip2 (768-dim, pooler_output)
-Run commands inside `nix develop` via Make targets:
+| Concurrency | GTE p90 | Pure Ruby p90 | Ratio | GTE RPS | Pure Ruby RPS |
+|------------|---------|---------------|-------|---------|---------------|
+| c=1 | ~12ms | ~120ms | 9-10× | ~95 | ~10 |
+| c=4 | ~39ms | ~503ms | 10-13× | ~228 | ~10 |
+| c=8 | ~146ms | ~613ms | 3-4× | ~224 | ~10 |
+| c=16 | ~430ms | ~611ms | 1-1.5× | ~226 | ~11 |
-```bash
-make setup
-make compile
-make test
-make lint
-make ci
-```
+#### E5 (384-dim, last_hidden_state + mean pool)
+| Concurrency | GTE p90 | Pure Ruby p90 | Ratio | GTE RPS | Pure Ruby RPS |
+|------------|---------|---------------|-------|---------|---------------|
+| c=1 | ~7ms | ~120ms | 16-17× | ~160 | ~10 |
+| c=4 | ~12ms | ~430ms | 35-40× | ~477 | ~10 |
+| c=8 | ~64ms | ~530ms | 8-9× | ~503 | ~10 |
+| c=16 | ~205ms | ~534ms | 2-3× | ~509 | ~11 |
+GTE releases the GVL during ONNX inference, enabling true parallelism across Puma threads.
+Pure Ruby is GVL-bound (~10 RPS regardless of concurrency).
-## Benchmark
+The Puma thread pool (min=2, max=5) limits throughput at c=16+.
+GTE's pipelining and GVL release already saturate the available threads at c=4.
-The repo includes a shared multi-runtime benchmark harness:
+### In-Process Benchmarks
 ```bash
 make bench

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.0.13
1	+ 0.0.14

data/ext/gte/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "gte"
-version = "0.0.13"
+version = "0.0.14"
 edition = "2021"
 authors = ["elcuervo <elcuervo@elcuervo.net>"]
 license = "MIT"
@@ -22,11 +22,8 @@ ruby-ffi = ["dep:magnus", "dep:rb-sys"]
 rb-sys = { version = "0.9", features = ["stable-api-compiled-fallback"], optional = true }
 magnus = { version = "0.8", optional = true }
 ort = { version = "=2.0.0-rc.12", features = ["ndarray", "xnnpack"] }
-ort-sys = "=2.0.0-rc.12"
 tokenizers = "0.21.0"
 ndarray = "0.17"
-half = "2"
-serde = { version = "1", features = ["derive"] }
 serde_json = "1"
 [dev-dependencies]
@@ -35,3 +32,28 @@ criterion = "0.5"
 [[bench]]
 name = "hot_path"
 harness = false
+[lints.rust]
+unsafe_code = "deny"
+rust_2018_idioms = "warn"
+unused_qualifications = "warn"
+unused_results = "warn"
+[lints.clippy]
+all = "warn"
+pedantic = "warn"
+# Reasonable exceptions to pedantic lints:
+module_name_repetitions = "allow"
+missing_errors_doc = "allow"
+must_use_candidate = "allow"
+cast_possible_truncation = "allow"
+cast_sign_loss = "allow"
+cast_precision_loss = "allow"
+similar_names = "allow"
+too_many_lines = "allow"
+# ndarray::ArrayView types are Copy — passing by value is idiomatic
+needless_pass_by_value = "allow"
+# ort::Outlet::name() is on a private type — closure is required
+redundant_closure_for_method_calls = "allow"
+# Transitive dep conflicts are not actionable in this crate
+multiple_crate_versions = "allow"

data/ext/gte/benches/hot_path.rs CHANGED Viewed

@@ -5,9 +5,7 @@ use gte::postprocess::{mean_pool, normalize_l2};
 use ndarray::{Array2, Array3};
 fn build_hidden_states(batch: usize, seq: usize, dim: usize) -> Array3<f32> {
-    Array3::from_shape_fn((batch, seq, dim), |(b, s, d)| {
-        (((b * 31 + s * 17 + d * 13) % 97) as f32) / 97.0
-    })
+    Array3::from_shape_fn((batch, seq, dim), |(b, s, d)| (((b * 31 + s * 17 + d * 13) % 97) as f32) / 97.0)
 }
 fn build_attention_mask(batch: usize, seq: usize) -> Array2<i64> {
@@ -22,15 +20,7 @@ fn bench_mean_pool(c: &mut Criterion) {
         group.bench_with_input(
             BenchmarkId::from_parameter(format!("{batch}x{seq}x{dim}")),
             &(batch, seq, dim),
-            |b, _| {
-                b.iter(|| {
-                    mean_pool(
-                        black_box(hidden_states.view()),
-                        black_box(attention_mask.view()),
-                    )
-                    .unwrap()
-                })
-            },
+            |b, _| b.iter(|| mean_pool(black_box(hidden_states.view()), black_box(attention_mask.view())).unwrap()),
         );
     }
     group.finish();
@@ -39,14 +29,10 @@ fn bench_mean_pool(c: &mut Criterion) {
 fn bench_normalize_l2(c: &mut Criterion) {
     let mut group = c.benchmark_group("normalize_l2");
     for (rows, dim) in [(1, 384), (8, 384), (32, 768), (128, 768)] {
-        let embeddings = Array2::from_shape_fn((rows, dim), |(row, col)| {
-            (((row * 19 + col * 7) % 113) as f32) / 113.0
+        let embeddings = Array2::from_shape_fn((rows, dim), |(row, col)| (((row * 19 + col * 7) % 113) as f32) / 113.0);
+        group.bench_with_input(BenchmarkId::from_parameter(format!("{rows}x{dim}")), &(rows, dim), |b, _| {
+            b.iter(|| normalize_l2(black_box(embeddings.clone())))
         });
-        group.bench_with_input(
-            BenchmarkId::from_parameter(format!("{rows}x{dim}")),
-            &(rows, dim),
-            |b, _| b.iter(|| normalize_l2(black_box(embeddings.clone()))),
-        );
     }
     group.finish();
 }
@@ -61,26 +47,14 @@ fn bench_padding_impact(c: &mut Criterion) {
     let dim = 768;
     let mut group = c.benchmark_group("padding_impact");
-    for (label, seq) in [
-        ("batch_longest/4tok", 4usize),
-        ("fixed/siglip2_max_64", 64usize),
-        ("fixed/e5_max_512", 512usize),
-    ] {
+    for (label, seq) in
+        [("batch_longest/4tok", 4usize), ("fixed/siglip2_max_64", 64usize), ("fixed/e5_max_512", 512usize)]
+    {
         let hidden_states = build_hidden_states(1, seq, dim);
         let attention_mask = build_attention_mask(1, seq);
-        group.bench_with_input(
-            BenchmarkId::from_parameter(label),
-            &seq,
-            |b, _| {
-                b.iter(|| {
-                    mean_pool(
-                        black_box(hidden_states.view()),
-                        black_box(attention_mask.view()),
-                    )
-                    .unwrap()
-                })
-            },
-        );
+        group.bench_with_input(BenchmarkId::from_parameter(label), &seq, |b, _| {
+            b.iter(|| mean_pool(black_box(hidden_states.view()), black_box(attention_mask.view())).unwrap())
+        });
     }
     group.finish();
 }
@@ -93,7 +67,12 @@ fn bench_padding_impact(c: &mut Criterion) {
 // Sweeps execution providers for quick local comparison.
 fn bench_embedding_e2e(c: &mut Criterion) {
     let cases = [
-        ("e5", "GTE_BENCH_E5_DIR", "query: cat", "query: ".to_string() + &"the quick brown fox jumps over the lazy dog ".repeat(20)),
+        (
+            "e5",
+            "GTE_BENCH_E5_DIR",
+            "query: cat",
+            "query: ".to_string() + &"the quick brown fox jumps over the lazy dog ".repeat(20),
+        ),
         ("siglip2", "GTE_BENCH_SIGLIP2_DIR", "cat", "a photo of ".to_string() + &"a cat sitting on a mat ".repeat(10)),
         ("clip", "GTE_BENCH_CLIP_DIR", "cat", "a photo of ".to_string() + &"a cat sitting on a mat ".repeat(10)),
     ];
@@ -107,10 +86,7 @@ fn bench_embedding_e2e(c: &mut Criterion) {
         };
         for provider in ["cpu", "xnnpack"] {
-            let overrides = ModelLoadOverrides {
-                execution_providers: Some(provider),
-                ..ModelLoadOverrides::default()
-            };
+            let overrides = ModelLoadOverrides { execution_providers: Some(provider), ..ModelLoadOverrides::default() };
             let embedder = match Embedder::from_dir(&dir, 3, overrides) {
                 Ok(e) => e,
                 Err(err) => {
@@ -122,11 +98,7 @@ fn bench_embedding_e2e(c: &mut Criterion) {
             for (input_label, input) in [("short", short_input.to_string()), ("long", long_input.clone())] {
                 let id = BenchmarkId::from_parameter(format!("{model_label}/{provider}/{input_label}"));
                 group.bench_with_input(id, &input, |b, text| {
-                    b.iter(|| {
-                        embedder
-                            .embed(black_box(vec![text.clone()]))
-                            .expect("embed succeeds")
-                    })
+                    b.iter(|| embedder.embed(black_box(&[text.clone()])).expect("embed succeeds"))
                 });
             }
         }
@@ -134,11 +106,5 @@ fn bench_embedding_e2e(c: &mut Criterion) {
     group.finish();
 }
-criterion_group!(
-    benches,
-    bench_mean_pool,
-    bench_normalize_l2,
-    bench_padding_impact,
-    bench_embedding_e2e
-);
+criterion_group!(benches, bench_mean_pool, bench_normalize_l2, bench_padding_impact, bench_embedding_e2e);
 criterion_main!(benches);

data/ext/gte/build.rs CHANGED Viewed

@@ -1,15 +1,11 @@
 fn main() {
-    let version = std::fs::read_to_string("../../VERSION")
-        .expect("VERSION file not found")
-        .trim()
-        .to_string();
+    let version = std::fs::read_to_string("../../VERSION").expect("VERSION file not found").trim().to_string();
     let cargo_version = env!("CARGO_PKG_VERSION");
     assert_eq!(
         version, cargo_version,
-        "VERSION file ({}) doesn't match Cargo.toml ({}). Update Cargo.toml to match.",
-        version, cargo_version
+        "VERSION file ({version}) doesn't match Cargo.toml ({cargo_version}). Update Cargo.toml to match.",
     );
     println!("cargo:rerun-if-changed=../../VERSION");

data/ext/gte/rustfmt.toml ADDED Viewed

@@ -0,0 +1,5 @@
+edition = "2021"
+max_width = 120
+use_small_heuristics = "Max"
+newline_style = "Unix"
+tab_spaces = 4

data/ext/gte/src/embedder.rs CHANGED Viewed

@@ -1,11 +1,11 @@
 use crate::error::{GteError, Result};
 use crate::model_config::{ExtractorMode, ModelConfig, ModelLoadOverrides, PaddingMode};
 use crate::model_profile::{
-    has_input, infer_extraction_mode, read_tokenizer_profile, resolve_default_text_model,
-    resolve_named_model, resolve_tokenizer_path, select_output_tensor, validate_supported_text_inputs,
+    has_input, infer_extraction_mode, read_tokenizer_profile, resolve_default_text_model, resolve_named_model,
+    resolve_tokenizer_path, select_output_tensor, validate_supported_text_inputs,
 };
 use crate::postprocess::normalize_l2 as normalize_l2_rows;
-use crate::session::{build_session, run_session, SessionPool};
+use crate::session::{build_session, SessionPool};
 use crate::tokenizer::{parse_padding_mode_override, Tokenized, Tokenizer};
 use ndarray::Array2;
 use std::path::{Path, PathBuf};
@@ -13,7 +13,7 @@ use std::path::{Path, PathBuf};
 pub struct Embedder {
     tokenizer: Tokenizer,
     pool: SessionPool,
-    config: ModelConfig,
+    pub config: ModelConfig,
 }
 impl Embedder {
@@ -22,30 +22,17 @@ impl Embedder {
         P1: AsRef<Path>,
         P2: AsRef<Path>,
     {
-        let tokenizer = Tokenizer::new(
-            tokenizer_path,
-            config.max_length,
-            config.with_type_ids,
-            config.padding_mode,
-            None,
-        )?;
-        let model_path = model_path.as_ref().to_path_buf();
-        let session = build_session(&model_path, &config)?;
-        let pool = SessionPool::new(session, model_path, config.clone());
+        let tokenizer =
+            Tokenizer::new(tokenizer_path, config.max_length, config.with_type_ids, config.padding_mode, None)?;
+        let model_path = model_path.as_ref();
+        let session = build_session(model_path, &config)?;
+        let pool = SessionPool::new(session, model_path, &config)?;
         Ok(Self { tokenizer, pool, config })
     }
-    pub fn from_dir<P: AsRef<Path>>(
-        dir: P,
-        optimization_level: u8,
-        overrides: ModelLoadOverrides<'_>,
-    ) -> Result<Self> {
-        const PREFERRED_EMBEDDING_OUTPUTS: [&str; 4] = [
-            "pooler_output",
-            "text_embeds",
-            "sentence_embedding",
-            "last_hidden_state",
-        ];
+    pub fn from_dir<P: AsRef<Path>>(dir: P, optimization_level: u8, overrides: ModelLoadOverrides<'_>) -> Result<Self> {
+        const PREFERRED_EMBEDDING_OUTPUTS: [&str; 4] =
+            ["pooler_output", "text_embeds", "sentence_embedding", "last_hidden_state"];
         let dir = dir.as_ref();
         let tokenizer_path = resolve_tokenizer_path(dir)?;
@@ -57,16 +44,13 @@ impl Embedder {
         let tokenizer_profile = read_tokenizer_profile(dir);
         let max_length = if let Some(override_value) = overrides.max_length {
             if override_value == 0 {
-                return Err(GteError::Inference(
-                    "max_length override must be greater than 0".to_string(),
-                ));
+                return Err(GteError::Inference("max_length override must be greater than 0".to_string()));
             }
             override_value.min(tokenizer_profile.safe_max_length)
         } else {
             tokenizer_profile.default_max_length
         };
-        let padding_mode =
-            parse_padding_mode_override(overrides.padding)?.unwrap_or(PaddingMode::Auto);
+        let padding_mode = parse_padding_mode_override(overrides.padding)?.unwrap_or(PaddingMode::Auto);
         let session_config = ModelConfig {
             max_length,
@@ -77,19 +61,18 @@ impl Embedder {
             with_attention_mask: true,
             optimization_level,
             execution_providers: overrides.execution_providers.map(str::to_string),
+            lowercase_input: overrides.lowercase_input.unwrap_or(false),
+            max_input_chars: overrides.max_input_chars,
         };
         let session = build_session(&model_path, &session_config)?;
         validate_supported_text_inputs(&session, "text embedding")?;
         let with_type_ids = has_input(&session, "token_type_ids");
         let with_attention_mask = has_input(&session, "attention_mask");
-        let output_tensor =
-            select_output_tensor(&session, overrides.output_tensor, &PREFERRED_EMBEDDING_OUTPUTS)?;
+        let output_tensor = select_output_tensor(&session, overrides.output_tensor, &PREFERRED_EMBEDDING_OUTPUTS)?;
         let mode = infer_extraction_mode(&session, output_tensor.as_str())?;
         if matches!(mode, ExtractorMode::MeanPool) && !with_attention_mask {
-            return Err(GteError::Inference(
-                "cannot use mean pooling without attention_mask input".to_string(),
-            ));
+            return Err(GteError::Inference("cannot use mean pooling without attention_mask input".to_string()));
         }
         let config = ModelConfig {
@@ -101,6 +84,8 @@ impl Embedder {
             with_attention_mask,
             optimization_level,
             execution_providers: overrides.execution_providers.map(str::to_string),
+            lowercase_input: overrides.lowercase_input.unwrap_or(false),
+            max_input_chars: overrides.max_input_chars,
         };
         let tokenizer = Tokenizer::new(
@@ -111,29 +96,72 @@ impl Embedder {
             tokenizer_profile.fixed_padding_length,
         )?;
-        let pool = SessionPool::new(session, model_path, session_config);
+        let pool = SessionPool::new(session, &model_path, &session_config)?;
         Ok(Self { tokenizer, pool, config })
     }
-    pub fn embed(&self, texts: Vec<String>) -> Result<Array2<f32>> {
-        self.embed_ref(&texts)
+    pub fn embed(&self, texts: &[String]) -> Result<Array2<f32>> {
+        self.embed_ref(texts)
     }
     pub fn embed_ref(&self, texts: &[String]) -> Result<Array2<f32>> {
-        let tokenized = self.tokenize(texts)?;
+        let sanitized: Vec<String>;
+        let input = if self.config.lowercase_input || self.config.max_input_chars.is_some() {
+            sanitized = texts
+                .iter()
+                .map(|t| {
+                    let mut s = if self.config.lowercase_input { t.to_lowercase() } else { t.clone() };
+                    if let Some(max_chars) = self.config.max_input_chars {
+                        s.truncate(max_chars.min(s.len()));
+                    }
+                    s
+                })
+                .collect();
+            &sanitized
+        } else {
+            texts
+        };
+        let tokenized = self.tokenize(input)?;
         self.run(&tokenized)
     }
-    pub fn tokenize(&self, texts: &[String]) -> crate::error::Result<Tokenized> {
+    pub fn tokenize(&self, texts: &[String]) -> Result<Tokenized> {
         self.tokenizer.tokenize(texts)
     }
-    pub fn run(&self, tokenized: &Tokenized) -> crate::error::Result<Array2<f32>> {
-        let mut session = self.pool.acquire()?;
-        run_session(&mut session, tokenized, &self.config)
+    pub fn run(&self, tokenized: &Tokenized) -> Result<Array2<f32>> {
+        self.pool.run(tokenized, &self.config)
     }
 }
 pub fn normalize_l2(embeddings: Array2<f32>) -> Array2<f32> {
     normalize_l2_rows(embeddings)
 }
+pub fn output_name_suggests_normalized(name: &str) -> bool {
+    let lower = name.to_ascii_lowercase();
+    let base = lower.rsplit('/').next().unwrap_or(&lower);
+    base.contains("normalized") || base.contains("l2_norm") || base.contains("l2norm")
+}
+#[cfg(test)]
+mod normalize_tests {
+    use super::output_name_suggests_normalized;
+    #[test]
+    fn detects_normalized_output_names() {
+        assert!(output_name_suggests_normalized("pooled_sentence_embeddings_debiased_normalized"));
+        assert!(output_name_suggests_normalized("embeddings/L2_Normalized"));
+        assert!(output_name_suggests_normalized("l2norm_output"));
+        assert!(output_name_suggests_normalized("norm/l2_norm_tensor"));
+    }
+    #[test]
+    fn does_not_detect_raw_output_names() {
+        assert!(!output_name_suggests_normalized("last_hidden_state"));
+        assert!(!output_name_suggests_normalized("text_embeds"));
+        assert!(!output_name_suggests_normalized("pooler_output"));
+        assert!(!output_name_suggests_normalized("sentence_embedding"));
+        assert!(!output_name_suggests_normalized("logits"));
+    }
+}

data/ext/gte/src/error.rs CHANGED Viewed

@@ -9,10 +9,10 @@ pub enum GteError {
 impl std::fmt::Display for GteError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
-            GteError::Tokenizer(msg) => write!(f, "GTE tokenizer error: {}", msg),
-            GteError::Inference(msg) => write!(f, "GTE inference error: {}", msg),
-            GteError::Ort(msg) => write!(f, "GTE ORT error: {}", msg),
-            GteError::Shape(msg) => write!(f, "GTE shape error: {}", msg),
+            GteError::Tokenizer(msg) => write!(f, "GTE tokenizer error: {msg}"),
+            GteError::Inference(msg) => write!(f, "GTE inference error: {msg}"),
+            GteError::Ort(msg) => write!(f, "GTE ORT error: {msg}"),
+            GteError::Shape(msg) => write!(f, "GTE shape error: {msg}"),
         }
     }
 }

data/ext/gte/src/lib.rs CHANGED Viewed

@@ -19,7 +19,7 @@ use magnus::{prelude::*, Error, Ruby};
 fn init(ruby: &Ruby) -> Result<(), Error> {
     let module = ruby.define_module("GTE")?;
     module.define_error("Error", ruby.exception_standard_error())?;
-    crate::ruby_embedder::register(ruby)?;
+    ruby_embedder::register(ruby)?;
     std::panic::set_hook(Box::new(|info| {
         let msg = info
             .payload()

data/ext/gte/src/model_config.rs CHANGED Viewed

@@ -23,6 +23,8 @@ pub struct ModelConfig {
     pub with_attention_mask: bool,
     pub optimization_level: u8,
     pub execution_providers: Option<String>,
+    pub lowercase_input: bool,
+    pub max_input_chars: Option<usize>,
 }
 #[derive(Debug, Clone, Copy, Default)]
@@ -32,4 +34,6 @@ pub struct ModelLoadOverrides<'a> {
     pub max_length: Option<usize>,
     pub padding: Option<&'a str>,
     pub execution_providers: Option<&'a str>,
+    pub lowercase_input: Option<bool>,
+    pub max_input_chars: Option<usize>,
 }