RubyGems - gte - Versions diffs - 0.0.14-x86_64-linux → 0.0.16-x86_64-linux - Mend

gte 0.0.14-x86_64-linux → 0.0.16-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +4 -4
data/Gemfile +0 -1
data/README.md +112 -82
data/Rakefile +0 -9
data/VERSION +1 -1
data/ext/gte/Cargo.toml +2 -1
data/ext/gte/src/embedder.rs +29 -65
data/ext/gte/src/lib.rs +1 -0
data/ext/gte/src/model_config.rs +0 -4
data/ext/gte/src/pipeline.rs +8 -9
data/ext/gte/src/postprocess.rs +8 -6
data/ext/gte/src/reranker.rs +7 -10
data/ext/gte/src/ruby_embedder.rs +10 -33
data/ext/gte/src/session.rs +58 -109
data/ext/gte/src/tokenizer.rs +45 -38
data/ext/gte/tests/embedder_unit_test.rs +1 -1
data/ext/gte/tests/padding_regression_test.rs +7 -25
data/ext/gte/tests/tokenizer_unit_test.rs +7 -7
data/lib/gte/config.rb +1 -2
data/lib/gte/embedder.rb +2 -14
data/lib/gte/gte.so +0 -0
data/lib/gte/model.rb +0 -7
data/lib/gte/reranker.rb +14 -33
data/lib/gte.rb +4 -25
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 89f684dfc37cc272603c6ba0cccadff558d743eb1a3d2b25ba5bc6eafe1efbeb
-  data.tar.gz: d819b6b3523bf19ff84f0b7f5203213b470ed7e83d6ed3dcf2713daaf42b9a32
+  metadata.gz: 20d9efdde7d7af021cc2940f5f1a2c6139f05530aa9504dbd94f6d3470fd0ff2
+  data.tar.gz: dd999664494476009790aa7c1431abf900eefae2d846818e22bda8735712ee7d
 SHA512:
-  metadata.gz: 153a2b6d1d8bdff7414ffa768f1ab0084f6a49578fa63e933a07f51eda21d8c4d10ae2be11ce13af314655488e2df060dad829446b19020a7b7d0d153e72ddc7
-  data.tar.gz: 899ad238106610f95cb76abf3e48ae5625cc3098a6477f168db8e29c29d22f1af2683acf4fa105e1a66ce7c9b0465564f836319aab9da1d966aaaf67e2a994e6
+  metadata.gz: 60a197aed55cde07447227d011c95dd835ee150bd0e2d16319d434367da9dd5f5ef54ccfe09e509547ad13ec92c0919360125e743a174910e8d1b309969889f8
+  data.tar.gz: c1e5f83a48ebfb0e8fcb43d02bf2c744daa959386c5499cd489d1e220fbdf9b7c2a5baf2a48c51f3495db648a5b20c5f7888647dd78d497d8b2abcece2bcf890

data/Gemfile CHANGED Viewed

@@ -8,7 +8,6 @@ gem 'rake'
 gem 'rake-compiler'
 gem 'rb_sys'
 gem 'rspec'
-gem 'rspec-benchmark'
 gem 'rubocop', require: false
 group :bench do

data/README.md CHANGED Viewed

@@ -15,32 +15,29 @@ model = GTE.config(ENV.fetch("GTE_MODEL_DIR"))
 tensor = model.embed("query: hello world")
 vector = tensor.row(0)
-# [] with string => Array<Float> (single vector)
-single = model["query: nearest coffee shop"]
-# [] with array => GTE::Tensor (batch)
-batch = model[["query: hello", "query: world"]]
+# Binary f32 bytes (zero-copy to Numo/NumPy)
+bytes = model.embed_binary("query: hello world")
 ```
-## Embedding Config (`GTE.config`)
+## Embedding Config (`GTE::Pool`)
-`GTE.config(model_dir)` builds (and caches) a `GTE::Model`.
+`GTE.config(model_dir)` creates a new pool with one ONNX session by default.
 ```ruby
-default_model = GTE.config(ENV.fetch("GTE_MODEL_DIR"))
-raw_model = GTE.config(ENV.fetch("GTE_MODEL_DIR")) do |config|
-  config.with(normalize: false)
-end
+default = GTE.config(ENV.fetch("GTE_MODEL_DIR"))
+default.embed("query: hello world")
-custom = GTE.config(ENV.fetch("GTE_MODEL_DIR")) do |config|
+# With config overrides
+configurable = GTE.config(ENV.fetch("GTE_MODEL_DIR")) do |config|
   config.with(
     output_tensor: "last_hidden_state",
-    max_length: 256,
-    padding: "batch_longest",
-    optimization_level: 3
+    max_length: 128,
+    execution_providers: "xnnpack"
   )
 end
+# Explicit pool size (each session costs ~120MB RSS)
+large = GTE.config(ENV.fetch("GTE_MODEL_DIR"), pool_size: 4)
 ```
 Config fields and defaults:
@@ -48,19 +45,11 @@ Config fields and defaults:
 - `model_dir`: absolute path to model directory
 - `optimization_level`: `3`
 - `model_name`: `nil`
-- `normalize`: `true` (L2 normalization at Ruby-facing API)
 - `output_tensor`: `nil` (auto-select output tensor)
 - `max_length`: `nil` (uses tokenizer/model defaults)
 - `padding`: `nil` (auto; accepts `auto`, `batch_longest`, `fixed`)
 - `execution_providers`: `nil` (falls back to `GTE_EXECUTION_PROVIDERS` / CPU default)
-Notes:
-- Return a `Config::Text` from the block (for example, `config.with(...)`).
-- Model instances are cached by full config key; different config values create different cached instances.
-- `GTE.warmup(model, threads:)` pre-warms thread-local ONNX sessions eagerly at boot.
-  Useful in multi-threaded servers (Puma, Sidekiq) to avoid ~100-500ms cold-start latency.
 Common model presets:
 ```ruby
@@ -91,27 +80,28 @@ clip = GTE.config(ENV.fetch("GTE_CLIP_DIR")) do |config|
 end
 ```
-Picking a specific layer:
+Output selection:
 - Use `output_tensor:` to request a named model output.
 - `last_hidden_state` gives token-level hidden states and is mean-pooled by `gte` when the tensor is rank 3.
-- `pooler_output`, `sentence_embedding`, and similar 2D tensors are returned directly and then L2-normalized by default.
+- `pooler_output`, `sentence_embedding`, and similar 2D tensors are returned directly and L2-normalized.
+- If the output tensor name suggests already-normalized output (e.g. `l2_norm`, `normalized`), normalization is skipped.
 - If the requested tensor is not present in the model, `gte` raises an error instead of silently falling back.
-Low-level embedder setup (without model cache):
+Low-level embedder setup (without Pool convenience):
 ```ruby
-embedder = GTE::Embedder.config(ENV.fetch("GTE_MODEL_DIR")) do |config|
-  config.with(execution_providers: "cpu")
-end
+embedder = GTE::Embedder.from_config(
+  GTE::Embedder.default_config(ENV.fetch("GTE_MODEL_DIR"))
+)
 ```
 ## Reranker
-Use `GTE::Reranker.config(model_dir)` for cross-encoder reranking.
+Use `GTE::Reranker.new(model_dir)` for cross-encoder reranking.
 ```ruby
-reranker = GTE::Reranker.config(ENV.fetch("GTE_RERANK_DIR")) do |config|
+reranker = GTE::Reranker.new(ENV.fetch("GTE_RERANK_DIR")) do |config|
   config.with(sigmoid: true)
 end
@@ -124,13 +114,6 @@ candidates = [
 # Raw scores aligned with input order
 scores = reranker.score(query, candidates)
 # => [0.93, 0.07]
-# Ranked output sorted by score desc
-ranked = reranker.rerank(query: query, candidates: candidates)
-# => [
-#      { index: 0, score: 0.93, text: "Backpropagation and gradient descent are core techniques." },
-#      { index: 1, score: 0.07, text: "This recipe uses flour and eggs." }
-#    ]
 ```
 Reranker config fields and defaults:
@@ -144,26 +127,10 @@ Reranker config fields and defaults:
 - `padding`: `nil` (auto; accepts `auto`, `batch_longest`, `fixed`)
 - `execution_providers`: `nil`
-Session pool sizing:
-- `GTE_SESSION_POOL_CAP`: optional positive integer cap for internal ONNX session pool size.
-- Unset by default; runtime uses available CPU parallelism.
 ## Automatic Tuning
 `gte` automatically adapts to the hardware — no configuration required.
-### ONNX Intra-op Threads
-- Auto-detected via `std::thread::available_parallelism()` capped at 4.
-- Prevents oversubscription on high-concurrency workloads.
-- Override with `GTE_INTRA_OP_NUM_THREADS` env var.
-### ONNX Inter-op Threads
-- Defaults to 1 (text embedding graphs are linear chains with no independent parallel nodes).
-- Override with `GTE_INTER_OP_NUM_THREADS` env var.
 ### Execution Providers
 `gte` automatically tries XNNPACK for optimized CPU inference. Falls back to
@@ -183,28 +150,61 @@ export GTE_EXECUTION_PROVIDERS=xnnpack,coreml
 Set `cpu` or `none` to skip auto-detect and use ORT's default CPU provider.
+### Session Pool
+gte uses a **pre-allocated session pool** per worker — it creates N sessions at
+construction time, where N is determined by:
+| Priority | Source | Description |
+|----------|--------|-------------|
+| 1 | `GTE_SESSION_POOL_SIZE` | Explicit size (e.g. `4`) |
+| 2 | `PUMA_MAX_THREADS` | Match Puma concurrency (capped at 8) |
+| 3 | Default | `1` (single session, matching the unsplash-api singleton pattern) |
+The pool is fixed-size: sessions are never created or destroyed after construction.
+When all sessions are busy, the calling thread blocks on `parking_lot::Mutex`
+until a session is released. This avoids the allocation and memory overhead of
+lazy-growing pools while matching the concurrency needs of application threads.
 ### Session Pre-Warming
-ONNX sessions are created lazily per OS thread. In multi-threaded servers (Puma, Sidekiq),
-each thread creates its own session on first use (~100-500ms cold start).
-Pre-warm sessions eagerly at boot:
+The pool is pre-warmed automatically in `GTE.config` — one inference per
+session is run on construction so the first production request never hits a cold
+cache. No manual warmup step needed.
-```ruby
-model = GTE.config(ENV.fetch("GTE_MODEL_DIR"))
+To re-warm (useful after fork in Puma's `on_worker_boot`):
-# Pre-warm thread-local sessions for a Puma server with 5 threads:
-GTE.warmup(model, threads: 5)
+```ruby
+pool.warmup
 ```
-## Runtime + Result Examples
+### Tuning Performance
+| Variable | Effect | Default |
+|----------|--------|---------|
+| `GTE_SESSION_POOL_SIZE` | Max ONNX sessions per worker | `1` (or `PUMA_MAX_THREADS`) |
+| `GTE_INTRA_OP_NUM_THREADS` | Threads ONNX Runtime uses per inference op | `min(CPU cores, 4)` |
+| `GTE_INTER_OP_NUM_THREADS` | Threads for independent graph nodes (irrelevant for text models) | `1` |
+| `GTE_EXECUTION_PROVIDERS` | Comma-separated: `xnnpack`, `coreml`, `cpu` | Auto: `xnnpack` on arm64 |
+**To squeeze more throughput:**
+- Set `GTE_SESSION_POOL_SIZE` to match or slightly exceed your Puma `MAX_THREADS`.
+- On machines with many cores, reduce `GTE_INTRA_OP_NUM_THREADS` to `1` or `2`
+  to avoid CPU oversubscription when multiple sessions run concurrently.
+**Memory estimation per worker:**
+- Pool size N (default 1): **N × model file size × 3–5**
+- Each additional session adds ~120MB RSS on arm64 with XNNPACK.
+## Runtime
 Process-local reuse (recommended for Puma/web servers):
 ```ruby
-EMBEDDER = GTE.config(ENV.fetch("GTE_MODEL_DIR"))
+$gte = GTE.config(ENV.fetch("GTE_MODEL_DIR"))
 def embed_query(text)
-  EMBEDDER[text] # Array<Float>
+  $gte.embed(text).row(0) # Array<Float>
 end
 ```
@@ -219,7 +219,6 @@ A model directory must include `tokenizer.json` and one ONNX model, resolved in
 Input policy is text-only. Graphs requiring unsupported multimodal inputs (such as `pixel_values`) are intentionally rejected.
 ## Development
 Run commands inside `nix develop` via Make targets:
@@ -256,38 +255,37 @@ make bench-docker-validate  # cross-validation checks
 | Concurrency | GTE p90 | Pure Ruby p90 | Ratio | GTE RPS | Pure Ruby RPS |
 |------------|---------|---------------|-------|---------|---------------|
-| c=1 | ~12ms | ~120ms | 9-10× | ~95 | ~10 |
-| c=4 | ~39ms | ~503ms | 10-13× | ~228 | ~10 |
-| c=8 | ~146ms | ~613ms | 3-4× | ~224 | ~10 |
-| c=16 | ~430ms | ~611ms | 1-1.5× | ~226 | ~11 |
+| c=1 | ~14ms | ~92ms | 6.4× | ~89 | ~21 |
+| c=2 | ~15ms | ~175ms | 11.4× | ~163 | ~21 |
+| c=4 | ~39ms | ~293ms | 7.4× | ~219 | ~24 |
+| c=8 | ~75ms | ~502ms | 6.7× | ~195 | ~24 |
+| c=16 | ~279ms | ~606ms | 2.2× | ~219 | ~26 |
 #### E5 (384-dim, last_hidden_state + mean pool)
 | Concurrency | GTE p90 | Pure Ruby p90 | Ratio | GTE RPS | Pure Ruby RPS |
 |------------|---------|---------------|-------|---------|---------------|
-| c=1 | ~7ms | ~120ms | 16-17× | ~160 | ~10 |
-| c=4 | ~12ms | ~430ms | 35-40× | ~477 | ~10 |
-| c=8 | ~64ms | ~530ms | 8-9× | ~503 | ~10 |
-| c=16 | ~205ms | ~534ms | 2-3× | ~509 | ~11 |
+| c=1 | ~8ms | ~73ms | 9.3× | ~152 | ~32 |
+| c=2 | ~8ms | ~95ms | 11.8× | ~291 | ~36 |
+| c=4 | ~22ms | ~163ms | 7.5× | ~432 | ~45 |
+| c=8 | ~51ms | ~291ms | 5.7× | ~451 | ~43 |
+| c=16 | ~133ms | ~1080ms | 8.1× | ~467 | ~47 |
-GTE releases the GVL during ONNX inference, enabling true parallelism across Puma threads.
-Pure Ruby is GVL-bound (~10 RPS regardless of concurrency).
+GTE releases the GVL during ONNX inference, enabling true parallelism across
+Puma threads and worker processes. Pure Ruby is serialized
+(~25–45 RPS regardless of concurrency).
-The Puma thread pool (min=2, max=5) limits throughput at c=16+.
-GTE's pipelining and GVL release already saturate the available threads at c=4.
+Config: Puma workers=2, threads=min=2/max=5, cpus=4, mem_limit=3g.
+Docker wrk with random 135-text query set, 15s runs.
 ### In-Process Benchmarks
 ```bash
 make bench
-nix develop -c bundle exec rake bench:pure_compare
-nix develop -c bundle exec rake bench:matrix_sweep
 nix develop -c bundle exec ruby bench/memory_probe.rb --compare-pure
 ```
 - `make bench`: Puma-like single-request comparison at concurrency `16`
-- `rake bench:pure_compare`: batch amortization comparison
-- `rake bench:matrix_sweep`: GTE provider sweep using the shared result schema
 - Optional Python comparisons use `bench/python_onnxruntime.py` and are skipped automatically if local dependencies are unavailable.
 To run benchmark + append a `RUNS.md` entry + enforce goal checks:
@@ -300,3 +298,35 @@ make bench-record
 - Enforces the goal metric (`response_time_p95`) across every enabled competitor.
 - Does not require current-version coverage in `RUNS.md` unless explicitly enabled.
+## Fork Safety
+GTE uses ONNX Runtime sessions which maintain internal thread pools for parallelism
+(`GTE_INTRA_OP_NUM_THREADS`, default `min(cpus, 4)`). These thread pools are
+per-session and may not survive `fork()` on some platforms.
+**With Puma's `preload_app!`:**
+Sessions built before `fork()` share memory via COW, but the internal ORT threads
+created during `Session::builder().commit_from_file()` do not exist in the child
+process. When a forked worker calls `session.run()`, ORT must recreate these
+threads, which adds latency to the first inference call.
+**Recommendations:**
+1. **Set `GTE_INTRA_OP_NUM_THREADS=1`** in forked environments to avoid creating
+   per-session thread pools entirely. ORT will run inference single-threaded,
+   which is acceptable when multiple sessions handle concurrency.
+2. **Build sessions in `on_worker_boot`** instead of before fork to guarantee
+   fresh thread pools in each worker. This adds ~200ms to worker startup per
+   model but ensures consistent inference latency:
+   ```ruby
+   # config/puma.rb
+   on_worker_boot do
+     $gte_pool = GTE.config(ENV.fetch("GTE_MODEL_DIR"))
+   end
+   ```
+3. **If using `preload_app!`**, call `GTE.config` in `before_fork` and set
+   `GTE_INTRA_OP_NUM_THREADS=1` to avoid thread pool issues in child processes.

data/Rakefile CHANGED Viewed

@@ -74,15 +74,6 @@ namespace :bench do
     )
   end
-  desc 'Sweep execution-provider settings for Puma-like benchmark'
-  task :matrix_sweep do
-    run_in_nix(
-      'bundle', 'exec', 'ruby', 'bench/puma_matrix_sweep.rb',
-      '--iterations', '80',
-      '--runs', '3'
-    )
-  end
   desc 'Run memory probe for single-instance vs duplicate-instance behavior'
   task :memory_probe do
     run_in_nix(

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.0.14
1	+ 0.0.16

data/ext/gte/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "gte"
-version = "0.0.14"
+version = "0.0.16"
 edition = "2021"
 authors = ["elcuervo <elcuervo@elcuervo.net>"]
 license = "MIT"
@@ -22,6 +22,7 @@ ruby-ffi = ["dep:magnus", "dep:rb-sys"]
 rb-sys = { version = "0.9", features = ["stable-api-compiled-fallback"], optional = true }
 magnus = { version = "0.8", optional = true }
 ort = { version = "=2.0.0-rc.12", features = ["ndarray", "xnnpack"] }
+parking_lot = "0.12"
 tokenizers = "0.21.0"
 ndarray = "0.17"
 serde_json = "1"

data/ext/gte/src/embedder.rs CHANGED Viewed

@@ -5,8 +5,8 @@ use crate::model_profile::{
     resolve_tokenizer_path, select_output_tensor, validate_supported_text_inputs,
 };
 use crate::postprocess::normalize_l2 as normalize_l2_rows;
-use crate::session::{build_session, SessionPool};
-use crate::tokenizer::{parse_padding_mode_override, Tokenized, Tokenizer};
+use crate::session::{build_session, resolve_pool_size, run_session, SessionPool};
+use crate::tokenizer::{parse_padding_mode_override, Tokenizer};
 use ndarray::Array2;
 use std::path::{Path, PathBuf};
@@ -14,22 +14,10 @@ pub struct Embedder {
     tokenizer: Tokenizer,
     pool: SessionPool,
     pub config: ModelConfig,
+    normalize: bool,
 }
 impl Embedder {
-    pub fn new<P1, P2>(tokenizer_path: P1, model_path: P2, config: ModelConfig) -> Result<Self>
-    where
-        P1: AsRef<Path>,
-        P2: AsRef<Path>,
-    {
-        let tokenizer =
-            Tokenizer::new(tokenizer_path, config.max_length, config.with_type_ids, config.padding_mode, None)?;
-        let model_path = model_path.as_ref();
-        let session = build_session(model_path, &config)?;
-        let pool = SessionPool::new(session, model_path, &config)?;
-        Ok(Self { tokenizer, pool, config })
-    }
     pub fn from_dir<P: AsRef<Path>>(dir: P, optimization_level: u8, overrides: ModelLoadOverrides<'_>) -> Result<Self> {
         const PREFERRED_EMBEDDING_OUTPUTS: [&str; 4] =
             ["pooler_output", "text_embeds", "sentence_embedding", "last_hidden_state"];
@@ -52,7 +40,7 @@ impl Embedder {
         };
         let padding_mode = parse_padding_mode_override(overrides.padding)?.unwrap_or(PaddingMode::Auto);
-        let session_config = ModelConfig {
+        let probe_config = ModelConfig {
             max_length,
             padding_mode,
             output_tensor: String::new(),
@@ -61,10 +49,8 @@ impl Embedder {
             with_attention_mask: true,
             optimization_level,
             execution_providers: overrides.execution_providers.map(str::to_string),
-            lowercase_input: overrides.lowercase_input.unwrap_or(false),
-            max_input_chars: overrides.max_input_chars,
         };
-        let session = build_session(&model_path, &session_config)?;
+        let session = build_session(&model_path, &probe_config)?;
         validate_supported_text_inputs(&session, "text embedding")?;
         let with_type_ids = has_input(&session, "token_type_ids");
@@ -84,10 +70,10 @@ impl Embedder {
             with_attention_mask,
             optimization_level,
             execution_providers: overrides.execution_providers.map(str::to_string),
-            lowercase_input: overrides.lowercase_input.unwrap_or(false),
-            max_input_chars: overrides.max_input_chars,
         };
+        let normalize = should_normalize_output(&config.output_tensor);
         let tokenizer = Tokenizer::new(
             &tokenizer_path,
             config.max_length,
@@ -96,72 +82,50 @@ impl Embedder {
             tokenizer_profile.fixed_padding_length,
         )?;
-        let pool = SessionPool::new(session, &model_path, &session_config)?;
-        Ok(Self { tokenizer, pool, config })
+        let pool_size = resolve_pool_size();
+        let pool = SessionPool::new(&model_path, &config, pool_size)?;
+        Ok(Self { tokenizer, pool, config, normalize })
     }
     pub fn embed(&self, texts: &[String]) -> Result<Array2<f32>> {
-        self.embed_ref(texts)
-    }
-    pub fn embed_ref(&self, texts: &[String]) -> Result<Array2<f32>> {
-        let sanitized: Vec<String>;
-        let input = if self.config.lowercase_input || self.config.max_input_chars.is_some() {
-            sanitized = texts
-                .iter()
-                .map(|t| {
-                    let mut s = if self.config.lowercase_input { t.to_lowercase() } else { t.clone() };
-                    if let Some(max_chars) = self.config.max_input_chars {
-                        s.truncate(max_chars.min(s.len()));
-                    }
-                    s
-                })
-                .collect();
-            &sanitized
+        let tokenized = self.tokenizer.tokenize(texts)?;
+        let embeddings = self.pool.with_session(|session| run_session(session, &tokenized, &self.config))?;
+        if self.normalize {
+            Ok(normalize_l2_rows(embeddings))
         } else {
-            texts
-        };
-        let tokenized = self.tokenize(input)?;
-        self.run(&tokenized)
+            Ok(embeddings)
+        }
     }
-    pub fn tokenize(&self, texts: &[String]) -> Result<Tokenized> {
+    pub fn tokenize(&self, texts: &[String]) -> Result<crate::tokenizer::Tokenized> {
         self.tokenizer.tokenize(texts)
     }
-    pub fn run(&self, tokenized: &Tokenized) -> Result<Array2<f32>> {
-        self.pool.run(tokenized, &self.config)
-    }
-}
-pub fn normalize_l2(embeddings: Array2<f32>) -> Array2<f32> {
-    normalize_l2_rows(embeddings)
 }
-pub fn output_name_suggests_normalized(name: &str) -> bool {
+fn should_normalize_output(name: &str) -> bool {
     let lower = name.to_ascii_lowercase();
     let base = lower.rsplit('/').next().unwrap_or(&lower);
-    base.contains("normalized") || base.contains("l2_norm") || base.contains("l2norm")
+    !(base.contains("normalized") || base.contains("l2_norm") || base.contains("l2norm"))
 }
 #[cfg(test)]
 mod normalize_tests {
-    use super::output_name_suggests_normalized;
+    use super::should_normalize_output;
     #[test]
     fn detects_normalized_output_names() {
-        assert!(output_name_suggests_normalized("pooled_sentence_embeddings_debiased_normalized"));
-        assert!(output_name_suggests_normalized("embeddings/L2_Normalized"));
-        assert!(output_name_suggests_normalized("l2norm_output"));
-        assert!(output_name_suggests_normalized("norm/l2_norm_tensor"));
+        assert!(!should_normalize_output("pooled_sentence_embeddings_debiased_normalized"));
+        assert!(!should_normalize_output("embeddings/L2_Normalized"));
+        assert!(!should_normalize_output("l2norm_output"));
+        assert!(!should_normalize_output("norm/l2_norm_tensor"));
     }
     #[test]
     fn does_not_detect_raw_output_names() {
-        assert!(!output_name_suggests_normalized("last_hidden_state"));
-        assert!(!output_name_suggests_normalized("text_embeds"));
-        assert!(!output_name_suggests_normalized("pooler_output"));
-        assert!(!output_name_suggests_normalized("sentence_embedding"));
-        assert!(!output_name_suggests_normalized("logits"));
+        assert!(should_normalize_output("last_hidden_state"));
+        assert!(should_normalize_output("text_embeds"));
+        assert!(should_normalize_output("pooler_output"));
+        assert!(should_normalize_output("sentence_embedding"));
+        assert!(should_normalize_output("logits"));
     }
 }

data/ext/gte/src/lib.rs CHANGED Viewed

@@ -18,6 +18,7 @@ use magnus::{prelude::*, Error, Ruby};
 #[magnus::init]
 fn init(ruby: &Ruby) -> Result<(), Error> {
     let module = ruby.define_module("GTE")?;
+    #[allow(unused_results)]
     module.define_error("Error", ruby.exception_standard_error())?;
     ruby_embedder::register(ruby)?;
     std::panic::set_hook(Box::new(|info| {

data/ext/gte/src/model_config.rs CHANGED Viewed

@@ -23,8 +23,6 @@ pub struct ModelConfig {
     pub with_attention_mask: bool,
     pub optimization_level: u8,
     pub execution_providers: Option<String>,
-    pub lowercase_input: bool,
-    pub max_input_chars: Option<usize>,
 }
 #[derive(Debug, Clone, Copy, Default)]
@@ -34,6 +32,4 @@ pub struct ModelLoadOverrides<'a> {
     pub max_length: Option<usize>,
     pub padding: Option<&'a str>,
     pub execution_providers: Option<&'a str>,
-    pub lowercase_input: Option<bool>,
-    pub max_input_chars: Option<usize>,
 }

data/ext/gte/src/pipeline.rs CHANGED Viewed

@@ -11,21 +11,20 @@ pub struct InputTensors<'a> {
 impl<'a> InputTensors<'a> {
     pub fn from_tokenized(tokenized: &'a Tokenized, with_attention_mask: bool) -> Result<Self> {
-        let input_ids_view: ArrayView2<'_, i64> =
-            ArrayView2::from_shape((tokenized.rows, tokenized.cols), tokenized.input_ids.as_slice())?;
-        let attention_mask: ArrayView2<'_, i64> =
-            ArrayView2::from_shape((tokenized.rows, tokenized.cols), tokenized.attn_masks.as_slice())?;
+        let input_ids_view = tokenized.input_ids.view();
+        let attention_mask = tokenized.attn_masks.view();
-        let mut inputs = Vec::with_capacity(2 + usize::from(tokenized.type_ids.is_some()));
-        inputs.push(("input_ids", SessionInputValue::from(TensorRef::from_array_view(input_ids_view)?)));
+        let mut inputs = Vec::with_capacity(2);
         if with_attention_mask {
+            inputs.push(("input_ids", SessionInputValue::from(TensorRef::from_array_view(input_ids_view)?)));
             inputs.push(("attention_mask", SessionInputValue::from(TensorRef::from_array_view(attention_mask)?)));
+        } else {
+            inputs.push(("input_ids", SessionInputValue::from(TensorRef::from_array_view(input_ids_view)?)));
         }
-        if let Some(type_ids) = tokenized.type_ids.as_deref() {
-            let type_ids_view: ArrayView2<'_, i64> =
-                ArrayView2::from_shape((tokenized.rows, tokenized.cols), type_ids)?;
+        if let Some(ref type_ids) = tokenized.type_ids {
+            let type_ids_view = type_ids.view();
             inputs.push(("token_type_ids", SessionInputValue::from(TensorRef::from_array_view(type_ids_view)?)));
         }

data/ext/gte/src/postprocess.rs CHANGED Viewed

@@ -94,12 +94,13 @@ fn mean_pool_contiguous(
         if mask_row.iter().all(|&weight| weight == 1) {
             for token_index in 0..seq {
                 let token_base = hidden_base + token_index * dim;
-                for dim_index in 0..dim {
-                    output_row[dim_index] += hidden[token_base + dim_index];
+                let token_slice = &hidden[token_base..token_base + dim];
+                for (out, &h) in output_row.iter_mut().zip(token_slice.iter()) {
+                    *out += h;
                 }
             }
-            for value in output_row {
+            for value in output_row.iter_mut() {
                 *value *= seq_inverse;
             }
             continue;
@@ -114,15 +115,16 @@ fn mean_pool_contiguous(
             let weight = weight_raw as f32;
             let token_base = hidden_base + token_index * dim;
-            for dim_index in 0..dim {
-                output_row[dim_index] += hidden[token_base + dim_index] * weight;
+            let token_slice = &hidden[token_base..token_base + dim];
+            for (out, &h) in output_row.iter_mut().zip(token_slice.iter()) {
+                *out += h * weight;
             }
             weight_sum += weight;
         }
         if weight_sum > 0.0 {
             let inverse = weight_sum.recip();
-            for value in output_row {
+            for value in output_row.iter_mut() {
                 *value *= inverse;
             }
         }