gte 0.0.8 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2c754b4675ee105e9a280cd9deafa00a81b9e02ee629131f3e908400006b6ae4
4
- data.tar.gz: 40a0d3e04c3d2943ae50910164d644ecb763eac99a02044dc962cc141a0e13c5
3
+ metadata.gz: 245f038cae58b7059fcc966479eb642316ab2611e64acf555214eec54713ce00
4
+ data.tar.gz: d23d0b248eeffd2d24d6be8e3190bbc9b06681cb265ecf05a6b174bcaa7b55b4
5
5
  SHA512:
6
- metadata.gz: 16614e01e7a33a53339ba9fe7cf32fe7606041518a24177258d7a6e5550516e8cff741d0f0df02b7e5863fc763c02ae81b943dc4b18295701a4cafdec6627cb0
7
- data.tar.gz: 348e1fd1d9f4c44214b5101ba339109b5ececfbef18b48b7c11324a64481f476d8da831cc5148d17a85c41b525ee753c296d4421a4fb2adda269a3f5fe38cda6
6
+ metadata.gz: 188f5f5ee4320d1bead817a2d43b033eff2aeaed3e17b61bc9a49bef6c4e7edc0e4c9983f494b3b9eab314ea2171a6f95cff7dcffd3aeb3c5a6180f7c0b60a3c
7
+ data.tar.gz: 5d514fcb9fbed57b5ef1bb488003b474ecd6e1e497e97ad223ea7daecc4e78c396fe17269bc6da48d3dadf865d7c6b36a7878f5c006b0c2144e09212ff8435d4
data/Rakefile CHANGED
@@ -31,7 +31,7 @@ extension_task = Rake::ExtensionTask.new('gte', spec) do |ext|
31
31
  ext.cross_platform = cross_platforms
32
32
  end
33
33
 
34
- if cross_target && !cross_target.empty? && ENV['RUBY_CC_VERSION']
34
+ if cross_target && !cross_target.empty? && ENV.fetch('RUBY_CC_VERSION', nil) && cross_target != 'x86_64-linux'
35
35
  ruby_version = ENV['RUBY_CC_VERSION'].split(':').first
36
36
  lib_binary_path = File.join(extension_task.lib_dir, File.basename(extension_task.binary(cross_target)))
37
37
  copy_task = "copy:gte:#{cross_target}:#{ruby_version}"
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.8
1
+ 0.0.10
data/ext/gte/Cargo.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "gte"
3
- version = "0.0.8"
3
+ version = "0.0.10"
4
4
  edition = "2021"
5
5
  authors = ["elcuervo <elcuervo@elcuervo.net>"]
6
6
  license = "MIT"
@@ -1,4 +1,6 @@
1
1
  use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
2
+ use gte::embedder::Embedder;
3
+ use gte::model_config::ModelLoadOverrides;
2
4
  use gte::postprocess::{mean_pool, normalize_l2};
3
5
  use ndarray::{Array2, Array3};
4
6
 
@@ -49,5 +51,90 @@ fn bench_normalize_l2(c: &mut Criterion) {
49
51
  group.finish();
50
52
  }
51
53
 
52
- criterion_group!(benches, bench_mean_pool, bench_normalize_l2);
54
+ // Replicates the fixed-padding regression: a short input (4 tokens, like "cat")
55
+ // padded to max_length costs proportionally more in every downstream operation.
56
+ // Siglip2 regressed from 7ms → 44ms when tokenizer.json had "padding.strategy.Fixed: 64".
57
+ // Each row here represents: (label, actual_tokens, padded_to)
58
+ // batch_longest → seq = actual_tokens
59
+ // fixed → seq = max_length regardless of input
60
+ fn bench_padding_impact(c: &mut Criterion) {
61
+ let dim = 768;
62
+ let mut group = c.benchmark_group("padding_impact");
63
+
64
+ for (label, seq) in [
65
+ ("batch_longest/4tok", 4usize),
66
+ ("fixed/siglip2_max_64", 64usize),
67
+ ("fixed/e5_max_512", 512usize),
68
+ ] {
69
+ let hidden_states = build_hidden_states(1, seq, dim);
70
+ let attention_mask = build_attention_mask(1, seq);
71
+ group.bench_with_input(
72
+ BenchmarkId::from_parameter(label),
73
+ &seq,
74
+ |b, _| {
75
+ b.iter(|| {
76
+ mean_pool(
77
+ black_box(hidden_states.view()),
78
+ black_box(attention_mask.view()),
79
+ )
80
+ .unwrap()
81
+ })
82
+ },
83
+ );
84
+ }
85
+ group.finish();
86
+ }
87
+
88
+ // End-to-end inference bench. Requires real ONNX models on disk. Skips
89
+ // silently when env vars not set so default `cargo bench` stays cheap.
90
+ // GTE_BENCH_E5_DIR — sentence-transformers / E5-style text model dir
91
+ // GTE_BENCH_SIGLIP2_DIR — siglip2 text encoder dir
92
+ // GTE_BENCH_CLIP_DIR — clip text encoder dir
93
+ // Sweeps threads ∈ {0 (auto/all-cores), 1, 2} to validate DEFAULT_THREADS=0.
94
+ fn bench_embedding_e2e(c: &mut Criterion) {
95
+ let cases = [
96
+ ("e5", "GTE_BENCH_E5_DIR", "query: cat", "query: ".to_string() + &"the quick brown fox jumps over the lazy dog ".repeat(20)),
97
+ ("siglip2", "GTE_BENCH_SIGLIP2_DIR", "cat", "a photo of ".to_string() + &"a cat sitting on a mat ".repeat(10)),
98
+ ("clip", "GTE_BENCH_CLIP_DIR", "cat", "a photo of ".to_string() + &"a cat sitting on a mat ".repeat(10)),
99
+ ];
100
+
101
+ let mut group = c.benchmark_group("embedding_e2e");
102
+ group.sample_size(20);
103
+
104
+ for (model_label, env_var, short_input, long_input) in cases.iter() {
105
+ let Some(dir) = std::env::var(env_var).ok().filter(|v| !v.is_empty()) else {
106
+ continue;
107
+ };
108
+
109
+ for &threads in &[0usize, 1, 2] {
110
+ let embedder = match Embedder::from_dir(&dir, threads, 3, ModelLoadOverrides::default()) {
111
+ Ok(e) => e,
112
+ Err(err) => {
113
+ eprintln!("skip {model_label} threads={threads}: {err}");
114
+ continue;
115
+ }
116
+ };
117
+
118
+ for (input_label, input) in [("short", short_input.to_string()), ("long", long_input.clone())] {
119
+ let id = BenchmarkId::from_parameter(format!("{model_label}/threads_{threads}/{input_label}"));
120
+ group.bench_with_input(id, &input, |b, text| {
121
+ b.iter(|| {
122
+ embedder
123
+ .embed(black_box(vec![text.clone()]))
124
+ .expect("embed succeeds")
125
+ })
126
+ });
127
+ }
128
+ }
129
+ }
130
+ group.finish();
131
+ }
132
+
133
+ criterion_group!(
134
+ benches,
135
+ bench_mean_pool,
136
+ bench_normalize_l2,
137
+ bench_padding_impact,
138
+ bench_embedding_e2e
139
+ );
53
140
  criterion_main!(benches);
@@ -4,6 +4,7 @@ use crate::embedder::{normalize_l2, Embedder};
4
4
  use crate::error::GteError;
5
5
  use crate::model_config::ModelLoadOverrides;
6
6
  use crate::reranker::Reranker;
7
+ use crate::tokenizer::Tokenized;
7
8
  use magnus::{function, method, prelude::*, wrap, Error, RArray, Ruby};
8
9
  use std::os::raw::c_void;
9
10
  use std::panic::{catch_unwind, AssertUnwindSafe};
@@ -32,9 +33,10 @@ pub struct RbTensor {
32
33
  // GVL-release helpers
33
34
  // ---------------------------------------------------------------------------
34
35
 
36
+ // Tokenized holds only Vec<i64> fields — safe to send across threads.
35
37
  struct InferArgs {
36
38
  embedder: *const Embedder,
37
- texts: *const Vec<String>,
39
+ tokenized: *const Tokenized,
38
40
  normalize: bool,
39
41
  result: Option<crate::error::Result<ndarray::Array2<f32>>>,
40
42
  }
@@ -63,8 +65,9 @@ fn panic_payload_to_string(payload: Box<dyn std::any::Any + Send>) -> String {
63
65
  unsafe extern "C" fn run_without_gvl(ptr: *mut c_void) -> *mut c_void {
64
66
  let args = &mut *(ptr as *mut InferArgs);
65
67
  let run_result = catch_unwind(AssertUnwindSafe(|| {
66
- let tokenized = (*args.embedder).tokenize(&*args.texts)?;
67
- let embeddings = (*args.embedder).run(&tokenized)?;
68
+ // Tokenization happens before GVL release (in rb_embed / rb_embed_one).
69
+ // Only ONNX inference runs here without the GVL.
70
+ let embeddings = (*args.embedder).run(&*args.tokenized)?;
68
71
  if args.normalize { Ok(normalize_l2(embeddings)) } else { Ok(embeddings) }
69
72
  }));
70
73
  args.result = Some(match run_result {
@@ -95,12 +98,12 @@ unsafe extern "C" fn run_score_without_gvl(ptr: *mut c_void) -> *mut c_void {
95
98
  fn infer_without_gvl(
96
99
  embedder: &Arc<Embedder>,
97
100
  normalize: bool,
98
- texts: Vec<String>,
101
+ tokenized: &Tokenized,
99
102
  ) -> Result<ndarray::Array2<f32>, Error> {
100
103
  let embeddings = unsafe {
101
104
  let mut args = InferArgs {
102
105
  embedder: Arc::as_ptr(embedder),
103
- texts: &texts as *const Vec<String>,
106
+ tokenized: tokenized as *const Tokenized,
104
107
  normalize,
105
108
  result: None,
106
109
  };
@@ -195,12 +198,14 @@ impl RbEmbedder {
195
198
 
196
199
  pub fn rb_embed(_ruby: &Ruby, rb_self: &Self, texts: RArray) -> Result<RbTensor, Error> {
197
200
  let texts: Vec<String> = texts.to_vec()?;
198
- let embeddings = infer_without_gvl(&rb_self.inner, rb_self.normalize, texts)?;
201
+ let tokenized = rb_self.inner.tokenize(&texts).map_err(magnus::Error::from)?;
202
+ let embeddings = infer_without_gvl(&rb_self.inner, rb_self.normalize, &tokenized)?;
199
203
  tensor_from_array(embeddings)
200
204
  }
201
205
 
202
206
  pub fn rb_embed_one(_ruby: &Ruby, rb_self: &Self, text: String) -> Result<RbTensor, Error> {
203
- let embeddings = infer_without_gvl(&rb_self.inner, rb_self.normalize, vec![text])?;
207
+ let tokenized = rb_self.inner.tokenize(&[text]).map_err(magnus::Error::from)?;
208
+ let embeddings = infer_without_gvl(&rb_self.inner, rb_self.normalize, &tokenized)?;
204
209
  tensor_from_array(embeddings)
205
210
  }
206
211
  }
@@ -292,6 +297,10 @@ impl RbTensor {
292
297
  Self::row(ruby, rb_self, 0)
293
298
  }
294
299
 
300
+ pub fn first_binary_f32(ruby: &Ruby, rb_self: &Self) -> Result<magnus::RString, Error> {
301
+ Self::row_binary_f32(ruby, rb_self, 0)
302
+ }
303
+
295
304
  pub fn row_binary_f32(
296
305
  ruby: &Ruby,
297
306
  rb_self: &Self,
@@ -354,6 +363,7 @@ pub fn register(ruby: &Ruby) -> Result<(), Error> {
354
363
  tensor_class.define_method("row", method!(RbTensor::row, 1))?;
355
364
  tensor_class.define_method("first", method!(RbTensor::first, 0))?;
356
365
  tensor_class.define_method("row_binary_f32", method!(RbTensor::row_binary_f32, 1))?;
366
+ tensor_class.define_method("first_binary_f32", method!(RbTensor::first_binary_f32, 0))?;
357
367
  tensor_class.define_method("to_a", method!(RbTensor::to_a, 0))?;
358
368
  tensor_class.define_method("to_binary_f32", method!(RbTensor::to_binary_f32, 0))?;
359
369
  Ok(())
@@ -28,7 +28,7 @@ pub fn build_session<P: AsRef<Path>>(model_path: P, config: &ModelConfig) -> Res
28
28
  .map_err(ort_err)?
29
29
  .with_optimization_level(opt_level)
30
30
  .map_err(ort_err)?
31
- .with_memory_pattern(true)
31
+ .with_memory_pattern(false)
32
32
  .map_err(ort_err)?;
33
33
 
34
34
  let providers = preferred_execution_providers(config.execution_providers.as_deref());
@@ -54,12 +54,6 @@ pub fn build_session<P: AsRef<Path>>(model_path: P, config: &ModelConfig) -> Res
54
54
  // Session pool
55
55
  // ---------------------------------------------------------------------------
56
56
 
57
- const AUTO_THREAD_POOL_CAP: usize = 6;
58
-
59
- /// Keep enough sessions to cover the configured thread budget without
60
- /// oversubscribing CPU parallelism. In ORT auto-thread mode (`num_threads == 0`)
61
- /// we still keep a modest pool because request-level concurrency benefits from
62
- /// more than one session even when ORT manages thread counts internally.
63
57
  fn pool_capacity(num_threads: usize) -> usize {
64
58
  let available_parallelism = std::thread::available_parallelism()
65
59
  .map(|n| n.get())
@@ -72,8 +66,10 @@ fn pool_capacity_with_parallelism(num_threads: usize, available_parallelism: usi
72
66
  return 1;
73
67
  }
74
68
 
69
+ // Auto-thread mode: ORT grabs all cores per session. One session avoids
70
+ // N² intra-op oversubscription when multiple Ruby threads call concurrently.
75
71
  if num_threads == 0 {
76
- return available_parallelism.clamp(1, AUTO_THREAD_POOL_CAP);
72
+ return 1;
77
73
  }
78
74
 
79
75
  available_parallelism.div_ceil(num_threads).max(1)
@@ -347,10 +343,12 @@ mod tests {
347
343
  }
348
344
 
349
345
  #[test]
350
- fn pool_capacity_uses_bounded_parallel_pool_for_auto_thread_mode() {
346
+ fn pool_capacity_uses_single_session_for_auto_thread_mode() {
347
+ // Auto-thread = ORT uses all cores per session. Pool=1 avoids N²
348
+ // intra-op oversubscription under concurrent Ruby threads.
351
349
  assert_eq!(pool_capacity_with_parallelism(0, 1), 1);
352
- assert_eq!(pool_capacity_with_parallelism(0, 4), 4);
353
- assert_eq!(pool_capacity_with_parallelism(0, 8), 6);
350
+ assert_eq!(pool_capacity_with_parallelism(0, 4), 1);
351
+ assert_eq!(pool_capacity_with_parallelism(0, 8), 1);
354
352
  }
355
353
 
356
354
  #[test]
@@ -113,18 +113,11 @@ pub fn parse_padding_mode_override(value: Option<&str>) -> Result<Option<Padding
113
113
  fn resolve_padding_strategy(
114
114
  padding_mode: PaddingMode,
115
115
  max_length: usize,
116
- fixed_padding_length: Option<usize>,
116
+ _fixed_padding_length: Option<usize>,
117
117
  ) -> PaddingStrategy {
118
118
  match padding_mode {
119
- PaddingMode::BatchLongest => PaddingStrategy::BatchLongest,
119
+ PaddingMode::BatchLongest | PaddingMode::Auto => PaddingStrategy::BatchLongest,
120
120
  PaddingMode::Fixed => PaddingStrategy::Fixed(max_length),
121
- PaddingMode::Auto => {
122
- if fixed_padding_length.is_some() {
123
- PaddingStrategy::Fixed(max_length)
124
- } else {
125
- PaddingStrategy::BatchLongest
126
- }
127
- }
128
121
  }
129
122
  }
130
123
 
@@ -225,10 +218,25 @@ mod tests {
225
218
  }
226
219
 
227
220
  #[test]
228
- fn resolve_padding_strategy_uses_fixed_for_auto_when_model_has_fixed_padding() {
229
- match resolve_padding_strategy(PaddingMode::Auto, 64, Some(64)) {
230
- PaddingStrategy::Fixed(64) => {}
231
- other => panic!("expected Fixed(64), got {:?}", other),
232
- }
221
+ fn resolve_padding_strategy_auto_always_uses_batch_longest() {
222
+ // Auto ignores fixed_padding_length from tokenizer.json — BatchLongest is
223
+ // always faster for inference and correct for variable-length inputs.
224
+ // Use PaddingMode::Fixed explicitly when fixed-length padding is required.
225
+ assert!(matches!(
226
+ resolve_padding_strategy(PaddingMode::Auto, 64, Some(64)),
227
+ PaddingStrategy::BatchLongest
228
+ ));
229
+ assert!(matches!(
230
+ resolve_padding_strategy(PaddingMode::Auto, 512, None),
231
+ PaddingStrategy::BatchLongest
232
+ ));
233
+ }
234
+
235
+ #[test]
236
+ fn resolve_padding_strategy_fixed_uses_max_length() {
237
+ assert!(matches!(
238
+ resolve_padding_strategy(PaddingMode::Fixed, 64, None),
239
+ PaddingStrategy::Fixed(64)
240
+ ));
233
241
  }
234
242
  }
@@ -1,12 +1,14 @@
1
1
  use gte::embedder::Embedder;
2
2
  use gte::model_config::ModelLoadOverrides;
3
3
 
4
+ fn model_dir(env_var: &str) -> Option<String> {
5
+ std::env::var(env_var).ok().filter(|v| !v.is_empty())
6
+ }
7
+
4
8
  #[test]
5
- #[ignore = "requires ext/gte/tests/fixtures/e5/tokenizer.json and model.onnx"]
6
9
  fn test_e5_single_embedding_shape() {
7
- const DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/e5");
8
-
9
- let embedder = Embedder::from_dir(DIR, 0, 3, ModelLoadOverrides::default())
10
+ let Some(dir) = model_dir("GTE_BENCH_E5_DIR") else { return };
11
+ let embedder = Embedder::from_dir(&dir, 0, 3, ModelLoadOverrides::default())
10
12
  .expect("embedder should initialize");
11
13
  let result = embedder
12
14
  .embed(vec!["query: Hello world".to_string()])
@@ -17,11 +19,9 @@ fn test_e5_single_embedding_shape() {
17
19
  }
18
20
 
19
21
  #[test]
20
- #[ignore = "requires ext/gte/tests/fixtures/clip/tokenizer.json and model.onnx"]
21
22
  fn test_clip_single_embedding_shape() {
22
- const DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/clip");
23
-
24
- let embedder = Embedder::from_dir(DIR, 0, 3, ModelLoadOverrides::default())
23
+ let Some(dir) = model_dir("GTE_BENCH_CLIP_DIR") else { return };
24
+ let embedder = Embedder::from_dir(&dir, 0, 3, ModelLoadOverrides::default())
25
25
  .expect("embedder should initialize");
26
26
  let result = embedder
27
27
  .embed(vec!["a photo of a cat".to_string()])
@@ -32,11 +32,9 @@ fn test_clip_single_embedding_shape() {
32
32
  }
33
33
 
34
34
  #[test]
35
- #[ignore = "requires ext/gte/tests/fixtures/e5/tokenizer.json and model.onnx"]
36
35
  fn test_e5_batch_embedding_shape() {
37
- const DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/e5");
38
-
39
- let embedder = Embedder::from_dir(DIR, 0, 3, ModelLoadOverrides::default())
36
+ let Some(dir) = model_dir("GTE_BENCH_E5_DIR") else { return };
37
+ let embedder = Embedder::from_dir(&dir, 0, 3, ModelLoadOverrides::default())
40
38
  .expect("embedder should initialize");
41
39
  let texts = vec![
42
40
  "query: first sentence".to_string(),
@@ -51,11 +49,9 @@ fn test_e5_batch_embedding_shape() {
51
49
  }
52
50
 
53
51
  #[test]
54
- #[ignore = "requires ext/gte/tests/fixtures/e5/tokenizer.json and model.onnx"]
55
52
  fn test_e5_long_input_truncation_no_error() {
56
- const DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/e5");
57
-
58
- let embedder = Embedder::from_dir(DIR, 0, 3, ModelLoadOverrides::default())
53
+ let Some(dir) = model_dir("GTE_BENCH_E5_DIR") else { return };
54
+ let embedder = Embedder::from_dir(&dir, 0, 3, ModelLoadOverrides::default())
59
55
  .expect("embedder should initialize");
60
56
  let very_long_text = "word ".repeat(1000);
61
57
  let result = embedder
@@ -0,0 +1,94 @@
1
+ // Regression tests for the fixed-padding performance bug.
2
+ //
3
+ // Root cause: PaddingMode::Auto silently read "padding.strategy.Fixed: N" from
4
+ // tokenizer.json and applied it, padding every input to max_length tokens.
5
+ // A query like "cat" (1 token) was padded to 64 tokens for Siglip2, making
6
+ // inference ~6x slower (44ms vs 7ms measured on Heroku).
7
+ //
8
+ // These tests use tests/fixtures/minimal/tokenizer.json which has
9
+ // "padding.strategy.Fixed: 64" baked in — exactly the condition that triggered
10
+ // the regression in production models like Siglip2.
11
+
12
+ use gte::model_config::PaddingMode;
13
+ use gte::tokenizer::Tokenizer;
14
+
15
+ const TOKENIZER: &str = concat!(
16
+ env!("CARGO_MANIFEST_DIR"),
17
+ "/tests/fixtures/minimal/tokenizer.json"
18
+ );
19
+
20
+ // Short input tokenizes to 1 token with this vocabulary.
21
+ const SHORT_INPUT: &str = "cat";
22
+ const MAX_LENGTH: usize = 64;
23
+
24
+ #[test]
25
+ fn auto_padding_uses_batch_longest_regardless_of_tokenizer_json() {
26
+ // fixed_padding_length: Some(MAX_LENGTH) simulates what model_profile::read_tokenizer_profile
27
+ // returns when tokenizer.json has "padding.strategy.Fixed: 64".
28
+ let tokenizer = Tokenizer::new(TOKENIZER, MAX_LENGTH, false, PaddingMode::Auto, Some(MAX_LENGTH))
29
+ .expect("tokenizer should load");
30
+
31
+ let tokenized = tokenizer
32
+ .tokenize(&[SHORT_INPUT.to_string()])
33
+ .expect("tokenize should succeed");
34
+
35
+ // Old behavior: cols == 64 (silently padded to max_length)
36
+ // New behavior: cols == actual token count (1 for "cat")
37
+ assert!(
38
+ tokenized.cols < MAX_LENGTH,
39
+ "Auto padding should use batch_longest, got cols={} (expected < {}). \
40
+ This is the Siglip2 regression: short queries were padded to max_length, \
41
+ making inference ~6x slower.",
42
+ tokenized.cols,
43
+ MAX_LENGTH
44
+ );
45
+ }
46
+
47
+ #[test]
48
+ fn fixed_padding_mode_pads_to_max_length() {
49
+ let tokenizer = Tokenizer::new(TOKENIZER, MAX_LENGTH, false, PaddingMode::Fixed, None)
50
+ .expect("tokenizer should load");
51
+
52
+ let tokenized = tokenizer
53
+ .tokenize(&[SHORT_INPUT.to_string()])
54
+ .expect("tokenize should succeed");
55
+
56
+ assert_eq!(
57
+ tokenized.cols, MAX_LENGTH,
58
+ "Fixed mode should pad to max_length"
59
+ );
60
+ assert_eq!(tokenized.input_ids.len(), MAX_LENGTH);
61
+ assert_eq!(tokenized.attn_masks.len(), MAX_LENGTH);
62
+ }
63
+
64
+ #[test]
65
+ fn batch_longest_padding_uses_longest_sequence_in_batch() {
66
+ let tokenizer = Tokenizer::new(TOKENIZER, MAX_LENGTH, false, PaddingMode::BatchLongest, None)
67
+ .expect("tokenizer should load");
68
+
69
+ // "cat" = 1 token, "hello world" = 2 tokens — batch pads to 2, not 64
70
+ let tokenized = tokenizer
71
+ .tokenize(&["cat".to_string(), "hello world".to_string()])
72
+ .expect("tokenize should succeed");
73
+
74
+ assert_eq!(tokenized.rows, 2);
75
+ assert!(
76
+ tokenized.cols < MAX_LENGTH,
77
+ "BatchLongest should pad to longest in batch (2 tokens), not max_length ({}). Got cols={}",
78
+ MAX_LENGTH,
79
+ tokenized.cols
80
+ );
81
+ }
82
+
83
+ #[test]
84
+ fn auto_padding_with_no_fixed_hint_also_uses_batch_longest() {
85
+ // Sanity check: Auto with fixed_padding_length=None also uses BatchLongest
86
+ let tokenizer = Tokenizer::new(TOKENIZER, MAX_LENGTH, false, PaddingMode::Auto, None)
87
+ .expect("tokenizer should load");
88
+
89
+ let tokenized = tokenizer
90
+ .tokenize(&[SHORT_INPUT.to_string()])
91
+ .expect("tokenize should succeed");
92
+
93
+ assert!(tokenized.cols < MAX_LENGTH);
94
+ }
data/lib/gte/embedder.rb CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  module GTE
4
4
  class Embedder
5
- DEFAULT_THREADS = 1
5
+ DEFAULT_THREADS = 0
6
6
  DEFAULT_OPTIMIZATION_LEVEL = 3
7
7
 
8
8
  class << self
data/lib/gte/model.rb CHANGED
@@ -23,5 +23,9 @@ module GTE
23
23
  when Array then embed(input)
24
24
  end
25
25
  end
26
+
27
+ def embed_binary(text)
28
+ embed(text).row_binary_f32(0)
29
+ end
26
30
  end
27
31
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gte
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - elcuervo
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-04-28 00:00:00.000000000 Z
11
+ date: 2026-05-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -110,6 +110,7 @@ files:
110
110
  - ext/gte/src/tokenizer.rs
111
111
  - ext/gte/tests/embedder_unit_test.rs
112
112
  - ext/gte/tests/inference_integration_test.rs
113
+ - ext/gte/tests/padding_regression_test.rs
113
114
  - ext/gte/tests/postprocess_unit_test.rs
114
115
  - ext/gte/tests/tokenizer_unit_test.rs
115
116
  - lib/gte.rb