gte 0.0.8 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/ext/gte/Cargo.toml +1 -1
- data/ext/gte/benches/hot_path.rs +88 -1
- data/ext/gte/src/ruby_embedder.rs +17 -7
- data/ext/gte/src/session.rs +9 -11
- data/ext/gte/src/tokenizer.rs +22 -14
- data/ext/gte/tests/inference_integration_test.rs +12 -16
- data/ext/gte/tests/padding_regression_test.rs +94 -0
- data/lib/gte/embedder.rb +1 -1
- data/lib/gte/model.rb +4 -0
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 245f038cae58b7059fcc966479eb642316ab2611e64acf555214eec54713ce00
|
|
4
|
+
data.tar.gz: d23d0b248eeffd2d24d6be8e3190bbc9b06681cb265ecf05a6b174bcaa7b55b4
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 188f5f5ee4320d1bead817a2d43b033eff2aeaed3e17b61bc9a49bef6c4e7edc0e4c9983f494b3b9eab314ea2171a6f95cff7dcffd3aeb3c5a6180f7c0b60a3c
|
|
7
|
+
data.tar.gz: 5d514fcb9fbed57b5ef1bb488003b474ecd6e1e497e97ad223ea7daecc4e78c396fe17269bc6da48d3dadf865d7c6b36a7878f5c006b0c2144e09212ff8435d4
|
data/Rakefile
CHANGED
|
@@ -31,7 +31,7 @@ extension_task = Rake::ExtensionTask.new('gte', spec) do |ext|
|
|
|
31
31
|
ext.cross_platform = cross_platforms
|
|
32
32
|
end
|
|
33
33
|
|
|
34
|
-
if cross_target && !cross_target.empty? && ENV
|
|
34
|
+
if cross_target && !cross_target.empty? && ENV.fetch('RUBY_CC_VERSION', nil) && cross_target != 'x86_64-linux'
|
|
35
35
|
ruby_version = ENV['RUBY_CC_VERSION'].split(':').first
|
|
36
36
|
lib_binary_path = File.join(extension_task.lib_dir, File.basename(extension_task.binary(cross_target)))
|
|
37
37
|
copy_task = "copy:gte:#{cross_target}:#{ruby_version}"
|
data/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
0.0.
|
|
1
|
+
0.0.10
|
data/ext/gte/Cargo.toml
CHANGED
data/ext/gte/benches/hot_path.rs
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
|
|
2
|
+
use gte::embedder::Embedder;
|
|
3
|
+
use gte::model_config::ModelLoadOverrides;
|
|
2
4
|
use gte::postprocess::{mean_pool, normalize_l2};
|
|
3
5
|
use ndarray::{Array2, Array3};
|
|
4
6
|
|
|
@@ -49,5 +51,90 @@ fn bench_normalize_l2(c: &mut Criterion) {
|
|
|
49
51
|
group.finish();
|
|
50
52
|
}
|
|
51
53
|
|
|
52
|
-
|
|
54
|
+
// Replicates the fixed-padding regression: a short input (4 tokens, like "cat")
|
|
55
|
+
// padded to max_length costs proportionally more in every downstream operation.
|
|
56
|
+
// Siglip2 regressed from 7ms → 44ms when tokenizer.json had "padding.strategy.Fixed: 64".
|
|
57
|
+
// Each row here represents: (label, actual_tokens, padded_to)
|
|
58
|
+
// batch_longest → seq = actual_tokens
|
|
59
|
+
// fixed → seq = max_length regardless of input
|
|
60
|
+
fn bench_padding_impact(c: &mut Criterion) {
|
|
61
|
+
let dim = 768;
|
|
62
|
+
let mut group = c.benchmark_group("padding_impact");
|
|
63
|
+
|
|
64
|
+
for (label, seq) in [
|
|
65
|
+
("batch_longest/4tok", 4usize),
|
|
66
|
+
("fixed/siglip2_max_64", 64usize),
|
|
67
|
+
("fixed/e5_max_512", 512usize),
|
|
68
|
+
] {
|
|
69
|
+
let hidden_states = build_hidden_states(1, seq, dim);
|
|
70
|
+
let attention_mask = build_attention_mask(1, seq);
|
|
71
|
+
group.bench_with_input(
|
|
72
|
+
BenchmarkId::from_parameter(label),
|
|
73
|
+
&seq,
|
|
74
|
+
|b, _| {
|
|
75
|
+
b.iter(|| {
|
|
76
|
+
mean_pool(
|
|
77
|
+
black_box(hidden_states.view()),
|
|
78
|
+
black_box(attention_mask.view()),
|
|
79
|
+
)
|
|
80
|
+
.unwrap()
|
|
81
|
+
})
|
|
82
|
+
},
|
|
83
|
+
);
|
|
84
|
+
}
|
|
85
|
+
group.finish();
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// End-to-end inference bench. Requires real ONNX models on disk. Skips
|
|
89
|
+
// silently when env vars not set so default `cargo bench` stays cheap.
|
|
90
|
+
// GTE_BENCH_E5_DIR — sentence-transformers / E5-style text model dir
|
|
91
|
+
// GTE_BENCH_SIGLIP2_DIR — siglip2 text encoder dir
|
|
92
|
+
// GTE_BENCH_CLIP_DIR — clip text encoder dir
|
|
93
|
+
// Sweeps threads ∈ {0 (auto/all-cores), 1, 2} to validate DEFAULT_THREADS=0.
|
|
94
|
+
fn bench_embedding_e2e(c: &mut Criterion) {
|
|
95
|
+
let cases = [
|
|
96
|
+
("e5", "GTE_BENCH_E5_DIR", "query: cat", "query: ".to_string() + &"the quick brown fox jumps over the lazy dog ".repeat(20)),
|
|
97
|
+
("siglip2", "GTE_BENCH_SIGLIP2_DIR", "cat", "a photo of ".to_string() + &"a cat sitting on a mat ".repeat(10)),
|
|
98
|
+
("clip", "GTE_BENCH_CLIP_DIR", "cat", "a photo of ".to_string() + &"a cat sitting on a mat ".repeat(10)),
|
|
99
|
+
];
|
|
100
|
+
|
|
101
|
+
let mut group = c.benchmark_group("embedding_e2e");
|
|
102
|
+
group.sample_size(20);
|
|
103
|
+
|
|
104
|
+
for (model_label, env_var, short_input, long_input) in cases.iter() {
|
|
105
|
+
let Some(dir) = std::env::var(env_var).ok().filter(|v| !v.is_empty()) else {
|
|
106
|
+
continue;
|
|
107
|
+
};
|
|
108
|
+
|
|
109
|
+
for &threads in &[0usize, 1, 2] {
|
|
110
|
+
let embedder = match Embedder::from_dir(&dir, threads, 3, ModelLoadOverrides::default()) {
|
|
111
|
+
Ok(e) => e,
|
|
112
|
+
Err(err) => {
|
|
113
|
+
eprintln!("skip {model_label} threads={threads}: {err}");
|
|
114
|
+
continue;
|
|
115
|
+
}
|
|
116
|
+
};
|
|
117
|
+
|
|
118
|
+
for (input_label, input) in [("short", short_input.to_string()), ("long", long_input.clone())] {
|
|
119
|
+
let id = BenchmarkId::from_parameter(format!("{model_label}/threads_{threads}/{input_label}"));
|
|
120
|
+
group.bench_with_input(id, &input, |b, text| {
|
|
121
|
+
b.iter(|| {
|
|
122
|
+
embedder
|
|
123
|
+
.embed(black_box(vec![text.clone()]))
|
|
124
|
+
.expect("embed succeeds")
|
|
125
|
+
})
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
group.finish();
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
criterion_group!(
|
|
134
|
+
benches,
|
|
135
|
+
bench_mean_pool,
|
|
136
|
+
bench_normalize_l2,
|
|
137
|
+
bench_padding_impact,
|
|
138
|
+
bench_embedding_e2e
|
|
139
|
+
);
|
|
53
140
|
criterion_main!(benches);
|
|
@@ -4,6 +4,7 @@ use crate::embedder::{normalize_l2, Embedder};
|
|
|
4
4
|
use crate::error::GteError;
|
|
5
5
|
use crate::model_config::ModelLoadOverrides;
|
|
6
6
|
use crate::reranker::Reranker;
|
|
7
|
+
use crate::tokenizer::Tokenized;
|
|
7
8
|
use magnus::{function, method, prelude::*, wrap, Error, RArray, Ruby};
|
|
8
9
|
use std::os::raw::c_void;
|
|
9
10
|
use std::panic::{catch_unwind, AssertUnwindSafe};
|
|
@@ -32,9 +33,10 @@ pub struct RbTensor {
|
|
|
32
33
|
// GVL-release helpers
|
|
33
34
|
// ---------------------------------------------------------------------------
|
|
34
35
|
|
|
36
|
+
// Tokenized holds only Vec<i64> fields — safe to send across threads.
|
|
35
37
|
struct InferArgs {
|
|
36
38
|
embedder: *const Embedder,
|
|
37
|
-
|
|
39
|
+
tokenized: *const Tokenized,
|
|
38
40
|
normalize: bool,
|
|
39
41
|
result: Option<crate::error::Result<ndarray::Array2<f32>>>,
|
|
40
42
|
}
|
|
@@ -63,8 +65,9 @@ fn panic_payload_to_string(payload: Box<dyn std::any::Any + Send>) -> String {
|
|
|
63
65
|
unsafe extern "C" fn run_without_gvl(ptr: *mut c_void) -> *mut c_void {
|
|
64
66
|
let args = &mut *(ptr as *mut InferArgs);
|
|
65
67
|
let run_result = catch_unwind(AssertUnwindSafe(|| {
|
|
66
|
-
|
|
67
|
-
|
|
68
|
+
// Tokenization happens before GVL release (in rb_embed / rb_embed_one).
|
|
69
|
+
// Only ONNX inference runs here without the GVL.
|
|
70
|
+
let embeddings = (*args.embedder).run(&*args.tokenized)?;
|
|
68
71
|
if args.normalize { Ok(normalize_l2(embeddings)) } else { Ok(embeddings) }
|
|
69
72
|
}));
|
|
70
73
|
args.result = Some(match run_result {
|
|
@@ -95,12 +98,12 @@ unsafe extern "C" fn run_score_without_gvl(ptr: *mut c_void) -> *mut c_void {
|
|
|
95
98
|
fn infer_without_gvl(
|
|
96
99
|
embedder: &Arc<Embedder>,
|
|
97
100
|
normalize: bool,
|
|
98
|
-
|
|
101
|
+
tokenized: &Tokenized,
|
|
99
102
|
) -> Result<ndarray::Array2<f32>, Error> {
|
|
100
103
|
let embeddings = unsafe {
|
|
101
104
|
let mut args = InferArgs {
|
|
102
105
|
embedder: Arc::as_ptr(embedder),
|
|
103
|
-
|
|
106
|
+
tokenized: tokenized as *const Tokenized,
|
|
104
107
|
normalize,
|
|
105
108
|
result: None,
|
|
106
109
|
};
|
|
@@ -195,12 +198,14 @@ impl RbEmbedder {
|
|
|
195
198
|
|
|
196
199
|
pub fn rb_embed(_ruby: &Ruby, rb_self: &Self, texts: RArray) -> Result<RbTensor, Error> {
|
|
197
200
|
let texts: Vec<String> = texts.to_vec()?;
|
|
198
|
-
let
|
|
201
|
+
let tokenized = rb_self.inner.tokenize(&texts).map_err(magnus::Error::from)?;
|
|
202
|
+
let embeddings = infer_without_gvl(&rb_self.inner, rb_self.normalize, &tokenized)?;
|
|
199
203
|
tensor_from_array(embeddings)
|
|
200
204
|
}
|
|
201
205
|
|
|
202
206
|
pub fn rb_embed_one(_ruby: &Ruby, rb_self: &Self, text: String) -> Result<RbTensor, Error> {
|
|
203
|
-
let
|
|
207
|
+
let tokenized = rb_self.inner.tokenize(&[text]).map_err(magnus::Error::from)?;
|
|
208
|
+
let embeddings = infer_without_gvl(&rb_self.inner, rb_self.normalize, &tokenized)?;
|
|
204
209
|
tensor_from_array(embeddings)
|
|
205
210
|
}
|
|
206
211
|
}
|
|
@@ -292,6 +297,10 @@ impl RbTensor {
|
|
|
292
297
|
Self::row(ruby, rb_self, 0)
|
|
293
298
|
}
|
|
294
299
|
|
|
300
|
+
pub fn first_binary_f32(ruby: &Ruby, rb_self: &Self) -> Result<magnus::RString, Error> {
|
|
301
|
+
Self::row_binary_f32(ruby, rb_self, 0)
|
|
302
|
+
}
|
|
303
|
+
|
|
295
304
|
pub fn row_binary_f32(
|
|
296
305
|
ruby: &Ruby,
|
|
297
306
|
rb_self: &Self,
|
|
@@ -354,6 +363,7 @@ pub fn register(ruby: &Ruby) -> Result<(), Error> {
|
|
|
354
363
|
tensor_class.define_method("row", method!(RbTensor::row, 1))?;
|
|
355
364
|
tensor_class.define_method("first", method!(RbTensor::first, 0))?;
|
|
356
365
|
tensor_class.define_method("row_binary_f32", method!(RbTensor::row_binary_f32, 1))?;
|
|
366
|
+
tensor_class.define_method("first_binary_f32", method!(RbTensor::first_binary_f32, 0))?;
|
|
357
367
|
tensor_class.define_method("to_a", method!(RbTensor::to_a, 0))?;
|
|
358
368
|
tensor_class.define_method("to_binary_f32", method!(RbTensor::to_binary_f32, 0))?;
|
|
359
369
|
Ok(())
|
data/ext/gte/src/session.rs
CHANGED
|
@@ -28,7 +28,7 @@ pub fn build_session<P: AsRef<Path>>(model_path: P, config: &ModelConfig) -> Res
|
|
|
28
28
|
.map_err(ort_err)?
|
|
29
29
|
.with_optimization_level(opt_level)
|
|
30
30
|
.map_err(ort_err)?
|
|
31
|
-
.with_memory_pattern(
|
|
31
|
+
.with_memory_pattern(false)
|
|
32
32
|
.map_err(ort_err)?;
|
|
33
33
|
|
|
34
34
|
let providers = preferred_execution_providers(config.execution_providers.as_deref());
|
|
@@ -54,12 +54,6 @@ pub fn build_session<P: AsRef<Path>>(model_path: P, config: &ModelConfig) -> Res
|
|
|
54
54
|
// Session pool
|
|
55
55
|
// ---------------------------------------------------------------------------
|
|
56
56
|
|
|
57
|
-
const AUTO_THREAD_POOL_CAP: usize = 6;
|
|
58
|
-
|
|
59
|
-
/// Keep enough sessions to cover the configured thread budget without
|
|
60
|
-
/// oversubscribing CPU parallelism. In ORT auto-thread mode (`num_threads == 0`)
|
|
61
|
-
/// we still keep a modest pool because request-level concurrency benefits from
|
|
62
|
-
/// more than one session even when ORT manages thread counts internally.
|
|
63
57
|
fn pool_capacity(num_threads: usize) -> usize {
|
|
64
58
|
let available_parallelism = std::thread::available_parallelism()
|
|
65
59
|
.map(|n| n.get())
|
|
@@ -72,8 +66,10 @@ fn pool_capacity_with_parallelism(num_threads: usize, available_parallelism: usi
|
|
|
72
66
|
return 1;
|
|
73
67
|
}
|
|
74
68
|
|
|
69
|
+
// Auto-thread mode: ORT grabs all cores per session. One session avoids
|
|
70
|
+
// N² intra-op oversubscription when multiple Ruby threads call concurrently.
|
|
75
71
|
if num_threads == 0 {
|
|
76
|
-
return
|
|
72
|
+
return 1;
|
|
77
73
|
}
|
|
78
74
|
|
|
79
75
|
available_parallelism.div_ceil(num_threads).max(1)
|
|
@@ -347,10 +343,12 @@ mod tests {
|
|
|
347
343
|
}
|
|
348
344
|
|
|
349
345
|
#[test]
|
|
350
|
-
fn
|
|
346
|
+
fn pool_capacity_uses_single_session_for_auto_thread_mode() {
|
|
347
|
+
// Auto-thread = ORT uses all cores per session. Pool=1 avoids N²
|
|
348
|
+
// intra-op oversubscription under concurrent Ruby threads.
|
|
351
349
|
assert_eq!(pool_capacity_with_parallelism(0, 1), 1);
|
|
352
|
-
assert_eq!(pool_capacity_with_parallelism(0, 4),
|
|
353
|
-
assert_eq!(pool_capacity_with_parallelism(0, 8),
|
|
350
|
+
assert_eq!(pool_capacity_with_parallelism(0, 4), 1);
|
|
351
|
+
assert_eq!(pool_capacity_with_parallelism(0, 8), 1);
|
|
354
352
|
}
|
|
355
353
|
|
|
356
354
|
#[test]
|
data/ext/gte/src/tokenizer.rs
CHANGED
|
@@ -113,18 +113,11 @@ pub fn parse_padding_mode_override(value: Option<&str>) -> Result<Option<Padding
|
|
|
113
113
|
fn resolve_padding_strategy(
|
|
114
114
|
padding_mode: PaddingMode,
|
|
115
115
|
max_length: usize,
|
|
116
|
-
|
|
116
|
+
_fixed_padding_length: Option<usize>,
|
|
117
117
|
) -> PaddingStrategy {
|
|
118
118
|
match padding_mode {
|
|
119
|
-
PaddingMode::BatchLongest => PaddingStrategy::BatchLongest,
|
|
119
|
+
PaddingMode::BatchLongest | PaddingMode::Auto => PaddingStrategy::BatchLongest,
|
|
120
120
|
PaddingMode::Fixed => PaddingStrategy::Fixed(max_length),
|
|
121
|
-
PaddingMode::Auto => {
|
|
122
|
-
if fixed_padding_length.is_some() {
|
|
123
|
-
PaddingStrategy::Fixed(max_length)
|
|
124
|
-
} else {
|
|
125
|
-
PaddingStrategy::BatchLongest
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
121
|
}
|
|
129
122
|
}
|
|
130
123
|
|
|
@@ -225,10 +218,25 @@ mod tests {
|
|
|
225
218
|
}
|
|
226
219
|
|
|
227
220
|
#[test]
|
|
228
|
-
fn
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
221
|
+
fn resolve_padding_strategy_auto_always_uses_batch_longest() {
|
|
222
|
+
// Auto ignores fixed_padding_length from tokenizer.json — BatchLongest is
|
|
223
|
+
// always faster for inference and correct for variable-length inputs.
|
|
224
|
+
// Use PaddingMode::Fixed explicitly when fixed-length padding is required.
|
|
225
|
+
assert!(matches!(
|
|
226
|
+
resolve_padding_strategy(PaddingMode::Auto, 64, Some(64)),
|
|
227
|
+
PaddingStrategy::BatchLongest
|
|
228
|
+
));
|
|
229
|
+
assert!(matches!(
|
|
230
|
+
resolve_padding_strategy(PaddingMode::Auto, 512, None),
|
|
231
|
+
PaddingStrategy::BatchLongest
|
|
232
|
+
));
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
#[test]
|
|
236
|
+
fn resolve_padding_strategy_fixed_uses_max_length() {
|
|
237
|
+
assert!(matches!(
|
|
238
|
+
resolve_padding_strategy(PaddingMode::Fixed, 64, None),
|
|
239
|
+
PaddingStrategy::Fixed(64)
|
|
240
|
+
));
|
|
233
241
|
}
|
|
234
242
|
}
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
use gte::embedder::Embedder;
|
|
2
2
|
use gte::model_config::ModelLoadOverrides;
|
|
3
3
|
|
|
4
|
+
fn model_dir(env_var: &str) -> Option<String> {
|
|
5
|
+
std::env::var(env_var).ok().filter(|v| !v.is_empty())
|
|
6
|
+
}
|
|
7
|
+
|
|
4
8
|
#[test]
|
|
5
|
-
#[ignore = "requires ext/gte/tests/fixtures/e5/tokenizer.json and model.onnx"]
|
|
6
9
|
fn test_e5_single_embedding_shape() {
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
let embedder = Embedder::from_dir(DIR, 0, 3, ModelLoadOverrides::default())
|
|
10
|
+
let Some(dir) = model_dir("GTE_BENCH_E5_DIR") else { return };
|
|
11
|
+
let embedder = Embedder::from_dir(&dir, 0, 3, ModelLoadOverrides::default())
|
|
10
12
|
.expect("embedder should initialize");
|
|
11
13
|
let result = embedder
|
|
12
14
|
.embed(vec!["query: Hello world".to_string()])
|
|
@@ -17,11 +19,9 @@ fn test_e5_single_embedding_shape() {
|
|
|
17
19
|
}
|
|
18
20
|
|
|
19
21
|
#[test]
|
|
20
|
-
#[ignore = "requires ext/gte/tests/fixtures/clip/tokenizer.json and model.onnx"]
|
|
21
22
|
fn test_clip_single_embedding_shape() {
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
let embedder = Embedder::from_dir(DIR, 0, 3, ModelLoadOverrides::default())
|
|
23
|
+
let Some(dir) = model_dir("GTE_BENCH_CLIP_DIR") else { return };
|
|
24
|
+
let embedder = Embedder::from_dir(&dir, 0, 3, ModelLoadOverrides::default())
|
|
25
25
|
.expect("embedder should initialize");
|
|
26
26
|
let result = embedder
|
|
27
27
|
.embed(vec!["a photo of a cat".to_string()])
|
|
@@ -32,11 +32,9 @@ fn test_clip_single_embedding_shape() {
|
|
|
32
32
|
}
|
|
33
33
|
|
|
34
34
|
#[test]
|
|
35
|
-
#[ignore = "requires ext/gte/tests/fixtures/e5/tokenizer.json and model.onnx"]
|
|
36
35
|
fn test_e5_batch_embedding_shape() {
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
let embedder = Embedder::from_dir(DIR, 0, 3, ModelLoadOverrides::default())
|
|
36
|
+
let Some(dir) = model_dir("GTE_BENCH_E5_DIR") else { return };
|
|
37
|
+
let embedder = Embedder::from_dir(&dir, 0, 3, ModelLoadOverrides::default())
|
|
40
38
|
.expect("embedder should initialize");
|
|
41
39
|
let texts = vec![
|
|
42
40
|
"query: first sentence".to_string(),
|
|
@@ -51,11 +49,9 @@ fn test_e5_batch_embedding_shape() {
|
|
|
51
49
|
}
|
|
52
50
|
|
|
53
51
|
#[test]
|
|
54
|
-
#[ignore = "requires ext/gte/tests/fixtures/e5/tokenizer.json and model.onnx"]
|
|
55
52
|
fn test_e5_long_input_truncation_no_error() {
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
let embedder = Embedder::from_dir(DIR, 0, 3, ModelLoadOverrides::default())
|
|
53
|
+
let Some(dir) = model_dir("GTE_BENCH_E5_DIR") else { return };
|
|
54
|
+
let embedder = Embedder::from_dir(&dir, 0, 3, ModelLoadOverrides::default())
|
|
59
55
|
.expect("embedder should initialize");
|
|
60
56
|
let very_long_text = "word ".repeat(1000);
|
|
61
57
|
let result = embedder
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
// Regression tests for the fixed-padding performance bug.
|
|
2
|
+
//
|
|
3
|
+
// Root cause: PaddingMode::Auto silently read "padding.strategy.Fixed: N" from
|
|
4
|
+
// tokenizer.json and applied it, padding every input to max_length tokens.
|
|
5
|
+
// A query like "cat" (1 token) was padded to 64 tokens for Siglip2, making
|
|
6
|
+
// inference ~6x slower (44ms vs 7ms measured on Heroku).
|
|
7
|
+
//
|
|
8
|
+
// These tests use tests/fixtures/minimal/tokenizer.json which has
|
|
9
|
+
// "padding.strategy.Fixed: 64" baked in — exactly the condition that triggered
|
|
10
|
+
// the regression in production models like Siglip2.
|
|
11
|
+
|
|
12
|
+
use gte::model_config::PaddingMode;
|
|
13
|
+
use gte::tokenizer::Tokenizer;
|
|
14
|
+
|
|
15
|
+
const TOKENIZER: &str = concat!(
|
|
16
|
+
env!("CARGO_MANIFEST_DIR"),
|
|
17
|
+
"/tests/fixtures/minimal/tokenizer.json"
|
|
18
|
+
);
|
|
19
|
+
|
|
20
|
+
// Short input tokenizes to 1 token with this vocabulary.
|
|
21
|
+
const SHORT_INPUT: &str = "cat";
|
|
22
|
+
const MAX_LENGTH: usize = 64;
|
|
23
|
+
|
|
24
|
+
#[test]
|
|
25
|
+
fn auto_padding_uses_batch_longest_regardless_of_tokenizer_json() {
|
|
26
|
+
// fixed_padding_length: Some(MAX_LENGTH) simulates what model_profile::read_tokenizer_profile
|
|
27
|
+
// returns when tokenizer.json has "padding.strategy.Fixed: 64".
|
|
28
|
+
let tokenizer = Tokenizer::new(TOKENIZER, MAX_LENGTH, false, PaddingMode::Auto, Some(MAX_LENGTH))
|
|
29
|
+
.expect("tokenizer should load");
|
|
30
|
+
|
|
31
|
+
let tokenized = tokenizer
|
|
32
|
+
.tokenize(&[SHORT_INPUT.to_string()])
|
|
33
|
+
.expect("tokenize should succeed");
|
|
34
|
+
|
|
35
|
+
// Old behavior: cols == 64 (silently padded to max_length)
|
|
36
|
+
// New behavior: cols == actual token count (1 for "cat")
|
|
37
|
+
assert!(
|
|
38
|
+
tokenized.cols < MAX_LENGTH,
|
|
39
|
+
"Auto padding should use batch_longest, got cols={} (expected < {}). \
|
|
40
|
+
This is the Siglip2 regression: short queries were padded to max_length, \
|
|
41
|
+
making inference ~6x slower.",
|
|
42
|
+
tokenized.cols,
|
|
43
|
+
MAX_LENGTH
|
|
44
|
+
);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
#[test]
|
|
48
|
+
fn fixed_padding_mode_pads_to_max_length() {
|
|
49
|
+
let tokenizer = Tokenizer::new(TOKENIZER, MAX_LENGTH, false, PaddingMode::Fixed, None)
|
|
50
|
+
.expect("tokenizer should load");
|
|
51
|
+
|
|
52
|
+
let tokenized = tokenizer
|
|
53
|
+
.tokenize(&[SHORT_INPUT.to_string()])
|
|
54
|
+
.expect("tokenize should succeed");
|
|
55
|
+
|
|
56
|
+
assert_eq!(
|
|
57
|
+
tokenized.cols, MAX_LENGTH,
|
|
58
|
+
"Fixed mode should pad to max_length"
|
|
59
|
+
);
|
|
60
|
+
assert_eq!(tokenized.input_ids.len(), MAX_LENGTH);
|
|
61
|
+
assert_eq!(tokenized.attn_masks.len(), MAX_LENGTH);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
#[test]
|
|
65
|
+
fn batch_longest_padding_uses_longest_sequence_in_batch() {
|
|
66
|
+
let tokenizer = Tokenizer::new(TOKENIZER, MAX_LENGTH, false, PaddingMode::BatchLongest, None)
|
|
67
|
+
.expect("tokenizer should load");
|
|
68
|
+
|
|
69
|
+
// "cat" = 1 token, "hello world" = 2 tokens — batch pads to 2, not 64
|
|
70
|
+
let tokenized = tokenizer
|
|
71
|
+
.tokenize(&["cat".to_string(), "hello world".to_string()])
|
|
72
|
+
.expect("tokenize should succeed");
|
|
73
|
+
|
|
74
|
+
assert_eq!(tokenized.rows, 2);
|
|
75
|
+
assert!(
|
|
76
|
+
tokenized.cols < MAX_LENGTH,
|
|
77
|
+
"BatchLongest should pad to longest in batch (2 tokens), not max_length ({}). Got cols={}",
|
|
78
|
+
MAX_LENGTH,
|
|
79
|
+
tokenized.cols
|
|
80
|
+
);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
#[test]
|
|
84
|
+
fn auto_padding_with_no_fixed_hint_also_uses_batch_longest() {
|
|
85
|
+
// Sanity check: Auto with fixed_padding_length=None also uses BatchLongest
|
|
86
|
+
let tokenizer = Tokenizer::new(TOKENIZER, MAX_LENGTH, false, PaddingMode::Auto, None)
|
|
87
|
+
.expect("tokenizer should load");
|
|
88
|
+
|
|
89
|
+
let tokenized = tokenizer
|
|
90
|
+
.tokenize(&[SHORT_INPUT.to_string()])
|
|
91
|
+
.expect("tokenize should succeed");
|
|
92
|
+
|
|
93
|
+
assert!(tokenized.cols < MAX_LENGTH);
|
|
94
|
+
}
|
data/lib/gte/embedder.rb
CHANGED
data/lib/gte/model.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: gte
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.10
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- elcuervo
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-05-13 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rake
|
|
@@ -110,6 +110,7 @@ files:
|
|
|
110
110
|
- ext/gte/src/tokenizer.rs
|
|
111
111
|
- ext/gte/tests/embedder_unit_test.rs
|
|
112
112
|
- ext/gte/tests/inference_integration_test.rs
|
|
113
|
+
- ext/gte/tests/padding_regression_test.rs
|
|
113
114
|
- ext/gte/tests/postprocess_unit_test.rs
|
|
114
115
|
- ext/gte/tests/tokenizer_unit_test.rs
|
|
115
116
|
- lib/gte.rb
|