gte 0.0.3-arm64-darwin → 0.0.4-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +7 -0
- data/Rakefile +8 -0
- data/VERSION +1 -1
- data/ext/gte/Cargo.toml +1 -1
- data/ext/gte/src/embedder.rs +11 -27
- data/ext/gte/src/postprocess.rs +19 -2
- data/lib/gte/3.0/gte.bundle +0 -0
- data/lib/gte/3.1/gte.bundle +0 -0
- data/lib/gte/3.2/gte.bundle +0 -0
- data/lib/gte/3.3/gte.bundle +0 -0
- data/lib/gte/3.4/gte.bundle +0 -0
- data/lib/gte/4.0/gte.bundle +0 -0
- data/lib/gte.rb +21 -2
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 86bd8582b693e5d6b1b1d880dec05b21912b3cc45de1a09e143210fcdb76dc5d
|
|
4
|
+
data.tar.gz: 8785600d70d86d0f9de669631054f279105801f47ca6f6d227b1e93fe9e6ff9f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7ac75c64e28ca6c227be56439f15bea76355c775cec5ba203c95666b8f603d370b492647a4b9c242b84d9fdfa36f6f34fa2436895596f5b887863b193bf4e523
|
|
7
|
+
data.tar.gz: 2b9f12675b6e0fd461f611f199e7ee66f366e3bbdf93c15bd018a0b45eb2eef001b9adb9670b02a07f58589a915ab7391762c5fabb54c14254193ff11d1867ec
|
data/README.md
CHANGED
|
@@ -13,6 +13,12 @@ model = GTE.new(ENV.fetch("GTE_MODEL_DIR"))
|
|
|
13
13
|
vector = model["query: hello world"]
|
|
14
14
|
```
|
|
15
15
|
|
|
16
|
+
For Puma or other thread pools, prefer process-local reuse:
|
|
17
|
+
|
|
18
|
+
```ruby
|
|
19
|
+
MODEL = GTE.new(ENV.fetch("GTE_MODEL_DIR"))
|
|
20
|
+
```
|
|
21
|
+
|
|
16
22
|
## Model Directory
|
|
17
23
|
|
|
18
24
|
A model directory must include `tokenizer.json` and one ONNX model, resolved in this order:
|
|
@@ -40,6 +46,7 @@ The repo includes two benchmark paths:
|
|
|
40
46
|
bundle exec rake bench:pure_compare
|
|
41
47
|
bundle exec rake bench:puma_compare
|
|
42
48
|
bundle exec rake bench:matrix_sweep
|
|
49
|
+
bundle exec ruby bench/memory_probe.rb --compare-pure
|
|
43
50
|
```
|
|
44
51
|
|
|
45
52
|
For release tracking and regression detection, record a run entry in `RUNS.md`:
|
data/Rakefile
CHANGED
|
@@ -48,6 +48,14 @@ namespace :bench do
|
|
|
48
48
|
)
|
|
49
49
|
end
|
|
50
50
|
|
|
51
|
+
desc 'Run memory probe for single-instance vs duplicate-instance behavior'
|
|
52
|
+
task :memory_probe do
|
|
53
|
+
run_in_nix(
|
|
54
|
+
'bundle', 'exec', 'ruby', 'bench/memory_probe.rb',
|
|
55
|
+
'--compare-pure'
|
|
56
|
+
)
|
|
57
|
+
end
|
|
58
|
+
|
|
51
59
|
desc 'Run Puma benchmark, append RUNS.md entry, and enforce goal/regression checks'
|
|
52
60
|
task :record_run do
|
|
53
61
|
run_in_nix(
|
data/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
0.0.
|
|
1
|
+
0.0.4
|
data/ext/gte/Cargo.toml
CHANGED
data/ext/gte/src/embedder.rs
CHANGED
|
@@ -57,16 +57,17 @@ impl Embedder {
|
|
|
57
57
|
}
|
|
58
58
|
|
|
59
59
|
let max_length = read_max_length(dir);
|
|
60
|
+
let probe_num_threads = if num_threads == 0 { 1 } else { num_threads };
|
|
60
61
|
let temp_config = ModelConfig {
|
|
61
62
|
max_length,
|
|
62
63
|
output_tensor: String::new(),
|
|
63
64
|
mode: ExtractorMode::Raw,
|
|
64
65
|
with_type_ids: false,
|
|
65
66
|
with_attention_mask: true,
|
|
66
|
-
num_threads,
|
|
67
|
+
num_threads: probe_num_threads,
|
|
67
68
|
optimization_level,
|
|
68
69
|
};
|
|
69
|
-
let session = build_session(&model_path, &temp_config)?;
|
|
70
|
+
let mut session = build_session(&model_path, &temp_config)?;
|
|
70
71
|
|
|
71
72
|
validate_supported_inputs(&session)?;
|
|
72
73
|
let with_type_ids = session.inputs.iter().any(|i| i.name == "token_type_ids");
|
|
@@ -97,11 +98,11 @@ impl Embedder {
|
|
|
97
98
|
optimization_level,
|
|
98
99
|
};
|
|
99
100
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
session
|
|
104
|
-
}
|
|
101
|
+
if tuned_num_threads != probe_num_threads {
|
|
102
|
+
// Release probe session before rebuilding to minimize transient peak RSS.
|
|
103
|
+
drop(session);
|
|
104
|
+
session = build_session(&model_path, &config)?;
|
|
105
|
+
}
|
|
105
106
|
|
|
106
107
|
let tokenizer = Tokenizer::new(&tokenizer_path, config.max_length, config.with_type_ids)?;
|
|
107
108
|
|
|
@@ -138,16 +139,13 @@ fn tune_num_threads(
|
|
|
138
139
|
}
|
|
139
140
|
|
|
140
141
|
let family = infer_model_family(with_attention_mask, with_type_ids, output_name);
|
|
141
|
-
let target_concurrency = puma_target_concurrency();
|
|
142
|
-
let host_cores = host_parallelism();
|
|
143
|
-
let budgeted_threads = (host_cores / target_concurrency).max(1);
|
|
144
142
|
|
|
145
143
|
match family {
|
|
146
144
|
// Puma-like workloads typically run many concurrent single-item requests where
|
|
147
145
|
// one intra-op thread per request gives the best tail behavior.
|
|
148
|
-
ModelFamily::E5Like | ModelFamily::ClipLike
|
|
149
|
-
|
|
150
|
-
|
|
146
|
+
ModelFamily::E5Like | ModelFamily::ClipLike => 1,
|
|
147
|
+
// Siglip2 text path benefits from a small intra-op pool under concurrency.
|
|
148
|
+
ModelFamily::SiglipLike => 3,
|
|
151
149
|
ModelFamily::Other => 0,
|
|
152
150
|
}
|
|
153
151
|
}
|
|
@@ -169,20 +167,6 @@ fn infer_model_family(
|
|
|
169
167
|
ModelFamily::Other
|
|
170
168
|
}
|
|
171
169
|
|
|
172
|
-
fn puma_target_concurrency() -> usize {
|
|
173
|
-
std::env::var("GTE_PUMA_CONCURRENCY")
|
|
174
|
-
.ok()
|
|
175
|
-
.and_then(|raw| raw.parse::<usize>().ok())
|
|
176
|
-
.filter(|value| *value > 0)
|
|
177
|
-
.unwrap_or(16)
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
fn host_parallelism() -> usize {
|
|
181
|
-
std::thread::available_parallelism()
|
|
182
|
-
.map(|n| n.get())
|
|
183
|
-
.unwrap_or(1)
|
|
184
|
-
}
|
|
185
|
-
|
|
186
170
|
fn resolve_named_model(dir: &Path, name: &str) -> Result<PathBuf> {
|
|
187
171
|
let candidates = [dir.join("onnx").join(name), dir.join(name)];
|
|
188
172
|
for path in &candidates {
|
data/ext/gte/src/postprocess.rs
CHANGED
|
@@ -87,10 +87,27 @@ fn mean_pool_contiguous(
|
|
|
87
87
|
let mask_base = batch_index * seq;
|
|
88
88
|
let hidden_base = batch_index * seq * dim;
|
|
89
89
|
let output_row = &mut output[batch_index * dim..(batch_index + 1) * dim];
|
|
90
|
+
let mask_row = &attention_mask[mask_base..mask_base + seq];
|
|
91
|
+
|
|
92
|
+
if mask_row.iter().all(|&weight| weight == 1) {
|
|
93
|
+
for token_index in 0..seq {
|
|
94
|
+
let token_base = hidden_base + token_index * dim;
|
|
95
|
+
for dim_index in 0..dim {
|
|
96
|
+
output_row[dim_index] += hidden[token_base + dim_index];
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
let inverse = (seq as f32).recip();
|
|
101
|
+
for value in output_row {
|
|
102
|
+
*value *= inverse;
|
|
103
|
+
}
|
|
104
|
+
continue;
|
|
105
|
+
}
|
|
106
|
+
|
|
90
107
|
let mut weight_sum = 0.0f32;
|
|
91
108
|
|
|
92
|
-
for token_index in
|
|
93
|
-
let weight =
|
|
109
|
+
for (token_index, &weight_raw) in mask_row.iter().enumerate() {
|
|
110
|
+
let weight = weight_raw;
|
|
94
111
|
if weight <= 0 {
|
|
95
112
|
continue;
|
|
96
113
|
}
|
data/lib/gte/3.0/gte.bundle
CHANGED
|
Binary file
|
data/lib/gte/3.1/gte.bundle
CHANGED
|
Binary file
|
data/lib/gte/3.2/gte.bundle
CHANGED
|
Binary file
|
data/lib/gte/3.3/gte.bundle
CHANGED
|
Binary file
|
data/lib/gte/3.4/gte.bundle
CHANGED
|
Binary file
|
data/lib/gte/4.0/gte.bundle
CHANGED
|
Binary file
|
data/lib/gte.rb
CHANGED
|
@@ -9,6 +9,9 @@ end
|
|
|
9
9
|
module GTE
|
|
10
10
|
VERSION = File.read(File.expand_path('../VERSION', __dir__)).strip
|
|
11
11
|
|
|
12
|
+
@model_cache_mutex = Mutex.new
|
|
13
|
+
@model_cache = {}
|
|
14
|
+
|
|
12
15
|
class Model
|
|
13
16
|
def initialize(dir, num_threads: 0, optimization_level: 3, model_name: nil)
|
|
14
17
|
@embedder = GTE::Embedder.new(dir, num_threads, optimization_level, model_name.to_s)
|
|
@@ -30,7 +33,23 @@ module GTE
|
|
|
30
33
|
end
|
|
31
34
|
end
|
|
32
35
|
|
|
33
|
-
def self.new(dir,
|
|
34
|
-
|
|
36
|
+
def self.new(dir, threads: 0, optimization: 3, model_name: nil)
|
|
37
|
+
key = [
|
|
38
|
+
File.expand_path(dir),
|
|
39
|
+
Integer(threads),
|
|
40
|
+
Integer(optimization),
|
|
41
|
+
model_name.to_s
|
|
42
|
+
].freeze
|
|
43
|
+
|
|
44
|
+
@model_cache_mutex.synchronize do
|
|
45
|
+
@model_cache[key] ||= Model.new(
|
|
46
|
+
key[0],
|
|
47
|
+
num_threads: key[1],
|
|
48
|
+
optimization_level: key[2],
|
|
49
|
+
model_name: key[3].empty? ? nil : key[3]
|
|
50
|
+
)
|
|
51
|
+
end
|
|
35
52
|
end
|
|
53
|
+
|
|
54
|
+
def self.fetch(*) = new(*)
|
|
36
55
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: gte
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.4
|
|
5
5
|
platform: arm64-darwin
|
|
6
6
|
authors:
|
|
7
7
|
- elcuervo
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-13 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rake
|