gte 0.0.3-arm64-darwin → 0.0.4-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e86f2a3e303b2ad2dd726f2a14b04127f07591d94f220aae65a39e922818756f
4
- data.tar.gz: cccb448f456718089b73080ff29541e1c525fa28c0fc4e27674dede3bd59389f
3
+ metadata.gz: 86bd8582b693e5d6b1b1d880dec05b21912b3cc45de1a09e143210fcdb76dc5d
4
+ data.tar.gz: 8785600d70d86d0f9de669631054f279105801f47ca6f6d227b1e93fe9e6ff9f
5
5
  SHA512:
6
- metadata.gz: 72ec78ebb4c400bebf8cd52dc44aa034cf8ddbf5911e81547eb8120087ae5f72b32873f04be515aeb7170374eab938d1031d80ae6a680b1f37f43441c4acbaa4
7
- data.tar.gz: bc00deb09690233e212526e1cd5115af1a813d3e2f0e55445bbf90d94e169a25cef2a0c07c64d8ac6d98224eaa58166bdbdb26a6c715a584d7a634598e1e6ae8
6
+ metadata.gz: 7ac75c64e28ca6c227be56439f15bea76355c775cec5ba203c95666b8f603d370b492647a4b9c242b84d9fdfa36f6f34fa2436895596f5b887863b193bf4e523
7
+ data.tar.gz: 2b9f12675b6e0fd461f611f199e7ee66f366e3bbdf93c15bd018a0b45eb2eef001b9adb9670b02a07f58589a915ab7391762c5fabb54c14254193ff11d1867ec
data/README.md CHANGED
@@ -13,6 +13,12 @@ model = GTE.new(ENV.fetch("GTE_MODEL_DIR"))
13
13
  vector = model["query: hello world"]
14
14
  ```
15
15
 
16
+ For Puma or other thread pools, prefer process-local reuse:
17
+
18
+ ```ruby
19
+ MODEL = GTE.new(ENV.fetch("GTE_MODEL_DIR"))
20
+ ```
21
+
16
22
  ## Model Directory
17
23
 
18
24
  A model directory must include `tokenizer.json` and one ONNX model, resolved in this order:
@@ -40,6 +46,7 @@ The repo includes two benchmark paths:
40
46
  bundle exec rake bench:pure_compare
41
47
  bundle exec rake bench:puma_compare
42
48
  bundle exec rake bench:matrix_sweep
49
+ bundle exec ruby bench/memory_probe.rb --compare-pure
43
50
  ```
44
51
 
45
52
  For release tracking and regression detection, record a run entry in `RUNS.md`:
data/Rakefile CHANGED
@@ -48,6 +48,14 @@ namespace :bench do
48
48
  )
49
49
  end
50
50
 
51
+ desc 'Run memory probe for single-instance vs duplicate-instance behavior'
52
+ task :memory_probe do
53
+ run_in_nix(
54
+ 'bundle', 'exec', 'ruby', 'bench/memory_probe.rb',
55
+ '--compare-pure'
56
+ )
57
+ end
58
+
51
59
  desc 'Run Puma benchmark, append RUNS.md entry, and enforce goal/regression checks'
52
60
  task :record_run do
53
61
  run_in_nix(
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.3
1
+ 0.0.4
data/ext/gte/Cargo.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "gte"
3
- version = "0.0.3"
3
+ version = "0.0.4"
4
4
  edition = "2021"
5
5
  authors = ["elcuervo <elcuervo@elcuervo.net>"]
6
6
  license = "MIT"
@@ -57,16 +57,17 @@ impl Embedder {
57
57
  }
58
58
 
59
59
  let max_length = read_max_length(dir);
60
+ let probe_num_threads = if num_threads == 0 { 1 } else { num_threads };
60
61
  let temp_config = ModelConfig {
61
62
  max_length,
62
63
  output_tensor: String::new(),
63
64
  mode: ExtractorMode::Raw,
64
65
  with_type_ids: false,
65
66
  with_attention_mask: true,
66
- num_threads,
67
+ num_threads: probe_num_threads,
67
68
  optimization_level,
68
69
  };
69
- let session = build_session(&model_path, &temp_config)?;
70
+ let mut session = build_session(&model_path, &temp_config)?;
70
71
 
71
72
  validate_supported_inputs(&session)?;
72
73
  let with_type_ids = session.inputs.iter().any(|i| i.name == "token_type_ids");
@@ -97,11 +98,11 @@ impl Embedder {
97
98
  optimization_level,
98
99
  };
99
100
 
100
- let session = if tuned_num_threads != num_threads {
101
- build_session(&model_path, &config)?
102
- } else {
103
- session
104
- };
101
+ if tuned_num_threads != probe_num_threads {
102
+ // Release probe session before rebuilding to minimize transient peak RSS.
103
+ drop(session);
104
+ session = build_session(&model_path, &config)?;
105
+ }
105
106
 
106
107
  let tokenizer = Tokenizer::new(&tokenizer_path, config.max_length, config.with_type_ids)?;
107
108
 
@@ -138,16 +139,13 @@ fn tune_num_threads(
138
139
  }
139
140
 
140
141
  let family = infer_model_family(with_attention_mask, with_type_ids, output_name);
141
- let target_concurrency = puma_target_concurrency();
142
- let host_cores = host_parallelism();
143
- let budgeted_threads = (host_cores / target_concurrency).max(1);
144
142
 
145
143
  match family {
146
144
  // Puma-like workloads typically run many concurrent single-item requests where
147
145
  // one intra-op thread per request gives the best tail behavior.
148
- ModelFamily::E5Like | ModelFamily::ClipLike | ModelFamily::SiglipLike => {
149
- budgeted_threads.min(1)
150
- }
146
+ ModelFamily::E5Like | ModelFamily::ClipLike => 1,
147
+ // Siglip2 text path benefits from a small intra-op pool under concurrency.
148
+ ModelFamily::SiglipLike => 3,
151
149
  ModelFamily::Other => 0,
152
150
  }
153
151
  }
@@ -169,20 +167,6 @@ fn infer_model_family(
169
167
  ModelFamily::Other
170
168
  }
171
169
 
172
- fn puma_target_concurrency() -> usize {
173
- std::env::var("GTE_PUMA_CONCURRENCY")
174
- .ok()
175
- .and_then(|raw| raw.parse::<usize>().ok())
176
- .filter(|value| *value > 0)
177
- .unwrap_or(16)
178
- }
179
-
180
- fn host_parallelism() -> usize {
181
- std::thread::available_parallelism()
182
- .map(|n| n.get())
183
- .unwrap_or(1)
184
- }
185
-
186
170
  fn resolve_named_model(dir: &Path, name: &str) -> Result<PathBuf> {
187
171
  let candidates = [dir.join("onnx").join(name), dir.join(name)];
188
172
  for path in &candidates {
@@ -87,10 +87,27 @@ fn mean_pool_contiguous(
87
87
  let mask_base = batch_index * seq;
88
88
  let hidden_base = batch_index * seq * dim;
89
89
  let output_row = &mut output[batch_index * dim..(batch_index + 1) * dim];
90
+ let mask_row = &attention_mask[mask_base..mask_base + seq];
91
+
92
+ if mask_row.iter().all(|&weight| weight == 1) {
93
+ for token_index in 0..seq {
94
+ let token_base = hidden_base + token_index * dim;
95
+ for dim_index in 0..dim {
96
+ output_row[dim_index] += hidden[token_base + dim_index];
97
+ }
98
+ }
99
+
100
+ let inverse = (seq as f32).recip();
101
+ for value in output_row {
102
+ *value *= inverse;
103
+ }
104
+ continue;
105
+ }
106
+
90
107
  let mut weight_sum = 0.0f32;
91
108
 
92
- for token_index in 0..seq {
93
- let weight = attention_mask[mask_base + token_index];
109
+ for (token_index, &weight_raw) in mask_row.iter().enumerate() {
110
+ let weight = weight_raw;
94
111
  if weight <= 0 {
95
112
  continue;
96
113
  }
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
data/lib/gte.rb CHANGED
@@ -9,6 +9,9 @@ end
9
9
  module GTE
10
10
  VERSION = File.read(File.expand_path('../VERSION', __dir__)).strip
11
11
 
12
+ @model_cache_mutex = Mutex.new
13
+ @model_cache = {}
14
+
12
15
  class Model
13
16
  def initialize(dir, num_threads: 0, optimization_level: 3, model_name: nil)
14
17
  @embedder = GTE::Embedder.new(dir, num_threads, optimization_level, model_name.to_s)
@@ -30,7 +33,23 @@ module GTE
30
33
  end
31
34
  end
32
35
 
33
- def self.new(dir, num_threads: 0, optimization_level: 3, model_name: nil)
34
- Model.new(dir, num_threads: num_threads, optimization_level: optimization_level, model_name: model_name)
36
+ def self.new(dir, threads: 0, optimization: 3, model_name: nil)
37
+ key = [
38
+ File.expand_path(dir),
39
+ Integer(threads),
40
+ Integer(optimization),
41
+ model_name.to_s
42
+ ].freeze
43
+
44
+ @model_cache_mutex.synchronize do
45
+ @model_cache[key] ||= Model.new(
46
+ key[0],
47
+ num_threads: key[1],
48
+ optimization_level: key[2],
49
+ model_name: key[3].empty? ? nil : key[3]
50
+ )
51
+ end
35
52
  end
53
+
54
+ def self.fetch(*) = new(*)
36
55
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gte
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: arm64-darwin
6
6
  authors:
7
7
  - elcuervo
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-04-10 00:00:00.000000000 Z
11
+ date: 2026-04-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake