gte 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,62 @@
1
+ use gte::embedder::Embedder;
2
+
3
+ #[test]
4
+ #[ignore = "requires ext/gte/tests/fixtures/e5/tokenizer.json and model.onnx"]
5
+ fn test_e5_single_embedding_shape() {
6
+ const DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/e5");
7
+
8
+ let embedder = Embedder::from_dir(DIR, 0, 3).expect("embedder should initialize");
9
+ let result = embedder
10
+ .embed(vec!["query: Hello world".to_string()])
11
+ .expect("embed should succeed");
12
+
13
+ assert_eq!(result.shape()[0], 1);
14
+ assert!(result.shape()[1] > 0);
15
+ }
16
+
17
+ #[test]
18
+ #[ignore = "requires ext/gte/tests/fixtures/clip/tokenizer.json and model.onnx"]
19
+ fn test_clip_single_embedding_shape() {
20
+ const DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/clip");
21
+
22
+ let embedder = Embedder::from_dir(DIR, 0, 3).expect("embedder should initialize");
23
+ let result = embedder
24
+ .embed(vec!["a photo of a cat".to_string()])
25
+ .expect("embed should succeed");
26
+
27
+ assert_eq!(result.shape()[0], 1);
28
+ assert!(result.shape()[1] > 0);
29
+ }
30
+
31
+ #[test]
32
+ #[ignore = "requires ext/gte/tests/fixtures/e5/tokenizer.json and model.onnx"]
33
+ fn test_e5_batch_embedding_shape() {
34
+ const DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/e5");
35
+
36
+ let embedder = Embedder::from_dir(DIR, 0, 3).expect("embedder should initialize");
37
+ let texts = vec![
38
+ "query: first sentence".to_string(),
39
+ "query: second sentence".to_string(),
40
+ "query: third sentence for batch".to_string(),
41
+ ];
42
+
43
+ let result = embedder.embed(texts).expect("batch embed should succeed");
44
+
45
+ assert_eq!(result.shape()[0], 3);
46
+ assert!(result.shape()[1] > 0);
47
+ }
48
+
49
+ #[test]
50
+ #[ignore = "requires ext/gte/tests/fixtures/e5/tokenizer.json and model.onnx"]
51
+ fn test_e5_long_input_truncation_no_error() {
52
+ const DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/e5");
53
+
54
+ let embedder = Embedder::from_dir(DIR, 0, 3).expect("embedder should initialize");
55
+ let very_long_text = "word ".repeat(1000);
56
+ let result = embedder
57
+ .embed(vec![very_long_text])
58
+ .expect("long input should be truncated without error");
59
+
60
+ assert_eq!(result.shape()[0], 1);
61
+ assert!(result.shape()[1] > 0);
62
+ }
@@ -0,0 +1,44 @@
1
+ use gte::tokenizer::Tokenizer;
2
+
3
+ #[test]
4
+ #[ignore = "requires ext/gte/tests/fixtures/e5/tokenizer.json"]
5
+ fn test_e5_tokenizer_output_shape() {
6
+ const TOKENIZER: &str = concat!(
7
+ env!("CARGO_MANIFEST_DIR"),
8
+ "/tests/fixtures/e5/tokenizer.json"
9
+ );
10
+
11
+ let tokenizer = Tokenizer::new(TOKENIZER, 512, true).expect("tokenizer should load");
12
+ let texts = vec![
13
+ "Hello, world!".to_string(),
14
+ "A second, longer sentence to test padding behavior.".to_string(),
15
+ ];
16
+
17
+ let tokenized = tokenizer.tokenize(&texts).expect("tokenize should succeed");
18
+
19
+ assert_eq!(tokenized.rows, 2, "batch size should be 2");
20
+ assert!(tokenized.cols > 0, "sequence length should be non-zero");
21
+ assert_eq!(tokenized.input_ids.len(), tokenized.rows * tokenized.cols);
22
+ assert_eq!(tokenized.attn_masks.len(), tokenized.rows * tokenized.cols);
23
+
24
+ let type_ids = tokenized.type_ids.as_ref().expect("type_ids should exist");
25
+ assert_eq!(type_ids.len(), tokenized.rows * tokenized.cols);
26
+ }
27
+
28
+ #[test]
29
+ #[ignore = "requires ext/gte/tests/fixtures/e5/tokenizer.json"]
30
+ fn test_e5_truncation_at_max_length() {
31
+ const TOKENIZER: &str = concat!(
32
+ env!("CARGO_MANIFEST_DIR"),
33
+ "/tests/fixtures/e5/tokenizer.json"
34
+ );
35
+
36
+ let tokenizer = Tokenizer::new(TOKENIZER, 16, false).expect("tokenizer should load");
37
+ let long_text = "word ".repeat(200);
38
+ let tokenized = tokenizer
39
+ .tokenize(&[long_text])
40
+ .expect("tokenize should not error on long input");
41
+
42
+ assert_eq!(tokenized.rows, 1);
43
+ assert_eq!(tokenized.cols, 16, "sequence length should be truncated to max_length");
44
+ }
data/lib/gte.rb ADDED
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'gte/gte'
4
+
5
+ module GTE
6
+ VERSION = File.read(File.expand_path('../VERSION', __dir__)).strip
7
+
8
+ class Model
9
+ def initialize(dir, num_threads: 0, optimization_level: 3)
10
+ @embedder = GTE::Embedder.new(dir, num_threads, optimization_level)
11
+ end
12
+
13
+ def embed(texts)
14
+ if texts.is_a?(String)
15
+ @embedder.embed_one(texts)
16
+ else
17
+ @embedder.embed(Array(texts))
18
+ end
19
+ end
20
+
21
+ def [](input)
22
+ case input
23
+ when String then embed(input).row(0)
24
+ when Array then embed(input)
25
+ end
26
+ end
27
+ end
28
+
29
+ def self.new(dir, num_threads: 0, optimization_level: 3)
30
+ Model.new(dir, num_threads: num_threads, optimization_level: optimization_level)
31
+ end
32
+ end
metadata ADDED
@@ -0,0 +1,136 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gte
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - elcuervo
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2026-04-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake-compiler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rb_sys
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec-benchmark
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description:
84
+ email:
85
+ - elcuervo@elcuervo.net
86
+ executables: []
87
+ extensions:
88
+ - ext/gte/extconf.rb
89
+ extra_rdoc_files: []
90
+ files:
91
+ - Gemfile
92
+ - LICENSE
93
+ - README.md
94
+ - Rakefile
95
+ - VERSION
96
+ - ext/gte/Cargo.toml
97
+ - ext/gte/benches/hot_path.rs
98
+ - ext/gte/build.rs
99
+ - ext/gte/extconf.rb
100
+ - ext/gte/src/embedder.rs
101
+ - ext/gte/src/error.rs
102
+ - ext/gte/src/lib.rs
103
+ - ext/gte/src/model_config.rs
104
+ - ext/gte/src/postprocess.rs
105
+ - ext/gte/src/ruby_embedder.rs
106
+ - ext/gte/src/session.rs
107
+ - ext/gte/src/tokenizer.rs
108
+ - ext/gte/tests/embedder_unit_test.rs
109
+ - ext/gte/tests/inference_integration_test.rs
110
+ - ext/gte/tests/tokenizer_unit_test.rs
111
+ - lib/gte.rb
112
+ homepage: https://github.com/elcuervo/gte
113
+ licenses:
114
+ - MIT
115
+ metadata:
116
+ rubygems_mfa_required: 'true'
117
+ post_install_message:
118
+ rdoc_options: []
119
+ require_paths:
120
+ - lib
121
+ required_ruby_version: !ruby/object:Gem::Requirement
122
+ requirements:
123
+ - - ">="
124
+ - !ruby/object:Gem::Version
125
+ version: '3.2'
126
+ required_rubygems_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ requirements: []
132
+ rubygems_version: 3.5.22
133
+ signing_key:
134
+ specification_version: 4
135
+ summary: General Text Embeddings
136
+ test_files: []