gte 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +17 -0
- data/LICENSE +21 -0
- data/README.md +49 -0
- data/Rakefile +76 -0
- data/VERSION +1 -0
- data/ext/gte/Cargo.toml +37 -0
- data/ext/gte/benches/hot_path.rs +53 -0
- data/ext/gte/build.rs +25 -0
- data/ext/gte/extconf.rb +6 -0
- data/ext/gte/src/embedder.rs +342 -0
- data/ext/gte/src/error.rs +48 -0
- data/ext/gte/src/lib.rs +31 -0
- data/ext/gte/src/model_config.rs +17 -0
- data/ext/gte/src/postprocess.rs +113 -0
- data/ext/gte/src/ruby_embedder.rs +222 -0
- data/ext/gte/src/session.rs +123 -0
- data/ext/gte/src/tokenizer.rs +130 -0
- data/ext/gte/tests/embedder_unit_test.rs +39 -0
- data/ext/gte/tests/inference_integration_test.rs +62 -0
- data/ext/gte/tests/tokenizer_unit_test.rs +44 -0
- data/lib/gte.rb +32 -0
- metadata +136 -0
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
use gte::embedder::Embedder;
|
|
2
|
+
|
|
3
|
+
#[test]
|
|
4
|
+
#[ignore = "requires ext/gte/tests/fixtures/e5/tokenizer.json and model.onnx"]
|
|
5
|
+
fn test_e5_single_embedding_shape() {
|
|
6
|
+
const DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/e5");
|
|
7
|
+
|
|
8
|
+
let embedder = Embedder::from_dir(DIR, 0, 3).expect("embedder should initialize");
|
|
9
|
+
let result = embedder
|
|
10
|
+
.embed(vec!["query: Hello world".to_string()])
|
|
11
|
+
.expect("embed should succeed");
|
|
12
|
+
|
|
13
|
+
assert_eq!(result.shape()[0], 1);
|
|
14
|
+
assert!(result.shape()[1] > 0);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
#[test]
|
|
18
|
+
#[ignore = "requires ext/gte/tests/fixtures/clip/tokenizer.json and model.onnx"]
|
|
19
|
+
fn test_clip_single_embedding_shape() {
|
|
20
|
+
const DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/clip");
|
|
21
|
+
|
|
22
|
+
let embedder = Embedder::from_dir(DIR, 0, 3).expect("embedder should initialize");
|
|
23
|
+
let result = embedder
|
|
24
|
+
.embed(vec!["a photo of a cat".to_string()])
|
|
25
|
+
.expect("embed should succeed");
|
|
26
|
+
|
|
27
|
+
assert_eq!(result.shape()[0], 1);
|
|
28
|
+
assert!(result.shape()[1] > 0);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
#[test]
|
|
32
|
+
#[ignore = "requires ext/gte/tests/fixtures/e5/tokenizer.json and model.onnx"]
|
|
33
|
+
fn test_e5_batch_embedding_shape() {
|
|
34
|
+
const DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/e5");
|
|
35
|
+
|
|
36
|
+
let embedder = Embedder::from_dir(DIR, 0, 3).expect("embedder should initialize");
|
|
37
|
+
let texts = vec![
|
|
38
|
+
"query: first sentence".to_string(),
|
|
39
|
+
"query: second sentence".to_string(),
|
|
40
|
+
"query: third sentence for batch".to_string(),
|
|
41
|
+
];
|
|
42
|
+
|
|
43
|
+
let result = embedder.embed(texts).expect("batch embed should succeed");
|
|
44
|
+
|
|
45
|
+
assert_eq!(result.shape()[0], 3);
|
|
46
|
+
assert!(result.shape()[1] > 0);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
#[test]
|
|
50
|
+
#[ignore = "requires ext/gte/tests/fixtures/e5/tokenizer.json and model.onnx"]
|
|
51
|
+
fn test_e5_long_input_truncation_no_error() {
|
|
52
|
+
const DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/e5");
|
|
53
|
+
|
|
54
|
+
let embedder = Embedder::from_dir(DIR, 0, 3).expect("embedder should initialize");
|
|
55
|
+
let very_long_text = "word ".repeat(1000);
|
|
56
|
+
let result = embedder
|
|
57
|
+
.embed(vec![very_long_text])
|
|
58
|
+
.expect("long input should be truncated without error");
|
|
59
|
+
|
|
60
|
+
assert_eq!(result.shape()[0], 1);
|
|
61
|
+
assert!(result.shape()[1] > 0);
|
|
62
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
use gte::tokenizer::Tokenizer;
|
|
2
|
+
|
|
3
|
+
#[test]
|
|
4
|
+
#[ignore = "requires ext/gte/tests/fixtures/e5/tokenizer.json"]
|
|
5
|
+
fn test_e5_tokenizer_output_shape() {
|
|
6
|
+
const TOKENIZER: &str = concat!(
|
|
7
|
+
env!("CARGO_MANIFEST_DIR"),
|
|
8
|
+
"/tests/fixtures/e5/tokenizer.json"
|
|
9
|
+
);
|
|
10
|
+
|
|
11
|
+
let tokenizer = Tokenizer::new(TOKENIZER, 512, true).expect("tokenizer should load");
|
|
12
|
+
let texts = vec![
|
|
13
|
+
"Hello, world!".to_string(),
|
|
14
|
+
"A second, longer sentence to test padding behavior.".to_string(),
|
|
15
|
+
];
|
|
16
|
+
|
|
17
|
+
let tokenized = tokenizer.tokenize(&texts).expect("tokenize should succeed");
|
|
18
|
+
|
|
19
|
+
assert_eq!(tokenized.rows, 2, "batch size should be 2");
|
|
20
|
+
assert!(tokenized.cols > 0, "sequence length should be non-zero");
|
|
21
|
+
assert_eq!(tokenized.input_ids.len(), tokenized.rows * tokenized.cols);
|
|
22
|
+
assert_eq!(tokenized.attn_masks.len(), tokenized.rows * tokenized.cols);
|
|
23
|
+
|
|
24
|
+
let type_ids = tokenized.type_ids.as_ref().expect("type_ids should exist");
|
|
25
|
+
assert_eq!(type_ids.len(), tokenized.rows * tokenized.cols);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
#[test]
|
|
29
|
+
#[ignore = "requires ext/gte/tests/fixtures/e5/tokenizer.json"]
|
|
30
|
+
fn test_e5_truncation_at_max_length() {
|
|
31
|
+
const TOKENIZER: &str = concat!(
|
|
32
|
+
env!("CARGO_MANIFEST_DIR"),
|
|
33
|
+
"/tests/fixtures/e5/tokenizer.json"
|
|
34
|
+
);
|
|
35
|
+
|
|
36
|
+
let tokenizer = Tokenizer::new(TOKENIZER, 16, false).expect("tokenizer should load");
|
|
37
|
+
let long_text = "word ".repeat(200);
|
|
38
|
+
let tokenized = tokenizer
|
|
39
|
+
.tokenize(&[long_text])
|
|
40
|
+
.expect("tokenize should not error on long input");
|
|
41
|
+
|
|
42
|
+
assert_eq!(tokenized.rows, 1);
|
|
43
|
+
assert_eq!(tokenized.cols, 16, "sequence length should be truncated to max_length");
|
|
44
|
+
}
|
data/lib/gte.rb
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'gte/gte'
|
|
4
|
+
|
|
5
|
+
module GTE
|
|
6
|
+
VERSION = File.read(File.expand_path('../VERSION', __dir__)).strip
|
|
7
|
+
|
|
8
|
+
class Model
|
|
9
|
+
def initialize(dir, num_threads: 0, optimization_level: 3)
|
|
10
|
+
@embedder = GTE::Embedder.new(dir, num_threads, optimization_level)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def embed(texts)
|
|
14
|
+
if texts.is_a?(String)
|
|
15
|
+
@embedder.embed_one(texts)
|
|
16
|
+
else
|
|
17
|
+
@embedder.embed(Array(texts))
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def [](input)
|
|
22
|
+
case input
|
|
23
|
+
when String then embed(input).row(0)
|
|
24
|
+
when Array then embed(input)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def self.new(dir, num_threads: 0, optimization_level: 3)
|
|
30
|
+
Model.new(dir, num_threads: num_threads, optimization_level: optimization_level)
|
|
31
|
+
end
|
|
32
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: gte
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.0.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- elcuervo
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2026-04-10 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: rake
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - ">="
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '0'
|
|
20
|
+
type: :development
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - ">="
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '0'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: rake-compiler
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - ">="
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '0'
|
|
34
|
+
type: :development
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - ">="
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: '0'
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: rb_sys
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - ">="
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '0'
|
|
48
|
+
type: :development
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - ">="
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '0'
|
|
55
|
+
- !ruby/object:Gem::Dependency
|
|
56
|
+
name: rspec
|
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
|
58
|
+
requirements:
|
|
59
|
+
- - ">="
|
|
60
|
+
- !ruby/object:Gem::Version
|
|
61
|
+
version: '0'
|
|
62
|
+
type: :development
|
|
63
|
+
prerelease: false
|
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
+
requirements:
|
|
66
|
+
- - ">="
|
|
67
|
+
- !ruby/object:Gem::Version
|
|
68
|
+
version: '0'
|
|
69
|
+
- !ruby/object:Gem::Dependency
|
|
70
|
+
name: rspec-benchmark
|
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
|
72
|
+
requirements:
|
|
73
|
+
- - ">="
|
|
74
|
+
- !ruby/object:Gem::Version
|
|
75
|
+
version: '0'
|
|
76
|
+
type: :development
|
|
77
|
+
prerelease: false
|
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
79
|
+
requirements:
|
|
80
|
+
- - ">="
|
|
81
|
+
- !ruby/object:Gem::Version
|
|
82
|
+
version: '0'
|
|
83
|
+
description:
|
|
84
|
+
email:
|
|
85
|
+
- elcuervo@elcuervo.net
|
|
86
|
+
executables: []
|
|
87
|
+
extensions:
|
|
88
|
+
- ext/gte/extconf.rb
|
|
89
|
+
extra_rdoc_files: []
|
|
90
|
+
files:
|
|
91
|
+
- Gemfile
|
|
92
|
+
- LICENSE
|
|
93
|
+
- README.md
|
|
94
|
+
- Rakefile
|
|
95
|
+
- VERSION
|
|
96
|
+
- ext/gte/Cargo.toml
|
|
97
|
+
- ext/gte/benches/hot_path.rs
|
|
98
|
+
- ext/gte/build.rs
|
|
99
|
+
- ext/gte/extconf.rb
|
|
100
|
+
- ext/gte/src/embedder.rs
|
|
101
|
+
- ext/gte/src/error.rs
|
|
102
|
+
- ext/gte/src/lib.rs
|
|
103
|
+
- ext/gte/src/model_config.rs
|
|
104
|
+
- ext/gte/src/postprocess.rs
|
|
105
|
+
- ext/gte/src/ruby_embedder.rs
|
|
106
|
+
- ext/gte/src/session.rs
|
|
107
|
+
- ext/gte/src/tokenizer.rs
|
|
108
|
+
- ext/gte/tests/embedder_unit_test.rs
|
|
109
|
+
- ext/gte/tests/inference_integration_test.rs
|
|
110
|
+
- ext/gte/tests/tokenizer_unit_test.rs
|
|
111
|
+
- lib/gte.rb
|
|
112
|
+
homepage: https://github.com/elcuervo/gte
|
|
113
|
+
licenses:
|
|
114
|
+
- MIT
|
|
115
|
+
metadata:
|
|
116
|
+
rubygems_mfa_required: 'true'
|
|
117
|
+
post_install_message:
|
|
118
|
+
rdoc_options: []
|
|
119
|
+
require_paths:
|
|
120
|
+
- lib
|
|
121
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
122
|
+
requirements:
|
|
123
|
+
- - ">="
|
|
124
|
+
- !ruby/object:Gem::Version
|
|
125
|
+
version: '3.2'
|
|
126
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
127
|
+
requirements:
|
|
128
|
+
- - ">="
|
|
129
|
+
- !ruby/object:Gem::Version
|
|
130
|
+
version: '0'
|
|
131
|
+
requirements: []
|
|
132
|
+
rubygems_version: 3.5.22
|
|
133
|
+
signing_key:
|
|
134
|
+
specification_version: 4
|
|
135
|
+
summary: General Text Embeddings
|
|
136
|
+
test_files: []
|