informers 1.0.2 → 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +90 -19
- data/lib/informers/model.rb +2 -7
- data/lib/informers/models.rb +32 -3
- data/lib/informers/pipelines.rb +13 -6
- data/lib/informers/tokenizers.rb +13 -1
- data/lib/informers/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f5340da0bce9d55a0339fac6b8806f09119df3e89567ecb37a77e1a5921b8fa2
|
4
|
+
data.tar.gz: 66a9d275cb2999ad14ba1cfd900bdcbf9fdc3d26ce29387acdd74452bf2050ef
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a4a0c3da3d8a3555a6f2debca8f2939b6536ac76386cdd6c7264890b2d00842d537ecfca352021fa349ff9c4636ba49c189f652a66676746d9ec2a8d97eecc2a
|
7
|
+
data.tar.gz: a06aa115b5966fd1b8da7a80d8481d3e61778f31c3bb0da143f329e81ae3f73d4a1d1b2ee01672f4e90742a35d68a23dd5c871c3b68ffad0c16d8e5de480a60f
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -30,10 +30,15 @@ Embedding
|
|
30
30
|
- [intfloat/e5-base-v2](#intfloate5-base-v2)
|
31
31
|
- [nomic-ai/nomic-embed-text-v1](#nomic-ainomic-embed-text-v1)
|
32
32
|
- [BAAI/bge-base-en-v1.5](#baaibge-base-en-v15)
|
33
|
+
- [jinaai/jina-embeddings-v2-base-en](#jinaaijina-embeddings-v2-base-en)
|
34
|
+
- [Snowflake/snowflake-arctic-embed-m-v1.5](#snowflakesnowflake-arctic-embed-m-v15)
|
35
|
+
- [Xenova/all-mpnet-base-v2](#xenovaall-mpnet-base-v2)
|
33
36
|
|
34
|
-
Reranking
|
37
|
+
Reranking
|
35
38
|
|
36
39
|
- [mixedbread-ai/mxbai-rerank-base-v1](#mixedbread-aimxbai-rerank-base-v1)
|
40
|
+
- [jinaai/jina-reranker-v1-turbo-en](#jinaaijina-reranker-v1-turbo-en)
|
41
|
+
- [BAAI/bge-reranker-base](#baaibge-reranker-base)
|
37
42
|
|
38
43
|
### sentence-transformers/all-MiniLM-L6-v2
|
39
44
|
|
@@ -72,18 +77,16 @@ doc_score_pairs = docs.zip(scores).sort_by { |d, s| -s }
|
|
72
77
|
[Docs](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1)
|
73
78
|
|
74
79
|
```ruby
|
75
|
-
|
76
|
-
"Represent this sentence for searching relevant passages: #{query}"
|
77
|
-
end
|
80
|
+
query_prefix = "Represent this sentence for searching relevant passages: "
|
78
81
|
|
79
|
-
|
80
|
-
transform_query("puppy"),
|
82
|
+
input = [
|
81
83
|
"The dog is barking",
|
82
|
-
"The cat is purring"
|
84
|
+
"The cat is purring",
|
85
|
+
query_prefix + "puppy"
|
83
86
|
]
|
84
87
|
|
85
88
|
model = Informers.pipeline("embedding", "mixedbread-ai/mxbai-embed-large-v1")
|
86
|
-
embeddings = model.(
|
89
|
+
embeddings = model.(input)
|
87
90
|
```
|
88
91
|
|
89
92
|
### Supabase/gte-small
|
@@ -102,9 +105,12 @@ embeddings = model.(sentences)
|
|
102
105
|
[Docs](https://huggingface.co/intfloat/e5-base-v2)
|
103
106
|
|
104
107
|
```ruby
|
108
|
+
doc_prefix = "passage: "
|
109
|
+
query_prefix = "query: "
|
110
|
+
|
105
111
|
input = [
|
106
|
-
"
|
107
|
-
"
|
112
|
+
doc_prefix + "Ruby is a programming language created by Matz",
|
113
|
+
query_prefix + "Ruby creator"
|
108
114
|
]
|
109
115
|
|
110
116
|
model = Informers.pipeline("embedding", "intfloat/e5-base-v2")
|
@@ -116,9 +122,13 @@ embeddings = model.(input)
|
|
116
122
|
[Docs](https://huggingface.co/nomic-ai/nomic-embed-text-v1)
|
117
123
|
|
118
124
|
```ruby
|
125
|
+
doc_prefix = "search_document: "
|
126
|
+
query_prefix = "search_query: "
|
127
|
+
|
119
128
|
input = [
|
120
|
-
"
|
121
|
-
"
|
129
|
+
doc_prefix + "The dog is barking",
|
130
|
+
doc_prefix + "The cat is purring",
|
131
|
+
query_prefix + "puppy"
|
122
132
|
]
|
123
133
|
|
124
134
|
model = Informers.pipeline("embedding", "nomic-ai/nomic-embed-text-v1")
|
@@ -130,20 +140,57 @@ embeddings = model.(input)
|
|
130
140
|
[Docs](https://huggingface.co/BAAI/bge-base-en-v1.5)
|
131
141
|
|
132
142
|
```ruby
|
133
|
-
|
134
|
-
"Represent this sentence for searching relevant passages: #{query}"
|
135
|
-
end
|
143
|
+
query_prefix = "Represent this sentence for searching relevant passages: "
|
136
144
|
|
137
145
|
input = [
|
138
|
-
transform_query("puppy"),
|
139
146
|
"The dog is barking",
|
140
|
-
"The cat is purring"
|
147
|
+
"The cat is purring",
|
148
|
+
query_prefix + "puppy"
|
141
149
|
]
|
142
150
|
|
143
151
|
model = Informers.pipeline("embedding", "BAAI/bge-base-en-v1.5")
|
144
152
|
embeddings = model.(input)
|
145
153
|
```
|
146
154
|
|
155
|
+
### jinaai/jina-embeddings-v2-base-en
|
156
|
+
|
157
|
+
[Docs](https://huggingface.co/jinaai/jina-embeddings-v2-base-en)
|
158
|
+
|
159
|
+
```ruby
|
160
|
+
sentences = ["How is the weather today?", "What is the current weather like today?"]
|
161
|
+
|
162
|
+
model = Informers.pipeline("embedding", "jinaai/jina-embeddings-v2-base-en", model_file_name: "../model")
|
163
|
+
embeddings = model.(sentences)
|
164
|
+
```
|
165
|
+
|
166
|
+
### Snowflake/snowflake-arctic-embed-m-v1.5
|
167
|
+
|
168
|
+
[Docs](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5)
|
169
|
+
|
170
|
+
```ruby
|
171
|
+
query_prefix = "Represent this sentence for searching relevant passages: "
|
172
|
+
|
173
|
+
input = [
|
174
|
+
"The dog is barking",
|
175
|
+
"The cat is purring",
|
176
|
+
query_prefix + "puppy"
|
177
|
+
]
|
178
|
+
|
179
|
+
model = Informers.pipeline("embedding", "Snowflake/snowflake-arctic-embed-m-v1.5")
|
180
|
+
embeddings = model.(input, model_output: "sentence_embedding", pooling: "none")
|
181
|
+
```
|
182
|
+
|
183
|
+
### Xenova/all-mpnet-base-v2
|
184
|
+
|
185
|
+
[Docs](https://huggingface.co/Xenova/all-mpnet-base-v2)
|
186
|
+
|
187
|
+
```ruby
|
188
|
+
sentences = ["This is an example sentence", "Each sentence is converted"]
|
189
|
+
|
190
|
+
model = Informers.pipeline("embedding", "Xenova/all-mpnet-base-v2")
|
191
|
+
embeddings = model.(sentences)
|
192
|
+
```
|
193
|
+
|
147
194
|
### mixedbread-ai/mxbai-rerank-base-v1
|
148
195
|
|
149
196
|
[Docs](https://huggingface.co/mixedbread-ai/mxbai-rerank-base-v1)
|
@@ -156,6 +203,30 @@ model = Informers.pipeline("reranking", "mixedbread-ai/mxbai-rerank-base-v1")
|
|
156
203
|
result = model.(query, docs)
|
157
204
|
```
|
158
205
|
|
206
|
+
### jinaai/jina-reranker-v1-turbo-en
|
207
|
+
|
208
|
+
[Docs](https://huggingface.co/jinaai/jina-reranker-v1-turbo-en)
|
209
|
+
|
210
|
+
```ruby
|
211
|
+
query = "How many people live in London?"
|
212
|
+
docs = ["Around 9 Million people live in London", "London is known for its financial district"]
|
213
|
+
|
214
|
+
model = Informers.pipeline("reranking", "jinaai/jina-reranker-v1-turbo-en")
|
215
|
+
result = model.(query, docs)
|
216
|
+
```
|
217
|
+
|
218
|
+
### BAAI/bge-reranker-base
|
219
|
+
|
220
|
+
[Docs](https://huggingface.co/BAAI/bge-reranker-base)
|
221
|
+
|
222
|
+
```ruby
|
223
|
+
query = "How many people live in London?"
|
224
|
+
docs = ["Around 9 Million people live in London", "London is known for its financial district"]
|
225
|
+
|
226
|
+
model = Informers.pipeline("reranking", "BAAI/bge-reranker-base")
|
227
|
+
result = model.(query, docs)
|
228
|
+
```
|
229
|
+
|
159
230
|
### Other
|
160
231
|
|
161
232
|
You can use the feature extraction pipeline directly.
|
@@ -165,7 +236,7 @@ model = Informers.pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2", quan
|
|
165
236
|
embeddings = model.(sentences, pooling: "mean", normalize: true)
|
166
237
|
```
|
167
238
|
|
168
|
-
The model
|
239
|
+
The model must include a `.onnx` file ([example](https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx)). If the file is not at `onnx/model.onnx` or `onnx/model_quantized.onnx`, use the `model_file_name` option to specify the location.
|
169
240
|
|
170
241
|
## Pipelines
|
171
242
|
|
@@ -176,7 +247,7 @@ embed = Informers.pipeline("embedding")
|
|
176
247
|
embed.("We are very happy to show you the 🤗 Transformers library.")
|
177
248
|
```
|
178
249
|
|
179
|
-
Reranking
|
250
|
+
Reranking
|
180
251
|
|
181
252
|
```ruby
|
182
253
|
rerank = Informers.pipeline("reranking")
|
data/lib/informers/model.rb
CHANGED
@@ -6,19 +6,14 @@ module Informers
|
|
6
6
|
end
|
7
7
|
|
8
8
|
def embed(texts)
|
9
|
-
is_batched = texts.is_a?(Array)
|
10
|
-
texts = [texts] unless is_batched
|
11
|
-
|
12
9
|
case @model_id
|
13
10
|
when "sentence-transformers/all-MiniLM-L6-v2", "Xenova/all-MiniLM-L6-v2", "Xenova/multi-qa-MiniLM-L6-cos-v1", "Supabase/gte-small"
|
14
|
-
|
11
|
+
@model.(texts)
|
15
12
|
when "mixedbread-ai/mxbai-embed-large-v1"
|
16
|
-
|
13
|
+
@model.(texts, pooling: "cls", normalize: false)
|
17
14
|
else
|
18
15
|
raise Error, "Use the embedding pipeline for this model: #{@model_id}"
|
19
16
|
end
|
20
|
-
|
21
|
-
is_batched ? output : output[0]
|
22
17
|
end
|
23
18
|
end
|
24
19
|
end
|
data/lib/informers/models.rb
CHANGED
@@ -135,7 +135,15 @@ module Informers
|
|
135
135
|
end
|
136
136
|
|
137
137
|
def self.construct_session(pretrained_model_name_or_path, file_name, **options)
|
138
|
-
|
138
|
+
prefix = "onnx/"
|
139
|
+
if file_name.start_with?("../")
|
140
|
+
prefix = ""
|
141
|
+
file_name = file_name[3..]
|
142
|
+
elsif file_name.start_with?("/")
|
143
|
+
prefix = ""
|
144
|
+
file_name = file_name[1..]
|
145
|
+
end
|
146
|
+
model_file_name = "#{prefix}#{file_name}#{options[:quantized] ? "_quantized" : ""}.onnx"
|
139
147
|
path = Utils::Hub.get_model_file(pretrained_model_name_or_path, model_file_name, true, **options)
|
140
148
|
|
141
149
|
OnnxRuntime::InferenceSession.new(path)
|
@@ -229,16 +237,37 @@ module Informers
|
|
229
237
|
end
|
230
238
|
end
|
231
239
|
|
240
|
+
class MPNetPreTrainedModel < PreTrainedModel
|
241
|
+
end
|
242
|
+
|
243
|
+
class MPNetModel < MPNetPreTrainedModel
|
244
|
+
end
|
245
|
+
|
246
|
+
class XLMRobertaPreTrainedModel < PreTrainedModel
|
247
|
+
end
|
248
|
+
|
249
|
+
class XLMRobertaModel < XLMRobertaPreTrainedModel
|
250
|
+
end
|
251
|
+
|
252
|
+
class XLMRobertaForSequenceClassification < XLMRobertaPreTrainedModel
|
253
|
+
def call(model_inputs)
|
254
|
+
SequenceClassifierOutput.new(*super(model_inputs))
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
232
258
|
MODEL_MAPPING_NAMES_ENCODER_ONLY = {
|
233
259
|
"bert" => ["BertModel", BertModel],
|
234
260
|
"nomic_bert" => ["NomicBertModel", NomicBertModel],
|
235
261
|
"deberta-v2" => ["DebertaV2Model", DebertaV2Model],
|
236
|
-
"
|
262
|
+
"mpnet" => ["MPNetModel", MPNetModel],
|
263
|
+
"distilbert" => ["DistilBertModel", DistilBertModel],
|
264
|
+
"xlm-roberta" => ["XLMRobertaModel", XLMRobertaModel]
|
237
265
|
}
|
238
266
|
|
239
267
|
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = {
|
240
268
|
"bert" => ["BertForSequenceClassification", BertForSequenceClassification],
|
241
|
-
"distilbert" => ["DistilBertForSequenceClassification", DistilBertForSequenceClassification]
|
269
|
+
"distilbert" => ["DistilBertForSequenceClassification", DistilBertForSequenceClassification],
|
270
|
+
"xlm-roberta" => ["XLMRobertaForSequenceClassification", XLMRobertaForSequenceClassification]
|
242
271
|
}
|
243
272
|
|
244
273
|
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = {
|
data/lib/informers/pipelines.rb
CHANGED
@@ -249,7 +249,8 @@ module Informers
|
|
249
249
|
pooling: "none",
|
250
250
|
normalize: false,
|
251
251
|
quantize: false,
|
252
|
-
precision: "binary"
|
252
|
+
precision: "binary",
|
253
|
+
model_output: nil
|
253
254
|
)
|
254
255
|
# Run tokenization
|
255
256
|
model_inputs = @tokenizer.(texts,
|
@@ -258,8 +259,10 @@ module Informers
|
|
258
259
|
)
|
259
260
|
model_options = {}
|
260
261
|
|
261
|
-
|
262
|
-
|
262
|
+
if !model_output.nil?
|
263
|
+
model_options[:output_names] = Array(model_output)
|
264
|
+
elsif @model.instance_variable_get(:@output_names) == ["token_embeddings"] && pooling == "mean" && normalize
|
265
|
+
# optimization for sentence-transformers/all-MiniLM-L6-v2
|
263
266
|
model_options[:output_names] = ["sentence_embedding"]
|
264
267
|
pooling = "none"
|
265
268
|
normalize = false
|
@@ -271,7 +274,9 @@ module Informers
|
|
271
274
|
# TODO improve
|
272
275
|
result =
|
273
276
|
if outputs.is_a?(Array)
|
274
|
-
|
277
|
+
# TODO show returned instead of all
|
278
|
+
output_names = @model.instance_variable_get(:@session).outputs.map { |v| v[:name] }
|
279
|
+
raise Error, "unexpected outputs: #{output_names}" if outputs.size != 1
|
275
280
|
outputs[0]
|
276
281
|
else
|
277
282
|
outputs.logits
|
@@ -285,6 +290,7 @@ module Informers
|
|
285
290
|
when "cls"
|
286
291
|
result = result.map(&:first)
|
287
292
|
else
|
293
|
+
# TODO raise ArgumentError in 2.0
|
288
294
|
raise Error, "Pooling method '#{pooling}' not supported."
|
289
295
|
end
|
290
296
|
|
@@ -304,9 +310,10 @@ module Informers
|
|
304
310
|
def call(
|
305
311
|
texts,
|
306
312
|
pooling: "mean",
|
307
|
-
normalize: true
|
313
|
+
normalize: true,
|
314
|
+
model_output: nil
|
308
315
|
)
|
309
|
-
super(texts, pooling:, normalize:)
|
316
|
+
super(texts, pooling:, normalize:, model_output:)
|
310
317
|
end
|
311
318
|
end
|
312
319
|
|
data/lib/informers/tokenizers.rb
CHANGED
@@ -91,11 +91,23 @@ module Informers
|
|
91
91
|
class DistilBertTokenizer < PreTrainedTokenizer
|
92
92
|
end
|
93
93
|
|
94
|
+
class RobertaTokenizer < PreTrainedTokenizer
|
95
|
+
end
|
96
|
+
|
97
|
+
class XLMRobertaTokenizer < PreTrainedTokenizer
|
98
|
+
end
|
99
|
+
|
100
|
+
class MPNetTokenizer < PreTrainedTokenizer
|
101
|
+
end
|
102
|
+
|
94
103
|
class AutoTokenizer
|
95
104
|
TOKENIZER_CLASS_MAPPING = {
|
96
105
|
"BertTokenizer" => BertTokenizer,
|
97
106
|
"DebertaV2Tokenizer" => DebertaV2Tokenizer,
|
98
|
-
"DistilBertTokenizer" => DistilBertTokenizer
|
107
|
+
"DistilBertTokenizer" => DistilBertTokenizer,
|
108
|
+
"RobertaTokenizer" => RobertaTokenizer,
|
109
|
+
"XLMRobertaTokenizer" => XLMRobertaTokenizer,
|
110
|
+
"MPNetTokenizer" => MPNetTokenizer
|
99
111
|
}
|
100
112
|
|
101
113
|
def self.from_pretrained(
|
data/lib/informers/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: informers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08-
|
11
|
+
date: 2024-08-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: onnxruntime
|