informers 1.0.1 → 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +108 -10
- data/lib/informers/model.rb +4 -11
- data/lib/informers/models.rb +26 -6
- data/lib/informers/pipelines.rb +88 -20
- data/lib/informers/tokenizers.rb +6 -0
- data/lib/informers/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4ea317272c5054b01616643e7e0f0b2b2fe0c4a87fe8399350a6b8d0a279c5a1
|
4
|
+
data.tar.gz: 530f8aaab9a5ca71811a82adca0272e2ca84525bcf1f60f2209c394cbd0f9c2a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 76059b486e6f6c0b0054450f76813dd4bf12845da6f46e8089585cd1a69be7db86a0acf446cc5a18e48108393403324626f6656d09bdb69083f2651abc0d2448
|
7
|
+
data.tar.gz: f466f5382edd76a7092dc6ada349a3e58fe7eedcd481726ca765f8ddfb4543b7269dab96c00a93d10b0fd67f800afd70a619cfb15d78dde494b29cc13d21ef1a
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -21,6 +21,20 @@ gem "informers"
|
|
21
21
|
|
22
22
|
## Models
|
23
23
|
|
24
|
+
Embedding
|
25
|
+
|
26
|
+
- [sentence-transformers/all-MiniLM-L6-v2](#sentence-transformersall-MiniLM-L6-v2)
|
27
|
+
- [Xenova/multi-qa-MiniLM-L6-cos-v1](#xenovamulti-qa-MiniLM-L6-cos-v1)
|
28
|
+
- [mixedbread-ai/mxbai-embed-large-v1](#mixedbread-aimxbai-embed-large-v1)
|
29
|
+
- [Supabase/gte-small](#supabasegte-small)
|
30
|
+
- [intfloat/e5-base-v2](#intfloate5-base-v2)
|
31
|
+
- [nomic-ai/nomic-embed-text-v1](#nomic-ainomic-embed-text-v1)
|
32
|
+
- [BAAI/bge-base-en-v1.5](#baaibge-base-en-v15)
|
33
|
+
|
34
|
+
Reranking (experimental)
|
35
|
+
|
36
|
+
- [mixedbread-ai/mxbai-rerank-base-v1](#mixedbread-aimxbai-rerank-base-v1)
|
37
|
+
|
24
38
|
### sentence-transformers/all-MiniLM-L6-v2
|
25
39
|
|
26
40
|
[Docs](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
|
@@ -28,14 +42,14 @@ gem "informers"
|
|
28
42
|
```ruby
|
29
43
|
sentences = ["This is an example sentence", "Each sentence is converted"]
|
30
44
|
|
31
|
-
model = Informers
|
32
|
-
embeddings = model.
|
45
|
+
model = Informers.pipeline("embedding", "sentence-transformers/all-MiniLM-L6-v2")
|
46
|
+
embeddings = model.(sentences)
|
33
47
|
```
|
34
48
|
|
35
49
|
For a quantized version, use:
|
36
50
|
|
37
51
|
```ruby
|
38
|
-
model = Informers
|
52
|
+
model = Informers.pipeline("embedding", "Xenova/all-MiniLM-L6-v2", quantized: true)
|
39
53
|
```
|
40
54
|
|
41
55
|
### Xenova/multi-qa-MiniLM-L6-cos-v1
|
@@ -46,9 +60,9 @@ model = Informers::Model.new("Xenova/all-MiniLM-L6-v2", quantized: true)
|
|
46
60
|
query = "How many people live in London?"
|
47
61
|
docs = ["Around 9 Million people live in London", "London is known for its financial district"]
|
48
62
|
|
49
|
-
model = Informers
|
50
|
-
query_embedding = model.
|
51
|
-
doc_embeddings = model.
|
63
|
+
model = Informers.pipeline("embedding", "Xenova/multi-qa-MiniLM-L6-cos-v1")
|
64
|
+
query_embedding = model.(query)
|
65
|
+
doc_embeddings = model.(docs)
|
52
66
|
scores = doc_embeddings.map { |e| e.zip(query_embedding).sum { |d, q| d * q } }
|
53
67
|
doc_score_pairs = docs.zip(scores).sort_by { |d, s| -s }
|
54
68
|
```
|
@@ -68,8 +82,8 @@ docs = [
|
|
68
82
|
"The cat is purring"
|
69
83
|
]
|
70
84
|
|
71
|
-
model = Informers
|
72
|
-
embeddings = model.
|
85
|
+
model = Informers.pipeline("embedding", "mixedbread-ai/mxbai-embed-large-v1")
|
86
|
+
embeddings = model.(docs)
|
73
87
|
```
|
74
88
|
|
75
89
|
### Supabase/gte-small
|
@@ -79,12 +93,96 @@ embeddings = model.embed(docs)
|
|
79
93
|
```ruby
|
80
94
|
sentences = ["That is a happy person", "That is a very happy person"]
|
81
95
|
|
82
|
-
model = Informers
|
83
|
-
embeddings = model.
|
96
|
+
model = Informers.pipeline("embedding", "Supabase/gte-small")
|
97
|
+
embeddings = model.(sentences)
|
98
|
+
```
|
99
|
+
|
100
|
+
### intfloat/e5-base-v2
|
101
|
+
|
102
|
+
[Docs](https://huggingface.co/intfloat/e5-base-v2)
|
103
|
+
|
104
|
+
```ruby
|
105
|
+
input = [
|
106
|
+
"passage: Ruby is a programming language created by Matz",
|
107
|
+
"query: Ruby creator"
|
108
|
+
]
|
109
|
+
|
110
|
+
model = Informers.pipeline("embedding", "intfloat/e5-base-v2")
|
111
|
+
embeddings = model.(input)
|
112
|
+
```
|
113
|
+
|
114
|
+
### nomic-ai/nomic-embed-text-v1
|
115
|
+
|
116
|
+
[Docs](https://huggingface.co/nomic-ai/nomic-embed-text-v1)
|
117
|
+
|
118
|
+
```ruby
|
119
|
+
input = [
|
120
|
+
"search_document: The dog is barking",
|
121
|
+
"search_query: puppy"
|
122
|
+
]
|
123
|
+
|
124
|
+
model = Informers.pipeline("embedding", "nomic-ai/nomic-embed-text-v1")
|
125
|
+
embeddings = model.(input)
|
126
|
+
```
|
127
|
+
|
128
|
+
### BAAI/bge-base-en-v1.5
|
129
|
+
|
130
|
+
[Docs](https://huggingface.co/BAAI/bge-base-en-v1.5)
|
131
|
+
|
132
|
+
```ruby
|
133
|
+
def transform_query(query)
|
134
|
+
"Represent this sentence for searching relevant passages: #{query}"
|
135
|
+
end
|
136
|
+
|
137
|
+
input = [
|
138
|
+
transform_query("puppy"),
|
139
|
+
"The dog is barking",
|
140
|
+
"The cat is purring"
|
141
|
+
]
|
142
|
+
|
143
|
+
model = Informers.pipeline("embedding", "BAAI/bge-base-en-v1.5")
|
144
|
+
embeddings = model.(input)
|
145
|
+
```
|
146
|
+
|
147
|
+
### mixedbread-ai/mxbai-rerank-base-v1
|
148
|
+
|
149
|
+
[Docs](https://huggingface.co/mixedbread-ai/mxbai-rerank-base-v1)
|
150
|
+
|
151
|
+
```ruby
|
152
|
+
query = "How many people live in London?"
|
153
|
+
docs = ["Around 9 Million people live in London", "London is known for its financial district"]
|
154
|
+
|
155
|
+
model = Informers.pipeline("reranking", "mixedbread-ai/mxbai-rerank-base-v1")
|
156
|
+
result = model.(query, docs)
|
84
157
|
```
|
85
158
|
|
159
|
+
### Other
|
160
|
+
|
161
|
+
You can use the feature extraction pipeline directly.
|
162
|
+
|
163
|
+
```ruby
|
164
|
+
model = Informers.pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2", quantized: false)
|
165
|
+
embeddings = model.(sentences, pooling: "mean", normalize: true)
|
166
|
+
```
|
167
|
+
|
168
|
+
The model files must include `onnx/model.onnx` or `onnx/model_quantized.onnx` ([example](https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx)).
|
169
|
+
|
86
170
|
## Pipelines
|
87
171
|
|
172
|
+
Embedding
|
173
|
+
|
174
|
+
```ruby
|
175
|
+
embed = Informers.pipeline("embedding")
|
176
|
+
embed.("We are very happy to show you the 🤗 Transformers library.")
|
177
|
+
```
|
178
|
+
|
179
|
+
Reranking (experimental)
|
180
|
+
|
181
|
+
```ruby
|
182
|
+
rerank = Informers.pipeline("reranking")
|
183
|
+
rerank.("Who created Ruby?", ["Matz created Ruby", "Another doc"])
|
184
|
+
```
|
185
|
+
|
88
186
|
Named-entity recognition
|
89
187
|
|
90
188
|
```ruby
|
data/lib/informers/model.rb
CHANGED
@@ -2,12 +2,7 @@ module Informers
|
|
2
2
|
class Model
|
3
3
|
def initialize(model_id, quantized: false)
|
4
4
|
@model_id = model_id
|
5
|
-
@model = Informers.pipeline("
|
6
|
-
|
7
|
-
# TODO better pattern
|
8
|
-
if model_id == "sentence-transformers/all-MiniLM-L6-v2"
|
9
|
-
@model.instance_variable_get(:@model).instance_variable_set(:@output_names, ["sentence_embedding"])
|
10
|
-
end
|
5
|
+
@model = Informers.pipeline("embedding", model_id, quantized: quantized)
|
11
6
|
end
|
12
7
|
|
13
8
|
def embed(texts)
|
@@ -15,14 +10,12 @@ module Informers
|
|
15
10
|
texts = [texts] unless is_batched
|
16
11
|
|
17
12
|
case @model_id
|
18
|
-
when "sentence-transformers/all-MiniLM-L6-v2"
|
13
|
+
when "sentence-transformers/all-MiniLM-L6-v2", "Xenova/all-MiniLM-L6-v2", "Xenova/multi-qa-MiniLM-L6-cos-v1", "Supabase/gte-small"
|
19
14
|
output = @model.(texts)
|
20
|
-
when "Xenova/all-MiniLM-L6-v2", "Xenova/multi-qa-MiniLM-L6-cos-v1", "Supabase/gte-small"
|
21
|
-
output = @model.(texts, pooling: "mean", normalize: true)
|
22
15
|
when "mixedbread-ai/mxbai-embed-large-v1"
|
23
|
-
output = @model.(texts, pooling: "cls")
|
16
|
+
output = @model.(texts, pooling: "cls", normalize: false)
|
24
17
|
else
|
25
|
-
raise Error, "
|
18
|
+
raise Error, "Use the embedding pipeline for this model: #{@model_id}"
|
26
19
|
end
|
27
20
|
|
28
21
|
is_batched ? output : output[0]
|
data/lib/informers/models.rb
CHANGED
@@ -141,13 +141,13 @@ module Informers
|
|
141
141
|
OnnxRuntime::InferenceSession.new(path)
|
142
142
|
end
|
143
143
|
|
144
|
-
def call(model_inputs)
|
145
|
-
@forward.(model_inputs)
|
144
|
+
def call(model_inputs, **kwargs)
|
145
|
+
@forward.(model_inputs, **kwargs)
|
146
146
|
end
|
147
147
|
|
148
148
|
private
|
149
149
|
|
150
|
-
def encoder_forward(model_inputs)
|
150
|
+
def encoder_forward(model_inputs, output_names: nil)
|
151
151
|
encoder_feeds = {}
|
152
152
|
@session.inputs.each do |input|
|
153
153
|
key = input[:name].to_sym
|
@@ -156,13 +156,13 @@ module Informers
|
|
156
156
|
if @session.inputs.any? { |v| v[:name] == "token_type_ids" } && !encoder_feeds[:token_type_ids]
|
157
157
|
raise Todo
|
158
158
|
end
|
159
|
-
session_run(@session, encoder_feeds)
|
159
|
+
session_run(@session, encoder_feeds, output_names:)
|
160
160
|
end
|
161
161
|
|
162
|
-
def session_run(session, inputs)
|
162
|
+
def session_run(session, inputs, output_names:)
|
163
163
|
checked_inputs = validate_inputs(session, inputs)
|
164
164
|
begin
|
165
|
-
output = session.run(@output_names, checked_inputs)
|
165
|
+
output = session.run(output_names || @output_names, checked_inputs)
|
166
166
|
output = replace_tensors(output)
|
167
167
|
output
|
168
168
|
rescue => e
|
@@ -199,6 +199,18 @@ module Informers
|
|
199
199
|
end
|
200
200
|
end
|
201
201
|
|
202
|
+
class NomicBertPreTrainedModel < PreTrainedModel
|
203
|
+
end
|
204
|
+
|
205
|
+
class NomicBertModel < NomicBertPreTrainedModel
|
206
|
+
end
|
207
|
+
|
208
|
+
class DebertaV2PreTrainedModel < PreTrainedModel
|
209
|
+
end
|
210
|
+
|
211
|
+
class DebertaV2Model < DebertaV2PreTrainedModel
|
212
|
+
end
|
213
|
+
|
202
214
|
class DistilBertPreTrainedModel < PreTrainedModel
|
203
215
|
end
|
204
216
|
|
@@ -217,6 +229,13 @@ module Informers
|
|
217
229
|
end
|
218
230
|
end
|
219
231
|
|
232
|
+
MODEL_MAPPING_NAMES_ENCODER_ONLY = {
|
233
|
+
"bert" => ["BertModel", BertModel],
|
234
|
+
"nomic_bert" => ["NomicBertModel", NomicBertModel],
|
235
|
+
"deberta-v2" => ["DebertaV2Model", DebertaV2Model],
|
236
|
+
"distilbert" => ["DistilBertModel", DistilBertModel]
|
237
|
+
}
|
238
|
+
|
220
239
|
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = {
|
221
240
|
"bert" => ["BertForSequenceClassification", BertForSequenceClassification],
|
222
241
|
"distilbert" => ["DistilBertForSequenceClassification", DistilBertForSequenceClassification]
|
@@ -231,6 +250,7 @@ module Informers
|
|
231
250
|
}
|
232
251
|
|
233
252
|
MODEL_CLASS_TYPE_MAPPING = [
|
253
|
+
[MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES[:EncoderOnly]],
|
234
254
|
[MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
235
255
|
[MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
236
256
|
[MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]]
|
data/lib/informers/pipelines.rb
CHANGED
@@ -10,10 +10,6 @@ module Informers
|
|
10
10
|
end
|
11
11
|
|
12
12
|
class TextClassificationPipeline < Pipeline
|
13
|
-
def initialize(**options)
|
14
|
-
super(**options)
|
15
|
-
end
|
16
|
-
|
17
13
|
def call(texts, top_k: 1)
|
18
14
|
# Run tokenization
|
19
15
|
model_inputs = @tokenizer.(texts,
|
@@ -56,10 +52,6 @@ module Informers
|
|
56
52
|
end
|
57
53
|
|
58
54
|
class TokenClassificationPipeline < Pipeline
|
59
|
-
def initialize(**options)
|
60
|
-
super(**options)
|
61
|
-
end
|
62
|
-
|
63
55
|
def call(
|
64
56
|
texts,
|
65
57
|
ignore_labels: ["O"],
|
@@ -200,10 +192,6 @@ module Informers
|
|
200
192
|
end
|
201
193
|
|
202
194
|
class QuestionAnsweringPipeline < Pipeline
|
203
|
-
def initialize(**options)
|
204
|
-
super(**options)
|
205
|
-
end
|
206
|
-
|
207
195
|
def call(question, context, top_k: 1)
|
208
196
|
# Run tokenization
|
209
197
|
inputs = @tokenizer.(question,
|
@@ -256,10 +244,6 @@ module Informers
|
|
256
244
|
end
|
257
245
|
|
258
246
|
class FeatureExtractionPipeline < Pipeline
|
259
|
-
def initialize(**options)
|
260
|
-
super(**options)
|
261
|
-
end
|
262
|
-
|
263
247
|
def call(
|
264
248
|
texts,
|
265
249
|
pooling: "none",
|
@@ -272,12 +256,27 @@ module Informers
|
|
272
256
|
padding: true,
|
273
257
|
truncation: true
|
274
258
|
)
|
259
|
+
model_options = {}
|
260
|
+
|
261
|
+
# optimization for sentence-transformers/all-MiniLM-L6-v2
|
262
|
+
if @model.instance_variable_get(:@output_names) == ["token_embeddings"] && pooling == "mean" && normalize
|
263
|
+
model_options[:output_names] = ["sentence_embedding"]
|
264
|
+
pooling = "none"
|
265
|
+
normalize = false
|
266
|
+
end
|
275
267
|
|
276
268
|
# Run model
|
277
|
-
outputs = @model.(model_inputs)
|
269
|
+
outputs = @model.(model_inputs, **model_options)
|
270
|
+
|
271
|
+
# TODO improve
|
272
|
+
result =
|
273
|
+
if outputs.is_a?(Array)
|
274
|
+
raise Error, "unexpected outputs" if outputs.size != 1
|
275
|
+
outputs[0]
|
276
|
+
else
|
277
|
+
outputs.logits
|
278
|
+
end
|
278
279
|
|
279
|
-
# TODO check outputs.last_hidden_state
|
280
|
-
result = outputs.logits
|
281
280
|
case pooling
|
282
281
|
when "none"
|
283
282
|
# Skip pooling
|
@@ -301,6 +300,46 @@ module Informers
|
|
301
300
|
end
|
302
301
|
end
|
303
302
|
|
303
|
+
class EmbeddingPipeline < FeatureExtractionPipeline
|
304
|
+
def call(
|
305
|
+
texts,
|
306
|
+
pooling: "mean",
|
307
|
+
normalize: true
|
308
|
+
)
|
309
|
+
super(texts, pooling:, normalize:)
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
class RerankingPipeline < Pipeline
|
314
|
+
def call(
|
315
|
+
query,
|
316
|
+
documents,
|
317
|
+
return_documents: false,
|
318
|
+
top_k: nil
|
319
|
+
)
|
320
|
+
model_inputs = @tokenizer.([query] * documents.size,
|
321
|
+
text_pair: documents,
|
322
|
+
padding: true,
|
323
|
+
truncation: true
|
324
|
+
)
|
325
|
+
|
326
|
+
outputs = @model.(model_inputs)
|
327
|
+
|
328
|
+
result =
|
329
|
+
Utils.sigmoid(outputs[0].map(&:first))
|
330
|
+
.map.with_index { |s, i| {doc_id: i, score: s} }
|
331
|
+
.sort_by { |v| -v[:score] }
|
332
|
+
|
333
|
+
if return_documents
|
334
|
+
result.each do |v|
|
335
|
+
v[:text] = documents[v[:doc_id]]
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
top_k ? result.first(top_k) : result
|
340
|
+
end
|
341
|
+
end
|
342
|
+
|
304
343
|
SUPPORTED_TASKS = {
|
305
344
|
"text-classification" => {
|
306
345
|
tokenizer: AutoTokenizer,
|
@@ -337,6 +376,24 @@ module Informers
|
|
337
376
|
model: "Xenova/all-MiniLM-L6-v2"
|
338
377
|
},
|
339
378
|
type: "text"
|
379
|
+
},
|
380
|
+
"embedding" => {
|
381
|
+
tokenizer: AutoTokenizer,
|
382
|
+
pipeline: EmbeddingPipeline,
|
383
|
+
model: AutoModel,
|
384
|
+
default: {
|
385
|
+
model: "sentence-transformers/all-MiniLM-L6-v2"
|
386
|
+
},
|
387
|
+
type: "text"
|
388
|
+
},
|
389
|
+
"reranking" => {
|
390
|
+
tokenizer: AutoTokenizer,
|
391
|
+
pipeline: RerankingPipeline,
|
392
|
+
model: AutoModel,
|
393
|
+
default: {
|
394
|
+
model: "mixedbread-ai/mxbai-rerank-base-v1"
|
395
|
+
},
|
396
|
+
type: "text"
|
340
397
|
}
|
341
398
|
}
|
342
399
|
|
@@ -361,11 +418,13 @@ module Informers
|
|
361
418
|
end
|
362
419
|
end
|
363
420
|
|
421
|
+
NO_DEFAULT = Object.new
|
422
|
+
|
364
423
|
class << self
|
365
424
|
def pipeline(
|
366
425
|
task,
|
367
426
|
model = nil,
|
368
|
-
quantized:
|
427
|
+
quantized: NO_DEFAULT,
|
369
428
|
progress_callback: DEFAULT_PROGRESS_CALLBACK,
|
370
429
|
config: nil,
|
371
430
|
cache_dir: nil,
|
@@ -373,6 +432,11 @@ module Informers
|
|
373
432
|
revision: "main",
|
374
433
|
model_file_name: nil
|
375
434
|
)
|
435
|
+
if quantized == NO_DEFAULT
|
436
|
+
# TODO move default to task class
|
437
|
+
quantized = !["embedding", "reranking"].include?(task)
|
438
|
+
end
|
439
|
+
|
376
440
|
# Apply aliases
|
377
441
|
task = TASK_ALIASES[task] || task
|
378
442
|
|
@@ -408,6 +472,10 @@ module Informers
|
|
408
472
|
results = load_items(classes, model, pretrained_options)
|
409
473
|
results[:task] = task
|
410
474
|
|
475
|
+
if model == "sentence-transformers/all-MiniLM-L6-v2"
|
476
|
+
results[:model].instance_variable_set(:@output_names, ["token_embeddings"])
|
477
|
+
end
|
478
|
+
|
411
479
|
Utils.dispatch_callback(progress_callback, {
|
412
480
|
status: "ready",
|
413
481
|
task: task,
|
data/lib/informers/tokenizers.rb
CHANGED
@@ -83,12 +83,18 @@ module Informers
|
|
83
83
|
# self.return_token_type_ids = true
|
84
84
|
end
|
85
85
|
|
86
|
+
class DebertaV2Tokenizer < PreTrainedTokenizer
|
87
|
+
# TODO
|
88
|
+
# self.return_token_type_ids = true
|
89
|
+
end
|
90
|
+
|
86
91
|
class DistilBertTokenizer < PreTrainedTokenizer
|
87
92
|
end
|
88
93
|
|
89
94
|
class AutoTokenizer
|
90
95
|
TOKENIZER_CLASS_MAPPING = {
|
91
96
|
"BertTokenizer" => BertTokenizer,
|
97
|
+
"DebertaV2Tokenizer" => DebertaV2Tokenizer,
|
92
98
|
"DistilBertTokenizer" => DistilBertTokenizer
|
93
99
|
}
|
94
100
|
|
data/lib/informers/version.rb
CHANGED