informers 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +108 -10
- data/lib/informers/model.rb +4 -11
- data/lib/informers/models.rb +26 -6
- data/lib/informers/pipelines.rb +88 -20
- data/lib/informers/tokenizers.rb +6 -0
- data/lib/informers/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4ea317272c5054b01616643e7e0f0b2b2fe0c4a87fe8399350a6b8d0a279c5a1
|
4
|
+
data.tar.gz: 530f8aaab9a5ca71811a82adca0272e2ca84525bcf1f60f2209c394cbd0f9c2a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 76059b486e6f6c0b0054450f76813dd4bf12845da6f46e8089585cd1a69be7db86a0acf446cc5a18e48108393403324626f6656d09bdb69083f2651abc0d2448
|
7
|
+
data.tar.gz: f466f5382edd76a7092dc6ada349a3e58fe7eedcd481726ca765f8ddfb4543b7269dab96c00a93d10b0fd67f800afd70a619cfb15d78dde494b29cc13d21ef1a
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -21,6 +21,20 @@ gem "informers"
|
|
21
21
|
|
22
22
|
## Models
|
23
23
|
|
24
|
+
Embedding
|
25
|
+
|
26
|
+
- [sentence-transformers/all-MiniLM-L6-v2](#sentence-transformersall-MiniLM-L6-v2)
|
27
|
+
- [Xenova/multi-qa-MiniLM-L6-cos-v1](#xenovamulti-qa-MiniLM-L6-cos-v1)
|
28
|
+
- [mixedbread-ai/mxbai-embed-large-v1](#mixedbread-aimxbai-embed-large-v1)
|
29
|
+
- [Supabase/gte-small](#supabasegte-small)
|
30
|
+
- [intfloat/e5-base-v2](#intfloate5-base-v2)
|
31
|
+
- [nomic-ai/nomic-embed-text-v1](#nomic-ainomic-embed-text-v1)
|
32
|
+
- [BAAI/bge-base-en-v1.5](#baaibge-base-en-v15)
|
33
|
+
|
34
|
+
Reranking (experimental)
|
35
|
+
|
36
|
+
- [mixedbread-ai/mxbai-rerank-base-v1](#mixedbread-aimxbai-rerank-base-v1)
|
37
|
+
|
24
38
|
### sentence-transformers/all-MiniLM-L6-v2
|
25
39
|
|
26
40
|
[Docs](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
|
@@ -28,14 +42,14 @@ gem "informers"
|
|
28
42
|
```ruby
|
29
43
|
sentences = ["This is an example sentence", "Each sentence is converted"]
|
30
44
|
|
31
|
-
model = Informers
|
32
|
-
embeddings = model.
|
45
|
+
model = Informers.pipeline("embedding", "sentence-transformers/all-MiniLM-L6-v2")
|
46
|
+
embeddings = model.(sentences)
|
33
47
|
```
|
34
48
|
|
35
49
|
For a quantized version, use:
|
36
50
|
|
37
51
|
```ruby
|
38
|
-
model = Informers
|
52
|
+
model = Informers.pipeline("embedding", "Xenova/all-MiniLM-L6-v2", quantized: true)
|
39
53
|
```
|
40
54
|
|
41
55
|
### Xenova/multi-qa-MiniLM-L6-cos-v1
|
@@ -46,9 +60,9 @@ model = Informers::Model.new("Xenova/all-MiniLM-L6-v2", quantized: true)
|
|
46
60
|
query = "How many people live in London?"
|
47
61
|
docs = ["Around 9 Million people live in London", "London is known for its financial district"]
|
48
62
|
|
49
|
-
model = Informers
|
50
|
-
query_embedding = model.
|
51
|
-
doc_embeddings = model.
|
63
|
+
model = Informers.pipeline("embedding", "Xenova/multi-qa-MiniLM-L6-cos-v1")
|
64
|
+
query_embedding = model.(query)
|
65
|
+
doc_embeddings = model.(docs)
|
52
66
|
scores = doc_embeddings.map { |e| e.zip(query_embedding).sum { |d, q| d * q } }
|
53
67
|
doc_score_pairs = docs.zip(scores).sort_by { |d, s| -s }
|
54
68
|
```
|
@@ -68,8 +82,8 @@ docs = [
|
|
68
82
|
"The cat is purring"
|
69
83
|
]
|
70
84
|
|
71
|
-
model = Informers
|
72
|
-
embeddings = model.
|
85
|
+
model = Informers.pipeline("embedding", "mixedbread-ai/mxbai-embed-large-v1")
|
86
|
+
embeddings = model.(docs)
|
73
87
|
```
|
74
88
|
|
75
89
|
### Supabase/gte-small
|
@@ -79,12 +93,96 @@ embeddings = model.embed(docs)
|
|
79
93
|
```ruby
|
80
94
|
sentences = ["That is a happy person", "That is a very happy person"]
|
81
95
|
|
82
|
-
model = Informers
|
83
|
-
embeddings = model.
|
96
|
+
model = Informers.pipeline("embedding", "Supabase/gte-small")
|
97
|
+
embeddings = model.(sentences)
|
98
|
+
```
|
99
|
+
|
100
|
+
### intfloat/e5-base-v2
|
101
|
+
|
102
|
+
[Docs](https://huggingface.co/intfloat/e5-base-v2)
|
103
|
+
|
104
|
+
```ruby
|
105
|
+
input = [
|
106
|
+
"passage: Ruby is a programming language created by Matz",
|
107
|
+
"query: Ruby creator"
|
108
|
+
]
|
109
|
+
|
110
|
+
model = Informers.pipeline("embedding", "intfloat/e5-base-v2")
|
111
|
+
embeddings = model.(input)
|
112
|
+
```
|
113
|
+
|
114
|
+
### nomic-ai/nomic-embed-text-v1
|
115
|
+
|
116
|
+
[Docs](https://huggingface.co/nomic-ai/nomic-embed-text-v1)
|
117
|
+
|
118
|
+
```ruby
|
119
|
+
input = [
|
120
|
+
"search_document: The dog is barking",
|
121
|
+
"search_query: puppy"
|
122
|
+
]
|
123
|
+
|
124
|
+
model = Informers.pipeline("embedding", "nomic-ai/nomic-embed-text-v1")
|
125
|
+
embeddings = model.(input)
|
126
|
+
```
|
127
|
+
|
128
|
+
### BAAI/bge-base-en-v1.5
|
129
|
+
|
130
|
+
[Docs](https://huggingface.co/BAAI/bge-base-en-v1.5)
|
131
|
+
|
132
|
+
```ruby
|
133
|
+
def transform_query(query)
|
134
|
+
"Represent this sentence for searching relevant passages: #{query}"
|
135
|
+
end
|
136
|
+
|
137
|
+
input = [
|
138
|
+
transform_query("puppy"),
|
139
|
+
"The dog is barking",
|
140
|
+
"The cat is purring"
|
141
|
+
]
|
142
|
+
|
143
|
+
model = Informers.pipeline("embedding", "BAAI/bge-base-en-v1.5")
|
144
|
+
embeddings = model.(input)
|
145
|
+
```
|
146
|
+
|
147
|
+
### mixedbread-ai/mxbai-rerank-base-v1
|
148
|
+
|
149
|
+
[Docs](https://huggingface.co/mixedbread-ai/mxbai-rerank-base-v1)
|
150
|
+
|
151
|
+
```ruby
|
152
|
+
query = "How many people live in London?"
|
153
|
+
docs = ["Around 9 Million people live in London", "London is known for its financial district"]
|
154
|
+
|
155
|
+
model = Informers.pipeline("reranking", "mixedbread-ai/mxbai-rerank-base-v1")
|
156
|
+
result = model.(query, docs)
|
84
157
|
```
|
85
158
|
|
159
|
+
### Other
|
160
|
+
|
161
|
+
You can use the feature extraction pipeline directly.
|
162
|
+
|
163
|
+
```ruby
|
164
|
+
model = Informers.pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2", quantized: false)
|
165
|
+
embeddings = model.(sentences, pooling: "mean", normalize: true)
|
166
|
+
```
|
167
|
+
|
168
|
+
The model files must include `onnx/model.onnx` or `onnx/model_quantized.onnx` ([example](https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx)).
|
169
|
+
|
86
170
|
## Pipelines
|
87
171
|
|
172
|
+
Embedding
|
173
|
+
|
174
|
+
```ruby
|
175
|
+
embed = Informers.pipeline("embedding")
|
176
|
+
embed.("We are very happy to show you the 🤗 Transformers library.")
|
177
|
+
```
|
178
|
+
|
179
|
+
Reranking (experimental)
|
180
|
+
|
181
|
+
```ruby
|
182
|
+
rerank = Informers.pipeline("reranking")
|
183
|
+
rerank.("Who created Ruby?", ["Matz created Ruby", "Another doc"])
|
184
|
+
```
|
185
|
+
|
88
186
|
Named-entity recognition
|
89
187
|
|
90
188
|
```ruby
|
data/lib/informers/model.rb
CHANGED
@@ -2,12 +2,7 @@ module Informers
|
|
2
2
|
class Model
|
3
3
|
def initialize(model_id, quantized: false)
|
4
4
|
@model_id = model_id
|
5
|
-
@model = Informers.pipeline("
|
6
|
-
|
7
|
-
# TODO better pattern
|
8
|
-
if model_id == "sentence-transformers/all-MiniLM-L6-v2"
|
9
|
-
@model.instance_variable_get(:@model).instance_variable_set(:@output_names, ["sentence_embedding"])
|
10
|
-
end
|
5
|
+
@model = Informers.pipeline("embedding", model_id, quantized: quantized)
|
11
6
|
end
|
12
7
|
|
13
8
|
def embed(texts)
|
@@ -15,14 +10,12 @@ module Informers
|
|
15
10
|
texts = [texts] unless is_batched
|
16
11
|
|
17
12
|
case @model_id
|
18
|
-
when "sentence-transformers/all-MiniLM-L6-v2"
|
13
|
+
when "sentence-transformers/all-MiniLM-L6-v2", "Xenova/all-MiniLM-L6-v2", "Xenova/multi-qa-MiniLM-L6-cos-v1", "Supabase/gte-small"
|
19
14
|
output = @model.(texts)
|
20
|
-
when "Xenova/all-MiniLM-L6-v2", "Xenova/multi-qa-MiniLM-L6-cos-v1", "Supabase/gte-small"
|
21
|
-
output = @model.(texts, pooling: "mean", normalize: true)
|
22
15
|
when "mixedbread-ai/mxbai-embed-large-v1"
|
23
|
-
output = @model.(texts, pooling: "cls")
|
16
|
+
output = @model.(texts, pooling: "cls", normalize: false)
|
24
17
|
else
|
25
|
-
raise Error, "
|
18
|
+
raise Error, "Use the embedding pipeline for this model: #{@model_id}"
|
26
19
|
end
|
27
20
|
|
28
21
|
is_batched ? output : output[0]
|
data/lib/informers/models.rb
CHANGED
@@ -141,13 +141,13 @@ module Informers
|
|
141
141
|
OnnxRuntime::InferenceSession.new(path)
|
142
142
|
end
|
143
143
|
|
144
|
-
def call(model_inputs)
|
145
|
-
@forward.(model_inputs)
|
144
|
+
def call(model_inputs, **kwargs)
|
145
|
+
@forward.(model_inputs, **kwargs)
|
146
146
|
end
|
147
147
|
|
148
148
|
private
|
149
149
|
|
150
|
-
def encoder_forward(model_inputs)
|
150
|
+
def encoder_forward(model_inputs, output_names: nil)
|
151
151
|
encoder_feeds = {}
|
152
152
|
@session.inputs.each do |input|
|
153
153
|
key = input[:name].to_sym
|
@@ -156,13 +156,13 @@ module Informers
|
|
156
156
|
if @session.inputs.any? { |v| v[:name] == "token_type_ids" } && !encoder_feeds[:token_type_ids]
|
157
157
|
raise Todo
|
158
158
|
end
|
159
|
-
session_run(@session, encoder_feeds)
|
159
|
+
session_run(@session, encoder_feeds, output_names:)
|
160
160
|
end
|
161
161
|
|
162
|
-
def session_run(session, inputs)
|
162
|
+
def session_run(session, inputs, output_names:)
|
163
163
|
checked_inputs = validate_inputs(session, inputs)
|
164
164
|
begin
|
165
|
-
output = session.run(@output_names, checked_inputs)
|
165
|
+
output = session.run(output_names || @output_names, checked_inputs)
|
166
166
|
output = replace_tensors(output)
|
167
167
|
output
|
168
168
|
rescue => e
|
@@ -199,6 +199,18 @@ module Informers
|
|
199
199
|
end
|
200
200
|
end
|
201
201
|
|
202
|
+
class NomicBertPreTrainedModel < PreTrainedModel
|
203
|
+
end
|
204
|
+
|
205
|
+
class NomicBertModel < NomicBertPreTrainedModel
|
206
|
+
end
|
207
|
+
|
208
|
+
class DebertaV2PreTrainedModel < PreTrainedModel
|
209
|
+
end
|
210
|
+
|
211
|
+
class DebertaV2Model < DebertaV2PreTrainedModel
|
212
|
+
end
|
213
|
+
|
202
214
|
class DistilBertPreTrainedModel < PreTrainedModel
|
203
215
|
end
|
204
216
|
|
@@ -217,6 +229,13 @@ module Informers
|
|
217
229
|
end
|
218
230
|
end
|
219
231
|
|
232
|
+
MODEL_MAPPING_NAMES_ENCODER_ONLY = {
|
233
|
+
"bert" => ["BertModel", BertModel],
|
234
|
+
"nomic_bert" => ["NomicBertModel", NomicBertModel],
|
235
|
+
"deberta-v2" => ["DebertaV2Model", DebertaV2Model],
|
236
|
+
"distilbert" => ["DistilBertModel", DistilBertModel]
|
237
|
+
}
|
238
|
+
|
220
239
|
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = {
|
221
240
|
"bert" => ["BertForSequenceClassification", BertForSequenceClassification],
|
222
241
|
"distilbert" => ["DistilBertForSequenceClassification", DistilBertForSequenceClassification]
|
@@ -231,6 +250,7 @@ module Informers
|
|
231
250
|
}
|
232
251
|
|
233
252
|
MODEL_CLASS_TYPE_MAPPING = [
|
253
|
+
[MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES[:EncoderOnly]],
|
234
254
|
[MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
235
255
|
[MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
|
236
256
|
[MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]]
|
data/lib/informers/pipelines.rb
CHANGED
@@ -10,10 +10,6 @@ module Informers
|
|
10
10
|
end
|
11
11
|
|
12
12
|
class TextClassificationPipeline < Pipeline
|
13
|
-
def initialize(**options)
|
14
|
-
super(**options)
|
15
|
-
end
|
16
|
-
|
17
13
|
def call(texts, top_k: 1)
|
18
14
|
# Run tokenization
|
19
15
|
model_inputs = @tokenizer.(texts,
|
@@ -56,10 +52,6 @@ module Informers
|
|
56
52
|
end
|
57
53
|
|
58
54
|
class TokenClassificationPipeline < Pipeline
|
59
|
-
def initialize(**options)
|
60
|
-
super(**options)
|
61
|
-
end
|
62
|
-
|
63
55
|
def call(
|
64
56
|
texts,
|
65
57
|
ignore_labels: ["O"],
|
@@ -200,10 +192,6 @@ module Informers
|
|
200
192
|
end
|
201
193
|
|
202
194
|
class QuestionAnsweringPipeline < Pipeline
|
203
|
-
def initialize(**options)
|
204
|
-
super(**options)
|
205
|
-
end
|
206
|
-
|
207
195
|
def call(question, context, top_k: 1)
|
208
196
|
# Run tokenization
|
209
197
|
inputs = @tokenizer.(question,
|
@@ -256,10 +244,6 @@ module Informers
|
|
256
244
|
end
|
257
245
|
|
258
246
|
class FeatureExtractionPipeline < Pipeline
|
259
|
-
def initialize(**options)
|
260
|
-
super(**options)
|
261
|
-
end
|
262
|
-
|
263
247
|
def call(
|
264
248
|
texts,
|
265
249
|
pooling: "none",
|
@@ -272,12 +256,27 @@ module Informers
|
|
272
256
|
padding: true,
|
273
257
|
truncation: true
|
274
258
|
)
|
259
|
+
model_options = {}
|
260
|
+
|
261
|
+
# optimization for sentence-transformers/all-MiniLM-L6-v2
|
262
|
+
if @model.instance_variable_get(:@output_names) == ["token_embeddings"] && pooling == "mean" && normalize
|
263
|
+
model_options[:output_names] = ["sentence_embedding"]
|
264
|
+
pooling = "none"
|
265
|
+
normalize = false
|
266
|
+
end
|
275
267
|
|
276
268
|
# Run model
|
277
|
-
outputs = @model.(model_inputs)
|
269
|
+
outputs = @model.(model_inputs, **model_options)
|
270
|
+
|
271
|
+
# TODO improve
|
272
|
+
result =
|
273
|
+
if outputs.is_a?(Array)
|
274
|
+
raise Error, "unexpected outputs" if outputs.size != 1
|
275
|
+
outputs[0]
|
276
|
+
else
|
277
|
+
outputs.logits
|
278
|
+
end
|
278
279
|
|
279
|
-
# TODO check outputs.last_hidden_state
|
280
|
-
result = outputs.logits
|
281
280
|
case pooling
|
282
281
|
when "none"
|
283
282
|
# Skip pooling
|
@@ -301,6 +300,46 @@ module Informers
|
|
301
300
|
end
|
302
301
|
end
|
303
302
|
|
303
|
+
class EmbeddingPipeline < FeatureExtractionPipeline
|
304
|
+
def call(
|
305
|
+
texts,
|
306
|
+
pooling: "mean",
|
307
|
+
normalize: true
|
308
|
+
)
|
309
|
+
super(texts, pooling:, normalize:)
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
class RerankingPipeline < Pipeline
|
314
|
+
def call(
|
315
|
+
query,
|
316
|
+
documents,
|
317
|
+
return_documents: false,
|
318
|
+
top_k: nil
|
319
|
+
)
|
320
|
+
model_inputs = @tokenizer.([query] * documents.size,
|
321
|
+
text_pair: documents,
|
322
|
+
padding: true,
|
323
|
+
truncation: true
|
324
|
+
)
|
325
|
+
|
326
|
+
outputs = @model.(model_inputs)
|
327
|
+
|
328
|
+
result =
|
329
|
+
Utils.sigmoid(outputs[0].map(&:first))
|
330
|
+
.map.with_index { |s, i| {doc_id: i, score: s} }
|
331
|
+
.sort_by { |v| -v[:score] }
|
332
|
+
|
333
|
+
if return_documents
|
334
|
+
result.each do |v|
|
335
|
+
v[:text] = documents[v[:doc_id]]
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
top_k ? result.first(top_k) : result
|
340
|
+
end
|
341
|
+
end
|
342
|
+
|
304
343
|
SUPPORTED_TASKS = {
|
305
344
|
"text-classification" => {
|
306
345
|
tokenizer: AutoTokenizer,
|
@@ -337,6 +376,24 @@ module Informers
|
|
337
376
|
model: "Xenova/all-MiniLM-L6-v2"
|
338
377
|
},
|
339
378
|
type: "text"
|
379
|
+
},
|
380
|
+
"embedding" => {
|
381
|
+
tokenizer: AutoTokenizer,
|
382
|
+
pipeline: EmbeddingPipeline,
|
383
|
+
model: AutoModel,
|
384
|
+
default: {
|
385
|
+
model: "sentence-transformers/all-MiniLM-L6-v2"
|
386
|
+
},
|
387
|
+
type: "text"
|
388
|
+
},
|
389
|
+
"reranking" => {
|
390
|
+
tokenizer: AutoTokenizer,
|
391
|
+
pipeline: RerankingPipeline,
|
392
|
+
model: AutoModel,
|
393
|
+
default: {
|
394
|
+
model: "mixedbread-ai/mxbai-rerank-base-v1"
|
395
|
+
},
|
396
|
+
type: "text"
|
340
397
|
}
|
341
398
|
}
|
342
399
|
|
@@ -361,11 +418,13 @@ module Informers
|
|
361
418
|
end
|
362
419
|
end
|
363
420
|
|
421
|
+
NO_DEFAULT = Object.new
|
422
|
+
|
364
423
|
class << self
|
365
424
|
def pipeline(
|
366
425
|
task,
|
367
426
|
model = nil,
|
368
|
-
quantized:
|
427
|
+
quantized: NO_DEFAULT,
|
369
428
|
progress_callback: DEFAULT_PROGRESS_CALLBACK,
|
370
429
|
config: nil,
|
371
430
|
cache_dir: nil,
|
@@ -373,6 +432,11 @@ module Informers
|
|
373
432
|
revision: "main",
|
374
433
|
model_file_name: nil
|
375
434
|
)
|
435
|
+
if quantized == NO_DEFAULT
|
436
|
+
# TODO move default to task class
|
437
|
+
quantized = !["embedding", "reranking"].include?(task)
|
438
|
+
end
|
439
|
+
|
376
440
|
# Apply aliases
|
377
441
|
task = TASK_ALIASES[task] || task
|
378
442
|
|
@@ -408,6 +472,10 @@ module Informers
|
|
408
472
|
results = load_items(classes, model, pretrained_options)
|
409
473
|
results[:task] = task
|
410
474
|
|
475
|
+
if model == "sentence-transformers/all-MiniLM-L6-v2"
|
476
|
+
results[:model].instance_variable_set(:@output_names, ["token_embeddings"])
|
477
|
+
end
|
478
|
+
|
411
479
|
Utils.dispatch_callback(progress_callback, {
|
412
480
|
status: "ready",
|
413
481
|
task: task,
|
data/lib/informers/tokenizers.rb
CHANGED
@@ -83,12 +83,18 @@ module Informers
|
|
83
83
|
# self.return_token_type_ids = true
|
84
84
|
end
|
85
85
|
|
86
|
+
class DebertaV2Tokenizer < PreTrainedTokenizer
|
87
|
+
# TODO
|
88
|
+
# self.return_token_type_ids = true
|
89
|
+
end
|
90
|
+
|
86
91
|
class DistilBertTokenizer < PreTrainedTokenizer
|
87
92
|
end
|
88
93
|
|
89
94
|
class AutoTokenizer
|
90
95
|
TOKENIZER_CLASS_MAPPING = {
|
91
96
|
"BertTokenizer" => BertTokenizer,
|
97
|
+
"DebertaV2Tokenizer" => DebertaV2Tokenizer,
|
92
98
|
"DistilBertTokenizer" => DistilBertTokenizer
|
93
99
|
}
|
94
100
|
|
data/lib/informers/version.rb
CHANGED