informers 1.0.2 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +213 -19
- data/lib/informers/configs.rb +10 -8
- data/lib/informers/model.rb +2 -14
- data/lib/informers/models.rb +1027 -13
- data/lib/informers/pipelines.rb +781 -14
- data/lib/informers/processors.rb +796 -0
- data/lib/informers/tokenizers.rb +166 -4
- data/lib/informers/utils/core.rb +4 -0
- data/lib/informers/utils/generation.rb +294 -0
- data/lib/informers/utils/image.rb +116 -0
- data/lib/informers/utils/math.rb +73 -0
- data/lib/informers/utils/tensor.rb +46 -0
- data/lib/informers/version.rb +1 -1
- data/lib/informers.rb +3 -0
- metadata +8 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ab4f19adb4d6ca0289784cee6c6cb5235b73a5184abffbeaf44391768be1f0ac
|
4
|
+
data.tar.gz: '0880ce4dced5ce47ceaaa5fee8d10e6324b3fc0a23e05c3da3728414dcc273d9'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: eb3ee6d16e4e20eca6fae3fae8f97d78ba6bb655d48e2012640d64538785e2a9ff2afb10269cf01db928553438e8fbd08584774ba3f3d08bc25f36cbb971a99a
|
7
|
+
data.tar.gz: '0008441293f2605ec8599135d715093053e21f67f56ba59b730a3bc1f46f04f4a7fabb7fef039f156cd4183011c93b7fc9cab6ba731bf78627244bc4dedcf18d'
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -30,10 +30,15 @@ Embedding
|
|
30
30
|
- [intfloat/e5-base-v2](#intfloate5-base-v2)
|
31
31
|
- [nomic-ai/nomic-embed-text-v1](#nomic-ainomic-embed-text-v1)
|
32
32
|
- [BAAI/bge-base-en-v1.5](#baaibge-base-en-v15)
|
33
|
+
- [jinaai/jina-embeddings-v2-base-en](#jinaaijina-embeddings-v2-base-en)
|
34
|
+
- [Snowflake/snowflake-arctic-embed-m-v1.5](#snowflakesnowflake-arctic-embed-m-v15)
|
35
|
+
- [Xenova/all-mpnet-base-v2](#xenovaall-mpnet-base-v2)
|
33
36
|
|
34
|
-
Reranking
|
37
|
+
Reranking
|
35
38
|
|
36
39
|
- [mixedbread-ai/mxbai-rerank-base-v1](#mixedbread-aimxbai-rerank-base-v1)
|
40
|
+
- [jinaai/jina-reranker-v1-turbo-en](#jinaaijina-reranker-v1-turbo-en)
|
41
|
+
- [BAAI/bge-reranker-base](#baaibge-reranker-base)
|
37
42
|
|
38
43
|
### sentence-transformers/all-MiniLM-L6-v2
|
39
44
|
|
@@ -72,18 +77,16 @@ doc_score_pairs = docs.zip(scores).sort_by { |d, s| -s }
|
|
72
77
|
[Docs](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1)
|
73
78
|
|
74
79
|
```ruby
|
75
|
-
|
76
|
-
"Represent this sentence for searching relevant passages: #{query}"
|
77
|
-
end
|
80
|
+
query_prefix = "Represent this sentence for searching relevant passages: "
|
78
81
|
|
79
|
-
|
80
|
-
transform_query("puppy"),
|
82
|
+
input = [
|
81
83
|
"The dog is barking",
|
82
|
-
"The cat is purring"
|
84
|
+
"The cat is purring",
|
85
|
+
query_prefix + "puppy"
|
83
86
|
]
|
84
87
|
|
85
88
|
model = Informers.pipeline("embedding", "mixedbread-ai/mxbai-embed-large-v1")
|
86
|
-
embeddings = model.(
|
89
|
+
embeddings = model.(input)
|
87
90
|
```
|
88
91
|
|
89
92
|
### Supabase/gte-small
|
@@ -102,9 +105,12 @@ embeddings = model.(sentences)
|
|
102
105
|
[Docs](https://huggingface.co/intfloat/e5-base-v2)
|
103
106
|
|
104
107
|
```ruby
|
108
|
+
doc_prefix = "passage: "
|
109
|
+
query_prefix = "query: "
|
110
|
+
|
105
111
|
input = [
|
106
|
-
"
|
107
|
-
"
|
112
|
+
doc_prefix + "Ruby is a programming language created by Matz",
|
113
|
+
query_prefix + "Ruby creator"
|
108
114
|
]
|
109
115
|
|
110
116
|
model = Informers.pipeline("embedding", "intfloat/e5-base-v2")
|
@@ -116,9 +122,13 @@ embeddings = model.(input)
|
|
116
122
|
[Docs](https://huggingface.co/nomic-ai/nomic-embed-text-v1)
|
117
123
|
|
118
124
|
```ruby
|
125
|
+
doc_prefix = "search_document: "
|
126
|
+
query_prefix = "search_query: "
|
127
|
+
|
119
128
|
input = [
|
120
|
-
"
|
121
|
-
"
|
129
|
+
doc_prefix + "The dog is barking",
|
130
|
+
doc_prefix + "The cat is purring",
|
131
|
+
query_prefix + "puppy"
|
122
132
|
]
|
123
133
|
|
124
134
|
model = Informers.pipeline("embedding", "nomic-ai/nomic-embed-text-v1")
|
@@ -130,20 +140,57 @@ embeddings = model.(input)
|
|
130
140
|
[Docs](https://huggingface.co/BAAI/bge-base-en-v1.5)
|
131
141
|
|
132
142
|
```ruby
|
133
|
-
|
134
|
-
"Represent this sentence for searching relevant passages: #{query}"
|
135
|
-
end
|
143
|
+
query_prefix = "Represent this sentence for searching relevant passages: "
|
136
144
|
|
137
145
|
input = [
|
138
|
-
transform_query("puppy"),
|
139
146
|
"The dog is barking",
|
140
|
-
"The cat is purring"
|
147
|
+
"The cat is purring",
|
148
|
+
query_prefix + "puppy"
|
141
149
|
]
|
142
150
|
|
143
151
|
model = Informers.pipeline("embedding", "BAAI/bge-base-en-v1.5")
|
144
152
|
embeddings = model.(input)
|
145
153
|
```
|
146
154
|
|
155
|
+
### jinaai/jina-embeddings-v2-base-en
|
156
|
+
|
157
|
+
[Docs](https://huggingface.co/jinaai/jina-embeddings-v2-base-en)
|
158
|
+
|
159
|
+
```ruby
|
160
|
+
sentences = ["How is the weather today?", "What is the current weather like today?"]
|
161
|
+
|
162
|
+
model = Informers.pipeline("embedding", "jinaai/jina-embeddings-v2-base-en", model_file_name: "../model")
|
163
|
+
embeddings = model.(sentences)
|
164
|
+
```
|
165
|
+
|
166
|
+
### Snowflake/snowflake-arctic-embed-m-v1.5
|
167
|
+
|
168
|
+
[Docs](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5)
|
169
|
+
|
170
|
+
```ruby
|
171
|
+
query_prefix = "Represent this sentence for searching relevant passages: "
|
172
|
+
|
173
|
+
input = [
|
174
|
+
"The dog is barking",
|
175
|
+
"The cat is purring",
|
176
|
+
query_prefix + "puppy"
|
177
|
+
]
|
178
|
+
|
179
|
+
model = Informers.pipeline("embedding", "Snowflake/snowflake-arctic-embed-m-v1.5")
|
180
|
+
embeddings = model.(input, model_output: "sentence_embedding", pooling: "none")
|
181
|
+
```
|
182
|
+
|
183
|
+
### Xenova/all-mpnet-base-v2
|
184
|
+
|
185
|
+
[Docs](https://huggingface.co/Xenova/all-mpnet-base-v2)
|
186
|
+
|
187
|
+
```ruby
|
188
|
+
sentences = ["This is an example sentence", "Each sentence is converted"]
|
189
|
+
|
190
|
+
model = Informers.pipeline("embedding", "Xenova/all-mpnet-base-v2")
|
191
|
+
embeddings = model.(sentences)
|
192
|
+
```
|
193
|
+
|
147
194
|
### mixedbread-ai/mxbai-rerank-base-v1
|
148
195
|
|
149
196
|
[Docs](https://huggingface.co/mixedbread-ai/mxbai-rerank-base-v1)
|
@@ -156,6 +203,30 @@ model = Informers.pipeline("reranking", "mixedbread-ai/mxbai-rerank-base-v1")
|
|
156
203
|
result = model.(query, docs)
|
157
204
|
```
|
158
205
|
|
206
|
+
### jinaai/jina-reranker-v1-turbo-en
|
207
|
+
|
208
|
+
[Docs](https://huggingface.co/jinaai/jina-reranker-v1-turbo-en)
|
209
|
+
|
210
|
+
```ruby
|
211
|
+
query = "How many people live in London?"
|
212
|
+
docs = ["Around 9 Million people live in London", "London is known for its financial district"]
|
213
|
+
|
214
|
+
model = Informers.pipeline("reranking", "jinaai/jina-reranker-v1-turbo-en")
|
215
|
+
result = model.(query, docs)
|
216
|
+
```
|
217
|
+
|
218
|
+
### BAAI/bge-reranker-base
|
219
|
+
|
220
|
+
[Docs](https://huggingface.co/BAAI/bge-reranker-base)
|
221
|
+
|
222
|
+
```ruby
|
223
|
+
query = "How many people live in London?"
|
224
|
+
docs = ["Around 9 Million people live in London", "London is known for its financial district"]
|
225
|
+
|
226
|
+
model = Informers.pipeline("reranking", "BAAI/bge-reranker-base")
|
227
|
+
result = model.(query, docs)
|
228
|
+
```
|
229
|
+
|
159
230
|
### Other
|
160
231
|
|
161
232
|
You can use the feature extraction pipeline directly.
|
@@ -165,10 +236,16 @@ model = Informers.pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2", quan
|
|
165
236
|
embeddings = model.(sentences, pooling: "mean", normalize: true)
|
166
237
|
```
|
167
238
|
|
168
|
-
The model
|
239
|
+
The model must include a `.onnx` file ([example](https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx)). If the file is not at `onnx/model.onnx` or `onnx/model_quantized.onnx`, use the `model_file_name` option to specify the location.
|
169
240
|
|
170
241
|
## Pipelines
|
171
242
|
|
243
|
+
- [Text](#text)
|
244
|
+
- [Vision](#vision)
|
245
|
+
- [Multimodel](#multimodal)
|
246
|
+
|
247
|
+
### Text
|
248
|
+
|
172
249
|
Embedding
|
173
250
|
|
174
251
|
```ruby
|
@@ -176,7 +253,7 @@ embed = Informers.pipeline("embedding")
|
|
176
253
|
embed.("We are very happy to show you the 🤗 Transformers library.")
|
177
254
|
```
|
178
255
|
|
179
|
-
Reranking
|
256
|
+
Reranking
|
180
257
|
|
181
258
|
```ruby
|
182
259
|
rerank = Informers.pipeline("reranking")
|
@@ -204,6 +281,48 @@ qa = Informers.pipeline("question-answering")
|
|
204
281
|
qa.("Who invented Ruby?", "Ruby is a programming language created by Matz")
|
205
282
|
```
|
206
283
|
|
284
|
+
Zero-shot classification
|
285
|
+
|
286
|
+
```ruby
|
287
|
+
classifier = Informers.pipeline("zero-shot-classification")
|
288
|
+
classifier.("text", ["label1", "label2", "label3"])
|
289
|
+
```
|
290
|
+
|
291
|
+
Text generation
|
292
|
+
|
293
|
+
```ruby
|
294
|
+
generator = Informers.pipeline("text-generation")
|
295
|
+
generator.("I enjoy walking with my cute dog,")
|
296
|
+
```
|
297
|
+
|
298
|
+
Text-to-text generation
|
299
|
+
|
300
|
+
```ruby
|
301
|
+
text2text = Informers.pipeline("text2text-generation")
|
302
|
+
text2text.("translate from English to French: I'm very happy")
|
303
|
+
```
|
304
|
+
|
305
|
+
Translation
|
306
|
+
|
307
|
+
```ruby
|
308
|
+
translator = Informers.pipeline("translation", "Xenova/nllb-200-distilled-600M")
|
309
|
+
translator.("जीवन एक चॉकलेट बॉक्स की तरह है।", src_lang: "hin_Deva", tgt_lang: "fra_Latn")
|
310
|
+
```
|
311
|
+
|
312
|
+
Summarization
|
313
|
+
|
314
|
+
```ruby
|
315
|
+
summarizer = Informers.pipeline("summarization")
|
316
|
+
summarizer.("Many paragraphs of text")
|
317
|
+
```
|
318
|
+
|
319
|
+
Fill mask
|
320
|
+
|
321
|
+
```ruby
|
322
|
+
unmasker = Informers.pipeline("fill-mask")
|
323
|
+
unmasker.("Paris is the [MASK] of France.")
|
324
|
+
```
|
325
|
+
|
207
326
|
Feature extraction
|
208
327
|
|
209
328
|
```ruby
|
@@ -211,6 +330,80 @@ extractor = Informers.pipeline("feature-extraction")
|
|
211
330
|
extractor.("We are very happy to show you the 🤗 Transformers library.")
|
212
331
|
```
|
213
332
|
|
333
|
+
### Vision
|
334
|
+
|
335
|
+
Image classification
|
336
|
+
|
337
|
+
```ruby
|
338
|
+
classifier = Informers.pipeline("image-classification")
|
339
|
+
classifier.("image.jpg")
|
340
|
+
```
|
341
|
+
|
342
|
+
Zero-shot image classification
|
343
|
+
|
344
|
+
```ruby
|
345
|
+
classifier = Informers.pipeline("zero-shot-image-classification")
|
346
|
+
classifier.("image.jpg", ["label1", "label2", "label3"])
|
347
|
+
```
|
348
|
+
|
349
|
+
Image segmentation
|
350
|
+
|
351
|
+
```ruby
|
352
|
+
segmenter = Informers.pipeline("image-segmentation")
|
353
|
+
segmenter.("image.jpg")
|
354
|
+
```
|
355
|
+
|
356
|
+
Object detection
|
357
|
+
|
358
|
+
```ruby
|
359
|
+
detector = Informers.pipeline("object-detection")
|
360
|
+
detector.("image.jpg")
|
361
|
+
```
|
362
|
+
|
363
|
+
Zero-shot object detection
|
364
|
+
|
365
|
+
```ruby
|
366
|
+
detector = Informers.pipeline("zero-shot-object-detection")
|
367
|
+
detector.("image.jpg", ["label1", "label2", "label3"])
|
368
|
+
```
|
369
|
+
|
370
|
+
Depth estimation
|
371
|
+
|
372
|
+
```ruby
|
373
|
+
estimator = Informers.pipeline("depth-estimation")
|
374
|
+
estimator.("image.jpg")
|
375
|
+
```
|
376
|
+
|
377
|
+
Image-to-image
|
378
|
+
|
379
|
+
```ruby
|
380
|
+
upscaler = Informers.pipeline("image-to-image")
|
381
|
+
upscaler.("image.jpg")
|
382
|
+
```
|
383
|
+
|
384
|
+
Image feature extraction
|
385
|
+
|
386
|
+
```ruby
|
387
|
+
extractor = Informers.pipeline("image-feature-extraction")
|
388
|
+
extractor.("image.jpg")
|
389
|
+
```
|
390
|
+
|
391
|
+
### Multimodal
|
392
|
+
|
393
|
+
Image captioning
|
394
|
+
|
395
|
+
```ruby
|
396
|
+
captioner = Informers.pipeline("image-to-text")
|
397
|
+
captioner.("image.jpg")
|
398
|
+
```
|
399
|
+
|
400
|
+
Document question answering
|
401
|
+
|
402
|
+
```ruby
|
403
|
+
qa = Informers.pipeline("document-question-answering")
|
404
|
+
qa.("image.jpg", "What is the invoice number?")
|
405
|
+
```
|
406
|
+
|
214
407
|
## Credits
|
215
408
|
|
216
409
|
This library was ported from [Transformers.js](https://github.com/xenova/transformers.js) and is available under the same license.
|
@@ -250,5 +443,6 @@ To get started with development:
|
|
250
443
|
git clone https://github.com/ankane/informers.git
|
251
444
|
cd informers
|
252
445
|
bundle install
|
446
|
+
bundle exec rake download:files
|
253
447
|
bundle exec rake test
|
254
448
|
```
|
data/lib/informers/configs.rb
CHANGED
@@ -1,17 +1,19 @@
|
|
1
1
|
module Informers
|
2
2
|
class PretrainedConfig
|
3
|
-
attr_reader :model_type, :problem_type, :id2label
|
4
|
-
|
5
3
|
def initialize(config_json)
|
6
|
-
@
|
7
|
-
|
8
|
-
@model_type = config_json["model_type"]
|
9
|
-
@problem_type = config_json["problem_type"]
|
10
|
-
@id2label = config_json["id2label"]
|
4
|
+
@config_json = config_json.to_h
|
11
5
|
end
|
12
6
|
|
13
7
|
def [](key)
|
14
|
-
|
8
|
+
@config_json[key.to_s]
|
9
|
+
end
|
10
|
+
|
11
|
+
def []=(key, value)
|
12
|
+
@config_json[key.to_s] = value
|
13
|
+
end
|
14
|
+
|
15
|
+
def to_h
|
16
|
+
@config_json.to_h
|
15
17
|
end
|
16
18
|
|
17
19
|
def self.from_pretrained(
|
data/lib/informers/model.rb
CHANGED
@@ -1,24 +1,12 @@
|
|
1
1
|
module Informers
|
2
2
|
class Model
|
3
3
|
def initialize(model_id, quantized: false)
|
4
|
-
@model_id = model_id
|
5
4
|
@model = Informers.pipeline("embedding", model_id, quantized: quantized)
|
5
|
+
@options = model_id == "mixedbread-ai/mxbai-embed-large-v1" ? {pooling: "cls", normalize: false} : {}
|
6
6
|
end
|
7
7
|
|
8
8
|
def embed(texts)
|
9
|
-
|
10
|
-
texts = [texts] unless is_batched
|
11
|
-
|
12
|
-
case @model_id
|
13
|
-
when "sentence-transformers/all-MiniLM-L6-v2", "Xenova/all-MiniLM-L6-v2", "Xenova/multi-qa-MiniLM-L6-cos-v1", "Supabase/gte-small"
|
14
|
-
output = @model.(texts)
|
15
|
-
when "mixedbread-ai/mxbai-embed-large-v1"
|
16
|
-
output = @model.(texts, pooling: "cls", normalize: false)
|
17
|
-
else
|
18
|
-
raise Error, "Use the embedding pipeline for this model: #{@model_id}"
|
19
|
-
end
|
20
|
-
|
21
|
-
is_batched ? output : output[0]
|
9
|
+
@model.(texts, **@options)
|
22
10
|
end
|
23
11
|
end
|
24
12
|
end
|