informers 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +213 -19
- data/lib/informers/configs.rb +10 -8
- data/lib/informers/model.rb +2 -14
- data/lib/informers/models.rb +1027 -13
- data/lib/informers/pipelines.rb +781 -14
- data/lib/informers/processors.rb +796 -0
- data/lib/informers/tokenizers.rb +166 -4
- data/lib/informers/utils/core.rb +4 -0
- data/lib/informers/utils/generation.rb +294 -0
- data/lib/informers/utils/image.rb +116 -0
- data/lib/informers/utils/math.rb +73 -0
- data/lib/informers/utils/tensor.rb +46 -0
- data/lib/informers/version.rb +1 -1
- data/lib/informers.rb +3 -0
- metadata +8 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ab4f19adb4d6ca0289784cee6c6cb5235b73a5184abffbeaf44391768be1f0ac
|
4
|
+
data.tar.gz: '0880ce4dced5ce47ceaaa5fee8d10e6324b3fc0a23e05c3da3728414dcc273d9'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: eb3ee6d16e4e20eca6fae3fae8f97d78ba6bb655d48e2012640d64538785e2a9ff2afb10269cf01db928553438e8fbd08584774ba3f3d08bc25f36cbb971a99a
|
7
|
+
data.tar.gz: '0008441293f2605ec8599135d715093053e21f67f56ba59b730a3bc1f46f04f4a7fabb7fef039f156cd4183011c93b7fc9cab6ba731bf78627244bc4dedcf18d'
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -30,10 +30,15 @@ Embedding
|
|
30
30
|
- [intfloat/e5-base-v2](#intfloate5-base-v2)
|
31
31
|
- [nomic-ai/nomic-embed-text-v1](#nomic-ainomic-embed-text-v1)
|
32
32
|
- [BAAI/bge-base-en-v1.5](#baaibge-base-en-v15)
|
33
|
+
- [jinaai/jina-embeddings-v2-base-en](#jinaaijina-embeddings-v2-base-en)
|
34
|
+
- [Snowflake/snowflake-arctic-embed-m-v1.5](#snowflakesnowflake-arctic-embed-m-v15)
|
35
|
+
- [Xenova/all-mpnet-base-v2](#xenovaall-mpnet-base-v2)
|
33
36
|
|
34
|
-
Reranking
|
37
|
+
Reranking
|
35
38
|
|
36
39
|
- [mixedbread-ai/mxbai-rerank-base-v1](#mixedbread-aimxbai-rerank-base-v1)
|
40
|
+
- [jinaai/jina-reranker-v1-turbo-en](#jinaaijina-reranker-v1-turbo-en)
|
41
|
+
- [BAAI/bge-reranker-base](#baaibge-reranker-base)
|
37
42
|
|
38
43
|
### sentence-transformers/all-MiniLM-L6-v2
|
39
44
|
|
@@ -72,18 +77,16 @@ doc_score_pairs = docs.zip(scores).sort_by { |d, s| -s }
|
|
72
77
|
[Docs](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1)
|
73
78
|
|
74
79
|
```ruby
|
75
|
-
|
76
|
-
"Represent this sentence for searching relevant passages: #{query}"
|
77
|
-
end
|
80
|
+
query_prefix = "Represent this sentence for searching relevant passages: "
|
78
81
|
|
79
|
-
|
80
|
-
transform_query("puppy"),
|
82
|
+
input = [
|
81
83
|
"The dog is barking",
|
82
|
-
"The cat is purring"
|
84
|
+
"The cat is purring",
|
85
|
+
query_prefix + "puppy"
|
83
86
|
]
|
84
87
|
|
85
88
|
model = Informers.pipeline("embedding", "mixedbread-ai/mxbai-embed-large-v1")
|
86
|
-
embeddings = model.(
|
89
|
+
embeddings = model.(input)
|
87
90
|
```
|
88
91
|
|
89
92
|
### Supabase/gte-small
|
@@ -102,9 +105,12 @@ embeddings = model.(sentences)
|
|
102
105
|
[Docs](https://huggingface.co/intfloat/e5-base-v2)
|
103
106
|
|
104
107
|
```ruby
|
108
|
+
doc_prefix = "passage: "
|
109
|
+
query_prefix = "query: "
|
110
|
+
|
105
111
|
input = [
|
106
|
-
"
|
107
|
-
"
|
112
|
+
doc_prefix + "Ruby is a programming language created by Matz",
|
113
|
+
query_prefix + "Ruby creator"
|
108
114
|
]
|
109
115
|
|
110
116
|
model = Informers.pipeline("embedding", "intfloat/e5-base-v2")
|
@@ -116,9 +122,13 @@ embeddings = model.(input)
|
|
116
122
|
[Docs](https://huggingface.co/nomic-ai/nomic-embed-text-v1)
|
117
123
|
|
118
124
|
```ruby
|
125
|
+
doc_prefix = "search_document: "
|
126
|
+
query_prefix = "search_query: "
|
127
|
+
|
119
128
|
input = [
|
120
|
-
"
|
121
|
-
"
|
129
|
+
doc_prefix + "The dog is barking",
|
130
|
+
doc_prefix + "The cat is purring",
|
131
|
+
query_prefix + "puppy"
|
122
132
|
]
|
123
133
|
|
124
134
|
model = Informers.pipeline("embedding", "nomic-ai/nomic-embed-text-v1")
|
@@ -130,20 +140,57 @@ embeddings = model.(input)
|
|
130
140
|
[Docs](https://huggingface.co/BAAI/bge-base-en-v1.5)
|
131
141
|
|
132
142
|
```ruby
|
133
|
-
|
134
|
-
"Represent this sentence for searching relevant passages: #{query}"
|
135
|
-
end
|
143
|
+
query_prefix = "Represent this sentence for searching relevant passages: "
|
136
144
|
|
137
145
|
input = [
|
138
|
-
transform_query("puppy"),
|
139
146
|
"The dog is barking",
|
140
|
-
"The cat is purring"
|
147
|
+
"The cat is purring",
|
148
|
+
query_prefix + "puppy"
|
141
149
|
]
|
142
150
|
|
143
151
|
model = Informers.pipeline("embedding", "BAAI/bge-base-en-v1.5")
|
144
152
|
embeddings = model.(input)
|
145
153
|
```
|
146
154
|
|
155
|
+
### jinaai/jina-embeddings-v2-base-en
|
156
|
+
|
157
|
+
[Docs](https://huggingface.co/jinaai/jina-embeddings-v2-base-en)
|
158
|
+
|
159
|
+
```ruby
|
160
|
+
sentences = ["How is the weather today?", "What is the current weather like today?"]
|
161
|
+
|
162
|
+
model = Informers.pipeline("embedding", "jinaai/jina-embeddings-v2-base-en", model_file_name: "../model")
|
163
|
+
embeddings = model.(sentences)
|
164
|
+
```
|
165
|
+
|
166
|
+
### Snowflake/snowflake-arctic-embed-m-v1.5
|
167
|
+
|
168
|
+
[Docs](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5)
|
169
|
+
|
170
|
+
```ruby
|
171
|
+
query_prefix = "Represent this sentence for searching relevant passages: "
|
172
|
+
|
173
|
+
input = [
|
174
|
+
"The dog is barking",
|
175
|
+
"The cat is purring",
|
176
|
+
query_prefix + "puppy"
|
177
|
+
]
|
178
|
+
|
179
|
+
model = Informers.pipeline("embedding", "Snowflake/snowflake-arctic-embed-m-v1.5")
|
180
|
+
embeddings = model.(input, model_output: "sentence_embedding", pooling: "none")
|
181
|
+
```
|
182
|
+
|
183
|
+
### Xenova/all-mpnet-base-v2
|
184
|
+
|
185
|
+
[Docs](https://huggingface.co/Xenova/all-mpnet-base-v2)
|
186
|
+
|
187
|
+
```ruby
|
188
|
+
sentences = ["This is an example sentence", "Each sentence is converted"]
|
189
|
+
|
190
|
+
model = Informers.pipeline("embedding", "Xenova/all-mpnet-base-v2")
|
191
|
+
embeddings = model.(sentences)
|
192
|
+
```
|
193
|
+
|
147
194
|
### mixedbread-ai/mxbai-rerank-base-v1
|
148
195
|
|
149
196
|
[Docs](https://huggingface.co/mixedbread-ai/mxbai-rerank-base-v1)
|
@@ -156,6 +203,30 @@ model = Informers.pipeline("reranking", "mixedbread-ai/mxbai-rerank-base-v1")
|
|
156
203
|
result = model.(query, docs)
|
157
204
|
```
|
158
205
|
|
206
|
+
### jinaai/jina-reranker-v1-turbo-en
|
207
|
+
|
208
|
+
[Docs](https://huggingface.co/jinaai/jina-reranker-v1-turbo-en)
|
209
|
+
|
210
|
+
```ruby
|
211
|
+
query = "How many people live in London?"
|
212
|
+
docs = ["Around 9 Million people live in London", "London is known for its financial district"]
|
213
|
+
|
214
|
+
model = Informers.pipeline("reranking", "jinaai/jina-reranker-v1-turbo-en")
|
215
|
+
result = model.(query, docs)
|
216
|
+
```
|
217
|
+
|
218
|
+
### BAAI/bge-reranker-base
|
219
|
+
|
220
|
+
[Docs](https://huggingface.co/BAAI/bge-reranker-base)
|
221
|
+
|
222
|
+
```ruby
|
223
|
+
query = "How many people live in London?"
|
224
|
+
docs = ["Around 9 Million people live in London", "London is known for its financial district"]
|
225
|
+
|
226
|
+
model = Informers.pipeline("reranking", "BAAI/bge-reranker-base")
|
227
|
+
result = model.(query, docs)
|
228
|
+
```
|
229
|
+
|
159
230
|
### Other
|
160
231
|
|
161
232
|
You can use the feature extraction pipeline directly.
|
@@ -165,10 +236,16 @@ model = Informers.pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2", quan
|
|
165
236
|
embeddings = model.(sentences, pooling: "mean", normalize: true)
|
166
237
|
```
|
167
238
|
|
168
|
-
The model
|
239
|
+
The model must include a `.onnx` file ([example](https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx)). If the file is not at `onnx/model.onnx` or `onnx/model_quantized.onnx`, use the `model_file_name` option to specify the location.
|
169
240
|
|
170
241
|
## Pipelines
|
171
242
|
|
243
|
+
- [Text](#text)
|
244
|
+
- [Vision](#vision)
|
245
|
+
- [Multimodel](#multimodal)
|
246
|
+
|
247
|
+
### Text
|
248
|
+
|
172
249
|
Embedding
|
173
250
|
|
174
251
|
```ruby
|
@@ -176,7 +253,7 @@ embed = Informers.pipeline("embedding")
|
|
176
253
|
embed.("We are very happy to show you the 🤗 Transformers library.")
|
177
254
|
```
|
178
255
|
|
179
|
-
Reranking
|
256
|
+
Reranking
|
180
257
|
|
181
258
|
```ruby
|
182
259
|
rerank = Informers.pipeline("reranking")
|
@@ -204,6 +281,48 @@ qa = Informers.pipeline("question-answering")
|
|
204
281
|
qa.("Who invented Ruby?", "Ruby is a programming language created by Matz")
|
205
282
|
```
|
206
283
|
|
284
|
+
Zero-shot classification
|
285
|
+
|
286
|
+
```ruby
|
287
|
+
classifier = Informers.pipeline("zero-shot-classification")
|
288
|
+
classifier.("text", ["label1", "label2", "label3"])
|
289
|
+
```
|
290
|
+
|
291
|
+
Text generation
|
292
|
+
|
293
|
+
```ruby
|
294
|
+
generator = Informers.pipeline("text-generation")
|
295
|
+
generator.("I enjoy walking with my cute dog,")
|
296
|
+
```
|
297
|
+
|
298
|
+
Text-to-text generation
|
299
|
+
|
300
|
+
```ruby
|
301
|
+
text2text = Informers.pipeline("text2text-generation")
|
302
|
+
text2text.("translate from English to French: I'm very happy")
|
303
|
+
```
|
304
|
+
|
305
|
+
Translation
|
306
|
+
|
307
|
+
```ruby
|
308
|
+
translator = Informers.pipeline("translation", "Xenova/nllb-200-distilled-600M")
|
309
|
+
translator.("जीवन एक चॉकलेट बॉक्स की तरह है।", src_lang: "hin_Deva", tgt_lang: "fra_Latn")
|
310
|
+
```
|
311
|
+
|
312
|
+
Summarization
|
313
|
+
|
314
|
+
```ruby
|
315
|
+
summarizer = Informers.pipeline("summarization")
|
316
|
+
summarizer.("Many paragraphs of text")
|
317
|
+
```
|
318
|
+
|
319
|
+
Fill mask
|
320
|
+
|
321
|
+
```ruby
|
322
|
+
unmasker = Informers.pipeline("fill-mask")
|
323
|
+
unmasker.("Paris is the [MASK] of France.")
|
324
|
+
```
|
325
|
+
|
207
326
|
Feature extraction
|
208
327
|
|
209
328
|
```ruby
|
@@ -211,6 +330,80 @@ extractor = Informers.pipeline("feature-extraction")
|
|
211
330
|
extractor.("We are very happy to show you the 🤗 Transformers library.")
|
212
331
|
```
|
213
332
|
|
333
|
+
### Vision
|
334
|
+
|
335
|
+
Image classification
|
336
|
+
|
337
|
+
```ruby
|
338
|
+
classifier = Informers.pipeline("image-classification")
|
339
|
+
classifier.("image.jpg")
|
340
|
+
```
|
341
|
+
|
342
|
+
Zero-shot image classification
|
343
|
+
|
344
|
+
```ruby
|
345
|
+
classifier = Informers.pipeline("zero-shot-image-classification")
|
346
|
+
classifier.("image.jpg", ["label1", "label2", "label3"])
|
347
|
+
```
|
348
|
+
|
349
|
+
Image segmentation
|
350
|
+
|
351
|
+
```ruby
|
352
|
+
segmenter = Informers.pipeline("image-segmentation")
|
353
|
+
segmenter.("image.jpg")
|
354
|
+
```
|
355
|
+
|
356
|
+
Object detection
|
357
|
+
|
358
|
+
```ruby
|
359
|
+
detector = Informers.pipeline("object-detection")
|
360
|
+
detector.("image.jpg")
|
361
|
+
```
|
362
|
+
|
363
|
+
Zero-shot object detection
|
364
|
+
|
365
|
+
```ruby
|
366
|
+
detector = Informers.pipeline("zero-shot-object-detection")
|
367
|
+
detector.("image.jpg", ["label1", "label2", "label3"])
|
368
|
+
```
|
369
|
+
|
370
|
+
Depth estimation
|
371
|
+
|
372
|
+
```ruby
|
373
|
+
estimator = Informers.pipeline("depth-estimation")
|
374
|
+
estimator.("image.jpg")
|
375
|
+
```
|
376
|
+
|
377
|
+
Image-to-image
|
378
|
+
|
379
|
+
```ruby
|
380
|
+
upscaler = Informers.pipeline("image-to-image")
|
381
|
+
upscaler.("image.jpg")
|
382
|
+
```
|
383
|
+
|
384
|
+
Image feature extraction
|
385
|
+
|
386
|
+
```ruby
|
387
|
+
extractor = Informers.pipeline("image-feature-extraction")
|
388
|
+
extractor.("image.jpg")
|
389
|
+
```
|
390
|
+
|
391
|
+
### Multimodal
|
392
|
+
|
393
|
+
Image captioning
|
394
|
+
|
395
|
+
```ruby
|
396
|
+
captioner = Informers.pipeline("image-to-text")
|
397
|
+
captioner.("image.jpg")
|
398
|
+
```
|
399
|
+
|
400
|
+
Document question answering
|
401
|
+
|
402
|
+
```ruby
|
403
|
+
qa = Informers.pipeline("document-question-answering")
|
404
|
+
qa.("image.jpg", "What is the invoice number?")
|
405
|
+
```
|
406
|
+
|
214
407
|
## Credits
|
215
408
|
|
216
409
|
This library was ported from [Transformers.js](https://github.com/xenova/transformers.js) and is available under the same license.
|
@@ -250,5 +443,6 @@ To get started with development:
|
|
250
443
|
git clone https://github.com/ankane/informers.git
|
251
444
|
cd informers
|
252
445
|
bundle install
|
446
|
+
bundle exec rake download:files
|
253
447
|
bundle exec rake test
|
254
448
|
```
|
data/lib/informers/configs.rb
CHANGED
@@ -1,17 +1,19 @@
|
|
1
1
|
module Informers
|
2
2
|
class PretrainedConfig
|
3
|
-
attr_reader :model_type, :problem_type, :id2label
|
4
|
-
|
5
3
|
def initialize(config_json)
|
6
|
-
@
|
7
|
-
|
8
|
-
@model_type = config_json["model_type"]
|
9
|
-
@problem_type = config_json["problem_type"]
|
10
|
-
@id2label = config_json["id2label"]
|
4
|
+
@config_json = config_json.to_h
|
11
5
|
end
|
12
6
|
|
13
7
|
def [](key)
|
14
|
-
|
8
|
+
@config_json[key.to_s]
|
9
|
+
end
|
10
|
+
|
11
|
+
def []=(key, value)
|
12
|
+
@config_json[key.to_s] = value
|
13
|
+
end
|
14
|
+
|
15
|
+
def to_h
|
16
|
+
@config_json.to_h
|
15
17
|
end
|
16
18
|
|
17
19
|
def self.from_pretrained(
|
data/lib/informers/model.rb
CHANGED
@@ -1,24 +1,12 @@
|
|
1
1
|
module Informers
|
2
2
|
class Model
|
3
3
|
def initialize(model_id, quantized: false)
|
4
|
-
@model_id = model_id
|
5
4
|
@model = Informers.pipeline("embedding", model_id, quantized: quantized)
|
5
|
+
@options = model_id == "mixedbread-ai/mxbai-embed-large-v1" ? {pooling: "cls", normalize: false} : {}
|
6
6
|
end
|
7
7
|
|
8
8
|
def embed(texts)
|
9
|
-
|
10
|
-
texts = [texts] unless is_batched
|
11
|
-
|
12
|
-
case @model_id
|
13
|
-
when "sentence-transformers/all-MiniLM-L6-v2", "Xenova/all-MiniLM-L6-v2", "Xenova/multi-qa-MiniLM-L6-cos-v1", "Supabase/gte-small"
|
14
|
-
output = @model.(texts)
|
15
|
-
when "mixedbread-ai/mxbai-embed-large-v1"
|
16
|
-
output = @model.(texts, pooling: "cls", normalize: false)
|
17
|
-
else
|
18
|
-
raise Error, "Use the embedding pipeline for this model: #{@model_id}"
|
19
|
-
end
|
20
|
-
|
21
|
-
is_batched ? output : output[0]
|
9
|
+
@model.(texts, **@options)
|
22
10
|
end
|
23
11
|
end
|
24
12
|
end
|