informers 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4ea317272c5054b01616643e7e0f0b2b2fe0c4a87fe8399350a6b8d0a279c5a1
4
- data.tar.gz: 530f8aaab9a5ca71811a82adca0272e2ca84525bcf1f60f2209c394cbd0f9c2a
3
+ metadata.gz: ab4f19adb4d6ca0289784cee6c6cb5235b73a5184abffbeaf44391768be1f0ac
4
+ data.tar.gz: '0880ce4dced5ce47ceaaa5fee8d10e6324b3fc0a23e05c3da3728414dcc273d9'
5
5
  SHA512:
6
- metadata.gz: 76059b486e6f6c0b0054450f76813dd4bf12845da6f46e8089585cd1a69be7db86a0acf446cc5a18e48108393403324626f6656d09bdb69083f2651abc0d2448
7
- data.tar.gz: f466f5382edd76a7092dc6ada349a3e58fe7eedcd481726ca765f8ddfb4543b7269dab96c00a93d10b0fd67f800afd70a619cfb15d78dde494b29cc13d21ef1a
6
+ metadata.gz: eb3ee6d16e4e20eca6fae3fae8f97d78ba6bb655d48e2012640d64538785e2a9ff2afb10269cf01db928553438e8fbd08584774ba3f3d08bc25f36cbb971a99a
7
+ data.tar.gz: '0008441293f2605ec8599135d715093053e21f67f56ba59b730a3bc1f46f04f4a7fabb7fef039f156cd4183011c93b7fc9cab6ba731bf78627244bc4dedcf18d'
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ## 1.1.0 (2024-09-17)
2
+
3
+ - Added more pipelines
4
+
5
+ ## 1.0.3 (2024-08-29)
6
+
7
+ - Added `model_output` option
8
+ - Improved `model_file_name` option
9
+
1
10
  ## 1.0.2 (2024-08-28)
2
11
 
3
12
  - Added `embedding` pipeline
data/README.md CHANGED
@@ -30,10 +30,15 @@ Embedding
30
30
  - [intfloat/e5-base-v2](#intfloate5-base-v2)
31
31
  - [nomic-ai/nomic-embed-text-v1](#nomic-ainomic-embed-text-v1)
32
32
  - [BAAI/bge-base-en-v1.5](#baaibge-base-en-v15)
33
+ - [jinaai/jina-embeddings-v2-base-en](#jinaaijina-embeddings-v2-base-en)
34
+ - [Snowflake/snowflake-arctic-embed-m-v1.5](#snowflakesnowflake-arctic-embed-m-v15)
35
+ - [Xenova/all-mpnet-base-v2](#xenovaall-mpnet-base-v2)
33
36
 
34
- Reranking (experimental)
37
+ Reranking
35
38
 
36
39
  - [mixedbread-ai/mxbai-rerank-base-v1](#mixedbread-aimxbai-rerank-base-v1)
40
+ - [jinaai/jina-reranker-v1-turbo-en](#jinaaijina-reranker-v1-turbo-en)
41
+ - [BAAI/bge-reranker-base](#baaibge-reranker-base)
37
42
 
38
43
  ### sentence-transformers/all-MiniLM-L6-v2
39
44
 
@@ -72,18 +77,16 @@ doc_score_pairs = docs.zip(scores).sort_by { |d, s| -s }
72
77
  [Docs](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1)
73
78
 
74
79
  ```ruby
75
- def transform_query(query)
76
- "Represent this sentence for searching relevant passages: #{query}"
77
- end
80
+ query_prefix = "Represent this sentence for searching relevant passages: "
78
81
 
79
- docs = [
80
- transform_query("puppy"),
82
+ input = [
81
83
  "The dog is barking",
82
- "The cat is purring"
84
+ "The cat is purring",
85
+ query_prefix + "puppy"
83
86
  ]
84
87
 
85
88
  model = Informers.pipeline("embedding", "mixedbread-ai/mxbai-embed-large-v1")
86
- embeddings = model.(docs)
89
+ embeddings = model.(input)
87
90
  ```
88
91
 
89
92
  ### Supabase/gte-small
@@ -102,9 +105,12 @@ embeddings = model.(sentences)
102
105
  [Docs](https://huggingface.co/intfloat/e5-base-v2)
103
106
 
104
107
  ```ruby
108
+ doc_prefix = "passage: "
109
+ query_prefix = "query: "
110
+
105
111
  input = [
106
- "passage: Ruby is a programming language created by Matz",
107
- "query: Ruby creator"
112
+ doc_prefix + "Ruby is a programming language created by Matz",
113
+ query_prefix + "Ruby creator"
108
114
  ]
109
115
 
110
116
  model = Informers.pipeline("embedding", "intfloat/e5-base-v2")
@@ -116,9 +122,13 @@ embeddings = model.(input)
116
122
  [Docs](https://huggingface.co/nomic-ai/nomic-embed-text-v1)
117
123
 
118
124
  ```ruby
125
+ doc_prefix = "search_document: "
126
+ query_prefix = "search_query: "
127
+
119
128
  input = [
120
- "search_document: The dog is barking",
121
- "search_query: puppy"
129
+ doc_prefix + "The dog is barking",
130
+ doc_prefix + "The cat is purring",
131
+ query_prefix + "puppy"
122
132
  ]
123
133
 
124
134
  model = Informers.pipeline("embedding", "nomic-ai/nomic-embed-text-v1")
@@ -130,20 +140,57 @@ embeddings = model.(input)
130
140
  [Docs](https://huggingface.co/BAAI/bge-base-en-v1.5)
131
141
 
132
142
  ```ruby
133
- def transform_query(query)
134
- "Represent this sentence for searching relevant passages: #{query}"
135
- end
143
+ query_prefix = "Represent this sentence for searching relevant passages: "
136
144
 
137
145
  input = [
138
- transform_query("puppy"),
139
146
  "The dog is barking",
140
- "The cat is purring"
147
+ "The cat is purring",
148
+ query_prefix + "puppy"
141
149
  ]
142
150
 
143
151
  model = Informers.pipeline("embedding", "BAAI/bge-base-en-v1.5")
144
152
  embeddings = model.(input)
145
153
  ```
146
154
 
155
+ ### jinaai/jina-embeddings-v2-base-en
156
+
157
+ [Docs](https://huggingface.co/jinaai/jina-embeddings-v2-base-en)
158
+
159
+ ```ruby
160
+ sentences = ["How is the weather today?", "What is the current weather like today?"]
161
+
162
+ model = Informers.pipeline("embedding", "jinaai/jina-embeddings-v2-base-en", model_file_name: "../model")
163
+ embeddings = model.(sentences)
164
+ ```
165
+
166
+ ### Snowflake/snowflake-arctic-embed-m-v1.5
167
+
168
+ [Docs](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5)
169
+
170
+ ```ruby
171
+ query_prefix = "Represent this sentence for searching relevant passages: "
172
+
173
+ input = [
174
+ "The dog is barking",
175
+ "The cat is purring",
176
+ query_prefix + "puppy"
177
+ ]
178
+
179
+ model = Informers.pipeline("embedding", "Snowflake/snowflake-arctic-embed-m-v1.5")
180
+ embeddings = model.(input, model_output: "sentence_embedding", pooling: "none")
181
+ ```
182
+
183
+ ### Xenova/all-mpnet-base-v2
184
+
185
+ [Docs](https://huggingface.co/Xenova/all-mpnet-base-v2)
186
+
187
+ ```ruby
188
+ sentences = ["This is an example sentence", "Each sentence is converted"]
189
+
190
+ model = Informers.pipeline("embedding", "Xenova/all-mpnet-base-v2")
191
+ embeddings = model.(sentences)
192
+ ```
193
+
147
194
  ### mixedbread-ai/mxbai-rerank-base-v1
148
195
 
149
196
  [Docs](https://huggingface.co/mixedbread-ai/mxbai-rerank-base-v1)
@@ -156,6 +203,30 @@ model = Informers.pipeline("reranking", "mixedbread-ai/mxbai-rerank-base-v1")
156
203
  result = model.(query, docs)
157
204
  ```
158
205
 
206
+ ### jinaai/jina-reranker-v1-turbo-en
207
+
208
+ [Docs](https://huggingface.co/jinaai/jina-reranker-v1-turbo-en)
209
+
210
+ ```ruby
211
+ query = "How many people live in London?"
212
+ docs = ["Around 9 Million people live in London", "London is known for its financial district"]
213
+
214
+ model = Informers.pipeline("reranking", "jinaai/jina-reranker-v1-turbo-en")
215
+ result = model.(query, docs)
216
+ ```
217
+
218
+ ### BAAI/bge-reranker-base
219
+
220
+ [Docs](https://huggingface.co/BAAI/bge-reranker-base)
221
+
222
+ ```ruby
223
+ query = "How many people live in London?"
224
+ docs = ["Around 9 Million people live in London", "London is known for its financial district"]
225
+
226
+ model = Informers.pipeline("reranking", "BAAI/bge-reranker-base")
227
+ result = model.(query, docs)
228
+ ```
229
+
159
230
  ### Other
160
231
 
161
232
  You can use the feature extraction pipeline directly.
@@ -165,10 +236,16 @@ model = Informers.pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2", quan
165
236
  embeddings = model.(sentences, pooling: "mean", normalize: true)
166
237
  ```
167
238
 
168
- The model files must include `onnx/model.onnx` or `onnx/model_quantized.onnx` ([example](https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx)).
239
+ The model must include a `.onnx` file ([example](https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx)). If the file is not at `onnx/model.onnx` or `onnx/model_quantized.onnx`, use the `model_file_name` option to specify the location.
169
240
 
170
241
  ## Pipelines
171
242
 
243
+ - [Text](#text)
244
+ - [Vision](#vision)
245
+ - [Multimodel](#multimodal)
246
+
247
+ ### Text
248
+
172
249
  Embedding
173
250
 
174
251
  ```ruby
@@ -176,7 +253,7 @@ embed = Informers.pipeline("embedding")
176
253
  embed.("We are very happy to show you the 🤗 Transformers library.")
177
254
  ```
178
255
 
179
- Reranking (experimental)
256
+ Reranking
180
257
 
181
258
  ```ruby
182
259
  rerank = Informers.pipeline("reranking")
@@ -204,6 +281,48 @@ qa = Informers.pipeline("question-answering")
204
281
  qa.("Who invented Ruby?", "Ruby is a programming language created by Matz")
205
282
  ```
206
283
 
284
+ Zero-shot classification
285
+
286
+ ```ruby
287
+ classifier = Informers.pipeline("zero-shot-classification")
288
+ classifier.("text", ["label1", "label2", "label3"])
289
+ ```
290
+
291
+ Text generation
292
+
293
+ ```ruby
294
+ generator = Informers.pipeline("text-generation")
295
+ generator.("I enjoy walking with my cute dog,")
296
+ ```
297
+
298
+ Text-to-text generation
299
+
300
+ ```ruby
301
+ text2text = Informers.pipeline("text2text-generation")
302
+ text2text.("translate from English to French: I'm very happy")
303
+ ```
304
+
305
+ Translation
306
+
307
+ ```ruby
308
+ translator = Informers.pipeline("translation", "Xenova/nllb-200-distilled-600M")
309
+ translator.("जीवन एक चॉकलेट बॉक्स की तरह है।", src_lang: "hin_Deva", tgt_lang: "fra_Latn")
310
+ ```
311
+
312
+ Summarization
313
+
314
+ ```ruby
315
+ summarizer = Informers.pipeline("summarization")
316
+ summarizer.("Many paragraphs of text")
317
+ ```
318
+
319
+ Fill mask
320
+
321
+ ```ruby
322
+ unmasker = Informers.pipeline("fill-mask")
323
+ unmasker.("Paris is the [MASK] of France.")
324
+ ```
325
+
207
326
  Feature extraction
208
327
 
209
328
  ```ruby
@@ -211,6 +330,80 @@ extractor = Informers.pipeline("feature-extraction")
211
330
  extractor.("We are very happy to show you the 🤗 Transformers library.")
212
331
  ```
213
332
 
333
+ ### Vision
334
+
335
+ Image classification
336
+
337
+ ```ruby
338
+ classifier = Informers.pipeline("image-classification")
339
+ classifier.("image.jpg")
340
+ ```
341
+
342
+ Zero-shot image classification
343
+
344
+ ```ruby
345
+ classifier = Informers.pipeline("zero-shot-image-classification")
346
+ classifier.("image.jpg", ["label1", "label2", "label3"])
347
+ ```
348
+
349
+ Image segmentation
350
+
351
+ ```ruby
352
+ segmenter = Informers.pipeline("image-segmentation")
353
+ segmenter.("image.jpg")
354
+ ```
355
+
356
+ Object detection
357
+
358
+ ```ruby
359
+ detector = Informers.pipeline("object-detection")
360
+ detector.("image.jpg")
361
+ ```
362
+
363
+ Zero-shot object detection
364
+
365
+ ```ruby
366
+ detector = Informers.pipeline("zero-shot-object-detection")
367
+ detector.("image.jpg", ["label1", "label2", "label3"])
368
+ ```
369
+
370
+ Depth estimation
371
+
372
+ ```ruby
373
+ estimator = Informers.pipeline("depth-estimation")
374
+ estimator.("image.jpg")
375
+ ```
376
+
377
+ Image-to-image
378
+
379
+ ```ruby
380
+ upscaler = Informers.pipeline("image-to-image")
381
+ upscaler.("image.jpg")
382
+ ```
383
+
384
+ Image feature extraction
385
+
386
+ ```ruby
387
+ extractor = Informers.pipeline("image-feature-extraction")
388
+ extractor.("image.jpg")
389
+ ```
390
+
391
+ ### Multimodal
392
+
393
+ Image captioning
394
+
395
+ ```ruby
396
+ captioner = Informers.pipeline("image-to-text")
397
+ captioner.("image.jpg")
398
+ ```
399
+
400
+ Document question answering
401
+
402
+ ```ruby
403
+ qa = Informers.pipeline("document-question-answering")
404
+ qa.("image.jpg", "What is the invoice number?")
405
+ ```
406
+
214
407
  ## Credits
215
408
 
216
409
  This library was ported from [Transformers.js](https://github.com/xenova/transformers.js) and is available under the same license.
@@ -250,5 +443,6 @@ To get started with development:
250
443
  git clone https://github.com/ankane/informers.git
251
444
  cd informers
252
445
  bundle install
446
+ bundle exec rake download:files
253
447
  bundle exec rake test
254
448
  ```
@@ -1,17 +1,19 @@
1
1
  module Informers
2
2
  class PretrainedConfig
3
- attr_reader :model_type, :problem_type, :id2label
4
-
5
3
  def initialize(config_json)
6
- @is_encoder_decoder = false
7
-
8
- @model_type = config_json["model_type"]
9
- @problem_type = config_json["problem_type"]
10
- @id2label = config_json["id2label"]
4
+ @config_json = config_json.to_h
11
5
  end
12
6
 
13
7
  def [](key)
14
- instance_variable_get("@#{key}")
8
+ @config_json[key.to_s]
9
+ end
10
+
11
+ def []=(key, value)
12
+ @config_json[key.to_s] = value
13
+ end
14
+
15
+ def to_h
16
+ @config_json.to_h
15
17
  end
16
18
 
17
19
  def self.from_pretrained(
@@ -1,24 +1,12 @@
1
1
  module Informers
2
2
  class Model
3
3
  def initialize(model_id, quantized: false)
4
- @model_id = model_id
5
4
  @model = Informers.pipeline("embedding", model_id, quantized: quantized)
5
+ @options = model_id == "mixedbread-ai/mxbai-embed-large-v1" ? {pooling: "cls", normalize: false} : {}
6
6
  end
7
7
 
8
8
  def embed(texts)
9
- is_batched = texts.is_a?(Array)
10
- texts = [texts] unless is_batched
11
-
12
- case @model_id
13
- when "sentence-transformers/all-MiniLM-L6-v2", "Xenova/all-MiniLM-L6-v2", "Xenova/multi-qa-MiniLM-L6-cos-v1", "Supabase/gte-small"
14
- output = @model.(texts)
15
- when "mixedbread-ai/mxbai-embed-large-v1"
16
- output = @model.(texts, pooling: "cls", normalize: false)
17
- else
18
- raise Error, "Use the embedding pipeline for this model: #{@model_id}"
19
- end
20
-
21
- is_batched ? output : output[0]
9
+ @model.(texts, **@options)
22
10
  end
23
11
  end
24
12
  end