informers 1.0.2 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4ea317272c5054b01616643e7e0f0b2b2fe0c4a87fe8399350a6b8d0a279c5a1
4
- data.tar.gz: 530f8aaab9a5ca71811a82adca0272e2ca84525bcf1f60f2209c394cbd0f9c2a
3
+ metadata.gz: ab4f19adb4d6ca0289784cee6c6cb5235b73a5184abffbeaf44391768be1f0ac
4
+ data.tar.gz: '0880ce4dced5ce47ceaaa5fee8d10e6324b3fc0a23e05c3da3728414dcc273d9'
5
5
  SHA512:
6
- metadata.gz: 76059b486e6f6c0b0054450f76813dd4bf12845da6f46e8089585cd1a69be7db86a0acf446cc5a18e48108393403324626f6656d09bdb69083f2651abc0d2448
7
- data.tar.gz: f466f5382edd76a7092dc6ada349a3e58fe7eedcd481726ca765f8ddfb4543b7269dab96c00a93d10b0fd67f800afd70a619cfb15d78dde494b29cc13d21ef1a
6
+ metadata.gz: eb3ee6d16e4e20eca6fae3fae8f97d78ba6bb655d48e2012640d64538785e2a9ff2afb10269cf01db928553438e8fbd08584774ba3f3d08bc25f36cbb971a99a
7
+ data.tar.gz: '0008441293f2605ec8599135d715093053e21f67f56ba59b730a3bc1f46f04f4a7fabb7fef039f156cd4183011c93b7fc9cab6ba731bf78627244bc4dedcf18d'
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ## 1.1.0 (2024-09-17)
2
+
3
+ - Added more pipelines
4
+
5
+ ## 1.0.3 (2024-08-29)
6
+
7
+ - Added `model_output` option
8
+ - Improved `model_file_name` option
9
+
1
10
  ## 1.0.2 (2024-08-28)
2
11
 
3
12
  - Added `embedding` pipeline
data/README.md CHANGED
@@ -30,10 +30,15 @@ Embedding
30
30
  - [intfloat/e5-base-v2](#intfloate5-base-v2)
31
31
  - [nomic-ai/nomic-embed-text-v1](#nomic-ainomic-embed-text-v1)
32
32
  - [BAAI/bge-base-en-v1.5](#baaibge-base-en-v15)
33
+ - [jinaai/jina-embeddings-v2-base-en](#jinaaijina-embeddings-v2-base-en)
34
+ - [Snowflake/snowflake-arctic-embed-m-v1.5](#snowflakesnowflake-arctic-embed-m-v15)
35
+ - [Xenova/all-mpnet-base-v2](#xenovaall-mpnet-base-v2)
33
36
 
34
- Reranking (experimental)
37
+ Reranking
35
38
 
36
39
  - [mixedbread-ai/mxbai-rerank-base-v1](#mixedbread-aimxbai-rerank-base-v1)
40
+ - [jinaai/jina-reranker-v1-turbo-en](#jinaaijina-reranker-v1-turbo-en)
41
+ - [BAAI/bge-reranker-base](#baaibge-reranker-base)
37
42
 
38
43
  ### sentence-transformers/all-MiniLM-L6-v2
39
44
 
@@ -72,18 +77,16 @@ doc_score_pairs = docs.zip(scores).sort_by { |d, s| -s }
72
77
  [Docs](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1)
73
78
 
74
79
  ```ruby
75
- def transform_query(query)
76
- "Represent this sentence for searching relevant passages: #{query}"
77
- end
80
+ query_prefix = "Represent this sentence for searching relevant passages: "
78
81
 
79
- docs = [
80
- transform_query("puppy"),
82
+ input = [
81
83
  "The dog is barking",
82
- "The cat is purring"
84
+ "The cat is purring",
85
+ query_prefix + "puppy"
83
86
  ]
84
87
 
85
88
  model = Informers.pipeline("embedding", "mixedbread-ai/mxbai-embed-large-v1")
86
- embeddings = model.(docs)
89
+ embeddings = model.(input)
87
90
  ```
88
91
 
89
92
  ### Supabase/gte-small
@@ -102,9 +105,12 @@ embeddings = model.(sentences)
102
105
  [Docs](https://huggingface.co/intfloat/e5-base-v2)
103
106
 
104
107
  ```ruby
108
+ doc_prefix = "passage: "
109
+ query_prefix = "query: "
110
+
105
111
  input = [
106
- "passage: Ruby is a programming language created by Matz",
107
- "query: Ruby creator"
112
+ doc_prefix + "Ruby is a programming language created by Matz",
113
+ query_prefix + "Ruby creator"
108
114
  ]
109
115
 
110
116
  model = Informers.pipeline("embedding", "intfloat/e5-base-v2")
@@ -116,9 +122,13 @@ embeddings = model.(input)
116
122
  [Docs](https://huggingface.co/nomic-ai/nomic-embed-text-v1)
117
123
 
118
124
  ```ruby
125
+ doc_prefix = "search_document: "
126
+ query_prefix = "search_query: "
127
+
119
128
  input = [
120
- "search_document: The dog is barking",
121
- "search_query: puppy"
129
+ doc_prefix + "The dog is barking",
130
+ doc_prefix + "The cat is purring",
131
+ query_prefix + "puppy"
122
132
  ]
123
133
 
124
134
  model = Informers.pipeline("embedding", "nomic-ai/nomic-embed-text-v1")
@@ -130,20 +140,57 @@ embeddings = model.(input)
130
140
  [Docs](https://huggingface.co/BAAI/bge-base-en-v1.5)
131
141
 
132
142
  ```ruby
133
- def transform_query(query)
134
- "Represent this sentence for searching relevant passages: #{query}"
135
- end
143
+ query_prefix = "Represent this sentence for searching relevant passages: "
136
144
 
137
145
  input = [
138
- transform_query("puppy"),
139
146
  "The dog is barking",
140
- "The cat is purring"
147
+ "The cat is purring",
148
+ query_prefix + "puppy"
141
149
  ]
142
150
 
143
151
  model = Informers.pipeline("embedding", "BAAI/bge-base-en-v1.5")
144
152
  embeddings = model.(input)
145
153
  ```
146
154
 
155
+ ### jinaai/jina-embeddings-v2-base-en
156
+
157
+ [Docs](https://huggingface.co/jinaai/jina-embeddings-v2-base-en)
158
+
159
+ ```ruby
160
+ sentences = ["How is the weather today?", "What is the current weather like today?"]
161
+
162
+ model = Informers.pipeline("embedding", "jinaai/jina-embeddings-v2-base-en", model_file_name: "../model")
163
+ embeddings = model.(sentences)
164
+ ```
165
+
166
+ ### Snowflake/snowflake-arctic-embed-m-v1.5
167
+
168
+ [Docs](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5)
169
+
170
+ ```ruby
171
+ query_prefix = "Represent this sentence for searching relevant passages: "
172
+
173
+ input = [
174
+ "The dog is barking",
175
+ "The cat is purring",
176
+ query_prefix + "puppy"
177
+ ]
178
+
179
+ model = Informers.pipeline("embedding", "Snowflake/snowflake-arctic-embed-m-v1.5")
180
+ embeddings = model.(input, model_output: "sentence_embedding", pooling: "none")
181
+ ```
182
+
183
+ ### Xenova/all-mpnet-base-v2
184
+
185
+ [Docs](https://huggingface.co/Xenova/all-mpnet-base-v2)
186
+
187
+ ```ruby
188
+ sentences = ["This is an example sentence", "Each sentence is converted"]
189
+
190
+ model = Informers.pipeline("embedding", "Xenova/all-mpnet-base-v2")
191
+ embeddings = model.(sentences)
192
+ ```
193
+
147
194
  ### mixedbread-ai/mxbai-rerank-base-v1
148
195
 
149
196
  [Docs](https://huggingface.co/mixedbread-ai/mxbai-rerank-base-v1)
@@ -156,6 +203,30 @@ model = Informers.pipeline("reranking", "mixedbread-ai/mxbai-rerank-base-v1")
156
203
  result = model.(query, docs)
157
204
  ```
158
205
 
206
+ ### jinaai/jina-reranker-v1-turbo-en
207
+
208
+ [Docs](https://huggingface.co/jinaai/jina-reranker-v1-turbo-en)
209
+
210
+ ```ruby
211
+ query = "How many people live in London?"
212
+ docs = ["Around 9 Million people live in London", "London is known for its financial district"]
213
+
214
+ model = Informers.pipeline("reranking", "jinaai/jina-reranker-v1-turbo-en")
215
+ result = model.(query, docs)
216
+ ```
217
+
218
+ ### BAAI/bge-reranker-base
219
+
220
+ [Docs](https://huggingface.co/BAAI/bge-reranker-base)
221
+
222
+ ```ruby
223
+ query = "How many people live in London?"
224
+ docs = ["Around 9 Million people live in London", "London is known for its financial district"]
225
+
226
+ model = Informers.pipeline("reranking", "BAAI/bge-reranker-base")
227
+ result = model.(query, docs)
228
+ ```
229
+
159
230
  ### Other
160
231
 
161
232
  You can use the feature extraction pipeline directly.
@@ -165,10 +236,16 @@ model = Informers.pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2", quan
165
236
  embeddings = model.(sentences, pooling: "mean", normalize: true)
166
237
  ```
167
238
 
168
- The model files must include `onnx/model.onnx` or `onnx/model_quantized.onnx` ([example](https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx)).
239
+ The model must include a `.onnx` file ([example](https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx)). If the file is not at `onnx/model.onnx` or `onnx/model_quantized.onnx`, use the `model_file_name` option to specify the location.
169
240
 
170
241
  ## Pipelines
171
242
 
243
+ - [Text](#text)
244
+ - [Vision](#vision)
245
+ - [Multimodel](#multimodal)
246
+
247
+ ### Text
248
+
172
249
  Embedding
173
250
 
174
251
  ```ruby
@@ -176,7 +253,7 @@ embed = Informers.pipeline("embedding")
176
253
  embed.("We are very happy to show you the 🤗 Transformers library.")
177
254
  ```
178
255
 
179
- Reranking (experimental)
256
+ Reranking
180
257
 
181
258
  ```ruby
182
259
  rerank = Informers.pipeline("reranking")
@@ -204,6 +281,48 @@ qa = Informers.pipeline("question-answering")
204
281
  qa.("Who invented Ruby?", "Ruby is a programming language created by Matz")
205
282
  ```
206
283
 
284
+ Zero-shot classification
285
+
286
+ ```ruby
287
+ classifier = Informers.pipeline("zero-shot-classification")
288
+ classifier.("text", ["label1", "label2", "label3"])
289
+ ```
290
+
291
+ Text generation
292
+
293
+ ```ruby
294
+ generator = Informers.pipeline("text-generation")
295
+ generator.("I enjoy walking with my cute dog,")
296
+ ```
297
+
298
+ Text-to-text generation
299
+
300
+ ```ruby
301
+ text2text = Informers.pipeline("text2text-generation")
302
+ text2text.("translate from English to French: I'm very happy")
303
+ ```
304
+
305
+ Translation
306
+
307
+ ```ruby
308
+ translator = Informers.pipeline("translation", "Xenova/nllb-200-distilled-600M")
309
+ translator.("जीवन एक चॉकलेट बॉक्स की तरह है।", src_lang: "hin_Deva", tgt_lang: "fra_Latn")
310
+ ```
311
+
312
+ Summarization
313
+
314
+ ```ruby
315
+ summarizer = Informers.pipeline("summarization")
316
+ summarizer.("Many paragraphs of text")
317
+ ```
318
+
319
+ Fill mask
320
+
321
+ ```ruby
322
+ unmasker = Informers.pipeline("fill-mask")
323
+ unmasker.("Paris is the [MASK] of France.")
324
+ ```
325
+
207
326
  Feature extraction
208
327
 
209
328
  ```ruby
@@ -211,6 +330,80 @@ extractor = Informers.pipeline("feature-extraction")
211
330
  extractor.("We are very happy to show you the 🤗 Transformers library.")
212
331
  ```
213
332
 
333
+ ### Vision
334
+
335
+ Image classification
336
+
337
+ ```ruby
338
+ classifier = Informers.pipeline("image-classification")
339
+ classifier.("image.jpg")
340
+ ```
341
+
342
+ Zero-shot image classification
343
+
344
+ ```ruby
345
+ classifier = Informers.pipeline("zero-shot-image-classification")
346
+ classifier.("image.jpg", ["label1", "label2", "label3"])
347
+ ```
348
+
349
+ Image segmentation
350
+
351
+ ```ruby
352
+ segmenter = Informers.pipeline("image-segmentation")
353
+ segmenter.("image.jpg")
354
+ ```
355
+
356
+ Object detection
357
+
358
+ ```ruby
359
+ detector = Informers.pipeline("object-detection")
360
+ detector.("image.jpg")
361
+ ```
362
+
363
+ Zero-shot object detection
364
+
365
+ ```ruby
366
+ detector = Informers.pipeline("zero-shot-object-detection")
367
+ detector.("image.jpg", ["label1", "label2", "label3"])
368
+ ```
369
+
370
+ Depth estimation
371
+
372
+ ```ruby
373
+ estimator = Informers.pipeline("depth-estimation")
374
+ estimator.("image.jpg")
375
+ ```
376
+
377
+ Image-to-image
378
+
379
+ ```ruby
380
+ upscaler = Informers.pipeline("image-to-image")
381
+ upscaler.("image.jpg")
382
+ ```
383
+
384
+ Image feature extraction
385
+
386
+ ```ruby
387
+ extractor = Informers.pipeline("image-feature-extraction")
388
+ extractor.("image.jpg")
389
+ ```
390
+
391
+ ### Multimodal
392
+
393
+ Image captioning
394
+
395
+ ```ruby
396
+ captioner = Informers.pipeline("image-to-text")
397
+ captioner.("image.jpg")
398
+ ```
399
+
400
+ Document question answering
401
+
402
+ ```ruby
403
+ qa = Informers.pipeline("document-question-answering")
404
+ qa.("image.jpg", "What is the invoice number?")
405
+ ```
406
+
214
407
  ## Credits
215
408
 
216
409
  This library was ported from [Transformers.js](https://github.com/xenova/transformers.js) and is available under the same license.
@@ -250,5 +443,6 @@ To get started with development:
250
443
  git clone https://github.com/ankane/informers.git
251
444
  cd informers
252
445
  bundle install
446
+ bundle exec rake download:files
253
447
  bundle exec rake test
254
448
  ```
@@ -1,17 +1,19 @@
1
1
  module Informers
2
2
  class PretrainedConfig
3
- attr_reader :model_type, :problem_type, :id2label
4
-
5
3
  def initialize(config_json)
6
- @is_encoder_decoder = false
7
-
8
- @model_type = config_json["model_type"]
9
- @problem_type = config_json["problem_type"]
10
- @id2label = config_json["id2label"]
4
+ @config_json = config_json.to_h
11
5
  end
12
6
 
13
7
  def [](key)
14
- instance_variable_get("@#{key}")
8
+ @config_json[key.to_s]
9
+ end
10
+
11
+ def []=(key, value)
12
+ @config_json[key.to_s] = value
13
+ end
14
+
15
+ def to_h
16
+ @config_json.to_h
15
17
  end
16
18
 
17
19
  def self.from_pretrained(
@@ -1,24 +1,12 @@
1
1
  module Informers
2
2
  class Model
3
3
  def initialize(model_id, quantized: false)
4
- @model_id = model_id
5
4
  @model = Informers.pipeline("embedding", model_id, quantized: quantized)
5
+ @options = model_id == "mixedbread-ai/mxbai-embed-large-v1" ? {pooling: "cls", normalize: false} : {}
6
6
  end
7
7
 
8
8
  def embed(texts)
9
- is_batched = texts.is_a?(Array)
10
- texts = [texts] unless is_batched
11
-
12
- case @model_id
13
- when "sentence-transformers/all-MiniLM-L6-v2", "Xenova/all-MiniLM-L6-v2", "Xenova/multi-qa-MiniLM-L6-cos-v1", "Supabase/gte-small"
14
- output = @model.(texts)
15
- when "mixedbread-ai/mxbai-embed-large-v1"
16
- output = @model.(texts, pooling: "cls", normalize: false)
17
- else
18
- raise Error, "Use the embedding pipeline for this model: #{@model_id}"
19
- end
20
-
21
- is_batched ? output : output[0]
9
+ @model.(texts, **@options)
22
10
  end
23
11
  end
24
12
  end