informers 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3abc738d8975839b873bc5e07bb95305d455a9ac1eec94c432415b713411f20b
4
- data.tar.gz: b9c36794c33316378752dd816fb517714c6d8186062562a778d3c8539ba7d79a
3
+ metadata.gz: 4ea317272c5054b01616643e7e0f0b2b2fe0c4a87fe8399350a6b8d0a279c5a1
4
+ data.tar.gz: 530f8aaab9a5ca71811a82adca0272e2ca84525bcf1f60f2209c394cbd0f9c2a
5
5
  SHA512:
6
- metadata.gz: ce05bfcdebce333fd6b5abefca703850d3a6d6a50c3c1589bf675e91ae24b424f2e43e6bc0270ad4ea8a520f5be9d636c5e8a5a66deae2c0183adae6cbc517aa
7
- data.tar.gz: 6cc9b08b6e0f9e8ea23f306c0c460dc2557e4ee5113ef26300b517608485ea528fcb9254d51f395c37b557bf1728051c2c3dd8a20a25b5bd4826832a4ff30bf8
6
+ metadata.gz: 76059b486e6f6c0b0054450f76813dd4bf12845da6f46e8089585cd1a69be7db86a0acf446cc5a18e48108393403324626f6656d09bdb69083f2651abc0d2448
7
+ data.tar.gz: f466f5382edd76a7092dc6ada349a3e58fe7eedcd481726ca765f8ddfb4543b7269dab96c00a93d10b0fd67f800afd70a619cfb15d78dde494b29cc13d21ef1a
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 1.0.2 (2024-08-28)
2
+
3
+ - Added `embedding` pipeline
4
+ - Added experimental `reranking` pipeline
5
+ - Added support for `nomic-ai/nomic-embed-text-v1`
6
+
1
7
  ## 1.0.1 (2024-08-27)
2
8
 
3
9
  - Added support for `Supabase/gte-small` to `Model`
data/README.md CHANGED
@@ -21,6 +21,20 @@ gem "informers"
21
21
 
22
22
  ## Models
23
23
 
24
+ Embedding
25
+
26
+ - [sentence-transformers/all-MiniLM-L6-v2](#sentence-transformersall-MiniLM-L6-v2)
27
+ - [Xenova/multi-qa-MiniLM-L6-cos-v1](#xenovamulti-qa-MiniLM-L6-cos-v1)
28
+ - [mixedbread-ai/mxbai-embed-large-v1](#mixedbread-aimxbai-embed-large-v1)
29
+ - [Supabase/gte-small](#supabasegte-small)
30
+ - [intfloat/e5-base-v2](#intfloate5-base-v2)
31
+ - [nomic-ai/nomic-embed-text-v1](#nomic-ainomic-embed-text-v1)
32
+ - [BAAI/bge-base-en-v1.5](#baaibge-base-en-v15)
33
+
34
+ Reranking (experimental)
35
+
36
+ - [mixedbread-ai/mxbai-rerank-base-v1](#mixedbread-aimxbai-rerank-base-v1)
37
+
24
38
  ### sentence-transformers/all-MiniLM-L6-v2
25
39
 
26
40
  [Docs](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
@@ -28,14 +42,14 @@ gem "informers"
28
42
  ```ruby
29
43
  sentences = ["This is an example sentence", "Each sentence is converted"]
30
44
 
31
- model = Informers::Model.new("sentence-transformers/all-MiniLM-L6-v2")
32
- embeddings = model.embed(sentences)
45
+ model = Informers.pipeline("embedding", "sentence-transformers/all-MiniLM-L6-v2")
46
+ embeddings = model.(sentences)
33
47
  ```
34
48
 
35
49
  For a quantized version, use:
36
50
 
37
51
  ```ruby
38
- model = Informers::Model.new("Xenova/all-MiniLM-L6-v2", quantized: true)
52
+ model = Informers.pipeline("embedding", "Xenova/all-MiniLM-L6-v2", quantized: true)
39
53
  ```
40
54
 
41
55
  ### Xenova/multi-qa-MiniLM-L6-cos-v1
@@ -46,9 +60,9 @@ model = Informers::Model.new("Xenova/all-MiniLM-L6-v2", quantized: true)
46
60
  query = "How many people live in London?"
47
61
  docs = ["Around 9 Million people live in London", "London is known for its financial district"]
48
62
 
49
- model = Informers::Model.new("Xenova/multi-qa-MiniLM-L6-cos-v1")
50
- query_embedding = model.embed(query)
51
- doc_embeddings = model.embed(docs)
63
+ model = Informers.pipeline("embedding", "Xenova/multi-qa-MiniLM-L6-cos-v1")
64
+ query_embedding = model.(query)
65
+ doc_embeddings = model.(docs)
52
66
  scores = doc_embeddings.map { |e| e.zip(query_embedding).sum { |d, q| d * q } }
53
67
  doc_score_pairs = docs.zip(scores).sort_by { |d, s| -s }
54
68
  ```
@@ -68,8 +82,8 @@ docs = [
68
82
  "The cat is purring"
69
83
  ]
70
84
 
71
- model = Informers::Model.new("mixedbread-ai/mxbai-embed-large-v1")
72
- embeddings = model.embed(docs)
85
+ model = Informers.pipeline("embedding", "mixedbread-ai/mxbai-embed-large-v1")
86
+ embeddings = model.(docs)
73
87
  ```
74
88
 
75
89
  ### Supabase/gte-small
@@ -79,12 +93,96 @@ embeddings = model.embed(docs)
79
93
  ```ruby
80
94
  sentences = ["That is a happy person", "That is a very happy person"]
81
95
 
82
- model = Informers::Model.new("Supabase/gte-small")
83
- embeddings = model.embed(sentences)
96
+ model = Informers.pipeline("embedding", "Supabase/gte-small")
97
+ embeddings = model.(sentences)
98
+ ```
99
+
100
+ ### intfloat/e5-base-v2
101
+
102
+ [Docs](https://huggingface.co/intfloat/e5-base-v2)
103
+
104
+ ```ruby
105
+ input = [
106
+ "passage: Ruby is a programming language created by Matz",
107
+ "query: Ruby creator"
108
+ ]
109
+
110
+ model = Informers.pipeline("embedding", "intfloat/e5-base-v2")
111
+ embeddings = model.(input)
112
+ ```
113
+
114
+ ### nomic-ai/nomic-embed-text-v1
115
+
116
+ [Docs](https://huggingface.co/nomic-ai/nomic-embed-text-v1)
117
+
118
+ ```ruby
119
+ input = [
120
+ "search_document: The dog is barking",
121
+ "search_query: puppy"
122
+ ]
123
+
124
+ model = Informers.pipeline("embedding", "nomic-ai/nomic-embed-text-v1")
125
+ embeddings = model.(input)
126
+ ```
127
+
128
+ ### BAAI/bge-base-en-v1.5
129
+
130
+ [Docs](https://huggingface.co/BAAI/bge-base-en-v1.5)
131
+
132
+ ```ruby
133
+ def transform_query(query)
134
+ "Represent this sentence for searching relevant passages: #{query}"
135
+ end
136
+
137
+ input = [
138
+ transform_query("puppy"),
139
+ "The dog is barking",
140
+ "The cat is purring"
141
+ ]
142
+
143
+ model = Informers.pipeline("embedding", "BAAI/bge-base-en-v1.5")
144
+ embeddings = model.(input)
145
+ ```
146
+
147
+ ### mixedbread-ai/mxbai-rerank-base-v1
148
+
149
+ [Docs](https://huggingface.co/mixedbread-ai/mxbai-rerank-base-v1)
150
+
151
+ ```ruby
152
+ query = "How many people live in London?"
153
+ docs = ["Around 9 Million people live in London", "London is known for its financial district"]
154
+
155
+ model = Informers.pipeline("reranking", "mixedbread-ai/mxbai-rerank-base-v1")
156
+ result = model.(query, docs)
84
157
  ```
85
158
 
159
+ ### Other
160
+
161
+ You can use the feature extraction pipeline directly.
162
+
163
+ ```ruby
164
+ model = Informers.pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2", quantized: false)
165
+ embeddings = model.(sentences, pooling: "mean", normalize: true)
166
+ ```
167
+
168
+ The model files must include `onnx/model.onnx` or `onnx/model_quantized.onnx` ([example](https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx)).
169
+
86
170
  ## Pipelines
87
171
 
172
+ Embedding
173
+
174
+ ```ruby
175
+ embed = Informers.pipeline("embedding")
176
+ embed.("We are very happy to show you the 🤗 Transformers library.")
177
+ ```
178
+
179
+ Reranking (experimental)
180
+
181
+ ```ruby
182
+ rerank = Informers.pipeline("reranking")
183
+ rerank.("Who created Ruby?", ["Matz created Ruby", "Another doc"])
184
+ ```
185
+
88
186
  Named-entity recognition
89
187
 
90
188
  ```ruby
@@ -2,12 +2,7 @@ module Informers
2
2
  class Model
3
3
  def initialize(model_id, quantized: false)
4
4
  @model_id = model_id
5
- @model = Informers.pipeline("feature-extraction", model_id, quantized: quantized)
6
-
7
- # TODO better pattern
8
- if model_id == "sentence-transformers/all-MiniLM-L6-v2"
9
- @model.instance_variable_get(:@model).instance_variable_set(:@output_names, ["sentence_embedding"])
10
- end
5
+ @model = Informers.pipeline("embedding", model_id, quantized: quantized)
11
6
  end
12
7
 
13
8
  def embed(texts)
@@ -15,14 +10,12 @@ module Informers
15
10
  texts = [texts] unless is_batched
16
11
 
17
12
  case @model_id
18
- when "sentence-transformers/all-MiniLM-L6-v2"
13
+ when "sentence-transformers/all-MiniLM-L6-v2", "Xenova/all-MiniLM-L6-v2", "Xenova/multi-qa-MiniLM-L6-cos-v1", "Supabase/gte-small"
19
14
  output = @model.(texts)
20
- when "Xenova/all-MiniLM-L6-v2", "Xenova/multi-qa-MiniLM-L6-cos-v1", "Supabase/gte-small"
21
- output = @model.(texts, pooling: "mean", normalize: true)
22
15
  when "mixedbread-ai/mxbai-embed-large-v1"
23
- output = @model.(texts, pooling: "cls")
16
+ output = @model.(texts, pooling: "cls", normalize: false)
24
17
  else
25
- raise Error, "model not supported: #{@model_id}"
18
+ raise Error, "Use the embedding pipeline for this model: #{@model_id}"
26
19
  end
27
20
 
28
21
  is_batched ? output : output[0]
@@ -141,13 +141,13 @@ module Informers
141
141
  OnnxRuntime::InferenceSession.new(path)
142
142
  end
143
143
 
144
- def call(model_inputs)
145
- @forward.(model_inputs)
144
+ def call(model_inputs, **kwargs)
145
+ @forward.(model_inputs, **kwargs)
146
146
  end
147
147
 
148
148
  private
149
149
 
150
- def encoder_forward(model_inputs)
150
+ def encoder_forward(model_inputs, output_names: nil)
151
151
  encoder_feeds = {}
152
152
  @session.inputs.each do |input|
153
153
  key = input[:name].to_sym
@@ -156,13 +156,13 @@ module Informers
156
156
  if @session.inputs.any? { |v| v[:name] == "token_type_ids" } && !encoder_feeds[:token_type_ids]
157
157
  raise Todo
158
158
  end
159
- session_run(@session, encoder_feeds)
159
+ session_run(@session, encoder_feeds, output_names:)
160
160
  end
161
161
 
162
- def session_run(session, inputs)
162
+ def session_run(session, inputs, output_names:)
163
163
  checked_inputs = validate_inputs(session, inputs)
164
164
  begin
165
- output = session.run(@output_names, checked_inputs)
165
+ output = session.run(output_names || @output_names, checked_inputs)
166
166
  output = replace_tensors(output)
167
167
  output
168
168
  rescue => e
@@ -199,6 +199,18 @@ module Informers
199
199
  end
200
200
  end
201
201
 
202
+ class NomicBertPreTrainedModel < PreTrainedModel
203
+ end
204
+
205
+ class NomicBertModel < NomicBertPreTrainedModel
206
+ end
207
+
208
+ class DebertaV2PreTrainedModel < PreTrainedModel
209
+ end
210
+
211
+ class DebertaV2Model < DebertaV2PreTrainedModel
212
+ end
213
+
202
214
  class DistilBertPreTrainedModel < PreTrainedModel
203
215
  end
204
216
 
@@ -217,6 +229,13 @@ module Informers
217
229
  end
218
230
  end
219
231
 
232
+ MODEL_MAPPING_NAMES_ENCODER_ONLY = {
233
+ "bert" => ["BertModel", BertModel],
234
+ "nomic_bert" => ["NomicBertModel", NomicBertModel],
235
+ "deberta-v2" => ["DebertaV2Model", DebertaV2Model],
236
+ "distilbert" => ["DistilBertModel", DistilBertModel]
237
+ }
238
+
220
239
  MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = {
221
240
  "bert" => ["BertForSequenceClassification", BertForSequenceClassification],
222
241
  "distilbert" => ["DistilBertForSequenceClassification", DistilBertForSequenceClassification]
@@ -231,6 +250,7 @@ module Informers
231
250
  }
232
251
 
233
252
  MODEL_CLASS_TYPE_MAPPING = [
253
+ [MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES[:EncoderOnly]],
234
254
  [MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
235
255
  [MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
236
256
  [MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]]
@@ -10,10 +10,6 @@ module Informers
10
10
  end
11
11
 
12
12
  class TextClassificationPipeline < Pipeline
13
- def initialize(**options)
14
- super(**options)
15
- end
16
-
17
13
  def call(texts, top_k: 1)
18
14
  # Run tokenization
19
15
  model_inputs = @tokenizer.(texts,
@@ -56,10 +52,6 @@ module Informers
56
52
  end
57
53
 
58
54
  class TokenClassificationPipeline < Pipeline
59
- def initialize(**options)
60
- super(**options)
61
- end
62
-
63
55
  def call(
64
56
  texts,
65
57
  ignore_labels: ["O"],
@@ -200,10 +192,6 @@ module Informers
200
192
  end
201
193
 
202
194
  class QuestionAnsweringPipeline < Pipeline
203
- def initialize(**options)
204
- super(**options)
205
- end
206
-
207
195
  def call(question, context, top_k: 1)
208
196
  # Run tokenization
209
197
  inputs = @tokenizer.(question,
@@ -256,10 +244,6 @@ module Informers
256
244
  end
257
245
 
258
246
  class FeatureExtractionPipeline < Pipeline
259
- def initialize(**options)
260
- super(**options)
261
- end
262
-
263
247
  def call(
264
248
  texts,
265
249
  pooling: "none",
@@ -272,12 +256,27 @@ module Informers
272
256
  padding: true,
273
257
  truncation: true
274
258
  )
259
+ model_options = {}
260
+
261
+ # optimization for sentence-transformers/all-MiniLM-L6-v2
262
+ if @model.instance_variable_get(:@output_names) == ["token_embeddings"] && pooling == "mean" && normalize
263
+ model_options[:output_names] = ["sentence_embedding"]
264
+ pooling = "none"
265
+ normalize = false
266
+ end
275
267
 
276
268
  # Run model
277
- outputs = @model.(model_inputs)
269
+ outputs = @model.(model_inputs, **model_options)
270
+
271
+ # TODO improve
272
+ result =
273
+ if outputs.is_a?(Array)
274
+ raise Error, "unexpected outputs" if outputs.size != 1
275
+ outputs[0]
276
+ else
277
+ outputs.logits
278
+ end
278
279
 
279
- # TODO check outputs.last_hidden_state
280
- result = outputs.logits
281
280
  case pooling
282
281
  when "none"
283
282
  # Skip pooling
@@ -301,6 +300,46 @@ module Informers
301
300
  end
302
301
  end
303
302
 
303
+ class EmbeddingPipeline < FeatureExtractionPipeline
304
+ def call(
305
+ texts,
306
+ pooling: "mean",
307
+ normalize: true
308
+ )
309
+ super(texts, pooling:, normalize:)
310
+ end
311
+ end
312
+
313
+ class RerankingPipeline < Pipeline
314
+ def call(
315
+ query,
316
+ documents,
317
+ return_documents: false,
318
+ top_k: nil
319
+ )
320
+ model_inputs = @tokenizer.([query] * documents.size,
321
+ text_pair: documents,
322
+ padding: true,
323
+ truncation: true
324
+ )
325
+
326
+ outputs = @model.(model_inputs)
327
+
328
+ result =
329
+ Utils.sigmoid(outputs[0].map(&:first))
330
+ .map.with_index { |s, i| {doc_id: i, score: s} }
331
+ .sort_by { |v| -v[:score] }
332
+
333
+ if return_documents
334
+ result.each do |v|
335
+ v[:text] = documents[v[:doc_id]]
336
+ end
337
+ end
338
+
339
+ top_k ? result.first(top_k) : result
340
+ end
341
+ end
342
+
304
343
  SUPPORTED_TASKS = {
305
344
  "text-classification" => {
306
345
  tokenizer: AutoTokenizer,
@@ -337,6 +376,24 @@ module Informers
337
376
  model: "Xenova/all-MiniLM-L6-v2"
338
377
  },
339
378
  type: "text"
379
+ },
380
+ "embedding" => {
381
+ tokenizer: AutoTokenizer,
382
+ pipeline: EmbeddingPipeline,
383
+ model: AutoModel,
384
+ default: {
385
+ model: "sentence-transformers/all-MiniLM-L6-v2"
386
+ },
387
+ type: "text"
388
+ },
389
+ "reranking" => {
390
+ tokenizer: AutoTokenizer,
391
+ pipeline: RerankingPipeline,
392
+ model: AutoModel,
393
+ default: {
394
+ model: "mixedbread-ai/mxbai-rerank-base-v1"
395
+ },
396
+ type: "text"
340
397
  }
341
398
  }
342
399
 
@@ -361,11 +418,13 @@ module Informers
361
418
  end
362
419
  end
363
420
 
421
+ NO_DEFAULT = Object.new
422
+
364
423
  class << self
365
424
  def pipeline(
366
425
  task,
367
426
  model = nil,
368
- quantized: true,
427
+ quantized: NO_DEFAULT,
369
428
  progress_callback: DEFAULT_PROGRESS_CALLBACK,
370
429
  config: nil,
371
430
  cache_dir: nil,
@@ -373,6 +432,11 @@ module Informers
373
432
  revision: "main",
374
433
  model_file_name: nil
375
434
  )
435
+ if quantized == NO_DEFAULT
436
+ # TODO move default to task class
437
+ quantized = !["embedding", "reranking"].include?(task)
438
+ end
439
+
376
440
  # Apply aliases
377
441
  task = TASK_ALIASES[task] || task
378
442
 
@@ -408,6 +472,10 @@ module Informers
408
472
  results = load_items(classes, model, pretrained_options)
409
473
  results[:task] = task
410
474
 
475
+ if model == "sentence-transformers/all-MiniLM-L6-v2"
476
+ results[:model].instance_variable_set(:@output_names, ["token_embeddings"])
477
+ end
478
+
411
479
  Utils.dispatch_callback(progress_callback, {
412
480
  status: "ready",
413
481
  task: task,
@@ -83,12 +83,18 @@ module Informers
83
83
  # self.return_token_type_ids = true
84
84
  end
85
85
 
86
+ class DebertaV2Tokenizer < PreTrainedTokenizer
87
+ # TODO
88
+ # self.return_token_type_ids = true
89
+ end
90
+
86
91
  class DistilBertTokenizer < PreTrainedTokenizer
87
92
  end
88
93
 
89
94
  class AutoTokenizer
90
95
  TOKENIZER_CLASS_MAPPING = {
91
96
  "BertTokenizer" => BertTokenizer,
97
+ "DebertaV2Tokenizer" => DebertaV2Tokenizer,
92
98
  "DistilBertTokenizer" => DistilBertTokenizer
93
99
  }
94
100
 
@@ -1,3 +1,3 @@
1
1
  module Informers
2
- VERSION = "1.0.1"
2
+ VERSION = "1.0.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: informers
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane