informers 1.0.1 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3abc738d8975839b873bc5e07bb95305d455a9ac1eec94c432415b713411f20b
4
- data.tar.gz: b9c36794c33316378752dd816fb517714c6d8186062562a778d3c8539ba7d79a
3
+ metadata.gz: 4ea317272c5054b01616643e7e0f0b2b2fe0c4a87fe8399350a6b8d0a279c5a1
4
+ data.tar.gz: 530f8aaab9a5ca71811a82adca0272e2ca84525bcf1f60f2209c394cbd0f9c2a
5
5
  SHA512:
6
- metadata.gz: ce05bfcdebce333fd6b5abefca703850d3a6d6a50c3c1589bf675e91ae24b424f2e43e6bc0270ad4ea8a520f5be9d636c5e8a5a66deae2c0183adae6cbc517aa
7
- data.tar.gz: 6cc9b08b6e0f9e8ea23f306c0c460dc2557e4ee5113ef26300b517608485ea528fcb9254d51f395c37b557bf1728051c2c3dd8a20a25b5bd4826832a4ff30bf8
6
+ metadata.gz: 76059b486e6f6c0b0054450f76813dd4bf12845da6f46e8089585cd1a69be7db86a0acf446cc5a18e48108393403324626f6656d09bdb69083f2651abc0d2448
7
+ data.tar.gz: f466f5382edd76a7092dc6ada349a3e58fe7eedcd481726ca765f8ddfb4543b7269dab96c00a93d10b0fd67f800afd70a619cfb15d78dde494b29cc13d21ef1a
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 1.0.2 (2024-08-28)
2
+
3
+ - Added `embedding` pipeline
4
+ - Added experimental `reranking` pipeline
5
+ - Added support for `nomic-ai/nomic-embed-text-v1`
6
+
1
7
  ## 1.0.1 (2024-08-27)
2
8
 
3
9
  - Added support for `Supabase/gte-small` to `Model`
data/README.md CHANGED
@@ -21,6 +21,20 @@ gem "informers"
21
21
 
22
22
  ## Models
23
23
 
24
+ Embedding
25
+
26
+ - [sentence-transformers/all-MiniLM-L6-v2](#sentence-transformersall-MiniLM-L6-v2)
27
+ - [Xenova/multi-qa-MiniLM-L6-cos-v1](#xenovamulti-qa-MiniLM-L6-cos-v1)
28
+ - [mixedbread-ai/mxbai-embed-large-v1](#mixedbread-aimxbai-embed-large-v1)
29
+ - [Supabase/gte-small](#supabasegte-small)
30
+ - [intfloat/e5-base-v2](#intfloate5-base-v2)
31
+ - [nomic-ai/nomic-embed-text-v1](#nomic-ainomic-embed-text-v1)
32
+ - [BAAI/bge-base-en-v1.5](#baaibge-base-en-v15)
33
+
34
+ Reranking (experimental)
35
+
36
+ - [mixedbread-ai/mxbai-rerank-base-v1](#mixedbread-aimxbai-rerank-base-v1)
37
+
24
38
  ### sentence-transformers/all-MiniLM-L6-v2
25
39
 
26
40
  [Docs](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
@@ -28,14 +42,14 @@ gem "informers"
28
42
  ```ruby
29
43
  sentences = ["This is an example sentence", "Each sentence is converted"]
30
44
 
31
- model = Informers::Model.new("sentence-transformers/all-MiniLM-L6-v2")
32
- embeddings = model.embed(sentences)
45
+ model = Informers.pipeline("embedding", "sentence-transformers/all-MiniLM-L6-v2")
46
+ embeddings = model.(sentences)
33
47
  ```
34
48
 
35
49
  For a quantized version, use:
36
50
 
37
51
  ```ruby
38
- model = Informers::Model.new("Xenova/all-MiniLM-L6-v2", quantized: true)
52
+ model = Informers.pipeline("embedding", "Xenova/all-MiniLM-L6-v2", quantized: true)
39
53
  ```
40
54
 
41
55
  ### Xenova/multi-qa-MiniLM-L6-cos-v1
@@ -46,9 +60,9 @@ model = Informers::Model.new("Xenova/all-MiniLM-L6-v2", quantized: true)
46
60
  query = "How many people live in London?"
47
61
  docs = ["Around 9 Million people live in London", "London is known for its financial district"]
48
62
 
49
- model = Informers::Model.new("Xenova/multi-qa-MiniLM-L6-cos-v1")
50
- query_embedding = model.embed(query)
51
- doc_embeddings = model.embed(docs)
63
+ model = Informers.pipeline("embedding", "Xenova/multi-qa-MiniLM-L6-cos-v1")
64
+ query_embedding = model.(query)
65
+ doc_embeddings = model.(docs)
52
66
  scores = doc_embeddings.map { |e| e.zip(query_embedding).sum { |d, q| d * q } }
53
67
  doc_score_pairs = docs.zip(scores).sort_by { |d, s| -s }
54
68
  ```
@@ -68,8 +82,8 @@ docs = [
68
82
  "The cat is purring"
69
83
  ]
70
84
 
71
- model = Informers::Model.new("mixedbread-ai/mxbai-embed-large-v1")
72
- embeddings = model.embed(docs)
85
+ model = Informers.pipeline("embedding", "mixedbread-ai/mxbai-embed-large-v1")
86
+ embeddings = model.(docs)
73
87
  ```
74
88
 
75
89
  ### Supabase/gte-small
@@ -79,12 +93,96 @@ embeddings = model.embed(docs)
79
93
  ```ruby
80
94
  sentences = ["That is a happy person", "That is a very happy person"]
81
95
 
82
- model = Informers::Model.new("Supabase/gte-small")
83
- embeddings = model.embed(sentences)
96
+ model = Informers.pipeline("embedding", "Supabase/gte-small")
97
+ embeddings = model.(sentences)
98
+ ```
99
+
100
+ ### intfloat/e5-base-v2
101
+
102
+ [Docs](https://huggingface.co/intfloat/e5-base-v2)
103
+
104
+ ```ruby
105
+ input = [
106
+ "passage: Ruby is a programming language created by Matz",
107
+ "query: Ruby creator"
108
+ ]
109
+
110
+ model = Informers.pipeline("embedding", "intfloat/e5-base-v2")
111
+ embeddings = model.(input)
112
+ ```
113
+
114
+ ### nomic-ai/nomic-embed-text-v1
115
+
116
+ [Docs](https://huggingface.co/nomic-ai/nomic-embed-text-v1)
117
+
118
+ ```ruby
119
+ input = [
120
+ "search_document: The dog is barking",
121
+ "search_query: puppy"
122
+ ]
123
+
124
+ model = Informers.pipeline("embedding", "nomic-ai/nomic-embed-text-v1")
125
+ embeddings = model.(input)
126
+ ```
127
+
128
+ ### BAAI/bge-base-en-v1.5
129
+
130
+ [Docs](https://huggingface.co/BAAI/bge-base-en-v1.5)
131
+
132
+ ```ruby
133
+ def transform_query(query)
134
+ "Represent this sentence for searching relevant passages: #{query}"
135
+ end
136
+
137
+ input = [
138
+ transform_query("puppy"),
139
+ "The dog is barking",
140
+ "The cat is purring"
141
+ ]
142
+
143
+ model = Informers.pipeline("embedding", "BAAI/bge-base-en-v1.5")
144
+ embeddings = model.(input)
145
+ ```
146
+
147
+ ### mixedbread-ai/mxbai-rerank-base-v1
148
+
149
+ [Docs](https://huggingface.co/mixedbread-ai/mxbai-rerank-base-v1)
150
+
151
+ ```ruby
152
+ query = "How many people live in London?"
153
+ docs = ["Around 9 Million people live in London", "London is known for its financial district"]
154
+
155
+ model = Informers.pipeline("reranking", "mixedbread-ai/mxbai-rerank-base-v1")
156
+ result = model.(query, docs)
84
157
  ```
85
158
 
159
+ ### Other
160
+
161
+ You can use the feature extraction pipeline directly.
162
+
163
+ ```ruby
164
+ model = Informers.pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2", quantized: false)
165
+ embeddings = model.(sentences, pooling: "mean", normalize: true)
166
+ ```
167
+
168
+ The model files must include `onnx/model.onnx` or `onnx/model_quantized.onnx` ([example](https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx)).
169
+
86
170
  ## Pipelines
87
171
 
172
+ Embedding
173
+
174
+ ```ruby
175
+ embed = Informers.pipeline("embedding")
176
+ embed.("We are very happy to show you the 🤗 Transformers library.")
177
+ ```
178
+
179
+ Reranking (experimental)
180
+
181
+ ```ruby
182
+ rerank = Informers.pipeline("reranking")
183
+ rerank.("Who created Ruby?", ["Matz created Ruby", "Another doc"])
184
+ ```
185
+
88
186
  Named-entity recognition
89
187
 
90
188
  ```ruby
@@ -2,12 +2,7 @@ module Informers
2
2
  class Model
3
3
  def initialize(model_id, quantized: false)
4
4
  @model_id = model_id
5
- @model = Informers.pipeline("feature-extraction", model_id, quantized: quantized)
6
-
7
- # TODO better pattern
8
- if model_id == "sentence-transformers/all-MiniLM-L6-v2"
9
- @model.instance_variable_get(:@model).instance_variable_set(:@output_names, ["sentence_embedding"])
10
- end
5
+ @model = Informers.pipeline("embedding", model_id, quantized: quantized)
11
6
  end
12
7
 
13
8
  def embed(texts)
@@ -15,14 +10,12 @@ module Informers
15
10
  texts = [texts] unless is_batched
16
11
 
17
12
  case @model_id
18
- when "sentence-transformers/all-MiniLM-L6-v2"
13
+ when "sentence-transformers/all-MiniLM-L6-v2", "Xenova/all-MiniLM-L6-v2", "Xenova/multi-qa-MiniLM-L6-cos-v1", "Supabase/gte-small"
19
14
  output = @model.(texts)
20
- when "Xenova/all-MiniLM-L6-v2", "Xenova/multi-qa-MiniLM-L6-cos-v1", "Supabase/gte-small"
21
- output = @model.(texts, pooling: "mean", normalize: true)
22
15
  when "mixedbread-ai/mxbai-embed-large-v1"
23
- output = @model.(texts, pooling: "cls")
16
+ output = @model.(texts, pooling: "cls", normalize: false)
24
17
  else
25
- raise Error, "model not supported: #{@model_id}"
18
+ raise Error, "Use the embedding pipeline for this model: #{@model_id}"
26
19
  end
27
20
 
28
21
  is_batched ? output : output[0]
@@ -141,13 +141,13 @@ module Informers
141
141
  OnnxRuntime::InferenceSession.new(path)
142
142
  end
143
143
 
144
- def call(model_inputs)
145
- @forward.(model_inputs)
144
+ def call(model_inputs, **kwargs)
145
+ @forward.(model_inputs, **kwargs)
146
146
  end
147
147
 
148
148
  private
149
149
 
150
- def encoder_forward(model_inputs)
150
+ def encoder_forward(model_inputs, output_names: nil)
151
151
  encoder_feeds = {}
152
152
  @session.inputs.each do |input|
153
153
  key = input[:name].to_sym
@@ -156,13 +156,13 @@ module Informers
156
156
  if @session.inputs.any? { |v| v[:name] == "token_type_ids" } && !encoder_feeds[:token_type_ids]
157
157
  raise Todo
158
158
  end
159
- session_run(@session, encoder_feeds)
159
+ session_run(@session, encoder_feeds, output_names:)
160
160
  end
161
161
 
162
- def session_run(session, inputs)
162
+ def session_run(session, inputs, output_names:)
163
163
  checked_inputs = validate_inputs(session, inputs)
164
164
  begin
165
- output = session.run(@output_names, checked_inputs)
165
+ output = session.run(output_names || @output_names, checked_inputs)
166
166
  output = replace_tensors(output)
167
167
  output
168
168
  rescue => e
@@ -199,6 +199,18 @@ module Informers
199
199
  end
200
200
  end
201
201
 
202
+ class NomicBertPreTrainedModel < PreTrainedModel
203
+ end
204
+
205
+ class NomicBertModel < NomicBertPreTrainedModel
206
+ end
207
+
208
+ class DebertaV2PreTrainedModel < PreTrainedModel
209
+ end
210
+
211
+ class DebertaV2Model < DebertaV2PreTrainedModel
212
+ end
213
+
202
214
  class DistilBertPreTrainedModel < PreTrainedModel
203
215
  end
204
216
 
@@ -217,6 +229,13 @@ module Informers
217
229
  end
218
230
  end
219
231
 
232
+ MODEL_MAPPING_NAMES_ENCODER_ONLY = {
233
+ "bert" => ["BertModel", BertModel],
234
+ "nomic_bert" => ["NomicBertModel", NomicBertModel],
235
+ "deberta-v2" => ["DebertaV2Model", DebertaV2Model],
236
+ "distilbert" => ["DistilBertModel", DistilBertModel]
237
+ }
238
+
220
239
  MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = {
221
240
  "bert" => ["BertForSequenceClassification", BertForSequenceClassification],
222
241
  "distilbert" => ["DistilBertForSequenceClassification", DistilBertForSequenceClassification]
@@ -231,6 +250,7 @@ module Informers
231
250
  }
232
251
 
233
252
  MODEL_CLASS_TYPE_MAPPING = [
253
+ [MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES[:EncoderOnly]],
234
254
  [MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
235
255
  [MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
236
256
  [MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]]
@@ -10,10 +10,6 @@ module Informers
10
10
  end
11
11
 
12
12
  class TextClassificationPipeline < Pipeline
13
- def initialize(**options)
14
- super(**options)
15
- end
16
-
17
13
  def call(texts, top_k: 1)
18
14
  # Run tokenization
19
15
  model_inputs = @tokenizer.(texts,
@@ -56,10 +52,6 @@ module Informers
56
52
  end
57
53
 
58
54
  class TokenClassificationPipeline < Pipeline
59
- def initialize(**options)
60
- super(**options)
61
- end
62
-
63
55
  def call(
64
56
  texts,
65
57
  ignore_labels: ["O"],
@@ -200,10 +192,6 @@ module Informers
200
192
  end
201
193
 
202
194
  class QuestionAnsweringPipeline < Pipeline
203
- def initialize(**options)
204
- super(**options)
205
- end
206
-
207
195
  def call(question, context, top_k: 1)
208
196
  # Run tokenization
209
197
  inputs = @tokenizer.(question,
@@ -256,10 +244,6 @@ module Informers
256
244
  end
257
245
 
258
246
  class FeatureExtractionPipeline < Pipeline
259
- def initialize(**options)
260
- super(**options)
261
- end
262
-
263
247
  def call(
264
248
  texts,
265
249
  pooling: "none",
@@ -272,12 +256,27 @@ module Informers
272
256
  padding: true,
273
257
  truncation: true
274
258
  )
259
+ model_options = {}
260
+
261
+ # optimization for sentence-transformers/all-MiniLM-L6-v2
262
+ if @model.instance_variable_get(:@output_names) == ["token_embeddings"] && pooling == "mean" && normalize
263
+ model_options[:output_names] = ["sentence_embedding"]
264
+ pooling = "none"
265
+ normalize = false
266
+ end
275
267
 
276
268
  # Run model
277
- outputs = @model.(model_inputs)
269
+ outputs = @model.(model_inputs, **model_options)
270
+
271
+ # TODO improve
272
+ result =
273
+ if outputs.is_a?(Array)
274
+ raise Error, "unexpected outputs" if outputs.size != 1
275
+ outputs[0]
276
+ else
277
+ outputs.logits
278
+ end
278
279
 
279
- # TODO check outputs.last_hidden_state
280
- result = outputs.logits
281
280
  case pooling
282
281
  when "none"
283
282
  # Skip pooling
@@ -301,6 +300,46 @@ module Informers
301
300
  end
302
301
  end
303
302
 
303
+ class EmbeddingPipeline < FeatureExtractionPipeline
304
+ def call(
305
+ texts,
306
+ pooling: "mean",
307
+ normalize: true
308
+ )
309
+ super(texts, pooling:, normalize:)
310
+ end
311
+ end
312
+
313
+ class RerankingPipeline < Pipeline
314
+ def call(
315
+ query,
316
+ documents,
317
+ return_documents: false,
318
+ top_k: nil
319
+ )
320
+ model_inputs = @tokenizer.([query] * documents.size,
321
+ text_pair: documents,
322
+ padding: true,
323
+ truncation: true
324
+ )
325
+
326
+ outputs = @model.(model_inputs)
327
+
328
+ result =
329
+ Utils.sigmoid(outputs[0].map(&:first))
330
+ .map.with_index { |s, i| {doc_id: i, score: s} }
331
+ .sort_by { |v| -v[:score] }
332
+
333
+ if return_documents
334
+ result.each do |v|
335
+ v[:text] = documents[v[:doc_id]]
336
+ end
337
+ end
338
+
339
+ top_k ? result.first(top_k) : result
340
+ end
341
+ end
342
+
304
343
  SUPPORTED_TASKS = {
305
344
  "text-classification" => {
306
345
  tokenizer: AutoTokenizer,
@@ -337,6 +376,24 @@ module Informers
337
376
  model: "Xenova/all-MiniLM-L6-v2"
338
377
  },
339
378
  type: "text"
379
+ },
380
+ "embedding" => {
381
+ tokenizer: AutoTokenizer,
382
+ pipeline: EmbeddingPipeline,
383
+ model: AutoModel,
384
+ default: {
385
+ model: "sentence-transformers/all-MiniLM-L6-v2"
386
+ },
387
+ type: "text"
388
+ },
389
+ "reranking" => {
390
+ tokenizer: AutoTokenizer,
391
+ pipeline: RerankingPipeline,
392
+ model: AutoModel,
393
+ default: {
394
+ model: "mixedbread-ai/mxbai-rerank-base-v1"
395
+ },
396
+ type: "text"
340
397
  }
341
398
  }
342
399
 
@@ -361,11 +418,13 @@ module Informers
361
418
  end
362
419
  end
363
420
 
421
+ NO_DEFAULT = Object.new
422
+
364
423
  class << self
365
424
  def pipeline(
366
425
  task,
367
426
  model = nil,
368
- quantized: true,
427
+ quantized: NO_DEFAULT,
369
428
  progress_callback: DEFAULT_PROGRESS_CALLBACK,
370
429
  config: nil,
371
430
  cache_dir: nil,
@@ -373,6 +432,11 @@ module Informers
373
432
  revision: "main",
374
433
  model_file_name: nil
375
434
  )
435
+ if quantized == NO_DEFAULT
436
+ # TODO move default to task class
437
+ quantized = !["embedding", "reranking"].include?(task)
438
+ end
439
+
376
440
  # Apply aliases
377
441
  task = TASK_ALIASES[task] || task
378
442
 
@@ -408,6 +472,10 @@ module Informers
408
472
  results = load_items(classes, model, pretrained_options)
409
473
  results[:task] = task
410
474
 
475
+ if model == "sentence-transformers/all-MiniLM-L6-v2"
476
+ results[:model].instance_variable_set(:@output_names, ["token_embeddings"])
477
+ end
478
+
411
479
  Utils.dispatch_callback(progress_callback, {
412
480
  status: "ready",
413
481
  task: task,
@@ -83,12 +83,18 @@ module Informers
83
83
  # self.return_token_type_ids = true
84
84
  end
85
85
 
86
+ class DebertaV2Tokenizer < PreTrainedTokenizer
87
+ # TODO
88
+ # self.return_token_type_ids = true
89
+ end
90
+
86
91
  class DistilBertTokenizer < PreTrainedTokenizer
87
92
  end
88
93
 
89
94
  class AutoTokenizer
90
95
  TOKENIZER_CLASS_MAPPING = {
91
96
  "BertTokenizer" => BertTokenizer,
97
+ "DebertaV2Tokenizer" => DebertaV2Tokenizer,
92
98
  "DistilBertTokenizer" => DistilBertTokenizer
93
99
  }
94
100
 
@@ -1,3 +1,3 @@
1
1
  module Informers
2
- VERSION = "1.0.1"
2
+ VERSION = "1.0.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: informers
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane