RubyGems - informers - Versions diffs - 0.2.0 → 1.0.0 - Mend

informers 0.2.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -0
data/README.md +63 -99
data/lib/informers/configs.rb +48 -0
data/lib/informers/env.rb +14 -0
data/lib/informers/model.rb +31 -0
data/lib/informers/models.rb +294 -0
data/lib/informers/pipelines.rb +439 -0
data/lib/informers/tokenizers.rb +141 -0
data/lib/informers/utils/core.rb +7 -0
data/lib/informers/utils/hub.rb +240 -0
data/lib/informers/utils/math.rb +44 -0
data/lib/informers/utils/tensor.rb +26 -0
data/lib/informers/version.rb +1 -1
data/lib/informers.rb +28 -9
metadata +21 -41
data/lib/informers/feature_extraction.rb +0 -59
data/lib/informers/fill_mask.rb +0 -109
data/lib/informers/ner.rb +0 -106
data/lib/informers/question_answering.rb +0 -197
data/lib/informers/sentiment_analysis.rb +0 -72
data/lib/informers/text_generation.rb +0 -54
data/vendor/LICENSE-bert.txt +0 -202
data/vendor/LICENSE-blingfire.txt +0 -21
data/vendor/LICENSE-gpt2.txt +0 -24
data/vendor/LICENSE-roberta.txt +0 -21
data/vendor/bert_base_cased_tok.bin +0 -0
data/vendor/bert_base_tok.bin +0 -0
data/vendor/gpt2.bin +0 -0
data/vendor/gpt2.i2w +0 -0
data/vendor/roberta.bin +0 -0
data/vendor/roberta.i2w +0 -0

data/lib/informers/pipelines.rb ADDED Viewed

@@ -0,0 +1,439 @@
+module Informers
+  class Pipeline
+    def initialize(task:, model:, tokenizer: nil, processor: nil)
+      super()
+      @task = task
+      @model = model
+      @tokenizer = tokenizer
+      @processor = processor
+    end
+  end
+  class TextClassificationPipeline < Pipeline
+    def initialize(**options)
+      super(**options)
+    end
+    def call(texts, top_k: 1)
+      # Run tokenization
+      model_inputs = @tokenizer.(texts,
+        padding: true,
+        truncation: true
+      )
+      # Run model
+      outputs = @model.(model_inputs)
+      function_to_apply =
+        if @model.config.problem_type == "multi_label_classification"
+          ->(batch) { Utils.sigmoid(batch) }
+        else
+          ->(batch) { Utils.softmax(batch) } # single_label_classification (default)
+        end
+      id2label = @model.config.id2label
+      to_return = []
+      outputs.logits.each do |batch|
+        output = function_to_apply.(batch)
+        scores = Utils.get_top_items(output, top_k)
+        vals = scores.map do |x|
+          {
+            label: id2label[x[0].to_s],
+            score: x[1]
+          }
+        end
+        if top_k == 1
+          to_return.concat(vals)
+        else
+          to_return << vals
+        end
+      end
+      texts.is_a?(Array) ? to_return : to_return[0]
+    end
+  end
+  class TokenClassificationPipeline < Pipeline
+    def initialize(**options)
+      super(**options)
+    end
+    def call(
+      texts,
+      ignore_labels: ["O"],
+      aggregation_strategy: "simple"
+    )
+      is_batched = texts.is_a?(Array)
+      # Run tokenization
+      model_inputs = @tokenizer.(is_batched ? texts : [texts],
+        padding: true,
+        truncation: true,
+        return_offsets: true
+      )
+      # Run model
+      outputs = @model.(model_inputs)
+      logits = outputs.logits
+      id2label = @model.config.id2label
+      to_return = []
+      logits.length.times do |i|
+        ids = model_inputs[:input_ids][i]
+        batch = logits[i]
+        offsets = model_inputs[:offsets][i]
+        # List of tokens that aren't ignored
+        tokens = []
+        batch.length.times do |j|
+          token_data = batch[j]
+          top_score_index = Utils.max(token_data)[1]
+          entity = id2label ? id2label[top_score_index.to_s] : "LABEL_#{top_score_index}"
+          if ignore_labels.include?(entity)
+            # We predicted a token that should be ignored. So, we skip it.
+            next
+          end
+          # TODO add option to keep special tokens?
+          word = @tokenizer.decode([ids[j]], skip_special_tokens: true)
+          if word == ""
+            # Was a special token. So, we skip it.
+            next
+          end
+          scores = Utils.softmax(token_data)
+          tokens << {
+            entity: entity,
+            score: scores[top_score_index],
+            index: j,
+            word: word,
+            start: offsets[j][0],
+            end: offsets[j][1]
+          }
+        end
+        case aggregation_strategy
+        when "simple"
+          tokens = group_entities(tokens)
+        when "none"
+          # do nothing
+        else
+          raise ArgumentError, "Invalid aggregation_strategy"
+        end
+        to_return << tokens
+      end
+      is_batched ? to_return : to_return[0]
+    end
+    def group_sub_entities(entities)
+      # Get the first entity in the entity group
+      entity = entities[0][:entity].split("-", 2)[-1]
+      scores = entities.map { |entity| entity[:score] }
+      tokens = entities.map { |entity| entity[:word] }
+      entity_group = {
+        entity_group: entity,
+        score: scores.sum / scores.count.to_f,
+        word: @tokenizer.convert_tokens_to_string(tokens),
+        start: entities[0][:start],
+        end: entities[-1][:end]
+      }
+      entity_group
+    end
+    def get_tag(entity_name)
+      if entity_name.start_with?("B-")
+        bi = "B"
+        tag = entity_name[2..]
+      elsif entity_name.start_with?("I-")
+        bi = "I"
+        tag = entity_name[2..]
+      else
+        # It's not in B-, I- format
+        # Default to I- for continuation.
+        bi = "I"
+        tag = entity_name
+      end
+      [bi, tag]
+    end
+    def group_entities(entities)
+      entity_groups = []
+      entity_group_disagg = []
+      entities.each do |entity|
+        if entity_group_disagg.empty?
+          entity_group_disagg << entity
+          next
+        end
+        # If the current entity is similar and adjacent to the previous entity,
+        # append it to the disaggregated entity group
+        # The split is meant to account for the "B" and "I" prefixes
+        # Shouldn't merge if both entities are B-type
+        bi, tag = get_tag(entity[:entity])
+        _last_bi, last_tag = get_tag(entity_group_disagg[-1][:entity])
+        if tag == last_tag && bi != "B"
+          # Modify subword type to be previous_type
+          entity_group_disagg << entity
+        else
+          # If the current entity is different from the previous entity
+          # aggregate the disaggregated entity group
+          entity_groups << group_sub_entities(entity_group_disagg)
+          entity_group_disagg = [entity]
+        end
+      end
+      if entity_group_disagg.any?
+        # it's the last entity, add it to the entity groups
+        entity_groups << group_sub_entities(entity_group_disagg)
+      end
+      entity_groups
+    end
+  end
+  class QuestionAnsweringPipeline < Pipeline
+    def initialize(**options)
+      super(**options)
+    end
+    def call(question, context, top_k: 1)
+      # Run tokenization
+      inputs = @tokenizer.(question,
+        text_pair: context,
+        padding: true,
+        truncation: true,
+        return_offsets: true
+      )
+      output = @model.(inputs)
+      to_return = []
+      output.start_logits.length.times do |j|
+        ids = inputs[:input_ids][j]
+        sep_index = ids.index(@tokenizer.sep_token_id)
+        offsets = inputs[:offsets][j]
+        s1 = Utils.softmax(output.start_logits[j])
+          .map.with_index
+          .select { |x| x[1] > sep_index }
+        e1 = Utils.softmax(output.end_logits[j])
+          .map.with_index
+          .select { |x| x[1] > sep_index }
+        options = s1.product(e1)
+          .select { |x| x[0][1] <= x[1][1] }
+          .map { |x| [x[0][1], x[1][1], x[0][0] * x[1][0]] }
+          .sort_by { |v| -v[2] }
+        [options.length, top_k].min.times do |k|
+          start, end_, score = options[k]
+          answer_tokens = ids.slice(start, end_ + 1)
+          answer = @tokenizer.decode(answer_tokens,
+            skip_special_tokens: true
+          )
+          to_return << {
+            answer:,
+            score:,
+            start: offsets[start][0],
+            end: offsets[end_][1]
+          }
+        end
+      end
+      question.is_a?(Array) ? to_return : to_return[0]
+    end
+  end
+  class FeatureExtractionPipeline < Pipeline
+    def initialize(**options)
+      super(**options)
+    end
+    def call(
+      texts,
+      pooling: "none",
+      normalize: false,
+      quantize: false,
+      precision: "binary"
+    )
+      # Run tokenization
+      model_inputs = @tokenizer.(texts,
+        padding: true,
+        truncation: true
+      )
+      # Run model
+      outputs = @model.(model_inputs)
+      # TODO check outputs.last_hidden_state
+      result = outputs.logits
+      case pooling
+      when "none"
+        # Skip pooling
+      when "mean"
+        result = Utils.mean_pooling(result, model_inputs[:attention_mask])
+      when "cls"
+        result = result.map(&:first)
+      else
+        raise Error, "Pooling method '#{pooling}' not supported."
+      end
+      if normalize
+        result = Utils.normalize(result)
+      end
+      if quantize
+        result = quantize_embeddings(result, precision)
+      end
+      texts.is_a?(Array) ? result : result[0]
+    end
+  end
+  SUPPORTED_TASKS = {
+    "text-classification" => {
+      tokenizer: AutoTokenizer,
+      pipeline: TextClassificationPipeline,
+      model: AutoModelForSequenceClassification,
+      default: {
+        model: "Xenova/distilbert-base-uncased-finetuned-sst-2-english"
+      },
+      type: "text"
+    },
+    "token-classification" => {
+      tokenizer: AutoTokenizer,
+      pipeline: TokenClassificationPipeline,
+      model: AutoModelForTokenClassification,
+      default: {
+        model: "Xenova/bert-base-multilingual-cased-ner-hrl"
+      },
+      type: "text"
+    },
+    "question-answering" => {
+      tokenizer: AutoTokenizer,
+      pipeline: QuestionAnsweringPipeline,
+      model: AutoModelForQuestionAnswering,
+      default: {
+        model: "Xenova/distilbert-base-cased-distilled-squad"
+      },
+      type: "text"
+    },
+    "feature-extraction" => {
+      tokenizer: AutoTokenizer,
+      pipeline: FeatureExtractionPipeline,
+      model: AutoModel,
+      default: {
+        model: "Xenova/all-MiniLM-L6-v2"
+      },
+      type: "text"
+    }
+  }
+  TASK_ALIASES = {
+    "sentiment-analysis" => "text-classification",
+    "ner" => "token-classification"
+  }
+  DEFAULT_PROGRESS_CALLBACK = lambda do |msg|
+    stream = $stderr
+    tty = stream.tty?
+    width = tty ? stream.winsize[1] : 80
+    if msg[:status] == "progress" && tty
+      stream.print "\r#{Utils::Hub.display_progress(msg[:file], width, msg[:size], msg[:total_size])}"
+    elsif msg[:status] == "done" && !msg[:cache_hit]
+      if tty
+        stream.puts
+      else
+        stream.puts Utils::Hub.display_progress(msg[:file], width, 1, 1)
+      end
+    end
+  end
+  class << self
+    def pipeline(
+      task,
+      model = nil,
+      quantized: true,
+      progress_callback: DEFAULT_PROGRESS_CALLBACK,
+      config: nil,
+      cache_dir: nil,
+      local_files_only: false,
+      revision: "main",
+      model_file_name: nil
+    )
+      # Apply aliases
+      task = TASK_ALIASES[task] || task
+      # Get pipeline info
+      pipeline_info = SUPPORTED_TASKS[task.split("_", 1)[0]]
+      if !pipeline_info
+        raise Error, "Unsupported pipeline: #{task}. Must be one of #{SUPPORTED_TASKS.keys}"
+      end
+      # Use model if specified, otherwise, use default
+      if !model
+        model = pipeline_info[:default][:model]
+        warn "No model specified. Using default model: #{model.inspect}."
+      end
+      pretrained_options = {
+        quantized:,
+        progress_callback:,
+        config:,
+        cache_dir:,
+        local_files_only:,
+        revision:,
+        model_file_name:
+      }
+      classes = {
+        tokenizer: pipeline_info[:tokenizer],
+        model: pipeline_info[:model],
+        processor: pipeline_info[:processor]
+      }
+      # Load model, tokenizer, and processor (if they exist)
+      results = load_items(classes, model, pretrained_options)
+      results[:task] = task
+      Utils.dispatch_callback(progress_callback, {
+        status: "ready",
+        task: task,
+        model: model
+      })
+      pipeline_class = pipeline_info.fetch(:pipeline)
+      pipeline_class.new(**results)
+    end
+    private
+    def load_items(mapping, model, pretrained_options)
+      result = {}
+      mapping.each do |name, cls|
+        next if !cls
+        if cls.is_a?(Array)
+          raise Todo
+        else
+          result[name] = cls.from_pretrained(model, **pretrained_options)
+        end
+      end
+      result
+    end
+  end
+end

data/lib/informers/tokenizers.rb ADDED Viewed

@@ -0,0 +1,141 @@
+module Informers
+  class PreTrainedTokenizer
+    attr_reader :sep_token_id
+    def initialize(tokenizer_json, tokenizer_config)
+      super()
+      @tokenizer = Tokenizers::Tokenizer.from_file(tokenizer_json)
+      @sep_token = tokenizer_config["sep_token"]
+      @sep_token_id = @tokenizer.token_to_id(@sep_token)
+      @model_max_length = tokenizer_config["model_max_length"]
+    end
+    def call(
+      text,
+      text_pair: nil,
+      add_special_tokens: true,
+      padding: false,
+      truncation: nil,
+      max_length: nil,
+      return_tensor: true,
+      return_token_type_ids: true, # TODO change default
+      return_offsets: false
+    )
+      is_batched = text.is_a?(Array)
+      if is_batched
+        if text.length == 0
+          raise Error, "text array must be non-empty"
+        end
+        if !text_pair.nil?
+          if !text_pair.is_a?(Array)
+            raise Error, "text_pair must also be an array"
+          elsif text.length != text_pair.length
+            raise Error, "text and text_pair must have the same length"
+          end
+        end
+      end
+      if padding
+        @tokenizer.enable_padding
+      else
+        @tokenizer.no_padding
+      end
+      if truncation
+        @tokenizer.enable_truncation(max_length || @model_max_length)
+      else
+        @tokenizer.no_truncation
+      end
+      if is_batched
+        input = text_pair ? text.zip(text_pair) : text
+        encoded = @tokenizer.encode_batch(input, add_special_tokens: add_special_tokens)
+      else
+        encoded = [@tokenizer.encode(text, text_pair, add_special_tokens: add_special_tokens)]
+      end
+      result = {input_ids: encoded.map(&:ids), attention_mask: encoded.map(&:attention_mask)}
+      if return_token_type_ids
+        result[:token_type_ids] = encoded.map(&:type_ids)
+      end
+      if return_offsets
+        result[:offsets] = encoded.map(&:offsets)
+      end
+      result
+    end
+    def decode(tokens, skip_special_tokens:)
+      @tokenizer.decode(tokens, skip_special_tokens: skip_special_tokens)
+    end
+    def convert_tokens_to_string(tokens)
+      @tokenizer.decoder.decode(tokens)
+    end
+  end
+  class BertTokenizer < PreTrainedTokenizer
+    # TODO
+    # self.return_token_type_ids = true
+  end
+  class DistilBertTokenizer < PreTrainedTokenizer
+  end
+  class AutoTokenizer
+    TOKENIZER_CLASS_MAPPING = {
+      "BertTokenizer" => BertTokenizer,
+      "DistilBertTokenizer" => DistilBertTokenizer
+    }
+    def self.from_pretrained(
+      pretrained_model_name_or_path,
+      quantized: true,
+      progress_callback: nil,
+      config: nil,
+      cache_dir: nil,
+      local_files_only: false,
+      revision: "main",
+      legacy: nil,
+      **kwargs
+    )
+      tokenizer_json, tokenizer_config = load_tokenizer(
+        pretrained_model_name_or_path,
+        quantized:,
+        progress_callback:,
+        config:,
+        cache_dir:,
+        local_files_only:,
+        revision:,
+        legacy:
+      )
+      # Some tokenizers are saved with the "Fast" suffix, so we remove that if present.
+      tokenizer_name = tokenizer_config["tokenizer_class"]&.delete_suffix("Fast") || "PreTrainedTokenizer"
+      cls = TOKENIZER_CLASS_MAPPING[tokenizer_name]
+      if !cls
+        warn "Unknown tokenizer class #{tokenizer_name.inspect}, attempting to construct from base class."
+        cls = PreTrainedTokenizer
+      end
+      cls.new(tokenizer_json, tokenizer_config)
+    end
+    def self.load_tokenizer(pretrained_model_name_or_path, **options)
+      info = [
+        Utils::Hub.get_model_file(pretrained_model_name_or_path, "tokenizer.json", true, **options),
+        Utils::Hub.get_model_json(pretrained_model_name_or_path, "tokenizer_config.json", true, **options),
+      ]
+      # Override legacy option if `options.legacy` is not null
+      if !options[:legacy].nil?
+        info[1]["legacy"] = options[:legacy]
+      end
+      info
+    end
+  end
+end

data/lib/informers/utils/core.rb ADDED Viewed

@@ -0,0 +1,7 @@
+module Informers
+  module Utils
+    def self.dispatch_callback(progress_callback, data)
+      progress_callback.(data) if progress_callback
+    end
+  end
+end