RubyGems - gliner - Versions diffs - 0.1.0 - Mend

gliner 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

checksums.yaml +7 -0
data/LICENSE +21 -0
data/README.md +145 -0
data/bin/console +81 -0
data/gliner.gemspec +26 -0
data/lib/gliner/classifier.rb +68 -0
data/lib/gliner/config/classification_task.rb +74 -0
data/lib/gliner/config/entity_types.rb +74 -0
data/lib/gliner/config/field_spec.rb +87 -0
data/lib/gliner/config_parser.rb +37 -0
data/lib/gliner/inference/session_validator.rb +67 -0
data/lib/gliner/inference.rb +124 -0
data/lib/gliner/input_builder.rb +117 -0
data/lib/gliner/model.rb +142 -0
data/lib/gliner/pipeline.rb +64 -0
data/lib/gliner/position_iteration.rb +21 -0
data/lib/gliner/runners/classification_runner.rb +26 -0
data/lib/gliner/runners/entity_runner.rb +19 -0
data/lib/gliner/runners/prepared_task.rb +55 -0
data/lib/gliner/runners/structured_runner.rb +36 -0
data/lib/gliner/span_extractor.rb +117 -0
data/lib/gliner/structured_extractor.rb +94 -0
data/lib/gliner/task.rb +29 -0
data/lib/gliner/tasks/classification.rb +101 -0
data/lib/gliner/tasks/entity_extraction.rb +72 -0
data/lib/gliner/tasks/json_extraction.rb +91 -0
data/lib/gliner/text_processor.rb +42 -0
data/lib/gliner/version.rb +5 -0
data/lib/gliner.rb +97 -0
metadata +150 -0

data/lib/gliner/inference.rb ADDED Viewed

@@ -0,0 +1,124 @@
+# frozen_string_literal: true
+require 'gliner/inference/session_validator'
+module Gliner
+  class Inference
+    TASK_TYPE_ENTITIES = 0
+    TASK_TYPE_CLASSIFICATION = 1
+    TASK_TYPE_JSON = 2
+    SCHEMA_PREFIX_LENGTH = 4
+    LABEL_SPACING = 2
+    Request = Data.define(
+      :input_ids,
+      :attention_mask,
+      :words_mask,
+      :text_lengths,
+      :task_type,
+      :label_positions,
+      :label_mask,
+      :want_cls
+    )
+    IOValidation = Data.define(:input_names, :output_name, :label_index_mode, :has_cls_logits)
+    attr_reader :label_index_mode, :has_cls_logits
+    def initialize(session)
+      @session = session
+      validation = SessionValidator[session]
+      @input_names = validation.input_names
+      @output_name = validation.output_name
+      @label_index_mode = validation.label_index_mode
+      @has_cls_logits = validation.has_cls_logits
+    end
+    def run(request)
+      outputs = output_names_for(request)
+      out = @session.run(outputs, build_inputs(request))
+      format_outputs(out, outputs)
+    end
+    def label_positions_for(word_ids, label_count)
+      label_count.times.map do |i|
+        combined_idx = SCHEMA_PREFIX_LENGTH + (i * LABEL_SPACING)
+        pos = word_ids.index(combined_idx)
+        raise Error, "Could not locate label position at combined index #{combined_idx}" if pos.nil?
+        pos
+      end
+    end
+    def label_logit(logits, pos, width, label_index, label_positions)
+      if @label_index_mode == :label_position
+        raise Error, 'Label positions required for span_logits output' if label_positions.nil?
+        label_pos = label_positions.fetch(label_index)
+        logits[0][pos][width][label_pos]
+      else
+        logits[0][pos][width][label_index]
+      end
+    end
+    def sigmoid(value)
+      1.0 / (1.0 + Math.exp(-value))
+    end
+    def softmax(values)
+      max_value = values.max
+      exps = values.map { |value| Math.exp(value - max_value) }
+      sum = exps.sum
+      exps.map { |value| value / sum }
+    end
+    private
+    def build_inputs(request)
+      inputs = base_inputs(request)
+      add_token_type_ids(inputs, request)
+      filter_inputs(inputs)
+    end
+    def base_inputs(request)
+      {
+        input_ids: [request.input_ids],
+        attention_mask: [request.attention_mask],
+        words_mask: [request.words_mask],
+        text_lengths: Array(request.text_lengths).flatten,
+        task_type: [request.task_type],
+        label_positions: [request.label_positions],
+        label_mask: [request.label_mask]
+      }
+    end
+    def add_token_type_ids(inputs, request)
+      return inputs unless @input_names&.include?('token_type_ids')
+      inputs[:token_type_ids] = [Array.new(request.input_ids.length, 0)]
+      inputs
+    end
+    def filter_inputs(inputs)
+      return inputs unless @input_names
+      inputs.select { |name, _| @input_names.include?(name.to_s) }
+    end
+    def output_names_for(request)
+      output_names = [@output_name]
+      output_names << 'cls_logits' if request.want_cls && @has_cls_logits
+      output_names
+    end
+    def format_outputs(out, output_names)
+      return { logits: out.fetch(0), cls_logits: out.fetch(1) } if output_names.length > 1
+      out.fetch(0)
+    end
+  end
+end

data/lib/gliner/input_builder.rb ADDED Viewed

@@ -0,0 +1,117 @@
+# frozen_string_literal: true
+require 'set'
+module Gliner
+  class InputBuilder
+    def initialize(text_processor, max_seq_len:)
+      @text_processor = text_processor
+      @max_seq_len = max_seq_len
+    end
+    def prepare(text, schema_tokens, already_normalized: false)
+      normalized_text = normalize_text(text, already_normalized: already_normalized)
+      words, start_map, end_map = @text_processor.split_words(normalized_text)
+      input_ids, word_ids = encode_tokens(schema_tokens, words)
+      input_ids, word_ids = truncate_inputs(input_ids, word_ids, max_len: @max_seq_len)
+      text_start_index = schema_tokens.length + 1
+      text_len = infer_effective_text_len(word_ids, text_start_index, words.length)
+      context = {
+        input_ids: input_ids,
+        word_ids: word_ids,
+        text_start_index: text_start_index,
+        start_map: start_map,
+        end_map: end_map,
+        original_text: normalized_text,
+        text_len: text_len
+      }
+      build_prepared_input(context)
+    end
+    def schema_tokens_for(prompt:, labels:, label_prefix:)
+      tokens = ['(', '[P]', prompt.to_s, '(']
+      labels.each do |label|
+        tokens << label_prefix
+        tokens << label.to_s
+      end
+      tokens.push(')', ')')
+      tokens
+    end
+    private
+    def normalize_text(text, already_normalized:)
+      already_normalized ? text.to_s : @text_processor.normalize_text(text)
+    end
+    def encode_tokens(schema_tokens, words)
+      combined_tokens = schema_tokens + ['[SEP_TEXT]'] + words
+      encoded = @text_processor.encode_pretokenized(combined_tokens)
+      [encoded[:ids], encoded[:word_ids]]
+    end
+    def truncate_inputs(input_ids, word_ids, max_len:)
+      return [input_ids, word_ids] if input_ids.length <= max_len
+      [input_ids.take(max_len), word_ids.take(max_len)]
+    end
+    def build_prepared_input(context)
+      input_ids = context.fetch(:input_ids)
+      word_ids = context.fetch(:word_ids)
+      text_start_index = context.fetch(:text_start_index)
+      word_analysis = analyze_words(word_ids, text_start_index)
+      PreparedInput.new(
+        input_ids: input_ids,
+        word_ids: word_ids,
+        attention_mask: Array.new(input_ids.length, 1),
+        words_mask: word_analysis[:mask],
+        pos_to_word_index: word_analysis[:index_map],
+        start_map: context.fetch(:start_map),
+        end_map: context.fetch(:end_map),
+        original_text: context.fetch(:original_text),
+        text_len: context.fetch(:text_len)
+      )
+    end
+    def analyze_words(word_ids, text_start_index)
+      mask = Array.new(word_ids.length, 0)
+      index_map = Array.new(word_ids.length)
+      last_word_id = nil
+      seen = Set.new
+      word_ids.each_with_index do |word_id, i|
+        next unless word_id
+        # Build mask (word boundaries)
+        if word_id != last_word_id && word_id >= text_start_index
+          mask[i] = 1
+          last_word_id = word_id
+        end
+        # Build index map (first occurrence)
+        unless seen.include?(word_id)
+          seen << word_id
+          index_map[i] = word_id - text_start_index if word_id >= text_start_index
+        end
+      end
+      { mask: mask, index_map: index_map }
+    end
+    def infer_effective_text_len(word_ids, text_start_index, full_text_len)
+      max_text_word_id = word_ids.compact.select { |word_id| word_id >= text_start_index }.max
+      return full_text_len if max_text_word_id.nil?
+      present = (max_text_word_id - text_start_index) + 1
+      [present, full_text_len].min
+    end
+  end
+end

data/lib/gliner/model.rb ADDED Viewed

@@ -0,0 +1,142 @@
+# frozen_string_literal: true
+require 'json'
+require 'onnxruntime'
+require 'tokenizers'
+require 'gliner/text_processor'
+require 'gliner/config_parser'
+require 'gliner/inference'
+require 'gliner/input_builder'
+require 'gliner/span_extractor'
+require 'gliner/classifier'
+require 'gliner/structured_extractor'
+require 'gliner/task'
+require 'gliner/pipeline'
+require 'gliner/tasks/entity_extraction'
+require 'gliner/tasks/classification'
+require 'gliner/tasks/json_extraction'
+module Gliner
+  class Model
+    DEFAULT_MAX_WIDTH = 8
+    DEFAULT_MAX_SEQ_LEN = 512
+    def self.from_dir(dir, file: 'model_int8.onnx')
+      config_path = File.join(dir, 'config.json')
+      config = File.exist?(config_path) ? JSON.parse(File.read(config_path)) : {}
+      new(
+        model_path: File.join(dir, file),
+        tokenizer_path: File.join(dir, 'tokenizer.json'),
+        max_width: config.fetch('max_width', DEFAULT_MAX_WIDTH),
+        max_seq_len: config.fetch('max_seq_len', DEFAULT_MAX_SEQ_LEN)
+      )
+    end
+    def initialize(model_path:, tokenizer_path:, max_width: DEFAULT_MAX_WIDTH, max_seq_len: DEFAULT_MAX_SEQ_LEN)
+      @model_path = model_path
+      @tokenizer_path = tokenizer_path
+      @max_width = Integer(max_width)
+      @max_seq_len = Integer(max_seq_len)
+      tokenizer = Tokenizers.from_file(@tokenizer_path)
+      session = OnnxRuntime::InferenceSession.new(@model_path)
+      @text_processor = TextProcessor.new(tokenizer)
+      @inference = Inference.new(session)
+    end
+    def config_parser
+      @config_parser ||= ConfigParser.new
+    end
+    def pipeline
+      @pipeline ||= Pipeline.new(text_processor: @text_processor, inference: @inference)
+    end
+    def input_builder
+      @input_builder ||= InputBuilder.new(@text_processor, max_seq_len: @max_seq_len)
+    end
+    def span_extractor
+      @span_extractor ||= SpanExtractor.new(@inference, max_width: @max_width)
+    end
+    def structured_extractor
+      @structured_extractor ||= StructuredExtractor.new(span_extractor)
+    end
+    def classifier
+      @classifier ||= Classifier.new(@inference, max_width: @max_width)
+    end
+    def entity_task
+      @entity_task ||= Tasks::EntityExtraction.new(
+        config_parser: config_parser,
+        inference: @inference,
+        input_builder: input_builder,
+        span_extractor: span_extractor
+      )
+    end
+    def classification_task
+      @classification_task ||= Tasks::Classification.new(
+        config_parser: config_parser,
+        inference: @inference,
+        input_builder: input_builder,
+        classifier: classifier
+      )
+    end
+    def json_task
+      @json_task ||= Tasks::JsonExtraction.new(
+        config_parser: config_parser,
+        inference: @inference,
+        input_builder: input_builder,
+        span_extractor: span_extractor,
+        structured_extractor: structured_extractor
+      )
+    end
+    def extract_entities(text, entity_types, **options)
+      threshold = options.fetch(:threshold, 0.5)
+      include_confidence = options.fetch(:include_confidence, false)
+      include_spans = options.fetch(:include_spans, false)
+      pipeline.execute(
+        entity_task,
+        text,
+        entity_types,
+        threshold: threshold,
+        include_confidence: include_confidence,
+        include_spans: include_spans
+      )
+    end
+    def classify_text(text, tasks, **options)
+      include_confidence = options.fetch(:include_confidence, false)
+      threshold = options[:threshold]
+      task_options = { include_confidence: include_confidence }
+      task_options[:threshold] = threshold unless threshold.nil?
+      classification_task.execute_all(pipeline, text, tasks, **task_options)
+    end
+    def extract_json(text, structures, **options)
+      threshold = options.fetch(:threshold, 0.5)
+      include_confidence = options.fetch(:include_confidence, false)
+      include_spans = options.fetch(:include_spans, false)
+      json_task.execute_all(
+        pipeline,
+        text,
+        structures,
+        threshold: threshold,
+        include_confidence: include_confidence,
+        include_spans: include_spans
+      )
+    end
+  end
+end

data/lib/gliner/pipeline.rb ADDED Viewed

@@ -0,0 +1,64 @@
+# frozen_string_literal: true
+module Gliner
+  class Pipeline
+    def initialize(text_processor:, inference:)
+      @text_processor = text_processor
+      @inference = inference
+    end
+    def execute(task, text, config, **options)
+      parsed = task.parse_config(config)
+      prepared_text = prepare_text(task, text)
+      labels = task.labels(parsed)
+      prepared = prepare_input(task, prepared_text, parsed, labels)
+      label_positions = label_positions_for(prepared, labels.length)
+      logits = run_inference(task, prepared, labels, label_positions)
+      task.process_output(logits, parsed, prepared, options.merge(label_positions: label_positions))
+    end
+    private
+    def prepare_text(task, text)
+      return @text_processor.normalize_text(text) if task.normalize_text?
+      text.to_s
+    end
+    def prepare_input(task, prepared_text, parsed, labels)
+      schema_tokens = task.input_builder.schema_tokens_for(
+        prompt: task.build_prompt(parsed),
+        labels: labels,
+        label_prefix: task.label_prefix
+      )
+      task.input_builder.prepare(
+        prepared_text,
+        schema_tokens,
+        already_normalized: task.normalize_text?
+      )
+    end
+    def label_positions_for(prepared, label_count)
+      @inference.label_positions_for(prepared.word_ids, label_count)
+    end
+    def run_inference(task, prepared, labels, label_positions)
+      @inference.run(build_request(task, prepared, labels, label_positions))
+    end
+    def build_request(task, prepared, labels, label_positions)
+      Inference::Request.new(
+        input_ids: prepared.input_ids,
+        attention_mask: prepared.attention_mask,
+        words_mask: prepared.words_mask,
+        text_lengths: [prepared.text_len],
+        task_type: task.task_type,
+        label_positions: label_positions,
+        label_mask: Array.new(labels.length, 1),
+        want_cls: task.needs_cls_logits?
+      )
+    end
+  end
+end

data/lib/gliner/position_iteration.rb ADDED Viewed

@@ -0,0 +1,21 @@
+# frozen_string_literal: true
+module Gliner
+  module PositionIteration
+    def each_position_width(seq_len, prepared, max_width)
+      return enum_for(:each_position_width, seq_len, prepared, max_width) unless block_given?
+      (0...seq_len).each do |pos|
+        start_word = prepared.pos_to_word_index[pos]
+        next unless start_word
+        (0...max_width).each do |width|
+          end_word = start_word + width
+          next if end_word >= prepared.text_len
+          yield pos, start_word, width
+        end
+      end
+    end
+  end
+end

data/lib/gliner/runners/classification_runner.rb ADDED Viewed

@@ -0,0 +1,26 @@
+# frozen_string_literal: true
+module Gliner
+  module Runners
+    class ClassificationRunner
+      def self.[](tasks)
+        new(Gliner.model!, tasks)
+      end
+      def initialize(model, tasks_config)
+        raise Error, 'tasks must be a Hash' unless tasks_config.is_a?(Hash)
+        @tasks = tasks_config.to_h do |name, config|
+          parsed = model.classification_task.parse_config(name: name, config: config)
+          [name.to_s, PreparedTask.new(model.classification_task, parsed)]
+        end
+      end
+      def [](text, **options)
+        @tasks.transform_values { |task| task.call(text, **options) }
+      end
+      alias call []
+    end
+  end
+end

data/lib/gliner/runners/entity_runner.rb ADDED Viewed

@@ -0,0 +1,19 @@
+# frozen_string_literal: true
+module Gliner
+  module Runners
+    class EntityRunner
+      def initialize(model, config)
+        parsed = model.entity_task.parse_config(config)
+        @task = PreparedTask.new(model.entity_task, parsed)
+      end
+      def [](text, **options)
+        result = @task.call(text, **options)
+        result.fetch('entities')
+      end
+      alias call []
+    end
+  end
+end

data/lib/gliner/runners/prepared_task.rb ADDED Viewed

@@ -0,0 +1,55 @@
+# frozen_string_literal: true
+module Gliner
+  module Runners
+    class PreparedTask
+      def initialize(task, parsed)
+        @task = task
+        @parsed = parsed
+        @labels = task.labels(parsed)
+        @schema_tokens = task.input_builder.schema_tokens_for(
+          prompt: task.build_prompt(parsed),
+          labels: @labels,
+          label_prefix: task.label_prefix
+        )
+        @label_mask = Array.new(@labels.length, 1)
+        @label_positions_template = precompute_label_positions
+      end
+      def call(text, **options)
+        prepared = @task.input_builder.prepare(text, @schema_tokens)
+        label_positions = @label_positions_template
+        if label_positions.any? { |pos| pos.nil? || pos >= prepared.input_ids.length }
+          label_positions = @task.inference.label_positions_for(prepared.word_ids, @labels.length)
+        end
+        logits = @task.inference.run(
+          Inference::Request.new(
+            input_ids: prepared.input_ids,
+            attention_mask: prepared.attention_mask,
+            words_mask: prepared.words_mask,
+            text_lengths: [prepared.text_len],
+            task_type: @task.task_type,
+            label_positions: label_positions,
+            label_mask: @label_mask,
+            want_cls: @task.needs_cls_logits?
+          )
+        )
+        @task.process_output(logits, @parsed, prepared, options.merge(label_positions: label_positions))
+      end
+      private
+      def precompute_label_positions
+        return [] if @labels.empty?
+        prepared = @task.input_builder.prepare('.', @schema_tokens)
+        @task.inference.label_positions_for(prepared.word_ids, @labels.length)
+      end
+    end
+  end
+end

data/lib/gliner/runners/structured_runner.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# frozen_string_literal: true
+module Gliner
+  module Runners
+    class StructuredRunner
+      def initialize(model, config)
+        @tasks = build_tasks(model, config)
+      end
+      def [](text, **options)
+        @tasks.transform_values do |task|
+          task.call(text, **options)
+        end
+      end
+      alias call []
+      private
+      def build_tasks(model, config)
+        raise Error, 'structures must be a Hash' unless config.is_a?(Hash)
+        if config.key?(:name) || config.key?('name')
+          parsed = model.json_task.parse_config(config)
+          { parsed[:name].to_s => PreparedTask.new(model.json_task, parsed) }
+        else
+          config.each_with_object({}) do |(name, fields), tasks|
+            parsed = model.json_task.parse_config(name: name, fields: fields)
+            tasks[name.to_s] = PreparedTask.new(model.json_task, parsed)
+          end
+        end
+      end
+    end
+  end
+end