RubyGems - gliner - Versions diffs - 0.1.0 - Mend

gliner 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

checksums.yaml +7 -0
data/LICENSE +21 -0
data/README.md +145 -0
data/bin/console +81 -0
data/gliner.gemspec +26 -0
data/lib/gliner/classifier.rb +68 -0
data/lib/gliner/config/classification_task.rb +74 -0
data/lib/gliner/config/entity_types.rb +74 -0
data/lib/gliner/config/field_spec.rb +87 -0
data/lib/gliner/config_parser.rb +37 -0
data/lib/gliner/inference/session_validator.rb +67 -0
data/lib/gliner/inference.rb +124 -0
data/lib/gliner/input_builder.rb +117 -0
data/lib/gliner/model.rb +142 -0
data/lib/gliner/pipeline.rb +64 -0
data/lib/gliner/position_iteration.rb +21 -0
data/lib/gliner/runners/classification_runner.rb +26 -0
data/lib/gliner/runners/entity_runner.rb +19 -0
data/lib/gliner/runners/prepared_task.rb +55 -0
data/lib/gliner/runners/structured_runner.rb +36 -0
data/lib/gliner/span_extractor.rb +117 -0
data/lib/gliner/structured_extractor.rb +94 -0
data/lib/gliner/task.rb +29 -0
data/lib/gliner/tasks/classification.rb +101 -0
data/lib/gliner/tasks/entity_extraction.rb +72 -0
data/lib/gliner/tasks/json_extraction.rb +91 -0
data/lib/gliner/text_processor.rb +42 -0
data/lib/gliner/version.rb +5 -0
data/lib/gliner.rb +97 -0
metadata +150 -0

data/lib/gliner/span_extractor.rb ADDED Viewed

@@ -0,0 +1,117 @@
+# frozen_string_literal: true
+require 'gliner/position_iteration'
+module Gliner
+  class SpanExtractor
+    include PositionIteration
+    SCORE_SIMILARITY_THRESHOLD = 0.02
+    def initialize(inference, max_width:)
+      @inference = inference
+      @max_width = max_width
+    end
+    def extract_spans_by_label(logits, labels, label_positions, prepared, threshold: 0.5, thresholds_by_label: nil)
+      labels.each_with_index.with_object({}) do |(label, label_index), out|
+        out[label.to_s] = find_spans_for_label(
+          logits: logits,
+          label_index: label_index,
+          label_positions: label_positions,
+          prepared: prepared,
+          threshold: threshold_for(label, threshold, thresholds_by_label)
+        )
+      end
+    end
+    def find_spans_for_label(logits:, label_index:, label_positions:, prepared:, threshold:)
+      seq_len = logits.first.length
+      each_position_width(seq_len, prepared, @max_width).filter_map do |pos, start_word, width|
+        score = calculate_span_score(logits, pos, width, label_index, label_positions)
+        next if score < threshold
+        build_span(prepared, start_word, start_word + width, score)
+      end
+    end
+    def choose_best_span(spans)
+      return nil if spans.empty?
+      sorted = spans.sort_by { |s| [-s.score, (s.end - s.start), s.text.length] }
+      best = sorted.first
+      best_score = best.score
+      near = spans_within_threshold(sorted, best_score)
+      near.min_by { |s| [(s.end - s.start), -s.score, s.text.length] } || best
+    end
+    def format_single_span(span, opts)
+      format_span(span, opts)
+    end
+    def format_spans(spans, opts)
+      return [] if spans.empty?
+      sorted = spans.sort_by { |s| -s.score }
+      selected = []
+      sorted.each do |span|
+        overlaps = selected.any? { |s| span.overlaps?(s) }
+        next if overlaps
+        selected << span
+      end
+      selected.map { |span| format_span(span, opts) }
+    end
+    private
+    def calculate_span_score(logits, pos, width, label_index, label_positions)
+      logit = @inference.label_logit(logits, pos, width, label_index, label_positions)
+      @inference.sigmoid(logit)
+    end
+    def spans_within_threshold(sorted_spans, best_score)
+      sorted_spans.take_while { |span| (best_score - span.score) <= SCORE_SIMILARITY_THRESHOLD }
+    end
+    def threshold_for(label, default_threshold, thresholds_by_label)
+      return default_threshold unless thresholds_by_label&.key?(label.to_s)
+      thresholds_by_label.fetch(label.to_s)
+    end
+    def build_span(prepared, start_word, end_word, score)
+      char_start = prepared.start_map[start_word]
+      char_end = prepared.end_map[end_word]
+      return nil if char_start.nil? || char_end.nil?
+      text_span = prepared.original_text[char_start...char_end].to_s.strip
+      return nil if text_span.empty?
+      Span.new(text: text_span, score: score, start: char_start, end: char_end)
+    end
+    def format_span(span, opts)
+      return nil if span.nil?
+      format_opts = FormatOptions.from(opts)
+      return span.text unless format_opts.include_confidence || format_opts.include_spans
+      result = { 'text' => span.text }
+      result['confidence'] = span.score if format_opts.include_confidence
+      if format_opts.include_spans
+        result['start'] = span.start
+        result['end'] = span.end
+      end
+      result
+    end
+  end
+end

data/lib/gliner/structured_extractor.rb ADDED Viewed

@@ -0,0 +1,94 @@
+# frozen_string_literal: true
+module Gliner
+  class StructuredExtractor
+    def initialize(span_extractor)
+      @span_extractor = span_extractor
+    end
+    def apply_choice_filters(spans_by_label, parsed_fields)
+      filtered = spans_by_label.transform_values(&:dup)
+      parsed_fields.each do |field|
+        next unless field[:choices]&.any?
+        label = field[:name]
+        spans = filtered.fetch(label, [])
+        filtered[label] = filter_spans_by_choices(spans, field[:choices])
+      end
+      filtered
+    end
+    def filter_spans_by_choices(spans, choices)
+      return spans if spans.empty? || choices.nil? || choices.empty?
+      normalized_choices = choices.map { |choice| normalize_choice(choice) }
+      matched = spans.select { |span| normalized_choices.include?(normalize_choice(span.text)) }
+      return spans if matched.empty?
+      matched
+    end
+    def build_structure_instances(parsed_fields, spans_by_label, opts)
+      format_opts = FormatOptions.from(opts)
+      anchor_field = anchor_field_for(parsed_fields)
+      return [{}] unless anchor_field
+      anchors = spans_by_label.fetch(anchor_field[:name], [])
+      return [format_structure_object(parsed_fields, spans_by_label, format_opts)] if anchors.empty?
+      instance_spans = build_instance_spans(anchors, spans_by_label)
+      format_instances(parsed_fields, instance_spans, format_opts)
+    end
+    def format_structure_object(parsed_fields, spans_by_label, opts)
+      obj = {}
+      parsed_fields.each do |field|
+        key = field[:name]
+        spans = spans_by_label.fetch(key, [])
+        if field[:dtype] == :str
+          best = @span_extractor.choose_best_span(spans)
+          obj[key] = @span_extractor.format_single_span(best, opts)
+        else
+          obj[key] = @span_extractor.format_spans(spans, opts)
+        end
+      end
+      obj
+    end
+    private
+    def anchor_field_for(parsed_fields)
+      parsed_fields.find { |field| field[:dtype] == :str } || parsed_fields.first
+    end
+    def build_instance_spans(anchors, spans_by_label)
+      anchors_sorted = anchors.sort_by(&:start)
+      instance_spans = anchors_sorted.map { Hash.new { |hash, key| hash[key] = [] } }
+      spans_by_label.each do |label, spans|
+        spans.each do |span|
+          anchor_index = anchors_sorted.rindex { |anchor| anchor.start <= span.start } || 0
+          instance_spans[anchor_index][label] << span
+        end
+      end
+      instance_spans
+    end
+    def format_instances(parsed_fields, instance_spans, opts)
+      instance_spans.map do |field_spans|
+        format_structure_object(parsed_fields, field_spans, opts)
+      end
+    end
+    def normalize_choice(value)
+      value.to_s.strip.downcase
+    end
+  end
+end

data/lib/gliner/task.rb ADDED Viewed

@@ -0,0 +1,29 @@
+# frozen_string_literal: true
+module Gliner
+  class Task
+    attr_reader :config_parser, :inference, :input_builder
+    def initialize(config_parser:, inference:, input_builder:)
+      @config_parser = config_parser
+      @inference = inference
+      @input_builder = input_builder
+    end
+    def parse_config(input) = raise NotImplementedError
+    def task_type = raise NotImplementedError
+    def label_prefix = raise NotImplementedError
+    def build_prompt(parsed) = raise NotImplementedError
+    def labels(parsed) = raise NotImplementedError
+    def process_output(logits, parsed, prepared, options) = raise NotImplementedError
+    def normalize_text? = false
+    def needs_cls_logits? = false
+  end
+end

data/lib/gliner/tasks/classification.rb ADDED Viewed

@@ -0,0 +1,101 @@
+# frozen_string_literal: true
+module Gliner
+  module Tasks
+    class Classification < Task
+      def initialize(config_parser:, inference:, input_builder:, classifier:)
+        super(config_parser: config_parser, inference: inference, input_builder: input_builder)
+        @classifier = classifier
+      end
+      def parse_config(input)
+        raise Error, 'classification config must be a Hash' unless input.is_a?(Hash)
+        name, task_config = extract_task_config(input)
+        parsed = config_parser.parse_classification_task(name, task_config)
+        parsed.merge(name: name.to_s)
+      end
+      def task_type
+        Inference::TASK_TYPE_CLASSIFICATION
+      end
+      def label_prefix
+        '[L]'
+      end
+      def build_prompt(parsed)
+        config_parser.build_prompt(parsed[:name], parsed[:label_descs])
+      end
+      def labels(parsed)
+        parsed[:labels]
+      end
+      def needs_cls_logits?
+        inference.has_cls_logits
+      end
+      def process_output(logits, parsed, prepared, options)
+        include_confidence = options.fetch(:include_confidence, false)
+        threshold_override = options[:threshold]
+        cls_threshold = threshold_override.nil? ? parsed[:cls_threshold] : threshold_override
+        scores = classification_scores(logits, parsed, prepared, options)
+        @classifier.format_classification(
+          scores,
+          labels: parsed[:labels],
+          multi_label: parsed[:multi_label],
+          include_confidence: include_confidence,
+          cls_threshold: cls_threshold
+        )
+      end
+      def execute_all(pipeline, text, tasks_config, **options)
+        raise Error, 'tasks must be a Hash' unless tasks_config.is_a?(Hash)
+        tasks_config.each_with_object({}) do |(task_name, task_config), results|
+          parsed_config = { name: task_name, config: task_config }
+          results[task_name.to_s] = pipeline.execute(self, text, parsed_config, **options)
+        end
+      end
+      private
+      def extract_task_config(input)
+        name = input[:name] || input['name']
+        task_config = input[:config] || input['config']
+        return [name, task_config] if name && task_config
+        return input.first if name.nil? && task_config.nil? && input.length == 1
+        raise Error, 'classification config must include :name and :config'
+      end
+      def classification_scores(logits, parsed, prepared, options)
+        return cls_scores(logits, parsed) if cls_logits?(logits)
+        label_positions = options.fetch(:label_positions) do
+          inference.label_positions_for(prepared.word_ids, parsed[:labels].length)
+        end
+        @classifier.classification_scores(
+          logits,
+          parsed[:labels],
+          label_positions,
+          prepared
+        )
+      end
+      def cls_logits?(logits)
+        logits.is_a?(Hash) && logits.key?(:cls_logits)
+      end
+      def cls_scores(logits, parsed)
+        cls_logits = Array(logits.fetch(:cls_logits).fetch(0))
+        parsed[:multi_label] ? cls_logits.map { |value| inference.sigmoid(value) } : inference.softmax(cls_logits)
+      end
+    end
+  end
+end

data/lib/gliner/tasks/entity_extraction.rb ADDED Viewed

@@ -0,0 +1,72 @@
+# frozen_string_literal: true
+module Gliner
+  module Tasks
+    class EntityExtraction < Task
+      def initialize(config_parser:, inference:, input_builder:, span_extractor:)
+        super(config_parser: config_parser, inference: inference, input_builder: input_builder)
+        @span_extractor = span_extractor
+      end
+      def parse_config(input)
+        config_parser.parse_entity_types(input)
+      end
+      def task_type
+        Inference::TASK_TYPE_ENTITIES
+      end
+      def label_prefix
+        '[E]'
+      end
+      def build_prompt(parsed)
+        config_parser.build_prompt('entities', parsed[:descriptions])
+      end
+      def labels(parsed)
+        parsed[:labels]
+      end
+      def process_output(logits, parsed, prepared, options)
+        threshold = options.fetch(:threshold, 0.5)
+        format_opts = FormatOptions.from(options)
+        label_positions = options[:label_positions] || inference.label_positions_for(prepared.word_ids, parsed[:labels].length)
+        spans_by_label = extract_spans(logits, parsed, prepared, label_positions, threshold)
+        { 'entities' => format_entities(parsed, spans_by_label, format_opts) }
+      end
+      private
+      def extract_spans(logits, parsed, prepared, label_positions, threshold)
+        @span_extractor.extract_spans_by_label(
+          logits,
+          parsed[:labels],
+          label_positions,
+          prepared,
+          threshold: threshold,
+          thresholds_by_label: parsed[:thresholds]
+        )
+      end
+      def format_entities(parsed, spans_by_label, format_opts)
+        parsed[:labels].each_with_object({}) do |label, entities|
+          spans = spans_by_label.fetch(label)
+          dtype = parsed[:dtypes].fetch(label, :list)
+          entities[label] = format_entity_value(spans, dtype, format_opts)
+        end
+      end
+      def format_entity_value(spans, dtype, format_opts)
+        if dtype == :str
+          @span_extractor.format_single_span(@span_extractor.choose_best_span(spans), format_opts)
+        else
+          @span_extractor.format_spans(spans, format_opts)
+        end
+      end
+    end
+  end
+end

data/lib/gliner/tasks/json_extraction.rb ADDED Viewed

@@ -0,0 +1,91 @@
+# frozen_string_literal: true
+module Gliner
+  module Tasks
+    class JsonExtraction < Task
+      def initialize(config_parser:, inference:, input_builder:, span_extractor:, structured_extractor:)
+        super(config_parser: config_parser, inference: inference, input_builder: input_builder)
+        @span_extractor = span_extractor
+        @structured_extractor = structured_extractor
+      end
+      def parse_config(input)
+        raise Error, 'structure config must be a Hash' unless input.is_a?(Hash)
+        name, fields = extract_structure_config(input)
+        parsed_fields = Array(fields).map { |spec| config_parser.parse_field_spec(spec.to_s) }
+        {
+          name: name.to_s,
+          parsed_fields: parsed_fields,
+          labels: parsed_fields.map { |field| field[:name] },
+          descriptions: config_parser.build_field_descriptions(parsed_fields)
+        }
+      end
+      def task_type
+        Inference::TASK_TYPE_JSON
+      end
+      def label_prefix
+        '[C]'
+      end
+      def normalize_text?
+        true
+      end
+      def build_prompt(parsed)
+        config_parser.build_prompt(parsed[:name], parsed[:descriptions])
+      end
+      def labels(parsed)
+        parsed[:labels]
+      end
+      def process_output(logits, parsed, prepared, options)
+        spans_by_label = extract_spans(logits, parsed, prepared, options)
+        filtered_spans = @structured_extractor.apply_choice_filters(spans_by_label, parsed[:parsed_fields])
+        format_opts = FormatOptions.from(options)
+        @structured_extractor.build_structure_instances(parsed[:parsed_fields], filtered_spans, format_opts)
+      end
+      def execute_all(pipeline, text, structures_config, **options)
+        raise Error, 'structures must be a Hash' unless structures_config.is_a?(Hash)
+        structures_config.each_with_object({}) do |(parent, fields), results|
+          parsed_config = { name: parent, fields: fields }
+          results[parent.to_s] = pipeline.execute(self, text, parsed_config, **options)
+        end
+      end
+      private
+      def extract_structure_config(input)
+        name = input[:name] || input['name']
+        fields = input[:fields] || input['fields']
+        return [name, fields] if name && fields
+        return input.first if name.nil? && fields.nil? && input.length == 1
+        raise Error, 'structure config must include :name and :fields'
+      end
+      def extract_spans(logits, parsed, prepared, options)
+        label_positions = options.fetch(:label_positions) do
+          inference.label_positions_for(prepared.word_ids, parsed[:labels].length)
+        end
+        @span_extractor.extract_spans_by_label(
+          logits,
+          parsed[:labels],
+          label_positions,
+          prepared,
+          threshold: options.fetch(:threshold, 0.5)
+        )
+      end
+    end
+  end
+end

data/lib/gliner/text_processor.rb ADDED Viewed

@@ -0,0 +1,42 @@
+# frozen_string_literal: true
+module Gliner
+  class TextProcessor
+    def initialize(tokenizer)
+      @tokenizer = tokenizer
+      @word_pre_tokenizer = Tokenizers::PreTokenizers::BertPreTokenizer.new
+    end
+    def normalize_text(text)
+      str = text.to_s
+      str = '.' if str.empty?
+      str.end_with?('.', '!', '?') ? str : "#{str}."
+    end
+    def split_words(text)
+      text = text.to_s
+      tokens = []
+      starts = []
+      ends = []
+      @word_pre_tokenizer.pre_tokenize_str(text).each do |(token, (start_pos, end_pos))|
+        token = token.to_s.downcase
+        next if token.empty?
+        tokens << token
+        starts << start_pos
+        ends << end_pos
+      end
+      [tokens, starts, ends]
+    end
+    def encode_pretokenized(tokens)
+      enc = @tokenizer.encode(tokens, is_pretokenized: true, add_special_tokens: false)
+      { ids: enc.ids, word_ids: enc.word_ids }
+    end
+  end
+end

data/lib/gliner/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module Gliner
+  VERSION = '0.1.0'
+end

data/lib/gliner.rb ADDED Viewed

@@ -0,0 +1,97 @@
+# frozen_string_literal: true
+require 'gliner/version'
+require 'gliner/model'
+require 'gliner/runners/prepared_task'
+require 'gliner/runners/entity_runner'
+require 'gliner/runners/structured_runner'
+require 'gliner/runners/classification_runner'
+module Gliner
+  Error = Class.new(StandardError)
+  PreparedInput = Data.define(
+    :input_ids,
+    :word_ids,
+    :attention_mask,
+    :words_mask,
+    :pos_to_word_index,
+    :start_map,
+    :end_map,
+    :original_text,
+    :text_len
+  )
+  Span = Data.define(:text, :score, :start, :end) do
+    def overlaps?(other)
+      !(self.end <= other.start || start >= other.end)
+    end
+  end
+  FormatOptions = Data.define(:include_confidence, :include_spans) do
+    def self.from(input)
+      return input if input.is_a?(FormatOptions)
+      new(
+        include_confidence: input.fetch(:include_confidence, false),
+        include_spans: input.fetch(:include_spans, false)
+      )
+    end
+  end
+  class << self
+    attr_writer :model
+    def load(dir, file: 'model_int8.onnx')
+      self.model = Model.from_dir(dir, file: file)
+    end
+    def model
+      @model ||= model_from_env
+    end
+    def model!
+      fetch_model!
+    end
+    def [](config)
+      runner_for(config).new(fetch_model!, config)
+    end
+    def classify
+      Runners::ClassificationRunner
+    end
+    private
+    def model_from_env
+      dir = ENV.fetch('GLINER_MODEL_DIR', nil)
+      return nil if dir.nil? || dir.empty?
+      file = ENV['GLINER_MODEL_FILE'] || 'model_int8.onnx'
+      Model.from_dir(dir, file: file)
+    end
+    def fetch_model!
+      model = self.model
+      return model if model
+      raise Error, 'No model loaded. Call Gliner.load("/path/to/model") or set GLINER_MODEL_DIR.'
+    end
+    def runner_for(config)
+      return Runners::StructuredRunner if structured_config?(config)
+      Runners::EntityRunner
+    end
+    def structured_config?(config)
+      return false unless config.is_a?(Hash)
+      keys = config.transform_keys(&:to_s)
+      return true if keys.key?('name') && keys.key?('fields')
+      config.values.all? { |value| value.is_a?(Array) }
+    end
+  end
+end