RubyGems - gliner - Versions diffs - 0.2.3 → 0.3.0 - Mend

gliner 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/README.md +56 -7
data/lib/gliner/classifier.rb +11 -9
data/lib/gliner/model.rb +8 -8
data/lib/gliner/runners/classification_runner.rb +2 -1
data/lib/gliner/runners/entity_runner.rb +2 -1
data/lib/gliner/runners/inspectable.rb +3 -3
data/lib/gliner/runners/structured_runner.rb +2 -1
data/lib/gliner/span_extractor.rb +20 -15
data/lib/gliner/structured_extractor.rb +19 -8
data/lib/gliner/tasks/classification.rb +2 -2
data/lib/gliner/tasks/entity_extraction.rb +16 -9
data/lib/gliner/tasks/json_extraction.rb +2 -4
data/lib/gliner/version.rb +1 -1
data/lib/gliner.rb +19 -13
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 88bb40bab466141ca3e1852059a0ba526a1928cc40f7a6adc5117d0da8e6c70a
-  data.tar.gz: e901cb0b6b544a359816d8c0eb7abd50fc20cd342ffd4c20f0a2bf52728e97c8
+  metadata.gz: eac979a64c4acb302685c0390322c688dcdf35097741ef5b201c83827013f6ce
+  data.tar.gz: cc9fd10929e4dffe94ca1e4ce49ec0cea8e7569b22c7c9956cfad8d1bd0d5afc
 SHA512:
-  metadata.gz: b1e6bb6047a668641389f5e1559e4a61712223a5a4a3169e3ce4102a0502642bd53b0e7b7f263aed4a3fb813d6a507f5a8a34dff92da8474de583d047e8f109a
-  data.tar.gz: 9f9595a03255085f666ae045081a99b5293e3165e6214cd71f21e78e3338072d551ce25d753a8d6778084340e9535a2b95de6387678c7673a8fe4d7b16cfd844
+  metadata.gz: 4b75a4a0610a52d9364d988ba94277b1d55f0a116691bb6994ea571f9e2ab415aa006f22f2118c4268ef7a365e04614bde7ff8353b01ac4b5413d3545295d5bb
+  data.tar.gz: 1498c299f0d522974a0bea27c4ae71a6fee01cd93b832105b197fe227a0a33c6f751220efb128982e96e934c5feea382fe46450c3ffa4f3c6973dfd2f73e3ca4

data/README.md CHANGED Viewed

@@ -29,9 +29,19 @@ text = "Apple CEO Tim Cook announced iPhone 15 in Cupertino yesterday."
 labels = ["company", "person", "product", "location"]
 model = Gliner[labels]
-pp model[text]
+entities = model[text]
-# => {"company"=>["Apple"], "person"=>["Tim Cook"], "product"=>["iPhone 15"], "location"=>["Cupertino"]}
+pp entities["person"]
+# => [#<data Gliner::Entity ...>]
+entities["person"].first.text
+# => "Tim Cook"
+entities["person"].first.probability
+# => 92.4
+entities["person"].first.offsets
+# => [10, 18]
 ```
 You can also pass per-entity configs:
@@ -43,9 +53,13 @@ labels = {
 }
 model = Gliner[labels]
-pp model["Email John Doe at john@example.com.", threshold: 0.5]
+entities = model["Email John Doe at john@example.com.", threshold: 0.5]
-# => {"email"=>["john@example.com"], "person"=>"John Doe"}
+entities["person"].text
+# => "John Doe"
+entities["email"].map(&:text)
+# => ["john@example.com"]
 ```
 ### Classification
@@ -59,7 +73,30 @@ result = model["This laptop has amazing performance but terrible battery life!"]
 pp result
-# => {"sentiment"=>"negative"}
+# => { sentiment: #<data Gliner::Label ...> }
+result["sentiment"].label
+# => "negative"
+result["sentiment"].probability
+# => 87.1
+```
+Multiple classification tasks:
+```ruby
+text = "Breaking: Tech giant announces major layoffs amid market downturn"
+tasks = {
+  sentiment: %w[positive negative neutral],
+  urgency: %w[high medium low],
+  category: { labels: %w[tech finance politics sports], multi_label: false }
+}
+results = Gliner.classify[tasks][text]
+results.transform_values { |value| value.label }
+# => { sentiment: "negative", urgency: "high", category: "tech" }
 ```
 ### Structured extraction
@@ -77,10 +114,21 @@ structure = {
 }
 result = Gliner[structure][text]
+product = result.fetch("product").first
 pp result
-# => {"product"=>[{"name"=>"iPhone 15 Pro Max", "storage"=>"256GB", "processor"=>"A17 Pro", "price"=>"1199"}]}
+product["name"].text
+# => "iPhone 15 Pro Max"
+product["storage"].text
+# => "256GB"
+product["processor"].text
+# => "A17 Pro"
+product["price"].text
+# => "1199"
 ```
 Choices can be included in field specs:
@@ -88,7 +136,8 @@ Choices can be included in field specs:
 ```ruby
 result = Gliner[{ order: ["status::[pending|processing|shipped]::str"] }]["Status: shipped"]
-# => {"order"=>[{"status"=>"shipped"}]}
+result.fetch("order").first["status"].text
+# shipped
 ```
 ## Model files

data/lib/gliner/classifier.rb CHANGED Viewed

@@ -17,12 +17,12 @@ module Gliner
       end
     end
-    def format_classification(scores, labels:, multi_label:, include_confidence:, cls_threshold:)
+    def format_classification(scores, labels:, multi_label:, include_probability:, cls_threshold:)
       label_scores = sorted_label_scores(scores, labels)
-      return format_multi_label(label_scores, cls_threshold, include_confidence) if multi_label
+      return format_multi_label(label_scores, cls_threshold, include_probability) if multi_label
-      format_single_label(label_scores.first, include_confidence)
+      format_single_label(label_scores.first, include_probability)
     end
     private
@@ -44,10 +44,12 @@ module Gliner
         .sort_by { |(_label, score)| -score }
     end
-    def format_multi_label(label_scores, cls_threshold, include_confidence)
+    def format_multi_label(label_scores, cls_threshold, include_probability)
       chosen = labels_above_threshold(label_scores, cls_threshold)
-      chosen.map { |label, score| format_label(label, score, include_confidence) }
+      chosen
+        .sort_by { |(_label, score)| -score }
+        .map { |label, score| format_label(label, score, include_probability) }
     end
     def labels_above_threshold(label_scores, threshold)
@@ -55,14 +57,14 @@ module Gliner
       above.empty? && label_scores.first ? [label_scores.first] : above
     end
-    def format_single_label(label_score, include_confidence)
+    def format_single_label(label_score, include_probability)
       label, score = label_score
-      format_label(label, score, include_confidence)
+      format_label(label, score, include_probability)
     end
-    def format_label(label, score, include_confidence)
-      include_confidence ? { 'label' => label, 'confidence' => score } : label
+    def format_label(label, score, _include_probability)
+      Gliner::Label.new(label: label, probability: score * 100.0)
     end
   end
 end

data/lib/gliner/model.rb CHANGED Viewed

@@ -72,7 +72,7 @@ module Gliner
     end
     def entity_task
-      @entity_task ||= Tasks::EntityExtraction.new(
+      @entity_task ||= Tasks::Entity.new(
         config_parser: config_parser,
         inference: @inference,
         input_builder: input_builder,
@@ -90,7 +90,7 @@ module Gliner
     end
     def json_task
-      @json_task ||= Tasks::JsonExtraction.new(
+      @json_task ||= Tasks::Json.new(
         config_parser: config_parser,
         inference: @inference,
         input_builder: input_builder,
@@ -101,7 +101,7 @@ module Gliner
     def extract_entities(text, entity_types, **options)
       threshold = options.fetch(:threshold, Gliner.config.threshold)
-      include_confidence = options.fetch(:include_confidence, false)
+      include_probability = options.fetch(:include_probability, false)
       include_spans = options.fetch(:include_spans, false)
       pipeline.execute(
@@ -109,16 +109,16 @@ module Gliner
         text,
         entity_types,
         threshold: threshold,
-        include_confidence: include_confidence,
+        include_probability: include_probability,
         include_spans: include_spans
       )
     end
     def classify_text(text, tasks, **options)
-      include_confidence = options.fetch(:include_confidence, false)
+      include_probability = options.fetch(:include_probability, false)
       threshold = options[:threshold]
-      task_options = { include_confidence: include_confidence }
+      task_options = { include_probability: include_probability }
       task_options[:threshold] = threshold unless threshold.nil?
       classification_task.execute_all(pipeline, text, tasks, **task_options)
@@ -126,7 +126,7 @@ module Gliner
     def extract_json(text, structures, **options)
       threshold = options.fetch(:threshold, Gliner.config.threshold)
-      include_confidence = options.fetch(:include_confidence, false)
+      include_probability = options.fetch(:include_probability, false)
       include_spans = options.fetch(:include_spans, false)
       json_task.execute_all(
@@ -134,7 +134,7 @@ module Gliner
         text,
         structures,
         threshold: threshold,
-        include_confidence: include_confidence,
+        include_probability: include_probability,
         include_spans: include_spans
       )
     end

data/lib/gliner/runners/classification_runner.rb CHANGED Viewed

@@ -2,7 +2,7 @@
 module Gliner
   module Runners
-    class ClassificationRunner
+    class Classification
       include Inspectable
       def self.[](tasks)
@@ -12,6 +12,7 @@ module Gliner
       def initialize(model, tasks_config)
         raise Error, 'tasks must be a Hash' unless tasks_config.is_a?(Hash)
+        @config = tasks_config
         @tasks = tasks_config.to_h do |name, config|
           parsed = model.classification_task.parse_config(name: name, config: config)
           [name.to_s, PreparedTask.new(model.classification_task, parsed)]

data/lib/gliner/runners/entity_runner.rb CHANGED Viewed

@@ -2,12 +2,13 @@
 module Gliner
   module Runners
-    class EntityRunner
+    class Entity
       include Inspectable
       def initialize(model, config)
         parsed = model.entity_task.parse_config(config)
+        @config = config
         @labels = parsed[:labels]
         @task = PreparedTask.new(model.entity_task, parsed)
       end

data/lib/gliner/runners/inspectable.rb CHANGED Viewed

@@ -3,10 +3,10 @@
 module Gliner
   module Runners
     module Inspectable
-      def inspect
-        items = Array(inspect_items).map(&:to_s)
+      attr_reader :config
-        "#<Gliner(#{inspect_label}) input=#{items.inspect}>"
+      def inspect
+        "#<Gliner(#{inspect_label}) config=#{config.inspect}>"
       end
     end
   end

data/lib/gliner/runners/structured_runner.rb CHANGED Viewed

@@ -2,10 +2,11 @@
 module Gliner
   module Runners
-    class StructuredRunner
+    class Structure
       include Inspectable
       def initialize(model, config)
+        @config = config
         @tasks = build_tasks(model, config)
       end

data/lib/gliner/span_extractor.rb CHANGED Viewed

@@ -47,11 +47,13 @@ module Gliner
       near.min_by { |s| [(s.end - s.start), -s.score, s.text.length] } || best
     end
-    def format_single_span(span, opts)
-      format_span(span, opts)
+    def format_single_span(span, opts = nil)
+      label = extract_label(opts)
+      format_span(span, opts, label: label, index: 0)
     end
-    def format_spans(spans, opts)
+    def format_spans(spans, opts = nil)
+      label = extract_label(opts)
       return [] if spans.empty?
       sorted = spans.sort_by { |s| -s.score }
@@ -64,7 +66,9 @@ module Gliner
         selected << span
       end
-      selected.map { |span| format_span(span, opts) }
+      selected.each_with_index.map do |span, index|
+        format_span(span, opts, label: label, index: index)
+      end
     end
     private
@@ -97,21 +101,22 @@ module Gliner
       Span.new(text: text_span, score: score, start: char_start, end: char_end)
     end
-    def format_span(span, opts)
+    def format_span(span, _opts, label:, index:)
       return nil if span.nil?
-      format_opts = FormatOptions.from(opts)
-      return span.text unless format_opts.include_confidence || format_opts.include_spans
-      result = { 'text' => span.text }
-      result['confidence'] = span.score if format_opts.include_confidence
+      Gliner::Entity.new(
+        index: index,
+        offsets: [span.start, span.end],
+        text: span.text,
+        name: label&.to_s,
+        probability: span.score * 100.0
+      )
+    end
-      if format_opts.include_spans
-        result['start'] = span.start
-        result['end'] = span.end
-      end
+    def extract_label(opts)
+      return nil unless opts.is_a?(Hash)
-      result
+      opts[:label] || opts['label']
     end
   end
 end

data/lib/gliner/structured_extractor.rb CHANGED Viewed

@@ -1,6 +1,18 @@
 # frozen_string_literal: true
 module Gliner
+  Structure = Data.define(:fields) do
+    include Enumerable
+    def [](key) = fields[key]
+    def fetch(key, ...) = fields.fetch(key, ...)
+    def to_h = fields
+    def to_hash = fields
+    def keys = fields.keys
+    def values = fields.values
+    def each(&block) = fields.each(&block)
+  end
   class StructuredExtractor
     def initialize(span_extractor)
       @span_extractor = span_extractor
@@ -32,18 +44,17 @@ module Gliner
     end
     def build_structure_instances(parsed_fields, spans_by_label, opts)
-      format_opts = FormatOptions.from(opts)
       anchor_field = anchor_field_for(parsed_fields)
-      return [{}] unless anchor_field
+      return [Gliner::Structure.new(fields: {})] unless anchor_field
       anchors = spans_by_label.fetch(anchor_field[:name], [])
-      return [format_structure_object(parsed_fields, spans_by_label, format_opts)] if anchors.empty?
+      return [format_structure_object(parsed_fields, spans_by_label, opts)] if anchors.empty?
       instance_spans = build_instance_spans(anchors, spans_by_label)
-      format_instances(parsed_fields, instance_spans, format_opts)
+      format_instances(parsed_fields, instance_spans, opts)
     end
-    def format_structure_object(parsed_fields, spans_by_label, opts)
+    def format_structure_object(parsed_fields, spans_by_label, _opts)
       obj = {}
       parsed_fields.each do |field|
@@ -52,13 +63,13 @@ module Gliner
         if field[:dtype] == :str
           best = @span_extractor.choose_best_span(spans)
-          obj[key] = @span_extractor.format_single_span(best, opts)
+          obj[key] = @span_extractor.format_single_span(best, label: key)
         else
-          obj[key] = @span_extractor.format_spans(spans, opts)
+          obj[key] = @span_extractor.format_spans(spans, label: key)
         end
       end
-      obj
+      Gliner::Structure.new(fields: obj)
     end
     private

data/lib/gliner/tasks/classification.rb CHANGED Viewed

@@ -38,7 +38,7 @@ module Gliner
       end
       def process_output(logits, parsed, prepared, options)
-        include_confidence = options.fetch(:include_confidence, false)
+        include_probability = options.fetch(:include_probability, false)
         threshold_override = options[:threshold]
         cls_threshold = threshold_override.nil? ? parsed[:cls_threshold] : threshold_override
@@ -47,7 +47,7 @@ module Gliner
           scores,
           labels: parsed[:labels],
           multi_label: parsed[:multi_label],
-          include_confidence: include_confidence,
+          include_probability: include_probability,
           cls_threshold: cls_threshold
         )
       end

data/lib/gliner/tasks/entity_extraction.rb CHANGED Viewed

@@ -2,7 +2,7 @@
 module Gliner
   module Tasks
-    class EntityExtraction < Task
+    class Entity < Task
       def initialize(config_parser:, inference:, input_builder:, span_extractor:)
         super(config_parser: config_parser, inference: inference, input_builder: input_builder)
         @span_extractor = span_extractor
@@ -30,12 +30,11 @@ module Gliner
       def process_output(logits, parsed, prepared, options)
         threshold = options.fetch(:threshold, Gliner.config.threshold)
-        format_opts = FormatOptions.from(options)
         label_positions = options[:label_positions] || inference.label_positions_for(prepared.word_ids, parsed[:labels].length)
         spans_by_label = extract_spans(logits, parsed, prepared, label_positions, threshold)
-        { 'entities' => format_entities(parsed, spans_by_label, format_opts) }
+        { 'entities' => format_entities(parsed, spans_by_label) }
       end
       private
@@ -51,20 +50,28 @@ module Gliner
         )
       end
-      def format_entities(parsed, spans_by_label, format_opts)
-        parsed[:labels].each_with_object({}) do |label, entities|
+      def format_entities(parsed, spans_by_label)
+        entities = parsed[:labels].each_with_object({}) do |label, entries|
           spans = spans_by_label.fetch(label)
           dtype = parsed[:dtypes].fetch(label, :list)
-          entities[label] = format_entity_value(spans, dtype, format_opts)
+          value = format_entity_value(label, spans, dtype)
+          next if value.is_a?(Array) && value.empty?
+          entries[label] = value
         end
+        Gliner::Entities.new(entities)
       end
-      def format_entity_value(spans, dtype, format_opts)
+      def format_entity_value(label, spans, dtype)
         if dtype == :str
-          @span_extractor.format_single_span(@span_extractor.choose_best_span(spans), format_opts)
+          @span_extractor.format_single_span(
+            @span_extractor.choose_best_span(spans),
+            label: label
+          )
         else
-          @span_extractor.format_spans(spans, format_opts)
+          @span_extractor.format_spans(spans, label: label)
         end
       end
     end

data/lib/gliner/tasks/json_extraction.rb CHANGED Viewed

@@ -2,7 +2,7 @@
 module Gliner
   module Tasks
-    class JsonExtraction < Task
+    class Json < Task
       def initialize(config_parser:, inference:, input_builder:, span_extractor:, structured_extractor:)
         super(config_parser: config_parser, inference: inference, input_builder: input_builder)
@@ -47,9 +47,7 @@ module Gliner
       def process_output(logits, parsed, prepared, options)
         spans_by_label = extract_spans(logits, parsed, prepared, options)
         filtered_spans = @structured_extractor.apply_choice_filters(spans_by_label, parsed[:parsed_fields])
-        format_opts = FormatOptions.from(options)
-        @structured_extractor.build_structure_instances(parsed[:parsed_fields], filtered_spans, format_opts)
+        @structured_extractor.build_structure_instances(parsed[:parsed_fields], filtered_spans, options)
       end
       def execute_all(pipeline, text, structures_config, **options)

data/lib/gliner/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Gliner
-  VERSION = '0.2.3'
+  VERSION = '0.3.0'
 end

data/lib/gliner.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 # frozen_string_literal: true
+require 'delegate'
 require 'fileutils'
 require 'httpx'
 require 'gliner/version'
@@ -31,20 +32,25 @@ module Gliner
     :text_len
   )
-  Span = Data.define(:text, :score, :start, :end) do
-    def overlaps?(other)
-      !(self.end <= other.start || start >= other.end)
+  Entity = Data.define(:index, :offsets, :text, :name, :probability) do
+    def to_s = text.to_s
+    def to_str = text.to_s
+  end
+  class Entities < SimpleDelegator
+    def list
+      __getobj__.values.flat_map { |value| Array(value) }
     end
   end
-  FormatOptions = Data.define(:include_confidence, :include_spans) do
-    def self.from(input)
-      return input if input.is_a?(FormatOptions)
+  Label = Data.define(:label, :probability) do
+    def to_s = label.to_s
+    def to_str = label.to_s
+  end
-      new(
-        include_confidence: input.fetch(:include_confidence, false),
-        include_spans: input.fetch(:include_spans, false)
-      )
+  Span = Data.define(:text, :score, :start, :end) do
+    def overlaps?(other)
+      !(self.end <= other.start || start >= other.end)
     end
   end
@@ -80,7 +86,7 @@ module Gliner
     end
     def classify
-      Runners::ClassificationRunner
+      Runners::Classification
     end
     def model!
@@ -111,9 +117,9 @@ module Gliner
     end
     def runner_for(config)
-      return Runners::StructuredRunner if structured_config?(config)
+      return Runners::Structure if structured_config?(config)
-      Runners::EntityRunner
+      Runners::Entity
     end
     def structured_config?(config)

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: gliner
 version: !ruby/object:Gem::Version
-  version: 0.2.3
+  version: 0.3.0
 platform: ruby
 authors:
 - elcuervo