ner-ruby 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c1a6a8375878db28b788d78ae9c92ac2628321d1aa65a5368daf6fcd1701b569
4
- data.tar.gz: 922ace5c18cf9949d5f92998b31d2c7bdf8097589ffc355c0f7b3a1b7151bf05
3
+ metadata.gz: 6c55a89d813528d2927bdf620869fa10c5660155aad6b15033f806426142c171
4
+ data.tar.gz: e0e58c4ac730b697b8e3b24c4228ca64bc2be048c346df63c8a9713d58fa0da1
5
5
  SHA512:
6
- metadata.gz: 11f6b9feee7c04a79ed35c0eba290ce7d74b680d12dde7fddd403542e567180539328e1b85b80d6687984f51438ef17e5b26dfa9dcadbcf0265834ec076b006b
7
- data.tar.gz: ac6dcd6c2292c65750d34cd76a379cf0597597e9733b2142529427f60ad6f9bb27beef42e32fddbdc09cf8a96ae58cc38c6b26f32d0fbd9bb5891c26acd0f38d
6
+ metadata.gz: 328af572e8556ebf0527288cab60bebb9e11441c29bdee5a68eeb835324d76d1d8f70cc70b2ff7b5190e569c7ab848690304492492183cbf41dc93b62a9d27fe
7
+ data.tar.gz: 603134f836d15da0af2068a26d1c99e47c311ada80a80167257b30841b3e4cf730e5ac062ce8090d32b5c180bfc6aa4c3027094b3ecf77299228a7405acb59ae
data/README.md CHANGED
@@ -1,11 +1,11 @@
1
1
  # ner-ruby
2
2
 
3
- Named Entity Recognition for Ruby using ONNX models.
3
+ Named Entity Recognition for Ruby. Extract entities (people, places, organizations) from text using ONNX models or API backends.
4
4
 
5
5
  ## Installation
6
6
 
7
7
  ```ruby
8
- gem "ner-ruby", "~> 0.1"
8
+ gem "ner-ruby"
9
9
  ```
10
10
 
11
11
  ## Usage
@@ -13,19 +13,34 @@ gem "ner-ruby", "~> 0.1"
13
13
  ```ruby
14
14
  require "ner_ruby"
15
15
 
16
- ner = NerRuby::Recognizer.new(
17
- model: "path/to/ner.onnx",
18
- tokenizer: "path/to/tokenizer.json"
16
+ # ONNX backend
17
+ recognizer = NerRuby::Recognizer.new(
18
+ model_path: "path/to/model.onnx",
19
+ labels: [:PER, :LOC, :ORG, :MISC]
19
20
  )
20
21
 
21
- entities = ner.recognize("Jokowi visited Jakarta on Monday")
22
- # => [Entity(text: "Jokowi", label: :PER), Entity(text: "Jakarta", label: :LOC)]
22
+ entities = recognizer.recognize("John works at Google in Mountain View")
23
+ entities.each do |e|
24
+ puts "#{e.text} (#{e.label}) [#{e.start_offset}:#{e.end_offset}] score=#{e.score}"
25
+ end
23
26
 
24
- entities = ner.recognize(text, labels: [:PER, :ORG])
25
-
26
- results = ner.recognize_batch(["Text one", "Text two"])
27
+ # API backend
28
+ recognizer = NerRuby::Recognizer.new(
29
+ backend: :api,
30
+ provider: :openai,
31
+ api_key: ENV["OPENAI_API_KEY"]
32
+ )
27
33
  ```
28
34
 
35
+ ## Features
36
+
37
+ - ONNX Runtime inference with auto label map from config.json
38
+ - API backend support (OpenAI, etc.)
39
+ - IOB/BIO tag decoding with wordpiece token merging
40
+ - Character span offsets (start_offset, end_offset)
41
+ - Numerically stable softmax
42
+ - Empty/nil text guards
43
+
29
44
  ## License
30
45
 
31
46
  MIT
@@ -3,7 +3,9 @@
3
3
  module NerRuby
4
4
  class Configuration
5
5
  attr_accessor :default_model_path, :default_tokenizer_path,
6
- :default_labels, :min_score, :batch_size
6
+ :default_labels, :min_score, :batch_size,
7
+ :min_scores_per_type, :enable_cache,
8
+ :max_length, :stride, :merge_adjacent
7
9
 
8
10
  def initialize
9
11
  @default_model_path = nil
@@ -11,6 +13,19 @@ module NerRuby
11
13
  @default_labels = nil
12
14
  @min_score = 0.5
13
15
  @batch_size = 32
16
+ @min_scores_per_type = {}
17
+ @enable_cache = true
18
+ @max_length = 512
19
+ @stride = 128
20
+ @merge_adjacent = true
21
+ end
22
+
23
+ def model_registry
24
+ @model_registry ||= ModelRegistry.new
25
+ end
26
+
27
+ def register_model(name, **opts)
28
+ model_registry.register(name, **opts)
14
29
  end
15
30
  end
16
31
  end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NerRuby
4
+ class ModelCache
5
+ def initialize
6
+ @cache = {}
7
+ @mutex = Mutex.new
8
+ end
9
+
10
+ def get(key)
11
+ @mutex.synchronize { @cache[key] }
12
+ end
13
+
14
+ def set(key, value)
15
+ @mutex.synchronize { @cache[key] = value }
16
+ end
17
+
18
+ def has?(key)
19
+ @mutex.synchronize { @cache.key?(key) }
20
+ end
21
+
22
+ def clear
23
+ @mutex.synchronize { @cache.clear }
24
+ end
25
+
26
+ def size
27
+ @mutex.synchronize { @cache.size }
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NerRuby
4
+ class ModelRegistry
5
+ BUILT_IN = {
6
+ english: {
7
+ repo_id: "dslim/bert-base-NER",
8
+ model_file: "model.onnx",
9
+ tokenizer: "dslim/bert-base-NER",
10
+ label_map: { 0 => "O", 1 => "B-MISC", 2 => "I-MISC", 3 => "B-PER",
11
+ 4 => "I-PER", 5 => "B-ORG", 6 => "I-ORG", 7 => "B-LOC", 8 => "I-LOC" }
12
+ },
13
+ indonesian: {
14
+ repo_id: "cahya/bert-base-indonesian-NER",
15
+ model_file: "model.onnx",
16
+ tokenizer: "cahya/bert-base-indonesian-NER",
17
+ label_map: { 0 => "O", 1 => "B-PER", 2 => "I-PER", 3 => "B-LOC",
18
+ 4 => "I-LOC", 5 => "B-ORG", 6 => "I-ORG" }
19
+ },
20
+ multilingual: {
21
+ repo_id: "Davlan/bert-base-multilingual-cased-ner-hrl",
22
+ model_file: "model.onnx",
23
+ tokenizer: "Davlan/bert-base-multilingual-cased-ner-hrl",
24
+ label_map: { 0 => "O", 1 => "B-PER", 2 => "I-PER", 3 => "B-ORG",
25
+ 4 => "I-ORG", 5 => "B-LOC", 6 => "I-LOC", 7 => "B-DATE", 8 => "I-DATE" }
26
+ }
27
+ }.freeze
28
+
29
+ def initialize
30
+ @custom = {}
31
+ end
32
+
33
+ def register(name, repo_id: nil, model_path: nil, tokenizer_path: nil, label_map: nil)
34
+ @custom[name.to_sym] = {
35
+ repo_id: repo_id,
36
+ model_path: model_path,
37
+ tokenizer_path: tokenizer_path,
38
+ label_map: label_map
39
+ }
40
+ end
41
+
42
+ def get(name)
43
+ @custom[name.to_sym] || BUILT_IN[name.to_sym]
44
+ end
45
+
46
+ def available
47
+ (BUILT_IN.keys + @custom.keys).uniq
48
+ end
49
+ end
50
+ end
@@ -2,6 +2,8 @@
2
2
 
3
3
  module NerRuby
4
4
  class Recognizer
5
+ @@cache = ModelCache.new
6
+
5
7
  def initialize(model: nil, tokenizer: nil, label_map: nil, backend: nil, provider: nil, api_key: nil)
6
8
  config = NerRuby.configuration
7
9
 
@@ -14,8 +16,16 @@ module NerRuby
14
16
  if model_path && tokenizer_path
15
17
  raise ModelNotFoundError, "Model not found: #{model_path}" unless File.exist?(model_path)
16
18
 
17
- @model = Models::Onnx.new(model_path: model_path)
18
- @tokenizer = load_tokenizer(tokenizer_path)
19
+ if config.enable_cache && @@cache.has?(model_path)
20
+ cached = @@cache.get(model_path)
21
+ @model = cached[:model]
22
+ @tokenizer = cached[:tokenizer]
23
+ else
24
+ @model = Models::Onnx.new(model_path: model_path)
25
+ @tokenizer = load_tokenizer(tokenizer_path)
26
+ @@cache.set(model_path, { model: @model, tokenizer: @tokenizer }) if config.enable_cache
27
+ end
28
+
19
29
  detected_label_map = label_map || @model.label_map
20
30
  @decoder = Decoder.new(label_map: detected_label_map)
21
31
  @pipeline = Pipeline.new(model: @model, tokenizer: @tokenizer, decoder: @decoder)
@@ -23,6 +33,24 @@ module NerRuby
23
33
  end
24
34
  end
25
35
 
36
+ # Load a recognizer from a registered model name
37
+ def self.from_pretrained(name)
38
+ config = NerRuby.configuration
39
+ model_info = config.model_registry.get(name)
40
+ raise Error, "Unknown model: #{name}. Available: #{config.model_registry.available.join(', ')}" unless model_info
41
+
42
+ if model_info[:model_path] && model_info[:tokenizer_path]
43
+ new(
44
+ model: model_info[:model_path],
45
+ tokenizer: model_info[:tokenizer_path],
46
+ label_map: model_info[:label_map]
47
+ )
48
+ else
49
+ # API-based fallback
50
+ new(backend: :api, provider: :huggingface)
51
+ end
52
+ end
53
+
26
54
  def recognize(text, labels: nil)
27
55
  return [] if text.nil? || text.strip.empty?
28
56
  validate_labels!(labels) if labels
@@ -34,19 +62,29 @@ module NerRuby
34
62
  entities = @pipeline.call(text)
35
63
  end
36
64
 
65
+ # Merge adjacent entities of the same type
66
+ if NerRuby.configuration.merge_adjacent
67
+ entities = merge_adjacent_entities(entities)
68
+ end
69
+
70
+ # Filter by labels
37
71
  if labels
38
72
  label_syms = labels.map(&:to_sym)
39
73
  entities = entities.select { |e| label_syms.include?(e.label) }
40
74
  end
41
75
 
42
- min = NerRuby.configuration.min_score
43
- entities.select { |e| e.score >= min }
76
+ # Filter by per-type or global min_score
77
+ filter_by_score(entities)
44
78
  end
45
79
 
46
80
  def recognize_batch(texts, labels: nil)
47
81
  texts.map { |text| recognize(text, labels: labels) }
48
82
  end
49
83
 
84
+ def self.clear_cache
85
+ @@cache.clear
86
+ end
87
+
50
88
  private
51
89
 
52
90
  def load_tokenizer(path)
@@ -59,5 +97,43 @@ module NerRuby
59
97
  raise ValidationError, "labels must be an array of symbols or strings"
60
98
  end
61
99
  end
100
+
101
+ def merge_adjacent_entities(entities)
102
+ return entities if entities.empty?
103
+
104
+ merged = [entities.first]
105
+ entities[1..].each do |entity|
106
+ prev = merged.last
107
+ if prev.label == entity.label && adjacent?(prev, entity)
108
+ # Merge into a new entity
109
+ merged[-1] = Entity.new(
110
+ text: "#{prev.text} #{entity.text}",
111
+ label: prev.label,
112
+ start_offset: prev.start_offset,
113
+ end_offset: entity.end_offset,
114
+ score: ((prev.score + entity.score) / 2.0).round(4)
115
+ )
116
+ else
117
+ merged << entity
118
+ end
119
+ end
120
+ merged
121
+ end
122
+
123
+ def adjacent?(a, b)
124
+ return true if a.end_offset && b.start_offset && (b.start_offset - a.end_offset).abs <= 1
125
+ false
126
+ end
127
+
128
+ def filter_by_score(entities)
129
+ config = NerRuby.configuration
130
+ per_type = config.min_scores_per_type
131
+ global_min = config.min_score
132
+
133
+ entities.select do |e|
134
+ threshold = per_type[e.label] || global_min
135
+ e.score >= threshold
136
+ end
137
+ end
62
138
  end
63
139
  end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ module NerRuby
4
+ class SlidingWindow
5
+ DEFAULT_MAX_LENGTH = 512
6
+ DEFAULT_STRIDE = 128
7
+
8
+ def initialize(max_length: DEFAULT_MAX_LENGTH, stride: DEFAULT_STRIDE)
9
+ @max_length = max_length
10
+ @stride = stride
11
+ end
12
+
13
+ # Split tokens into overlapping windows
14
+ def split(tokens, ids)
15
+ return [{ tokens: tokens, ids: ids, offset: 0 }] if tokens.length <= @max_length
16
+
17
+ windows = []
18
+ start = 0
19
+
20
+ while start < tokens.length
21
+ window_end = [start + @max_length, tokens.length].min
22
+ windows << {
23
+ tokens: tokens[start...window_end],
24
+ ids: ids[start...window_end],
25
+ offset: start
26
+ }
27
+ break if window_end >= tokens.length
28
+ start += @max_length - @stride
29
+ end
30
+
31
+ windows
32
+ end
33
+
34
+ # Merge entities from overlapping windows, preferring higher scores
35
+ def merge_entities(window_results)
36
+ all_entities = []
37
+
38
+ window_results.each do |entities|
39
+ entities.each do |entity|
40
+ existing = all_entities.find { |e| overlaps?(e, entity) }
41
+ if existing
42
+ # Keep the one with higher score
43
+ if entity.score > existing.score
44
+ all_entities.delete(existing)
45
+ all_entities << entity
46
+ end
47
+ else
48
+ all_entities << entity
49
+ end
50
+ end
51
+ end
52
+
53
+ all_entities.sort_by { |e| e.start_offset || 0 }
54
+ end
55
+
56
+ private
57
+
58
+ def overlaps?(a, b)
59
+ return false unless a.start_offset && b.start_offset && a.end_offset && b.end_offset
60
+ a.start_offset < b.end_offset && b.start_offset < a.end_offset
61
+ end
62
+ end
63
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module NerRuby
4
- VERSION = "0.1.1"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/ner_ruby.rb CHANGED
@@ -9,6 +9,9 @@ require_relative "ner_ruby/pipeline"
9
9
  require_relative "ner_ruby/models/base"
10
10
  require_relative "ner_ruby/models/onnx"
11
11
  require_relative "ner_ruby/models/api"
12
+ require_relative "ner_ruby/model_registry"
13
+ require_relative "ner_ruby/model_cache"
14
+ require_relative "ner_ruby/sliding_window"
12
15
  require_relative "ner_ruby/recognizer"
13
16
 
14
17
  module NerRuby
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ner-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Johannes Dwi Cahyo
@@ -96,11 +96,14 @@ files:
96
96
  - lib/ner_ruby/decoder.rb
97
97
  - lib/ner_ruby/entity.rb
98
98
  - lib/ner_ruby/error.rb
99
+ - lib/ner_ruby/model_cache.rb
100
+ - lib/ner_ruby/model_registry.rb
99
101
  - lib/ner_ruby/models/api.rb
100
102
  - lib/ner_ruby/models/base.rb
101
103
  - lib/ner_ruby/models/onnx.rb
102
104
  - lib/ner_ruby/pipeline.rb
103
105
  - lib/ner_ruby/recognizer.rb
106
+ - lib/ner_ruby/sliding_window.rb
104
107
  - lib/ner_ruby/version.rb
105
108
  - ner-ruby.gemspec
106
109
  homepage: https://github.com/johannesdwicahyo/ner-ruby