ner-ruby 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +25 -10
- data/lib/ner_ruby/configuration.rb +16 -1
- data/lib/ner_ruby/model_cache.rb +30 -0
- data/lib/ner_ruby/model_registry.rb +50 -0
- data/lib/ner_ruby/recognizer.rb +80 -4
- data/lib/ner_ruby/sliding_window.rb +63 -0
- data/lib/ner_ruby/version.rb +1 -1
- data/lib/ner_ruby.rb +3 -0
- metadata +4 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 6c55a89d813528d2927bdf620869fa10c5660155aad6b15033f806426142c171
|
|
4
|
+
data.tar.gz: e0e58c4ac730b697b8e3b24c4228ca64bc2be048c346df63c8a9713d58fa0da1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 328af572e8556ebf0527288cab60bebb9e11441c29bdee5a68eeb835324d76d1d8f70cc70b2ff7b5190e569c7ab848690304492492183cbf41dc93b62a9d27fe
|
|
7
|
+
data.tar.gz: 603134f836d15da0af2068a26d1c99e47c311ada80a80167257b30841b3e4cf730e5ac062ce8090d32b5c180bfc6aa4c3027094b3ecf77299228a7405acb59ae
|
data/README.md
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
# ner-ruby
|
|
2
2
|
|
|
3
|
-
Named Entity Recognition for Ruby using ONNX models.
|
|
3
|
+
Named Entity Recognition for Ruby. Extract entities (people, places, organizations) from text using ONNX models or API backends.
|
|
4
4
|
|
|
5
5
|
## Installation
|
|
6
6
|
|
|
7
7
|
```ruby
|
|
8
|
-
gem "ner-ruby"
|
|
8
|
+
gem "ner-ruby"
|
|
9
9
|
```
|
|
10
10
|
|
|
11
11
|
## Usage
|
|
@@ -13,19 +13,34 @@ gem "ner-ruby", "~> 0.1"
|
|
|
13
13
|
```ruby
|
|
14
14
|
require "ner_ruby"
|
|
15
15
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
16
|
+
# ONNX backend
|
|
17
|
+
recognizer = NerRuby::Recognizer.new(
|
|
18
|
+
model_path: "path/to/model.onnx",
|
|
19
|
+
labels: [:PER, :LOC, :ORG, :MISC]
|
|
19
20
|
)
|
|
20
21
|
|
|
21
|
-
entities =
|
|
22
|
-
|
|
22
|
+
entities = recognizer.recognize("John works at Google in Mountain View")
|
|
23
|
+
entities.each do |e|
|
|
24
|
+
puts "#{e.text} (#{e.label}) [#{e.start_offset}:#{e.end_offset}] score=#{e.score}"
|
|
25
|
+
end
|
|
23
26
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
+
# API backend
|
|
28
|
+
recognizer = NerRuby::Recognizer.new(
|
|
29
|
+
backend: :api,
|
|
30
|
+
provider: :openai,
|
|
31
|
+
api_key: ENV["OPENAI_API_KEY"]
|
|
32
|
+
)
|
|
27
33
|
```
|
|
28
34
|
|
|
35
|
+
## Features
|
|
36
|
+
|
|
37
|
+
- ONNX Runtime inference with auto label map from config.json
|
|
38
|
+
- API backend support (OpenAI, etc.)
|
|
39
|
+
- IOB/BIO tag decoding with wordpiece token merging
|
|
40
|
+
- Character span offsets (start_offset, end_offset)
|
|
41
|
+
- Numerically stable softmax
|
|
42
|
+
- Empty/nil text guards
|
|
43
|
+
|
|
29
44
|
## License
|
|
30
45
|
|
|
31
46
|
MIT
|
|
@@ -3,7 +3,9 @@
|
|
|
3
3
|
module NerRuby
|
|
4
4
|
class Configuration
|
|
5
5
|
attr_accessor :default_model_path, :default_tokenizer_path,
|
|
6
|
-
:default_labels, :min_score, :batch_size
|
|
6
|
+
:default_labels, :min_score, :batch_size,
|
|
7
|
+
:min_scores_per_type, :enable_cache,
|
|
8
|
+
:max_length, :stride, :merge_adjacent
|
|
7
9
|
|
|
8
10
|
def initialize
|
|
9
11
|
@default_model_path = nil
|
|
@@ -11,6 +13,19 @@ module NerRuby
|
|
|
11
13
|
@default_labels = nil
|
|
12
14
|
@min_score = 0.5
|
|
13
15
|
@batch_size = 32
|
|
16
|
+
@min_scores_per_type = {}
|
|
17
|
+
@enable_cache = true
|
|
18
|
+
@max_length = 512
|
|
19
|
+
@stride = 128
|
|
20
|
+
@merge_adjacent = true
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def model_registry
|
|
24
|
+
@model_registry ||= ModelRegistry.new
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def register_model(name, **opts)
|
|
28
|
+
model_registry.register(name, **opts)
|
|
14
29
|
end
|
|
15
30
|
end
|
|
16
31
|
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NerRuby
|
|
4
|
+
class ModelCache
|
|
5
|
+
def initialize
|
|
6
|
+
@cache = {}
|
|
7
|
+
@mutex = Mutex.new
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def get(key)
|
|
11
|
+
@mutex.synchronize { @cache[key] }
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def set(key, value)
|
|
15
|
+
@mutex.synchronize { @cache[key] = value }
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def has?(key)
|
|
19
|
+
@mutex.synchronize { @cache.key?(key) }
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def clear
|
|
23
|
+
@mutex.synchronize { @cache.clear }
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def size
|
|
27
|
+
@mutex.synchronize { @cache.size }
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NerRuby
|
|
4
|
+
class ModelRegistry
|
|
5
|
+
BUILT_IN = {
|
|
6
|
+
english: {
|
|
7
|
+
repo_id: "dslim/bert-base-NER",
|
|
8
|
+
model_file: "model.onnx",
|
|
9
|
+
tokenizer: "dslim/bert-base-NER",
|
|
10
|
+
label_map: { 0 => "O", 1 => "B-MISC", 2 => "I-MISC", 3 => "B-PER",
|
|
11
|
+
4 => "I-PER", 5 => "B-ORG", 6 => "I-ORG", 7 => "B-LOC", 8 => "I-LOC" }
|
|
12
|
+
},
|
|
13
|
+
indonesian: {
|
|
14
|
+
repo_id: "cahya/bert-base-indonesian-NER",
|
|
15
|
+
model_file: "model.onnx",
|
|
16
|
+
tokenizer: "cahya/bert-base-indonesian-NER",
|
|
17
|
+
label_map: { 0 => "O", 1 => "B-PER", 2 => "I-PER", 3 => "B-LOC",
|
|
18
|
+
4 => "I-LOC", 5 => "B-ORG", 6 => "I-ORG" }
|
|
19
|
+
},
|
|
20
|
+
multilingual: {
|
|
21
|
+
repo_id: "Davlan/bert-base-multilingual-cased-ner-hrl",
|
|
22
|
+
model_file: "model.onnx",
|
|
23
|
+
tokenizer: "Davlan/bert-base-multilingual-cased-ner-hrl",
|
|
24
|
+
label_map: { 0 => "O", 1 => "B-PER", 2 => "I-PER", 3 => "B-ORG",
|
|
25
|
+
4 => "I-ORG", 5 => "B-LOC", 6 => "I-LOC", 7 => "B-DATE", 8 => "I-DATE" }
|
|
26
|
+
}
|
|
27
|
+
}.freeze
|
|
28
|
+
|
|
29
|
+
def initialize
|
|
30
|
+
@custom = {}
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def register(name, repo_id: nil, model_path: nil, tokenizer_path: nil, label_map: nil)
|
|
34
|
+
@custom[name.to_sym] = {
|
|
35
|
+
repo_id: repo_id,
|
|
36
|
+
model_path: model_path,
|
|
37
|
+
tokenizer_path: tokenizer_path,
|
|
38
|
+
label_map: label_map
|
|
39
|
+
}
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def get(name)
|
|
43
|
+
@custom[name.to_sym] || BUILT_IN[name.to_sym]
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def available
|
|
47
|
+
(BUILT_IN.keys + @custom.keys).uniq
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
data/lib/ner_ruby/recognizer.rb
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
module NerRuby
|
|
4
4
|
class Recognizer
|
|
5
|
+
@@cache = ModelCache.new
|
|
6
|
+
|
|
5
7
|
def initialize(model: nil, tokenizer: nil, label_map: nil, backend: nil, provider: nil, api_key: nil)
|
|
6
8
|
config = NerRuby.configuration
|
|
7
9
|
|
|
@@ -14,8 +16,16 @@ module NerRuby
|
|
|
14
16
|
if model_path && tokenizer_path
|
|
15
17
|
raise ModelNotFoundError, "Model not found: #{model_path}" unless File.exist?(model_path)
|
|
16
18
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
+
if config.enable_cache && @@cache.has?(model_path)
|
|
20
|
+
cached = @@cache.get(model_path)
|
|
21
|
+
@model = cached[:model]
|
|
22
|
+
@tokenizer = cached[:tokenizer]
|
|
23
|
+
else
|
|
24
|
+
@model = Models::Onnx.new(model_path: model_path)
|
|
25
|
+
@tokenizer = load_tokenizer(tokenizer_path)
|
|
26
|
+
@@cache.set(model_path, { model: @model, tokenizer: @tokenizer }) if config.enable_cache
|
|
27
|
+
end
|
|
28
|
+
|
|
19
29
|
detected_label_map = label_map || @model.label_map
|
|
20
30
|
@decoder = Decoder.new(label_map: detected_label_map)
|
|
21
31
|
@pipeline = Pipeline.new(model: @model, tokenizer: @tokenizer, decoder: @decoder)
|
|
@@ -23,6 +33,24 @@ module NerRuby
|
|
|
23
33
|
end
|
|
24
34
|
end
|
|
25
35
|
|
|
36
|
+
# Load a recognizer from a registered model name
|
|
37
|
+
def self.from_pretrained(name)
|
|
38
|
+
config = NerRuby.configuration
|
|
39
|
+
model_info = config.model_registry.get(name)
|
|
40
|
+
raise Error, "Unknown model: #{name}. Available: #{config.model_registry.available.join(', ')}" unless model_info
|
|
41
|
+
|
|
42
|
+
if model_info[:model_path] && model_info[:tokenizer_path]
|
|
43
|
+
new(
|
|
44
|
+
model: model_info[:model_path],
|
|
45
|
+
tokenizer: model_info[:tokenizer_path],
|
|
46
|
+
label_map: model_info[:label_map]
|
|
47
|
+
)
|
|
48
|
+
else
|
|
49
|
+
# API-based fallback
|
|
50
|
+
new(backend: :api, provider: :huggingface)
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
26
54
|
def recognize(text, labels: nil)
|
|
27
55
|
return [] if text.nil? || text.strip.empty?
|
|
28
56
|
validate_labels!(labels) if labels
|
|
@@ -34,19 +62,29 @@ module NerRuby
|
|
|
34
62
|
entities = @pipeline.call(text)
|
|
35
63
|
end
|
|
36
64
|
|
|
65
|
+
# Merge adjacent entities of the same type
|
|
66
|
+
if NerRuby.configuration.merge_adjacent
|
|
67
|
+
entities = merge_adjacent_entities(entities)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Filter by labels
|
|
37
71
|
if labels
|
|
38
72
|
label_syms = labels.map(&:to_sym)
|
|
39
73
|
entities = entities.select { |e| label_syms.include?(e.label) }
|
|
40
74
|
end
|
|
41
75
|
|
|
42
|
-
|
|
43
|
-
entities
|
|
76
|
+
# Filter by per-type or global min_score
|
|
77
|
+
filter_by_score(entities)
|
|
44
78
|
end
|
|
45
79
|
|
|
46
80
|
def recognize_batch(texts, labels: nil)
|
|
47
81
|
texts.map { |text| recognize(text, labels: labels) }
|
|
48
82
|
end
|
|
49
83
|
|
|
84
|
+
def self.clear_cache
|
|
85
|
+
@@cache.clear
|
|
86
|
+
end
|
|
87
|
+
|
|
50
88
|
private
|
|
51
89
|
|
|
52
90
|
def load_tokenizer(path)
|
|
@@ -59,5 +97,43 @@ module NerRuby
|
|
|
59
97
|
raise ValidationError, "labels must be an array of symbols or strings"
|
|
60
98
|
end
|
|
61
99
|
end
|
|
100
|
+
|
|
101
|
+
def merge_adjacent_entities(entities)
|
|
102
|
+
return entities if entities.empty?
|
|
103
|
+
|
|
104
|
+
merged = [entities.first]
|
|
105
|
+
entities[1..].each do |entity|
|
|
106
|
+
prev = merged.last
|
|
107
|
+
if prev.label == entity.label && adjacent?(prev, entity)
|
|
108
|
+
# Merge into a new entity
|
|
109
|
+
merged[-1] = Entity.new(
|
|
110
|
+
text: "#{prev.text} #{entity.text}",
|
|
111
|
+
label: prev.label,
|
|
112
|
+
start_offset: prev.start_offset,
|
|
113
|
+
end_offset: entity.end_offset,
|
|
114
|
+
score: ((prev.score + entity.score) / 2.0).round(4)
|
|
115
|
+
)
|
|
116
|
+
else
|
|
117
|
+
merged << entity
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
merged
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def adjacent?(a, b)
|
|
124
|
+
return true if a.end_offset && b.start_offset && (b.start_offset - a.end_offset).abs <= 1
|
|
125
|
+
false
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def filter_by_score(entities)
|
|
129
|
+
config = NerRuby.configuration
|
|
130
|
+
per_type = config.min_scores_per_type
|
|
131
|
+
global_min = config.min_score
|
|
132
|
+
|
|
133
|
+
entities.select do |e|
|
|
134
|
+
threshold = per_type[e.label] || global_min
|
|
135
|
+
e.score >= threshold
|
|
136
|
+
end
|
|
137
|
+
end
|
|
62
138
|
end
|
|
63
139
|
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NerRuby
|
|
4
|
+
class SlidingWindow
|
|
5
|
+
DEFAULT_MAX_LENGTH = 512
|
|
6
|
+
DEFAULT_STRIDE = 128
|
|
7
|
+
|
|
8
|
+
def initialize(max_length: DEFAULT_MAX_LENGTH, stride: DEFAULT_STRIDE)
|
|
9
|
+
@max_length = max_length
|
|
10
|
+
@stride = stride
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
# Split tokens into overlapping windows
|
|
14
|
+
def split(tokens, ids)
|
|
15
|
+
return [{ tokens: tokens, ids: ids, offset: 0 }] if tokens.length <= @max_length
|
|
16
|
+
|
|
17
|
+
windows = []
|
|
18
|
+
start = 0
|
|
19
|
+
|
|
20
|
+
while start < tokens.length
|
|
21
|
+
window_end = [start + @max_length, tokens.length].min
|
|
22
|
+
windows << {
|
|
23
|
+
tokens: tokens[start...window_end],
|
|
24
|
+
ids: ids[start...window_end],
|
|
25
|
+
offset: start
|
|
26
|
+
}
|
|
27
|
+
break if window_end >= tokens.length
|
|
28
|
+
start += @max_length - @stride
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
windows
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Merge entities from overlapping windows, preferring higher scores
|
|
35
|
+
def merge_entities(window_results)
|
|
36
|
+
all_entities = []
|
|
37
|
+
|
|
38
|
+
window_results.each do |entities|
|
|
39
|
+
entities.each do |entity|
|
|
40
|
+
existing = all_entities.find { |e| overlaps?(e, entity) }
|
|
41
|
+
if existing
|
|
42
|
+
# Keep the one with higher score
|
|
43
|
+
if entity.score > existing.score
|
|
44
|
+
all_entities.delete(existing)
|
|
45
|
+
all_entities << entity
|
|
46
|
+
end
|
|
47
|
+
else
|
|
48
|
+
all_entities << entity
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
all_entities.sort_by { |e| e.start_offset || 0 }
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
private
|
|
57
|
+
|
|
58
|
+
def overlaps?(a, b)
|
|
59
|
+
return false unless a.start_offset && b.start_offset && a.end_offset && b.end_offset
|
|
60
|
+
a.start_offset < b.end_offset && b.start_offset < a.end_offset
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
data/lib/ner_ruby/version.rb
CHANGED
data/lib/ner_ruby.rb
CHANGED
|
@@ -9,6 +9,9 @@ require_relative "ner_ruby/pipeline"
|
|
|
9
9
|
require_relative "ner_ruby/models/base"
|
|
10
10
|
require_relative "ner_ruby/models/onnx"
|
|
11
11
|
require_relative "ner_ruby/models/api"
|
|
12
|
+
require_relative "ner_ruby/model_registry"
|
|
13
|
+
require_relative "ner_ruby/model_cache"
|
|
14
|
+
require_relative "ner_ruby/sliding_window"
|
|
12
15
|
require_relative "ner_ruby/recognizer"
|
|
13
16
|
|
|
14
17
|
module NerRuby
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ner-ruby
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Johannes Dwi Cahyo
|
|
@@ -96,11 +96,14 @@ files:
|
|
|
96
96
|
- lib/ner_ruby/decoder.rb
|
|
97
97
|
- lib/ner_ruby/entity.rb
|
|
98
98
|
- lib/ner_ruby/error.rb
|
|
99
|
+
- lib/ner_ruby/model_cache.rb
|
|
100
|
+
- lib/ner_ruby/model_registry.rb
|
|
99
101
|
- lib/ner_ruby/models/api.rb
|
|
100
102
|
- lib/ner_ruby/models/base.rb
|
|
101
103
|
- lib/ner_ruby/models/onnx.rb
|
|
102
104
|
- lib/ner_ruby/pipeline.rb
|
|
103
105
|
- lib/ner_ruby/recognizer.rb
|
|
106
|
+
- lib/ner_ruby/sliding_window.rb
|
|
104
107
|
- lib/ner_ruby/version.rb
|
|
105
108
|
- ner-ruby.gemspec
|
|
106
109
|
homepage: https://github.com/johannesdwicahyo/ner-ruby
|