ner-ruby 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +24 -0
- data/LICENSE +21 -0
- data/README.md +31 -0
- data/Rakefile +11 -0
- data/lib/ner_ruby/configuration.rb +16 -0
- data/lib/ner_ruby/decoder.rb +108 -0
- data/lib/ner_ruby/entity.rb +43 -0
- data/lib/ner_ruby/error.rb +10 -0
- data/lib/ner_ruby/models/api.rb +102 -0
- data/lib/ner_ruby/models/base.rb +11 -0
- data/lib/ner_ruby/models/onnx.rb +48 -0
- data/lib/ner_ruby/pipeline.rb +33 -0
- data/lib/ner_ruby/recognizer.rb +63 -0
- data/lib/ner_ruby/version.rb +5 -0
- data/lib/ner_ruby.rb +28 -0
- data/ner-ruby.gemspec +36 -0
- metadata +130 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: c1a6a8375878db28b788d78ae9c92ac2628321d1aa65a5368daf6fcd1701b569
|
|
4
|
+
data.tar.gz: 922ace5c18cf9949d5f92998b31d2c7bdf8097589ffc355c0f7b3a1b7151bf05
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 11f6b9feee7c04a79ed35c0eba290ce7d74b680d12dde7fddd403542e567180539328e1b85b80d6687984f51438ef17e5b26dfa9dcadbcf0265834ec076b006b
|
|
7
|
+
data.tar.gz: ac6dcd6c2292c65750d34cd76a379cf0597597e9733b2142529427f60ad6f9bb27beef42e32fddbdc09cf8a96ae58cc38c6b26f32d0fbd9bb5891c26acd0f38d
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.1.1 (2026-03-09)
|
|
4
|
+
|
|
5
|
+
### Fixed
|
|
6
|
+
- Wire API backend to Recognizer via `backend: :api` option
|
|
7
|
+
- Softmax numerical stability (subtract max before exp, clamp output to 0.0..1.0)
|
|
8
|
+
- Empty/nil text input returns empty array instead of crashing
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
- Model config auto-detection: reads `config.json` alongside ONNX model for label map
|
|
12
|
+
- Entity character span offsets (`start_offset`, `end_offset`) in original text
|
|
13
|
+
- Input validation for model path, API key, and labels parameter
|
|
14
|
+
- `ValidationError` and `ConfigurationError` error classes
|
|
15
|
+
|
|
16
|
+
## 0.1.0 (2026-03-09)
|
|
17
|
+
|
|
18
|
+
- Initial release
|
|
19
|
+
- Named Entity Recognition using ONNX models
|
|
20
|
+
- IOB/BIO tag decoder with sub-token merging
|
|
21
|
+
- Entity result objects with type predicates
|
|
22
|
+
- Support for PER, LOC, ORG, MISC entity types
|
|
23
|
+
- API backend for OpenAI and HuggingFace
|
|
24
|
+
- Configuration DSL
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Johannes Dwi Cahyo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# ner-ruby
|
|
2
|
+
|
|
3
|
+
Named Entity Recognition for Ruby using ONNX models.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```ruby
|
|
8
|
+
gem "ner-ruby", "~> 0.1"
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```ruby
|
|
14
|
+
require "ner_ruby"
|
|
15
|
+
|
|
16
|
+
ner = NerRuby::Recognizer.new(
|
|
17
|
+
model: "path/to/ner.onnx",
|
|
18
|
+
tokenizer: "path/to/tokenizer.json"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
entities = ner.recognize("Jokowi visited Jakarta on Monday")
|
|
22
|
+
# => [Entity(text: "Jokowi", label: :PER), Entity(text: "Jakarta", label: :LOC)]
|
|
23
|
+
|
|
24
|
+
entities = ner.recognize(text, labels: [:PER, :ORG])
|
|
25
|
+
|
|
26
|
+
results = ner.recognize_batch(["Text one", "Text two"])
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## License
|
|
30
|
+
|
|
31
|
+
MIT
|
data/Rakefile
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NerRuby
|
|
4
|
+
class Configuration
|
|
5
|
+
attr_accessor :default_model_path, :default_tokenizer_path,
|
|
6
|
+
:default_labels, :min_score, :batch_size
|
|
7
|
+
|
|
8
|
+
def initialize
|
|
9
|
+
@default_model_path = nil
|
|
10
|
+
@default_tokenizer_path = nil
|
|
11
|
+
@default_labels = nil
|
|
12
|
+
@min_score = 0.5
|
|
13
|
+
@batch_size = 32
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NerRuby
|
|
4
|
+
class Decoder
|
|
5
|
+
LABEL_MAPS = {
|
|
6
|
+
"bert-base-NER" => {
|
|
7
|
+
0 => "O",
|
|
8
|
+
1 => "B-MISC",
|
|
9
|
+
2 => "I-MISC",
|
|
10
|
+
3 => "B-PER",
|
|
11
|
+
4 => "I-PER",
|
|
12
|
+
5 => "B-ORG",
|
|
13
|
+
6 => "I-ORG",
|
|
14
|
+
7 => "B-LOC",
|
|
15
|
+
8 => "I-LOC"
|
|
16
|
+
}
|
|
17
|
+
}.freeze
|
|
18
|
+
|
|
19
|
+
def initialize(label_map: nil)
|
|
20
|
+
@label_map = label_map || LABEL_MAPS["bert-base-NER"]
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def decode(tokens, predictions, scores: nil, original_text: nil)
|
|
24
|
+
entities = []
|
|
25
|
+
current_entity = nil
|
|
26
|
+
|
|
27
|
+
tokens.each_with_index do |token, i|
|
|
28
|
+
next if special_token?(token)
|
|
29
|
+
|
|
30
|
+
label = @label_map[predictions[i]] || "O"
|
|
31
|
+
score = scores ? scores[i] : 1.0
|
|
32
|
+
|
|
33
|
+
if label.start_with?("B-")
|
|
34
|
+
entities << build_entity(current_entity, original_text) if current_entity
|
|
35
|
+
entity_type = label.sub("B-", "")
|
|
36
|
+
current_entity = { raw_tokens: [token], label: entity_type, scores: [score] }
|
|
37
|
+
elsif label.start_with?("I-") && current_entity
|
|
38
|
+
entity_type = label.sub("I-", "")
|
|
39
|
+
if entity_type == current_entity[:label]
|
|
40
|
+
current_entity[:raw_tokens] << token
|
|
41
|
+
current_entity[:scores] << score
|
|
42
|
+
else
|
|
43
|
+
entities << build_entity(current_entity, original_text)
|
|
44
|
+
current_entity = nil
|
|
45
|
+
end
|
|
46
|
+
else
|
|
47
|
+
entities << build_entity(current_entity, original_text) if current_entity
|
|
48
|
+
current_entity = nil
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
entities << build_entity(current_entity, original_text) if current_entity
|
|
53
|
+
entities
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
private
|
|
57
|
+
|
|
58
|
+
def special_token?(token)
|
|
59
|
+
token == "[CLS]" || token == "[SEP]" || token == "[PAD]" ||
|
|
60
|
+
token == "<s>" || token == "</s>" || token == "<pad>"
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def clean_token(token)
|
|
64
|
+
token.sub(/^##/, "")
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def build_entity(entity_data, original_text = nil)
|
|
68
|
+
text = merge_tokens(entity_data[:raw_tokens])
|
|
69
|
+
avg_score = entity_data[:scores].sum / entity_data[:scores].size
|
|
70
|
+
avg_score = avg_score.clamp(0.0, 1.0)
|
|
71
|
+
|
|
72
|
+
start_offset = nil
|
|
73
|
+
end_offset = nil
|
|
74
|
+
|
|
75
|
+
if original_text
|
|
76
|
+
idx = original_text.index(text)
|
|
77
|
+
if idx
|
|
78
|
+
start_offset = idx
|
|
79
|
+
end_offset = idx + text.length
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
Entity.new(
|
|
84
|
+
text: text,
|
|
85
|
+
label: entity_data[:label],
|
|
86
|
+
start_offset: start_offset,
|
|
87
|
+
end_offset: end_offset,
|
|
88
|
+
score: avg_score.round(4)
|
|
89
|
+
)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def merge_tokens(raw_tokens)
|
|
93
|
+
result = clean_token(raw_tokens.first) || ""
|
|
94
|
+
raw_tokens[1..].each do |token|
|
|
95
|
+
if wordpiece?(token)
|
|
96
|
+
result += clean_token(token)
|
|
97
|
+
else
|
|
98
|
+
result += " #{token}"
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
result
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def wordpiece?(token)
|
|
105
|
+
token.start_with?("##")
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NerRuby
|
|
4
|
+
class Entity
|
|
5
|
+
attr_reader :text, :label, :start_offset, :end_offset, :score
|
|
6
|
+
|
|
7
|
+
LABELS = %i[PER LOC ORG MISC DATE TIME MONEY PERCENT QUANTITY].freeze
|
|
8
|
+
|
|
9
|
+
def initialize(text:, label:, start_offset: nil, end_offset: nil, score: 0.0)
|
|
10
|
+
@text = text
|
|
11
|
+
@label = label.to_sym
|
|
12
|
+
@start_offset = start_offset
|
|
13
|
+
@end_offset = end_offset
|
|
14
|
+
@score = score
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def person?
|
|
18
|
+
label == :PER
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def location?
|
|
22
|
+
label == :LOC
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def organization?
|
|
26
|
+
label == :ORG
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def to_h
|
|
30
|
+
{
|
|
31
|
+
text: @text,
|
|
32
|
+
label: @label,
|
|
33
|
+
start_offset: @start_offset,
|
|
34
|
+
end_offset: @end_offset,
|
|
35
|
+
score: @score
|
|
36
|
+
}
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def to_s
|
|
40
|
+
"#{@text} [#{@label}] (#{(@score * 100).round(1)}%)"
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NerRuby
|
|
4
|
+
class Error < StandardError; end
|
|
5
|
+
class ModelNotFoundError < Error; end
|
|
6
|
+
class TokenizerError < Error; end
|
|
7
|
+
class InferenceError < Error; end
|
|
8
|
+
class ConfigurationError < Error; end
|
|
9
|
+
class ValidationError < Error; end
|
|
10
|
+
end
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "net/http"
|
|
4
|
+
require "uri"
|
|
5
|
+
require "json"
|
|
6
|
+
|
|
7
|
+
module NerRuby
|
|
8
|
+
module Models
|
|
9
|
+
class Api < Base
|
|
10
|
+
PROVIDERS = {
|
|
11
|
+
openai: {
|
|
12
|
+
url: "https://api.openai.com/v1/chat/completions",
|
|
13
|
+
model: "gpt-4o"
|
|
14
|
+
},
|
|
15
|
+
huggingface: {
|
|
16
|
+
url: "https://api-inference.huggingface.co/models/",
|
|
17
|
+
model: "dslim/bert-base-NER"
|
|
18
|
+
}
|
|
19
|
+
}.freeze
|
|
20
|
+
|
|
21
|
+
def initialize(provider: :openai, api_key: nil, model: nil)
|
|
22
|
+
@provider = provider
|
|
23
|
+
@api_key = api_key || ENV["#{provider.to_s.upcase}_API_KEY"]
|
|
24
|
+
@model = model || PROVIDERS.dig(provider, :model)
|
|
25
|
+
|
|
26
|
+
raise Error, "API key is required for #{provider}" unless @api_key
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def recognize(text, labels: nil)
|
|
30
|
+
return [] if text.nil? || text.strip.empty?
|
|
31
|
+
|
|
32
|
+
case @provider
|
|
33
|
+
when :openai then recognize_openai(text, labels: labels)
|
|
34
|
+
when :huggingface then recognize_huggingface(text)
|
|
35
|
+
else raise Error, "Unknown provider: #{@provider}"
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
def recognize_openai(text, labels: nil)
|
|
42
|
+
label_str = (labels || Entity::LABELS).map(&:to_s).join(", ")
|
|
43
|
+
prompt = <<~PROMPT
|
|
44
|
+
Extract named entities from the following text. Return JSON array with objects having keys: text, label, score.
|
|
45
|
+
Labels: #{label_str}
|
|
46
|
+
Text: #{text}
|
|
47
|
+
PROMPT
|
|
48
|
+
|
|
49
|
+
body = {
|
|
50
|
+
model: @model,
|
|
51
|
+
messages: [{ role: "user", content: prompt }],
|
|
52
|
+
response_format: { type: "json_object" }
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
response = post_json(PROVIDERS[:openai][:url], body, {
|
|
56
|
+
"Authorization" => "Bearer #{@api_key}"
|
|
57
|
+
})
|
|
58
|
+
|
|
59
|
+
content = response.dig("choices", 0, "message", "content")
|
|
60
|
+
parsed = JSON.parse(content)
|
|
61
|
+
entities = parsed["entities"] || parsed
|
|
62
|
+
|
|
63
|
+
entities.map do |e|
|
|
64
|
+
Entity.new(
|
|
65
|
+
text: e["text"],
|
|
66
|
+
label: e["label"],
|
|
67
|
+
score: e["score"] || 0.9
|
|
68
|
+
)
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def recognize_huggingface(text)
|
|
73
|
+
url = "#{PROVIDERS[:huggingface][:url]}#{@model}"
|
|
74
|
+
response = post_json(url, { inputs: text }, {
|
|
75
|
+
"Authorization" => "Bearer #{@api_key}"
|
|
76
|
+
})
|
|
77
|
+
|
|
78
|
+
response.map do |e|
|
|
79
|
+
Entity.new(
|
|
80
|
+
text: e["word"],
|
|
81
|
+
label: e["entity_group"] || e["entity"],
|
|
82
|
+
score: e["score"] || 0.0
|
|
83
|
+
)
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def post_json(url, body, headers)
|
|
88
|
+
uri = URI.parse(url)
|
|
89
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
90
|
+
http.use_ssl = true
|
|
91
|
+
|
|
92
|
+
req = Net::HTTP::Post.new(uri.request_uri)
|
|
93
|
+
req["Content-Type"] = "application/json"
|
|
94
|
+
headers.each { |k, v| req[k] = v }
|
|
95
|
+
req.body = JSON.generate(body)
|
|
96
|
+
|
|
97
|
+
response = http.request(req)
|
|
98
|
+
JSON.parse(response.body)
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module NerRuby
|
|
6
|
+
module Models
|
|
7
|
+
class Onnx < Base
|
|
8
|
+
attr_reader :label_map
|
|
9
|
+
|
|
10
|
+
def initialize(model_path:)
|
|
11
|
+
require "onnx_ruby"
|
|
12
|
+
@model_path = model_path
|
|
13
|
+
raise ModelNotFoundError, "Model not found: #{model_path}" unless File.exist?(model_path)
|
|
14
|
+
|
|
15
|
+
@session = OnnxRuby::Session.new(model_path)
|
|
16
|
+
@label_map = load_config_label_map
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def predict(input_ids)
|
|
20
|
+
attention_mask = Array.new(input_ids.length, 1)
|
|
21
|
+
token_type_ids = Array.new(input_ids.length, 0)
|
|
22
|
+
|
|
23
|
+
outputs = @session.run(
|
|
24
|
+
input_ids: [input_ids],
|
|
25
|
+
attention_mask: [attention_mask],
|
|
26
|
+
token_type_ids: [token_type_ids]
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
outputs[0][0]
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
private
|
|
33
|
+
|
|
34
|
+
def load_config_label_map
|
|
35
|
+
config_path = File.join(File.dirname(@model_path), "config.json")
|
|
36
|
+
return nil unless File.exist?(config_path)
|
|
37
|
+
|
|
38
|
+
config = JSON.parse(File.read(config_path))
|
|
39
|
+
id2label = config["id2label"]
|
|
40
|
+
return nil unless id2label.is_a?(Hash)
|
|
41
|
+
|
|
42
|
+
id2label.each_with_object({}) do |(k, v), map|
|
|
43
|
+
map[k.to_i] = v.to_s
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NerRuby
|
|
4
|
+
class Pipeline
|
|
5
|
+
def initialize(model:, tokenizer:, decoder: nil)
|
|
6
|
+
@model = model
|
|
7
|
+
@tokenizer = tokenizer
|
|
8
|
+
@decoder = decoder || Decoder.new
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def call(text)
|
|
12
|
+
encoding = @tokenizer.encode(text)
|
|
13
|
+
tokens = encoding[:tokens] || encoding["tokens"]
|
|
14
|
+
input_ids = encoding[:ids] || encoding["ids"]
|
|
15
|
+
|
|
16
|
+
logits = @model.predict(input_ids)
|
|
17
|
+
|
|
18
|
+
predictions = logits.map { |row| row.each_with_index.max_by { |v, _| v }.last }
|
|
19
|
+
scores = logits.map { |row| softmax(row).max }
|
|
20
|
+
|
|
21
|
+
@decoder.decode(tokens, predictions, scores: scores, original_text: text)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
private
|
|
25
|
+
|
|
26
|
+
def softmax(logits)
|
|
27
|
+
max = logits.max
|
|
28
|
+
exps = logits.map { |x| Math.exp(x - max) }
|
|
29
|
+
sum = exps.sum
|
|
30
|
+
exps.map { |x| (x / sum).clamp(0.0, 1.0) }
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module NerRuby
|
|
4
|
+
class Recognizer
|
|
5
|
+
def initialize(model: nil, tokenizer: nil, label_map: nil, backend: nil, provider: nil, api_key: nil)
|
|
6
|
+
config = NerRuby.configuration
|
|
7
|
+
|
|
8
|
+
if backend == :api
|
|
9
|
+
@api_model = Models::Api.new(provider: provider || :openai, api_key: api_key)
|
|
10
|
+
else
|
|
11
|
+
model_path = model || config.default_model_path
|
|
12
|
+
tokenizer_path = tokenizer || config.default_tokenizer_path
|
|
13
|
+
|
|
14
|
+
if model_path && tokenizer_path
|
|
15
|
+
raise ModelNotFoundError, "Model not found: #{model_path}" unless File.exist?(model_path)
|
|
16
|
+
|
|
17
|
+
@model = Models::Onnx.new(model_path: model_path)
|
|
18
|
+
@tokenizer = load_tokenizer(tokenizer_path)
|
|
19
|
+
detected_label_map = label_map || @model.label_map
|
|
20
|
+
@decoder = Decoder.new(label_map: detected_label_map)
|
|
21
|
+
@pipeline = Pipeline.new(model: @model, tokenizer: @tokenizer, decoder: @decoder)
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def recognize(text, labels: nil)
|
|
27
|
+
return [] if text.nil? || text.strip.empty?
|
|
28
|
+
validate_labels!(labels) if labels
|
|
29
|
+
|
|
30
|
+
if @api_model
|
|
31
|
+
entities = @api_model.recognize(text, labels: labels)
|
|
32
|
+
else
|
|
33
|
+
raise Error, "No model loaded. Provide model and tokenizer paths." unless @pipeline
|
|
34
|
+
entities = @pipeline.call(text)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
if labels
|
|
38
|
+
label_syms = labels.map(&:to_sym)
|
|
39
|
+
entities = entities.select { |e| label_syms.include?(e.label) }
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
min = NerRuby.configuration.min_score
|
|
43
|
+
entities.select { |e| e.score >= min }
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def recognize_batch(texts, labels: nil)
|
|
47
|
+
texts.map { |text| recognize(text, labels: labels) }
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
private
|
|
51
|
+
|
|
52
|
+
def load_tokenizer(path)
|
|
53
|
+
require "tokenizer_ruby"
|
|
54
|
+
TokenizerRuby::Tokenizer.new(path)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def validate_labels!(labels)
|
|
58
|
+
unless labels.is_a?(Array) && labels.all? { |l| l.is_a?(Symbol) || l.is_a?(String) }
|
|
59
|
+
raise ValidationError, "labels must be an array of symbols or strings"
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
data/lib/ner_ruby.rb
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "ner_ruby/version"
|
|
4
|
+
require_relative "ner_ruby/error"
|
|
5
|
+
require_relative "ner_ruby/configuration"
|
|
6
|
+
require_relative "ner_ruby/entity"
|
|
7
|
+
require_relative "ner_ruby/decoder"
|
|
8
|
+
require_relative "ner_ruby/pipeline"
|
|
9
|
+
require_relative "ner_ruby/models/base"
|
|
10
|
+
require_relative "ner_ruby/models/onnx"
|
|
11
|
+
require_relative "ner_ruby/models/api"
|
|
12
|
+
require_relative "ner_ruby/recognizer"
|
|
13
|
+
|
|
14
|
+
module NerRuby
|
|
15
|
+
class << self
|
|
16
|
+
def configuration
|
|
17
|
+
@configuration ||= Configuration.new
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def configure
|
|
21
|
+
yield(configuration)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def reset_configuration!
|
|
25
|
+
@configuration = Configuration.new
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
data/ner-ruby.gemspec
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "lib/ner_ruby/version"
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |spec|
|
|
6
|
+
spec.name = "ner-ruby"
|
|
7
|
+
spec.version = NerRuby::VERSION
|
|
8
|
+
spec.authors = ["Johannes Dwi Cahyo"]
|
|
9
|
+
spec.email = ["johannes@example.com"]
|
|
10
|
+
spec.summary = "Named Entity Recognition for Ruby using ONNX models"
|
|
11
|
+
spec.description = "NER using ONNX models via onnx-ruby and tokenizer-ruby. Extracts people, places, organizations, and other entities from text."
|
|
12
|
+
spec.homepage = "https://github.com/johannesdwicahyo/ner-ruby"
|
|
13
|
+
spec.license = "MIT"
|
|
14
|
+
spec.required_ruby_version = ">= 3.0.0"
|
|
15
|
+
|
|
16
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
|
17
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
|
18
|
+
spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
|
|
19
|
+
|
|
20
|
+
spec.files = Dir[
|
|
21
|
+
"lib/**/*.rb",
|
|
22
|
+
"README.md",
|
|
23
|
+
"LICENSE",
|
|
24
|
+
"CHANGELOG.md",
|
|
25
|
+
"Rakefile",
|
|
26
|
+
"ner-ruby.gemspec"
|
|
27
|
+
]
|
|
28
|
+
spec.require_paths = ["lib"]
|
|
29
|
+
|
|
30
|
+
spec.add_dependency "onnx-ruby", "~> 0.1"
|
|
31
|
+
spec.add_dependency "tokenizer-ruby", "~> 0.1"
|
|
32
|
+
|
|
33
|
+
spec.add_development_dependency "minitest", "~> 5.0"
|
|
34
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
|
35
|
+
spec.add_development_dependency "webmock", "~> 3.0"
|
|
36
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: ner-ruby
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Johannes Dwi Cahyo
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: onnx-ruby
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - "~>"
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '0.1'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - "~>"
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '0.1'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: tokenizer-ruby
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - "~>"
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '0.1'
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - "~>"
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '0.1'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: minitest
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - "~>"
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '5.0'
|
|
47
|
+
type: :development
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - "~>"
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '5.0'
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: rake
|
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - "~>"
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: '13.0'
|
|
61
|
+
type: :development
|
|
62
|
+
prerelease: false
|
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - "~>"
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
version: '13.0'
|
|
68
|
+
- !ruby/object:Gem::Dependency
|
|
69
|
+
name: webmock
|
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
|
71
|
+
requirements:
|
|
72
|
+
- - "~>"
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
version: '3.0'
|
|
75
|
+
type: :development
|
|
76
|
+
prerelease: false
|
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
78
|
+
requirements:
|
|
79
|
+
- - "~>"
|
|
80
|
+
- !ruby/object:Gem::Version
|
|
81
|
+
version: '3.0'
|
|
82
|
+
description: NER using ONNX models via onnx-ruby and tokenizer-ruby. Extracts people,
|
|
83
|
+
places, organizations, and other entities from text.
|
|
84
|
+
email:
|
|
85
|
+
- johannes@example.com
|
|
86
|
+
executables: []
|
|
87
|
+
extensions: []
|
|
88
|
+
extra_rdoc_files: []
|
|
89
|
+
files:
|
|
90
|
+
- CHANGELOG.md
|
|
91
|
+
- LICENSE
|
|
92
|
+
- README.md
|
|
93
|
+
- Rakefile
|
|
94
|
+
- lib/ner_ruby.rb
|
|
95
|
+
- lib/ner_ruby/configuration.rb
|
|
96
|
+
- lib/ner_ruby/decoder.rb
|
|
97
|
+
- lib/ner_ruby/entity.rb
|
|
98
|
+
- lib/ner_ruby/error.rb
|
|
99
|
+
- lib/ner_ruby/models/api.rb
|
|
100
|
+
- lib/ner_ruby/models/base.rb
|
|
101
|
+
- lib/ner_ruby/models/onnx.rb
|
|
102
|
+
- lib/ner_ruby/pipeline.rb
|
|
103
|
+
- lib/ner_ruby/recognizer.rb
|
|
104
|
+
- lib/ner_ruby/version.rb
|
|
105
|
+
- ner-ruby.gemspec
|
|
106
|
+
homepage: https://github.com/johannesdwicahyo/ner-ruby
|
|
107
|
+
licenses:
|
|
108
|
+
- MIT
|
|
109
|
+
metadata:
|
|
110
|
+
homepage_uri: https://github.com/johannesdwicahyo/ner-ruby
|
|
111
|
+
source_code_uri: https://github.com/johannesdwicahyo/ner-ruby
|
|
112
|
+
changelog_uri: https://github.com/johannesdwicahyo/ner-ruby/blob/main/CHANGELOG.md
|
|
113
|
+
rdoc_options: []
|
|
114
|
+
require_paths:
|
|
115
|
+
- lib
|
|
116
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
117
|
+
requirements:
|
|
118
|
+
- - ">="
|
|
119
|
+
- !ruby/object:Gem::Version
|
|
120
|
+
version: 3.0.0
|
|
121
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
122
|
+
requirements:
|
|
123
|
+
- - ">="
|
|
124
|
+
- !ruby/object:Gem::Version
|
|
125
|
+
version: '0'
|
|
126
|
+
requirements: []
|
|
127
|
+
rubygems_version: 3.6.9
|
|
128
|
+
specification_version: 4
|
|
129
|
+
summary: Named Entity Recognition for Ruby using ONNX models
|
|
130
|
+
test_files: []
|