mitie 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +43 -8
- data/lib/mitie/binary_relation_detector.rb +1 -0
- data/lib/mitie/document.rb +1 -5
- data/lib/mitie/ffi.rb +18 -0
- data/lib/mitie/ner.rb +19 -5
- data/lib/mitie/ner_trainer.rb +51 -0
- data/lib/mitie/ner_training_instance.rb +49 -0
- data/lib/mitie/utils.rb +12 -0
- data/lib/mitie/version.rb +1 -1
- data/lib/mitie.rb +8 -3
- metadata +8 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1e36228752078ce915d9d6cd49f3d8087c62263b0e41bebb5bcfd5a478398fe6
|
4
|
+
data.tar.gz: 54881a7c47cd855b678cf71a5baa7486b22acdbac802e15eda3984fc2215838d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 573e0e14ecd284cb9754abe95f2c3cd2ebbd9e2dbebaa5a0f33fad847abe6ebc3d0875c916d05f95fc1fdd9208c12d906315f0f5a557dfc6b3db63b620aa3f6f
|
7
|
+
data.tar.gz: 272335c83fc924aaf696382959d6cd93dd667139f282fef71b78b29a6784bda16f221cdd36691e885db6075312a00d463772f4408ee130ea5dde6a7a49b77d55
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,18 +1,18 @@
|
|
1
|
-
# MITIE
|
1
|
+
# MITIE Ruby
|
2
2
|
|
3
3
|
[MITIE](https://github.com/mit-nlp/MITIE) - named-entity recognition and binary relation detection - for Ruby
|
4
4
|
|
5
5
|
- Finds people, organizations, and locations in text
|
6
6
|
- Detects relationships between entities, like `PERSON` was born in `LOCATION`
|
7
7
|
|
8
|
-
[](https://github.com/ankane/mitie/actions)
|
8
|
+
[](https://github.com/ankane/mitie-ruby/actions)
|
9
9
|
|
10
10
|
## Installation
|
11
11
|
|
12
12
|
Add this line to your application’s Gemfile:
|
13
13
|
|
14
14
|
```ruby
|
15
|
-
gem
|
15
|
+
gem "mitie"
|
16
16
|
```
|
17
17
|
|
18
18
|
And download the pre-trained model for your language:
|
@@ -69,6 +69,41 @@ Get all tags for a model
|
|
69
69
|
model.tags
|
70
70
|
```
|
71
71
|
|
72
|
+
## Training
|
73
|
+
|
74
|
+
Load an NER model into a trainer
|
75
|
+
|
76
|
+
```ruby
|
77
|
+
trainer = Mitie::NERTrainer.new("total_word_feature_extractor.dat")
|
78
|
+
```
|
79
|
+
|
80
|
+
Create training instances
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
tokens = ["You", "can", "do", "machine", "learning", "in", "Ruby", "!"]
|
84
|
+
instance = Mitie::NERTrainingInstance.new(tokens)
|
85
|
+
instance.add_entity(3..4, "topic") # machine learning
|
86
|
+
instance.add_entity(6..6, "language") # Ruby
|
87
|
+
```
|
88
|
+
|
89
|
+
Add the training instances to the trainer
|
90
|
+
|
91
|
+
```ruby
|
92
|
+
trainer.add(instance)
|
93
|
+
```
|
94
|
+
|
95
|
+
Train the model
|
96
|
+
|
97
|
+
```ruby
|
98
|
+
model = trainer.train
|
99
|
+
```
|
100
|
+
|
101
|
+
Save the model
|
102
|
+
|
103
|
+
```ruby
|
104
|
+
model.save_to_disk("ner_model.dat")
|
105
|
+
```
|
106
|
+
|
72
107
|
## Binary Relation Detection
|
73
108
|
|
74
109
|
Detect relationships betweens two entities, like:
|
@@ -105,22 +140,22 @@ This returns
|
|
105
140
|
|
106
141
|
## History
|
107
142
|
|
108
|
-
View the [changelog](https://github.com/ankane/mitie/blob/master/CHANGELOG.md)
|
143
|
+
View the [changelog](https://github.com/ankane/mitie-ruby/blob/master/CHANGELOG.md)
|
109
144
|
|
110
145
|
## Contributing
|
111
146
|
|
112
147
|
Everyone is encouraged to help improve this project. Here are a few ways you can help:
|
113
148
|
|
114
|
-
- [Report bugs](https://github.com/ankane/mitie/issues)
|
115
|
-
- Fix bugs and [submit pull requests](https://github.com/ankane/mitie/pulls)
|
149
|
+
- [Report bugs](https://github.com/ankane/mitie-ruby/issues)
|
150
|
+
- Fix bugs and [submit pull requests](https://github.com/ankane/mitie-ruby/pulls)
|
116
151
|
- Write, clarify, or fix documentation
|
117
152
|
- Suggest or add new features
|
118
153
|
|
119
154
|
To get started with development:
|
120
155
|
|
121
156
|
```sh
|
122
|
-
git clone https://github.com/ankane/mitie.git
|
123
|
-
cd mitie
|
157
|
+
git clone https://github.com/ankane/mitie-ruby.git
|
158
|
+
cd mitie-ruby
|
124
159
|
bundle install
|
125
160
|
bundle exec rake vendor:all
|
126
161
|
|
@@ -35,6 +35,7 @@ module Mitie
|
|
35
35
|
|
36
36
|
score_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
|
37
37
|
status = FFI.mitie_classify_binary_relation(pointer, relation, score_ptr)
|
38
|
+
# TODO make Mitie::Error
|
38
39
|
raise "Bad status: #{status}" if status != 0
|
39
40
|
score = score_ptr.to_s(Fiddle::SIZEOF_DOUBLE).unpack1("d")
|
40
41
|
if score > 0
|
data/lib/mitie/document.rb
CHANGED
@@ -84,11 +84,7 @@ module Mitie
|
|
84
84
|
def tokenize
|
85
85
|
@tokenize ||= begin
|
86
86
|
if text.is_a?(Array)
|
87
|
-
|
88
|
-
tokens_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP * (text.size + 1))
|
89
|
-
text.size.times do |i|
|
90
|
-
tokens_ptr[i * Fiddle::SIZEOF_VOIDP, Fiddle::SIZEOF_VOIDP] = Fiddle::Pointer.to_ptr(text[i]).ref
|
91
|
-
end
|
87
|
+
tokens_ptr = Utils.array_to_pointer(text)
|
92
88
|
[tokens_ptr, nil]
|
93
89
|
else
|
94
90
|
offsets_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
|
data/lib/mitie/ffi.rb
CHANGED
@@ -10,9 +10,12 @@ module Mitie
|
|
10
10
|
raise e
|
11
11
|
end
|
12
12
|
|
13
|
+
# https://github.com/mit-nlp/MITIE/blob/master/mitielib/include/mitie.h
|
14
|
+
|
13
15
|
extern "void mitie_free(void* object)"
|
14
16
|
extern "char** mitie_tokenize(const char* text)"
|
15
17
|
extern "char** mitie_tokenize_with_offsets(const char* text, unsigned long** token_offsets)"
|
18
|
+
extern "int mitie_save_named_entity_extractor(const char* filename, const mitie_named_entity_extractor* ner)"
|
16
19
|
|
17
20
|
extern "mitie_named_entity_extractor* mitie_load_named_entity_extractor(const char* filename)"
|
18
21
|
extern "unsigned long mitie_get_num_possible_ner_tags(const mitie_named_entity_extractor* ner)"
|
@@ -26,6 +29,21 @@ module Mitie
|
|
26
29
|
extern "const char* mitie_ner_get_detection_tagstr(const mitie_named_entity_detections* dets, unsigned long idx)"
|
27
30
|
extern "double mitie_ner_get_detection_score(const mitie_named_entity_detections* dets, unsigned long idx)"
|
28
31
|
|
32
|
+
extern "mitie_ner_training_instance* mitie_create_ner_training_instance(char** tokens)"
|
33
|
+
extern "unsigned long mitie_ner_training_instance_num_entities(const mitie_ner_training_instance* instance)"
|
34
|
+
extern "unsigned long mitie_ner_training_instance_num_tokens(const mitie_ner_training_instance* instance)"
|
35
|
+
extern "int mitie_overlaps_any_entity(mitie_ner_training_instance* instance, unsigned long start, unsigned long length)"
|
36
|
+
extern "int mitie_add_ner_training_entity(mitie_ner_training_instance* instance, unsigned long start, unsigned long length, const char* label)"
|
37
|
+
|
38
|
+
extern "mitie_ner_trainer* mitie_create_ner_trainer(const char* filename)"
|
39
|
+
extern "unsigned long mitie_ner_trainer_size(const mitie_ner_trainer* trainer)"
|
40
|
+
extern "int mitie_add_ner_training_instance(mitie_ner_trainer* trainer, const mitie_ner_training_instance* instance)"
|
41
|
+
extern "void mitie_ner_trainer_set_beta(mitie_ner_trainer* trainer, double beta)"
|
42
|
+
extern "double mitie_ner_trainer_get_beta(const mitie_ner_trainer* trainer)"
|
43
|
+
extern "void mitie_ner_trainer_set_num_threads(mitie_ner_trainer* trainer, unsigned long num_threads)"
|
44
|
+
extern "unsigned long mitie_ner_trainer_get_num_threads(const mitie_ner_trainer* trainer)"
|
45
|
+
extern "mitie_named_entity_extractor* mitie_train_named_entity_extractor(const mitie_ner_trainer* trainer)"
|
46
|
+
|
29
47
|
extern "mitie_binary_relation_detector* mitie_load_binary_relation_detector(const char* filename)"
|
30
48
|
extern "const char* mitie_binary_relation_detector_name_string(const mitie_binary_relation_detector* detector)"
|
31
49
|
extern "int mitie_entities_overlap(unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)"
|
data/lib/mitie/ner.rb
CHANGED
@@ -2,11 +2,18 @@ module Mitie
|
|
2
2
|
class NER
|
3
3
|
attr_reader :pointer
|
4
4
|
|
5
|
-
def initialize(path)
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
5
|
+
def initialize(path = nil, pointer: nil)
|
6
|
+
if path
|
7
|
+
# better error message
|
8
|
+
raise ArgumentError, "File does not exist" unless File.exist?(path)
|
9
|
+
@pointer = FFI.mitie_load_named_entity_extractor(path)
|
10
|
+
elsif pointer
|
11
|
+
@pointer = pointer
|
12
|
+
else
|
13
|
+
raise ArgumentError, "Must pass either a path or a pointer"
|
14
|
+
end
|
15
|
+
|
16
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
10
17
|
end
|
11
18
|
|
12
19
|
def tags
|
@@ -23,6 +30,13 @@ module Mitie
|
|
23
30
|
doc(text).entities
|
24
31
|
end
|
25
32
|
|
33
|
+
def save_to_disk(filename)
|
34
|
+
if FFI.mitie_save_named_entity_extractor(filename, pointer) != 0
|
35
|
+
raise Error, "Unable to save model"
|
36
|
+
end
|
37
|
+
nil
|
38
|
+
end
|
39
|
+
|
26
40
|
def tokens(text)
|
27
41
|
doc(text).tokens
|
28
42
|
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Mitie
|
2
|
+
class NERTrainer
|
3
|
+
def initialize(filename)
|
4
|
+
raise ArgumentError, "File does not exist" unless File.exist?(filename)
|
5
|
+
@pointer = FFI.mitie_create_ner_trainer(filename)
|
6
|
+
|
7
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
8
|
+
end
|
9
|
+
|
10
|
+
def add(instance)
|
11
|
+
FFI.mitie_add_ner_training_instance(@pointer, instance.pointer)
|
12
|
+
end
|
13
|
+
|
14
|
+
def beta
|
15
|
+
FFI.mitie_ner_trainer_get_beta(@pointer)
|
16
|
+
end
|
17
|
+
|
18
|
+
def beta=(value)
|
19
|
+
raise ArgumentError, "beta must be greater than or equal to zero" unless value >= 0
|
20
|
+
|
21
|
+
FFI.mitie_ner_trainer_set_beta(@pointer, value)
|
22
|
+
end
|
23
|
+
|
24
|
+
def num_threads
|
25
|
+
FFI.mitie_ner_trainer_get_num_threads(@pointer)
|
26
|
+
end
|
27
|
+
|
28
|
+
def num_threads=(value)
|
29
|
+
FFI.mitie_ner_trainer_set_num_threads(@pointer, value)
|
30
|
+
end
|
31
|
+
|
32
|
+
def size
|
33
|
+
FFI.mitie_ner_trainer_size(@pointer)
|
34
|
+
end
|
35
|
+
|
36
|
+
def train
|
37
|
+
raise Error, "You can't call train() on an empty trainer" if size.zero?
|
38
|
+
|
39
|
+
extractor = FFI.mitie_train_named_entity_extractor(@pointer)
|
40
|
+
|
41
|
+
raise Error, "Unable to create named entity extractor. Probably ran out of RAM." if extractor.null?
|
42
|
+
|
43
|
+
Mitie::NER.new(pointer: extractor)
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.finalize(pointer)
|
47
|
+
# must use proc instead of stabby lambda
|
48
|
+
proc { FFI.mitie_free(pointer) }
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Mitie
|
2
|
+
class NERTrainingInstance
|
3
|
+
attr_reader :pointer
|
4
|
+
|
5
|
+
def initialize(tokens)
|
6
|
+
tokens_pointer = Utils.array_to_pointer(tokens)
|
7
|
+
|
8
|
+
@pointer = FFI.mitie_create_ner_training_instance(tokens_pointer)
|
9
|
+
raise Error, "Unable to create training instance. Probably ran out of RAM." if @pointer.null?
|
10
|
+
|
11
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
12
|
+
end
|
13
|
+
|
14
|
+
def add_entity(range, label)
|
15
|
+
if range.none? || range.end >= num_tokens || range.begin < 0
|
16
|
+
raise ArgumentError, "Invalid range"
|
17
|
+
end
|
18
|
+
|
19
|
+
raise ArgumentError, "Range overlaps existing entity" if overlaps_any_entity?(range)
|
20
|
+
|
21
|
+
unless FFI.mitie_add_ner_training_entity(@pointer, range.begin, range.size, label).zero?
|
22
|
+
raise Error, "Unable to add entity to training instance. Probably ran out of RAM."
|
23
|
+
end
|
24
|
+
|
25
|
+
nil
|
26
|
+
end
|
27
|
+
|
28
|
+
def num_entities
|
29
|
+
FFI.mitie_ner_training_instance_num_entities(@pointer)
|
30
|
+
end
|
31
|
+
|
32
|
+
def num_tokens
|
33
|
+
FFI.mitie_ner_training_instance_num_tokens(@pointer)
|
34
|
+
end
|
35
|
+
|
36
|
+
def overlaps_any_entity?(range)
|
37
|
+
if range.none? || range.max >= num_tokens
|
38
|
+
raise ArgumentError, "Invalid range"
|
39
|
+
end
|
40
|
+
|
41
|
+
FFI.mitie_overlaps_any_entity(@pointer, range.begin, range.size) == 1
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.finalize(pointer)
|
45
|
+
# must use proc instead of stabby lambda
|
46
|
+
proc { FFI.mitie_free(pointer) }
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/lib/mitie/utils.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
module Mitie
|
2
|
+
module Utils
|
3
|
+
def self.array_to_pointer(text)
|
4
|
+
# malloc uses memset to set all bytes to 0
|
5
|
+
tokens_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP * (text.size + 1))
|
6
|
+
text.size.times do |i|
|
7
|
+
tokens_ptr[i * Fiddle::SIZEOF_VOIDP, Fiddle::SIZEOF_VOIDP] = Fiddle::Pointer.to_ptr(text[i]).ref
|
8
|
+
end
|
9
|
+
tokens_ptr
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
data/lib/mitie/version.rb
CHANGED
data/lib/mitie.rb
CHANGED
@@ -5,6 +5,9 @@ require "fiddle/import"
|
|
5
5
|
require "mitie/binary_relation_detector"
|
6
6
|
require "mitie/document"
|
7
7
|
require "mitie/ner"
|
8
|
+
require "mitie/ner_training_instance"
|
9
|
+
require "mitie/ner_trainer"
|
10
|
+
require "mitie/utils"
|
8
11
|
require "mitie/version"
|
9
12
|
|
10
13
|
module Mitie
|
@@ -16,10 +19,12 @@ module Mitie
|
|
16
19
|
lib_name =
|
17
20
|
if Gem.win_platform?
|
18
21
|
"mitie.dll"
|
19
|
-
elsif RbConfig::CONFIG["arch"] =~ /arm64-darwin/i
|
20
|
-
"libmitie.arm64.dylib"
|
21
22
|
elsif RbConfig::CONFIG["host_os"] =~ /darwin/i
|
22
|
-
"
|
23
|
+
if RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i
|
24
|
+
"libmitie.arm64.dylib"
|
25
|
+
else
|
26
|
+
"libmitie.dylib"
|
27
|
+
end
|
23
28
|
else
|
24
29
|
"libmitie.so"
|
25
30
|
end
|
metadata
CHANGED
@@ -1,17 +1,17 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mitie
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-03-21 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
|
-
email: andrew@
|
14
|
+
email: andrew@ankane.org
|
15
15
|
executables: []
|
16
16
|
extensions: []
|
17
17
|
extra_rdoc_files: []
|
@@ -24,13 +24,16 @@ files:
|
|
24
24
|
- lib/mitie/document.rb
|
25
25
|
- lib/mitie/ffi.rb
|
26
26
|
- lib/mitie/ner.rb
|
27
|
+
- lib/mitie/ner_trainer.rb
|
28
|
+
- lib/mitie/ner_training_instance.rb
|
29
|
+
- lib/mitie/utils.rb
|
27
30
|
- lib/mitie/version.rb
|
28
31
|
- vendor/LICENSE.txt
|
29
32
|
- vendor/libmitie.arm64.dylib
|
30
33
|
- vendor/libmitie.dylib
|
31
34
|
- vendor/libmitie.so
|
32
35
|
- vendor/mitie.dll
|
33
|
-
homepage: https://github.com/ankane/mitie
|
36
|
+
homepage: https://github.com/ankane/mitie-ruby
|
34
37
|
licenses:
|
35
38
|
- BSL-1.0
|
36
39
|
metadata: {}
|
@@ -49,7 +52,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
49
52
|
- !ruby/object:Gem::Version
|
50
53
|
version: '0'
|
51
54
|
requirements: []
|
52
|
-
rubygems_version: 3.
|
55
|
+
rubygems_version: 3.3.7
|
53
56
|
signing_key:
|
54
57
|
specification_version: 4
|
55
58
|
summary: Named-entity recognition for Ruby
|