mitie 0.1.5 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +43 -8
- data/lib/mitie/binary_relation_detector.rb +1 -0
- data/lib/mitie/document.rb +1 -5
- data/lib/mitie/ffi.rb +18 -0
- data/lib/mitie/ner.rb +19 -5
- data/lib/mitie/ner_trainer.rb +51 -0
- data/lib/mitie/ner_training_instance.rb +49 -0
- data/lib/mitie/utils.rb +12 -0
- data/lib/mitie/version.rb +1 -1
- data/lib/mitie.rb +8 -3
- metadata +8 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1e36228752078ce915d9d6cd49f3d8087c62263b0e41bebb5bcfd5a478398fe6
|
4
|
+
data.tar.gz: 54881a7c47cd855b678cf71a5baa7486b22acdbac802e15eda3984fc2215838d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 573e0e14ecd284cb9754abe95f2c3cd2ebbd9e2dbebaa5a0f33fad847abe6ebc3d0875c916d05f95fc1fdd9208c12d906315f0f5a557dfc6b3db63b620aa3f6f
|
7
|
+
data.tar.gz: 272335c83fc924aaf696382959d6cd93dd667139f282fef71b78b29a6784bda16f221cdd36691e885db6075312a00d463772f4408ee130ea5dde6a7a49b77d55
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,18 +1,18 @@
|
|
1
|
-
# MITIE
|
1
|
+
# MITIE Ruby
|
2
2
|
|
3
3
|
[MITIE](https://github.com/mit-nlp/MITIE) - named-entity recognition and binary relation detection - for Ruby
|
4
4
|
|
5
5
|
- Finds people, organizations, and locations in text
|
6
6
|
- Detects relationships between entities, like `PERSON` was born in `LOCATION`
|
7
7
|
|
8
|
-
[![Build Status](https://github.com/ankane/mitie/workflows/build/badge.svg?branch=master)](https://github.com/ankane/mitie/actions)
|
8
|
+
[![Build Status](https://github.com/ankane/mitie-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/mitie-ruby/actions)
|
9
9
|
|
10
10
|
## Installation
|
11
11
|
|
12
12
|
Add this line to your application’s Gemfile:
|
13
13
|
|
14
14
|
```ruby
|
15
|
-
gem
|
15
|
+
gem "mitie"
|
16
16
|
```
|
17
17
|
|
18
18
|
And download the pre-trained model for your language:
|
@@ -69,6 +69,41 @@ Get all tags for a model
|
|
69
69
|
model.tags
|
70
70
|
```
|
71
71
|
|
72
|
+
## Training
|
73
|
+
|
74
|
+
Load an NER model into a trainer
|
75
|
+
|
76
|
+
```ruby
|
77
|
+
trainer = Mitie::NERTrainer.new("total_word_feature_extractor.dat")
|
78
|
+
```
|
79
|
+
|
80
|
+
Create training instances
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
tokens = ["You", "can", "do", "machine", "learning", "in", "Ruby", "!"]
|
84
|
+
instance = Mitie::NERTrainingInstance.new(tokens)
|
85
|
+
instance.add_entity(3..4, "topic") # machine learning
|
86
|
+
instance.add_entity(6..6, "language") # Ruby
|
87
|
+
```
|
88
|
+
|
89
|
+
Add the training instances to the trainer
|
90
|
+
|
91
|
+
```ruby
|
92
|
+
trainer.add(instance)
|
93
|
+
```
|
94
|
+
|
95
|
+
Train the model
|
96
|
+
|
97
|
+
```ruby
|
98
|
+
model = trainer.train
|
99
|
+
```
|
100
|
+
|
101
|
+
Save the model
|
102
|
+
|
103
|
+
```ruby
|
104
|
+
model.save_to_disk("ner_model.dat")
|
105
|
+
```
|
106
|
+
|
72
107
|
## Binary Relation Detection
|
73
108
|
|
74
109
|
Detect relationships betweens two entities, like:
|
@@ -105,22 +140,22 @@ This returns
|
|
105
140
|
|
106
141
|
## History
|
107
142
|
|
108
|
-
View the [changelog](https://github.com/ankane/mitie/blob/master/CHANGELOG.md)
|
143
|
+
View the [changelog](https://github.com/ankane/mitie-ruby/blob/master/CHANGELOG.md)
|
109
144
|
|
110
145
|
## Contributing
|
111
146
|
|
112
147
|
Everyone is encouraged to help improve this project. Here are a few ways you can help:
|
113
148
|
|
114
|
-
- [Report bugs](https://github.com/ankane/mitie/issues)
|
115
|
-
- Fix bugs and [submit pull requests](https://github.com/ankane/mitie/pulls)
|
149
|
+
- [Report bugs](https://github.com/ankane/mitie-ruby/issues)
|
150
|
+
- Fix bugs and [submit pull requests](https://github.com/ankane/mitie-ruby/pulls)
|
116
151
|
- Write, clarify, or fix documentation
|
117
152
|
- Suggest or add new features
|
118
153
|
|
119
154
|
To get started with development:
|
120
155
|
|
121
156
|
```sh
|
122
|
-
git clone https://github.com/ankane/mitie.git
|
123
|
-
cd mitie
|
157
|
+
git clone https://github.com/ankane/mitie-ruby.git
|
158
|
+
cd mitie-ruby
|
124
159
|
bundle install
|
125
160
|
bundle exec rake vendor:all
|
126
161
|
|
@@ -35,6 +35,7 @@ module Mitie
|
|
35
35
|
|
36
36
|
score_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
|
37
37
|
status = FFI.mitie_classify_binary_relation(pointer, relation, score_ptr)
|
38
|
+
# TODO make Mitie::Error
|
38
39
|
raise "Bad status: #{status}" if status != 0
|
39
40
|
score = score_ptr.to_s(Fiddle::SIZEOF_DOUBLE).unpack1("d")
|
40
41
|
if score > 0
|
data/lib/mitie/document.rb
CHANGED
@@ -84,11 +84,7 @@ module Mitie
|
|
84
84
|
def tokenize
|
85
85
|
@tokenize ||= begin
|
86
86
|
if text.is_a?(Array)
|
87
|
-
|
88
|
-
tokens_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP * (text.size + 1))
|
89
|
-
text.size.times do |i|
|
90
|
-
tokens_ptr[i * Fiddle::SIZEOF_VOIDP, Fiddle::SIZEOF_VOIDP] = Fiddle::Pointer.to_ptr(text[i]).ref
|
91
|
-
end
|
87
|
+
tokens_ptr = Utils.array_to_pointer(text)
|
92
88
|
[tokens_ptr, nil]
|
93
89
|
else
|
94
90
|
offsets_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
|
data/lib/mitie/ffi.rb
CHANGED
@@ -10,9 +10,12 @@ module Mitie
|
|
10
10
|
raise e
|
11
11
|
end
|
12
12
|
|
13
|
+
# https://github.com/mit-nlp/MITIE/blob/master/mitielib/include/mitie.h
|
14
|
+
|
13
15
|
extern "void mitie_free(void* object)"
|
14
16
|
extern "char** mitie_tokenize(const char* text)"
|
15
17
|
extern "char** mitie_tokenize_with_offsets(const char* text, unsigned long** token_offsets)"
|
18
|
+
extern "int mitie_save_named_entity_extractor(const char* filename, const mitie_named_entity_extractor* ner)"
|
16
19
|
|
17
20
|
extern "mitie_named_entity_extractor* mitie_load_named_entity_extractor(const char* filename)"
|
18
21
|
extern "unsigned long mitie_get_num_possible_ner_tags(const mitie_named_entity_extractor* ner)"
|
@@ -26,6 +29,21 @@ module Mitie
|
|
26
29
|
extern "const char* mitie_ner_get_detection_tagstr(const mitie_named_entity_detections* dets, unsigned long idx)"
|
27
30
|
extern "double mitie_ner_get_detection_score(const mitie_named_entity_detections* dets, unsigned long idx)"
|
28
31
|
|
32
|
+
extern "mitie_ner_training_instance* mitie_create_ner_training_instance(char** tokens)"
|
33
|
+
extern "unsigned long mitie_ner_training_instance_num_entities(const mitie_ner_training_instance* instance)"
|
34
|
+
extern "unsigned long mitie_ner_training_instance_num_tokens(const mitie_ner_training_instance* instance)"
|
35
|
+
extern "int mitie_overlaps_any_entity(mitie_ner_training_instance* instance, unsigned long start, unsigned long length)"
|
36
|
+
extern "int mitie_add_ner_training_entity(mitie_ner_training_instance* instance, unsigned long start, unsigned long length, const char* label)"
|
37
|
+
|
38
|
+
extern "mitie_ner_trainer* mitie_create_ner_trainer(const char* filename)"
|
39
|
+
extern "unsigned long mitie_ner_trainer_size(const mitie_ner_trainer* trainer)"
|
40
|
+
extern "int mitie_add_ner_training_instance(mitie_ner_trainer* trainer, const mitie_ner_training_instance* instance)"
|
41
|
+
extern "void mitie_ner_trainer_set_beta(mitie_ner_trainer* trainer, double beta)"
|
42
|
+
extern "double mitie_ner_trainer_get_beta(const mitie_ner_trainer* trainer)"
|
43
|
+
extern "void mitie_ner_trainer_set_num_threads(mitie_ner_trainer* trainer, unsigned long num_threads)"
|
44
|
+
extern "unsigned long mitie_ner_trainer_get_num_threads(const mitie_ner_trainer* trainer)"
|
45
|
+
extern "mitie_named_entity_extractor* mitie_train_named_entity_extractor(const mitie_ner_trainer* trainer)"
|
46
|
+
|
29
47
|
extern "mitie_binary_relation_detector* mitie_load_binary_relation_detector(const char* filename)"
|
30
48
|
extern "const char* mitie_binary_relation_detector_name_string(const mitie_binary_relation_detector* detector)"
|
31
49
|
extern "int mitie_entities_overlap(unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)"
|
data/lib/mitie/ner.rb
CHANGED
@@ -2,11 +2,18 @@ module Mitie
|
|
2
2
|
class NER
|
3
3
|
attr_reader :pointer
|
4
4
|
|
5
|
-
def initialize(path)
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
5
|
+
def initialize(path = nil, pointer: nil)
|
6
|
+
if path
|
7
|
+
# better error message
|
8
|
+
raise ArgumentError, "File does not exist" unless File.exist?(path)
|
9
|
+
@pointer = FFI.mitie_load_named_entity_extractor(path)
|
10
|
+
elsif pointer
|
11
|
+
@pointer = pointer
|
12
|
+
else
|
13
|
+
raise ArgumentError, "Must pass either a path or a pointer"
|
14
|
+
end
|
15
|
+
|
16
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
10
17
|
end
|
11
18
|
|
12
19
|
def tags
|
@@ -23,6 +30,13 @@ module Mitie
|
|
23
30
|
doc(text).entities
|
24
31
|
end
|
25
32
|
|
33
|
+
def save_to_disk(filename)
|
34
|
+
if FFI.mitie_save_named_entity_extractor(filename, pointer) != 0
|
35
|
+
raise Error, "Unable to save model"
|
36
|
+
end
|
37
|
+
nil
|
38
|
+
end
|
39
|
+
|
26
40
|
def tokens(text)
|
27
41
|
doc(text).tokens
|
28
42
|
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Mitie
|
2
|
+
class NERTrainer
|
3
|
+
def initialize(filename)
|
4
|
+
raise ArgumentError, "File does not exist" unless File.exist?(filename)
|
5
|
+
@pointer = FFI.mitie_create_ner_trainer(filename)
|
6
|
+
|
7
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
8
|
+
end
|
9
|
+
|
10
|
+
def add(instance)
|
11
|
+
FFI.mitie_add_ner_training_instance(@pointer, instance.pointer)
|
12
|
+
end
|
13
|
+
|
14
|
+
def beta
|
15
|
+
FFI.mitie_ner_trainer_get_beta(@pointer)
|
16
|
+
end
|
17
|
+
|
18
|
+
def beta=(value)
|
19
|
+
raise ArgumentError, "beta must be greater than or equal to zero" unless value >= 0
|
20
|
+
|
21
|
+
FFI.mitie_ner_trainer_set_beta(@pointer, value)
|
22
|
+
end
|
23
|
+
|
24
|
+
def num_threads
|
25
|
+
FFI.mitie_ner_trainer_get_num_threads(@pointer)
|
26
|
+
end
|
27
|
+
|
28
|
+
def num_threads=(value)
|
29
|
+
FFI.mitie_ner_trainer_set_num_threads(@pointer, value)
|
30
|
+
end
|
31
|
+
|
32
|
+
def size
|
33
|
+
FFI.mitie_ner_trainer_size(@pointer)
|
34
|
+
end
|
35
|
+
|
36
|
+
def train
|
37
|
+
raise Error, "You can't call train() on an empty trainer" if size.zero?
|
38
|
+
|
39
|
+
extractor = FFI.mitie_train_named_entity_extractor(@pointer)
|
40
|
+
|
41
|
+
raise Error, "Unable to create named entity extractor. Probably ran out of RAM." if extractor.null?
|
42
|
+
|
43
|
+
Mitie::NER.new(pointer: extractor)
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.finalize(pointer)
|
47
|
+
# must use proc instead of stabby lambda
|
48
|
+
proc { FFI.mitie_free(pointer) }
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Mitie
|
2
|
+
class NERTrainingInstance
|
3
|
+
attr_reader :pointer
|
4
|
+
|
5
|
+
def initialize(tokens)
|
6
|
+
tokens_pointer = Utils.array_to_pointer(tokens)
|
7
|
+
|
8
|
+
@pointer = FFI.mitie_create_ner_training_instance(tokens_pointer)
|
9
|
+
raise Error, "Unable to create training instance. Probably ran out of RAM." if @pointer.null?
|
10
|
+
|
11
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
12
|
+
end
|
13
|
+
|
14
|
+
def add_entity(range, label)
|
15
|
+
if range.none? || range.end >= num_tokens || range.begin < 0
|
16
|
+
raise ArgumentError, "Invalid range"
|
17
|
+
end
|
18
|
+
|
19
|
+
raise ArgumentError, "Range overlaps existing entity" if overlaps_any_entity?(range)
|
20
|
+
|
21
|
+
unless FFI.mitie_add_ner_training_entity(@pointer, range.begin, range.size, label).zero?
|
22
|
+
raise Error, "Unable to add entity to training instance. Probably ran out of RAM."
|
23
|
+
end
|
24
|
+
|
25
|
+
nil
|
26
|
+
end
|
27
|
+
|
28
|
+
def num_entities
|
29
|
+
FFI.mitie_ner_training_instance_num_entities(@pointer)
|
30
|
+
end
|
31
|
+
|
32
|
+
def num_tokens
|
33
|
+
FFI.mitie_ner_training_instance_num_tokens(@pointer)
|
34
|
+
end
|
35
|
+
|
36
|
+
def overlaps_any_entity?(range)
|
37
|
+
if range.none? || range.max >= num_tokens
|
38
|
+
raise ArgumentError, "Invalid range"
|
39
|
+
end
|
40
|
+
|
41
|
+
FFI.mitie_overlaps_any_entity(@pointer, range.begin, range.size) == 1
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.finalize(pointer)
|
45
|
+
# must use proc instead of stabby lambda
|
46
|
+
proc { FFI.mitie_free(pointer) }
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/lib/mitie/utils.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
module Mitie
|
2
|
+
module Utils
|
3
|
+
def self.array_to_pointer(text)
|
4
|
+
# malloc uses memset to set all bytes to 0
|
5
|
+
tokens_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP * (text.size + 1))
|
6
|
+
text.size.times do |i|
|
7
|
+
tokens_ptr[i * Fiddle::SIZEOF_VOIDP, Fiddle::SIZEOF_VOIDP] = Fiddle::Pointer.to_ptr(text[i]).ref
|
8
|
+
end
|
9
|
+
tokens_ptr
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
data/lib/mitie/version.rb
CHANGED
data/lib/mitie.rb
CHANGED
@@ -5,6 +5,9 @@ require "fiddle/import"
|
|
5
5
|
require "mitie/binary_relation_detector"
|
6
6
|
require "mitie/document"
|
7
7
|
require "mitie/ner"
|
8
|
+
require "mitie/ner_training_instance"
|
9
|
+
require "mitie/ner_trainer"
|
10
|
+
require "mitie/utils"
|
8
11
|
require "mitie/version"
|
9
12
|
|
10
13
|
module Mitie
|
@@ -16,10 +19,12 @@ module Mitie
|
|
16
19
|
lib_name =
|
17
20
|
if Gem.win_platform?
|
18
21
|
"mitie.dll"
|
19
|
-
elsif RbConfig::CONFIG["arch"] =~ /arm64-darwin/i
|
20
|
-
"libmitie.arm64.dylib"
|
21
22
|
elsif RbConfig::CONFIG["host_os"] =~ /darwin/i
|
22
|
-
"
|
23
|
+
if RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i
|
24
|
+
"libmitie.arm64.dylib"
|
25
|
+
else
|
26
|
+
"libmitie.dylib"
|
27
|
+
end
|
23
28
|
else
|
24
29
|
"libmitie.so"
|
25
30
|
end
|
metadata
CHANGED
@@ -1,17 +1,17 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mitie
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-03-21 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
|
-
email: andrew@
|
14
|
+
email: andrew@ankane.org
|
15
15
|
executables: []
|
16
16
|
extensions: []
|
17
17
|
extra_rdoc_files: []
|
@@ -24,13 +24,16 @@ files:
|
|
24
24
|
- lib/mitie/document.rb
|
25
25
|
- lib/mitie/ffi.rb
|
26
26
|
- lib/mitie/ner.rb
|
27
|
+
- lib/mitie/ner_trainer.rb
|
28
|
+
- lib/mitie/ner_training_instance.rb
|
29
|
+
- lib/mitie/utils.rb
|
27
30
|
- lib/mitie/version.rb
|
28
31
|
- vendor/LICENSE.txt
|
29
32
|
- vendor/libmitie.arm64.dylib
|
30
33
|
- vendor/libmitie.dylib
|
31
34
|
- vendor/libmitie.so
|
32
35
|
- vendor/mitie.dll
|
33
|
-
homepage: https://github.com/ankane/mitie
|
36
|
+
homepage: https://github.com/ankane/mitie-ruby
|
34
37
|
licenses:
|
35
38
|
- BSL-1.0
|
36
39
|
metadata: {}
|
@@ -49,7 +52,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
49
52
|
- !ruby/object:Gem::Version
|
50
53
|
version: '0'
|
51
54
|
requirements: []
|
52
|
-
rubygems_version: 3.
|
55
|
+
rubygems_version: 3.3.7
|
53
56
|
signing_key:
|
54
57
|
specification_version: 4
|
55
58
|
summary: Named-entity recognition for Ruby
|