mitie 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/README.md +125 -10
- data/lib/mitie/binary_relation_detector.rb +47 -25
- data/lib/mitie/binary_relation_trainer.rb +87 -0
- data/lib/mitie/document.rb +27 -33
- data/lib/mitie/ffi.rb +50 -1
- data/lib/mitie/ner.rb +19 -5
- data/lib/mitie/ner_trainer.rb +51 -0
- data/lib/mitie/ner_training_instance.rb +45 -0
- data/lib/mitie/text_categorizer.rb +47 -0
- data/lib/mitie/text_categorizer_trainer.rb +52 -0
- data/lib/mitie/utils.rb +22 -0
- data/lib/mitie/version.rb +1 -1
- data/lib/mitie.rb +11 -3
- metadata +12 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 50fb12adcd0042b3c09968108ec382694f8f0df20750b88566bc64c8d85d9e8d
|
4
|
+
data.tar.gz: 9d3d34f9839f71fc17e6651a17069f1f0fdb4b5956f9e57c4b66c9567d908967
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 189d5a17f94fff9abbc1d8d961c8f8407ae85bd1284f727e363b9aa7384d748eb1d80b6ffa1b1a44c8c6fadbbb6e91e6d064ea58fa88af972e6fd6e90e3ef71d
|
7
|
+
data.tar.gz: 202ba418ee98636736185a8f4783601acabecc59dde7e3a5c31a12b56827497b6b2b7e7e26d264b61848281b94cb635249b17a9e8fa009c496dacbe24e6466ca
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
## 0.2.0 (2022-06-01)
|
2
|
+
|
3
|
+
- Added support for text categorization
|
4
|
+
- Added support for training binary relation detectors
|
5
|
+
- Dropped support for Ruby < 2.7
|
6
|
+
|
7
|
+
## 0.1.6 (2022-03-20)
|
8
|
+
|
9
|
+
- Added support for training NER models
|
10
|
+
- Improved ARM detection
|
11
|
+
|
12
|
+
## 0.1.5 (2021-01-29)
|
13
|
+
|
14
|
+
- Fixed issue with multibyte characters
|
15
|
+
|
1
16
|
## 0.1.4 (2020-12-28)
|
2
17
|
|
3
18
|
- Added ARM shared library for Mac
|
data/README.md
CHANGED
@@ -1,21 +1,21 @@
|
|
1
|
-
# MITIE
|
1
|
+
# MITIE Ruby
|
2
2
|
|
3
|
-
[MITIE](https://github.com/mit-nlp/MITIE) - named-entity recognition
|
3
|
+
[MITIE](https://github.com/mit-nlp/MITIE) - named-entity recognition, binary relation detection, and text categorization - for Ruby
|
4
4
|
|
5
5
|
- Finds people, organizations, and locations in text
|
6
6
|
- Detects relationships between entities, like `PERSON` was born in `LOCATION`
|
7
7
|
|
8
|
-
[](https://github.com/ankane/mitie/actions)
|
8
|
+
[](https://github.com/ankane/mitie-ruby/actions)
|
9
9
|
|
10
10
|
## Installation
|
11
11
|
|
12
12
|
Add this line to your application’s Gemfile:
|
13
13
|
|
14
14
|
```ruby
|
15
|
-
gem
|
15
|
+
gem "mitie"
|
16
16
|
```
|
17
17
|
|
18
|
-
And download the pre-trained
|
18
|
+
And download the pre-trained models for your language:
|
19
19
|
|
20
20
|
- [English](https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2.tar.bz2)
|
21
21
|
- [Spanish](https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2-Spanish.zip)
|
@@ -23,6 +23,12 @@ And download the pre-trained model for your language:
|
|
23
23
|
|
24
24
|
## Getting Started
|
25
25
|
|
26
|
+
- [Named Entity Recognition](#named-entity-recognition)
|
27
|
+
- [Binary Relation Detection](#binary-relation-detection)
|
28
|
+
- [Text Categorization](#text-categorization)
|
29
|
+
|
30
|
+
## Named Entity Recognition
|
31
|
+
|
26
32
|
Load an NER model
|
27
33
|
|
28
34
|
```ruby
|
@@ -69,6 +75,41 @@ Get all tags for a model
|
|
69
75
|
model.tags
|
70
76
|
```
|
71
77
|
|
78
|
+
### Training
|
79
|
+
|
80
|
+
Load an NER model into a trainer
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
trainer = Mitie::NERTrainer.new("total_word_feature_extractor.dat")
|
84
|
+
```
|
85
|
+
|
86
|
+
Create training instances
|
87
|
+
|
88
|
+
```ruby
|
89
|
+
tokens = ["You", "can", "do", "machine", "learning", "in", "Ruby", "!"]
|
90
|
+
instance = Mitie::NERTrainingInstance.new(tokens)
|
91
|
+
instance.add_entity(3..4, "topic") # machine learning
|
92
|
+
instance.add_entity(6..6, "language") # Ruby
|
93
|
+
```
|
94
|
+
|
95
|
+
Add the training instances to the trainer
|
96
|
+
|
97
|
+
```ruby
|
98
|
+
trainer.add(instance)
|
99
|
+
```
|
100
|
+
|
101
|
+
Train the model
|
102
|
+
|
103
|
+
```ruby
|
104
|
+
model = trainer.train
|
105
|
+
```
|
106
|
+
|
107
|
+
Save the model
|
108
|
+
|
109
|
+
```ruby
|
110
|
+
model.save_to_disk("ner_model.dat")
|
111
|
+
```
|
112
|
+
|
72
113
|
## Binary Relation Detection
|
73
114
|
|
74
115
|
Detect relationships betweens two entities, like:
|
@@ -103,24 +144,98 @@ This returns
|
|
103
144
|
[{first: "Shopify", second: "Ottawa", score: 0.17649169745814464}]
|
104
145
|
```
|
105
146
|
|
147
|
+
### Training
|
148
|
+
|
149
|
+
Load an NER model into a trainer
|
150
|
+
|
151
|
+
```ruby
|
152
|
+
trainer = Mitie::BinaryRelationTrainer.new(model)
|
153
|
+
```
|
154
|
+
|
155
|
+
Add positive and negative examples to the trainer
|
156
|
+
|
157
|
+
```ruby
|
158
|
+
tokens = ["Shopify", "was", "founded", "in", "Ottawa"]
|
159
|
+
trainer.add_positive_binary_relation(tokens, 0..0, 4..4)
|
160
|
+
trainer.add_negative_binary_relation(tokens, 4..4, 0..0)
|
161
|
+
```
|
162
|
+
|
163
|
+
Train the detector
|
164
|
+
|
165
|
+
```ruby
|
166
|
+
detector = trainer.train
|
167
|
+
```
|
168
|
+
|
169
|
+
Save the detector
|
170
|
+
|
171
|
+
```ruby
|
172
|
+
detector.save_to_disk("binary_relation_detector.svm")
|
173
|
+
```
|
174
|
+
|
175
|
+
## Text Categorization
|
176
|
+
|
177
|
+
Load a model into a trainer
|
178
|
+
|
179
|
+
```ruby
|
180
|
+
trainer = Mitie::TextCategorizerTrainer.new("total_word_feature_extractor.dat")
|
181
|
+
```
|
182
|
+
|
183
|
+
Add labeled text to the trainer
|
184
|
+
|
185
|
+
```ruby
|
186
|
+
trainer.add(["This", "is", "super", "cool"], "positive")
|
187
|
+
```
|
188
|
+
|
189
|
+
Train the model
|
190
|
+
|
191
|
+
```ruby
|
192
|
+
model = trainer.train
|
193
|
+
```
|
194
|
+
|
195
|
+
Save the model
|
196
|
+
|
197
|
+
```ruby
|
198
|
+
model.save_to_disk("text_categorization_model.dat")
|
199
|
+
```
|
200
|
+
|
201
|
+
Load a saved model
|
202
|
+
|
203
|
+
```ruby
|
204
|
+
model = Mitie::TextCategorizer.new("text_categorization_model.dat")
|
205
|
+
```
|
206
|
+
|
207
|
+
Categorize text
|
208
|
+
|
209
|
+
```ruby
|
210
|
+
model.categorize(["What", "a", "super", "nice", "day"])
|
211
|
+
```
|
212
|
+
|
213
|
+
## Deployment
|
214
|
+
|
215
|
+
Check out [Trove](https://github.com/ankane/trove) for deploying models.
|
216
|
+
|
217
|
+
```sh
|
218
|
+
trove push ner_model.dat
|
219
|
+
```
|
220
|
+
|
106
221
|
## History
|
107
222
|
|
108
|
-
View the [changelog](https://github.com/ankane/mitie/blob/master/CHANGELOG.md)
|
223
|
+
View the [changelog](https://github.com/ankane/mitie-ruby/blob/master/CHANGELOG.md)
|
109
224
|
|
110
225
|
## Contributing
|
111
226
|
|
112
227
|
Everyone is encouraged to help improve this project. Here are a few ways you can help:
|
113
228
|
|
114
|
-
- [Report bugs](https://github.com/ankane/mitie/issues)
|
115
|
-
- Fix bugs and [submit pull requests](https://github.com/ankane/mitie/pulls)
|
229
|
+
- [Report bugs](https://github.com/ankane/mitie-ruby/issues)
|
230
|
+
- Fix bugs and [submit pull requests](https://github.com/ankane/mitie-ruby/pulls)
|
116
231
|
- Write, clarify, or fix documentation
|
117
232
|
- Suggest or add new features
|
118
233
|
|
119
234
|
To get started with development:
|
120
235
|
|
121
236
|
```sh
|
122
|
-
git clone https://github.com/ankane/mitie.git
|
123
|
-
cd mitie
|
237
|
+
git clone https://github.com/ankane/mitie-ruby.git
|
238
|
+
cd mitie-ruby
|
124
239
|
bundle install
|
125
240
|
bundle exec rake vendor:all
|
126
241
|
|
@@ -1,9 +1,16 @@
|
|
1
1
|
module Mitie
|
2
2
|
class BinaryRelationDetector
|
3
|
-
def initialize(path)
|
4
|
-
|
5
|
-
|
6
|
-
|
3
|
+
def initialize(path = nil, pointer: nil)
|
4
|
+
if path
|
5
|
+
# better error message
|
6
|
+
raise ArgumentError, "File does not exist" unless File.exist?(path)
|
7
|
+
@pointer = FFI.mitie_load_binary_relation_detector(path)
|
8
|
+
elsif pointer
|
9
|
+
@pointer = pointer
|
10
|
+
else
|
11
|
+
raise ArgumentError, "Must pass either a path or a pointer"
|
12
|
+
end
|
13
|
+
|
7
14
|
ObjectSpace.define_finalizer(self, self.class.finalize(pointer))
|
8
15
|
end
|
9
16
|
|
@@ -23,37 +30,52 @@ module Mitie
|
|
23
30
|
|
24
31
|
relations = []
|
25
32
|
combinations.each do |entity1, entity2|
|
26
|
-
relation =
|
27
|
-
|
28
|
-
doc.model.pointer,
|
29
|
-
doc.send(:tokens_ptr),
|
30
|
-
entity1[:token_index],
|
31
|
-
entity1[:token_length],
|
32
|
-
entity2[:token_index],
|
33
|
-
entity2[:token_length]
|
34
|
-
)
|
35
|
-
|
36
|
-
score_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
|
37
|
-
status = FFI.mitie_classify_binary_relation(pointer, relation, score_ptr)
|
38
|
-
raise "Bad status: #{status}" if status != 0
|
39
|
-
score = score_ptr.to_s(Fiddle::SIZEOF_DOUBLE).unpack1("d")
|
40
|
-
if score > 0
|
41
|
-
relations << {
|
42
|
-
first: entity1[:text],
|
43
|
-
second: entity2[:text],
|
44
|
-
score: score
|
45
|
-
}
|
46
|
-
end
|
33
|
+
relation = extract_relation(doc, entity1, entity2)
|
34
|
+
relations << relation if relation
|
47
35
|
end
|
48
36
|
relations
|
49
37
|
end
|
50
38
|
|
39
|
+
def save_to_disk(filename)
|
40
|
+
if FFI.mitie_save_binary_relation_detector(filename, pointer) != 0
|
41
|
+
raise Error, "Unable to save detector"
|
42
|
+
end
|
43
|
+
nil
|
44
|
+
end
|
45
|
+
|
51
46
|
private
|
52
47
|
|
53
48
|
def pointer
|
54
49
|
@pointer
|
55
50
|
end
|
56
51
|
|
52
|
+
def extract_relation(doc, entity1, entity2)
|
53
|
+
relation =
|
54
|
+
FFI.mitie_extract_binary_relation(
|
55
|
+
doc.model.pointer,
|
56
|
+
doc.send(:tokens_ptr),
|
57
|
+
entity1[:token_index],
|
58
|
+
entity1[:token_length],
|
59
|
+
entity2[:token_index],
|
60
|
+
entity2[:token_length]
|
61
|
+
)
|
62
|
+
|
63
|
+
score_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
|
64
|
+
status = FFI.mitie_classify_binary_relation(pointer, relation, score_ptr)
|
65
|
+
raise Error, "Bad status: #{status}" if status != 0
|
66
|
+
|
67
|
+
score = Utils.read_double(score_ptr)
|
68
|
+
if score > 0
|
69
|
+
{
|
70
|
+
first: entity1[:text],
|
71
|
+
second: entity2[:text],
|
72
|
+
score: score
|
73
|
+
}
|
74
|
+
end
|
75
|
+
ensure
|
76
|
+
FFI.mitie_free(relation) if relation
|
77
|
+
end
|
78
|
+
|
57
79
|
def self.finalize(pointer)
|
58
80
|
# must use proc instead of stabby lambda
|
59
81
|
proc { FFI.mitie_free(pointer) }
|
@@ -0,0 +1,87 @@
|
|
1
|
+
module Mitie
|
2
|
+
class BinaryRelationTrainer
|
3
|
+
def initialize(ner, name: "")
|
4
|
+
@pointer = FFI.mitie_create_binary_relation_trainer(name, ner.pointer)
|
5
|
+
|
6
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
7
|
+
end
|
8
|
+
|
9
|
+
def add_positive_binary_relation(tokens, range1, range2)
|
10
|
+
check_add(tokens, range1, range2)
|
11
|
+
|
12
|
+
tokens_pointer = Utils.array_to_pointer(tokens)
|
13
|
+
status = FFI.mitie_add_positive_binary_relation(@pointer, tokens_pointer, range1.begin, range1.size, range2.begin, range2.size)
|
14
|
+
if status != 0
|
15
|
+
raise Error, "Unable to add binary relation"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def add_negative_binary_relation(tokens, range1, range2)
|
20
|
+
check_add(tokens, range1, range2)
|
21
|
+
|
22
|
+
tokens_pointer = Utils.array_to_pointer(tokens)
|
23
|
+
status = FFI.mitie_add_negative_binary_relation(@pointer, tokens_pointer, range1.begin, range1.size, range2.begin, range2.size)
|
24
|
+
if status != 0
|
25
|
+
raise Error, "Unable to add binary relation"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def beta
|
30
|
+
FFI.mitie_binary_relation_trainer_get_beta(@pointer)
|
31
|
+
end
|
32
|
+
|
33
|
+
def beta=(value)
|
34
|
+
raise ArgumentError, "beta must be greater than or equal to zero" unless value >= 0
|
35
|
+
|
36
|
+
FFI.mitie_binary_relation_trainer_set_beta(@pointer, value)
|
37
|
+
end
|
38
|
+
|
39
|
+
def num_threads
|
40
|
+
FFI.mitie_binary_relation_trainer_get_num_threads(@pointer)
|
41
|
+
end
|
42
|
+
|
43
|
+
def num_threads=(value)
|
44
|
+
FFI.mitie_binary_relation_trainer_set_num_threads(@pointer, value)
|
45
|
+
end
|
46
|
+
|
47
|
+
def num_positive_examples
|
48
|
+
FFI.mitie_binary_relation_trainer_num_positive_examples(@pointer)
|
49
|
+
end
|
50
|
+
|
51
|
+
def num_negative_examples
|
52
|
+
FFI.mitie_binary_relation_trainer_num_negative_examples(@pointer)
|
53
|
+
end
|
54
|
+
|
55
|
+
def train
|
56
|
+
if num_positive_examples + num_negative_examples == 0
|
57
|
+
raise Error, "You can't call train() on an empty trainer"
|
58
|
+
end
|
59
|
+
|
60
|
+
detector = FFI.mitie_train_binary_relation_detector(@pointer)
|
61
|
+
|
62
|
+
raise Error, "Unable to create binary relation detector. Probably ran out of RAM." if detector.null?
|
63
|
+
|
64
|
+
Mitie::BinaryRelationDetector.new(pointer: detector)
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def check_add(tokens, range1, range2)
|
70
|
+
Utils.check_range(range1, tokens.size)
|
71
|
+
Utils.check_range(range2, tokens.size)
|
72
|
+
|
73
|
+
if entities_overlap?(range1, range2)
|
74
|
+
raise ArgumentError, "Entities overlap"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def entities_overlap?(range1, range2)
|
79
|
+
FFI.mitie_entities_overlap(range1.begin, range1.size, range2.begin, range2.size) == 1
|
80
|
+
end
|
81
|
+
|
82
|
+
def self.finalize(pointer)
|
83
|
+
# must use proc instead of stabby lambda
|
84
|
+
proc { FFI.mitie_free(pointer) }
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
data/lib/mitie/document.rb
CHANGED
@@ -33,37 +33,35 @@ module Mitie
|
|
33
33
|
|
34
34
|
def entities
|
35
35
|
@entities ||= begin
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
offset = tok[0][1]
|
36
|
+
entities = []
|
37
|
+
tokens = tokens_with_offset
|
38
|
+
detections = FFI.mitie_extract_entities(pointer, tokens_ptr)
|
39
|
+
num_detections = FFI.mitie_ner_get_num_detections(detections)
|
40
|
+
num_detections.times do |i|
|
41
|
+
pos = FFI.mitie_ner_get_detection_position(detections, i)
|
42
|
+
len = FFI.mitie_ner_get_detection_length(detections, i)
|
43
|
+
tag = FFI.mitie_ner_get_detection_tagstr(detections, i).to_s
|
44
|
+
score = FFI.mitie_ner_get_detection_score(detections, i)
|
45
|
+
tok = tokens[pos, len]
|
46
|
+
offset = tok[0][1]
|
48
47
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
end
|
56
|
-
entity[:tag] = tag
|
57
|
-
entity[:score] = score
|
58
|
-
entity[:offset] = offset if offset
|
59
|
-
entity[:token_index] = pos
|
60
|
-
entity[:token_length] = len
|
61
|
-
entities << entity
|
48
|
+
entity = {}
|
49
|
+
if offset
|
50
|
+
finish = tok[-1][1] + tok[-1][0].bytesize
|
51
|
+
entity[:text] = text.byteslice(offset...finish)
|
52
|
+
else
|
53
|
+
entity[:text] = tok.map(&:first)
|
62
54
|
end
|
63
|
-
|
64
|
-
|
65
|
-
|
55
|
+
entity[:tag] = tag
|
56
|
+
entity[:score] = score
|
57
|
+
entity[:offset] = offset if offset
|
58
|
+
entity[:token_index] = pos
|
59
|
+
entity[:token_length] = len
|
60
|
+
entities << entity
|
66
61
|
end
|
62
|
+
entities
|
63
|
+
ensure
|
64
|
+
FFI.mitie_free(detections) if detections
|
67
65
|
end
|
68
66
|
end
|
69
67
|
|
@@ -84,11 +82,7 @@ module Mitie
|
|
84
82
|
def tokenize
|
85
83
|
@tokenize ||= begin
|
86
84
|
if text.is_a?(Array)
|
87
|
-
|
88
|
-
tokens_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP * (text.size + 1))
|
89
|
-
text.size.times do |i|
|
90
|
-
tokens_ptr[i * Fiddle::SIZEOF_VOIDP, Fiddle::SIZEOF_VOIDP] = Fiddle::Pointer.to_ptr(text[i]).ref
|
91
|
-
end
|
85
|
+
tokens_ptr = Utils.array_to_pointer(text)
|
92
86
|
[tokens_ptr, nil]
|
93
87
|
else
|
94
88
|
offsets_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
|
data/lib/mitie/ffi.rb
CHANGED
@@ -10,14 +10,16 @@ module Mitie
|
|
10
10
|
raise e
|
11
11
|
end
|
12
12
|
|
13
|
+
# https://github.com/mit-nlp/MITIE/blob/master/mitielib/include/mitie.h
|
14
|
+
|
13
15
|
extern "void mitie_free(void* object)"
|
14
16
|
extern "char** mitie_tokenize(const char* text)"
|
15
17
|
extern "char** mitie_tokenize_with_offsets(const char* text, unsigned long** token_offsets)"
|
16
18
|
|
19
|
+
# ner
|
17
20
|
extern "mitie_named_entity_extractor* mitie_load_named_entity_extractor(const char* filename)"
|
18
21
|
extern "unsigned long mitie_get_num_possible_ner_tags(const mitie_named_entity_extractor* ner)"
|
19
22
|
extern "const char* mitie_get_named_entity_tagstr(const mitie_named_entity_extractor* ner, unsigned long idx)"
|
20
|
-
|
21
23
|
extern "mitie_named_entity_detections* mitie_extract_entities(const mitie_named_entity_extractor* ner, char** tokens)"
|
22
24
|
extern "unsigned long mitie_ner_get_num_detections(const mitie_named_entity_detections* dets)"
|
23
25
|
extern "unsigned long mitie_ner_get_detection_position(const mitie_named_entity_detections* dets, unsigned long idx)"
|
@@ -26,10 +28,57 @@ module Mitie
|
|
26
28
|
extern "const char* mitie_ner_get_detection_tagstr(const mitie_named_entity_detections* dets, unsigned long idx)"
|
27
29
|
extern "double mitie_ner_get_detection_score(const mitie_named_entity_detections* dets, unsigned long idx)"
|
28
30
|
|
31
|
+
# binary relation detector
|
29
32
|
extern "mitie_binary_relation_detector* mitie_load_binary_relation_detector(const char* filename)"
|
30
33
|
extern "const char* mitie_binary_relation_detector_name_string(const mitie_binary_relation_detector* detector)"
|
31
34
|
extern "int mitie_entities_overlap(unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)"
|
32
35
|
extern "mitie_binary_relation* mitie_extract_binary_relation(const mitie_named_entity_extractor* ner, char** tokens, unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)"
|
33
36
|
extern "int mitie_classify_binary_relation(const mitie_binary_relation_detector* detector, const mitie_binary_relation* relation, double* score)"
|
37
|
+
|
38
|
+
# text categorizer
|
39
|
+
extern "mitie_text_categorizer* mitie_load_text_categorizer(const char* filename)"
|
40
|
+
extern "int mitie_categorize_text(const mitie_text_categorizer* tcat, const char** tokens, char** text_tag, double* text_score)"
|
41
|
+
|
42
|
+
# save
|
43
|
+
extern "int mitie_save_named_entity_extractor(const char* filename, const mitie_named_entity_extractor* ner)"
|
44
|
+
extern "int mitie_save_binary_relation_detector(const char* filename, const mitie_binary_relation_detector* detector)"
|
45
|
+
extern "int mitie_save_text_categorizer(const char* filename, const mitie_text_categorizer* tcat)"
|
46
|
+
|
47
|
+
# ner trainer
|
48
|
+
extern "mitie_ner_training_instance* mitie_create_ner_training_instance(char** tokens)"
|
49
|
+
extern "unsigned long mitie_ner_training_instance_num_entities(const mitie_ner_training_instance* instance)"
|
50
|
+
extern "unsigned long mitie_ner_training_instance_num_tokens(const mitie_ner_training_instance* instance)"
|
51
|
+
extern "int mitie_overlaps_any_entity(mitie_ner_training_instance* instance, unsigned long start, unsigned long length)"
|
52
|
+
extern "int mitie_add_ner_training_entity(mitie_ner_training_instance* instance, unsigned long start, unsigned long length, const char* label)"
|
53
|
+
extern "mitie_ner_trainer* mitie_create_ner_trainer(const char* filename)"
|
54
|
+
extern "unsigned long mitie_ner_trainer_size(const mitie_ner_trainer* trainer)"
|
55
|
+
extern "int mitie_add_ner_training_instance(mitie_ner_trainer* trainer, const mitie_ner_training_instance* instance)"
|
56
|
+
extern "void mitie_ner_trainer_set_beta(mitie_ner_trainer* trainer, double beta)"
|
57
|
+
extern "double mitie_ner_trainer_get_beta(const mitie_ner_trainer* trainer)"
|
58
|
+
extern "void mitie_ner_trainer_set_num_threads(mitie_ner_trainer* trainer, unsigned long num_threads)"
|
59
|
+
extern "unsigned long mitie_ner_trainer_get_num_threads(const mitie_ner_trainer* trainer)"
|
60
|
+
extern "mitie_named_entity_extractor* mitie_train_named_entity_extractor(const mitie_ner_trainer* trainer)"
|
61
|
+
|
62
|
+
# binary relation trainer
|
63
|
+
extern "mitie_binary_relation_trainer* mitie_create_binary_relation_trainer(const char* relation_name, const mitie_named_entity_extractor* ner)"
|
64
|
+
extern "unsigned long mitie_binary_relation_trainer_num_positive_examples(const mitie_binary_relation_trainer* trainer)"
|
65
|
+
extern "unsigned long mitie_binary_relation_trainer_num_negative_examples(const mitie_binary_relation_trainer* trainer)"
|
66
|
+
extern "int mitie_add_positive_binary_relation(mitie_binary_relation_trainer* trainer, char** tokens, unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)"
|
67
|
+
extern "int mitie_add_negative_binary_relation(mitie_binary_relation_trainer* trainer, char** tokens, unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)"
|
68
|
+
extern "void mitie_binary_relation_trainer_set_beta(mitie_binary_relation_trainer* trainer, double beta)"
|
69
|
+
extern "double mitie_binary_relation_trainer_get_beta(const mitie_binary_relation_trainer* trainer)"
|
70
|
+
extern "void mitie_binary_relation_trainer_set_num_threads(mitie_binary_relation_trainer* trainer, unsigned long num_threads)"
|
71
|
+
extern "unsigned long mitie_binary_relation_trainer_get_num_threads(const mitie_binary_relation_trainer* trainer)"
|
72
|
+
extern "mitie_binary_relation_detector* mitie_train_binary_relation_detector(const mitie_binary_relation_trainer* trainer)"
|
73
|
+
|
74
|
+
# text categorizer trainer
|
75
|
+
extern "mitie_text_categorizer_trainer* mitie_create_text_categorizer_trainer(const char* filename)"
|
76
|
+
extern "unsigned long mitie_text_categorizer_trainer_size(const mitie_text_categorizer_trainer* trainer)"
|
77
|
+
extern "void mitie_text_categorizer_trainer_set_beta(mitie_text_categorizer_trainer* trainer, double beta)"
|
78
|
+
extern "double mitie_text_categorizer_trainer_get_beta(const mitie_text_categorizer_trainer* trainer)"
|
79
|
+
extern "void mitie_text_categorizer_trainer_set_num_threads(mitie_text_categorizer_trainer* trainer, unsigned long num_threads)"
|
80
|
+
extern "unsigned long mitie_text_categorizer_trainer_get_num_threads(const mitie_text_categorizer_trainer* trainer)"
|
81
|
+
extern "int mitie_add_text_categorizer_labeled_text(mitie_text_categorizer_trainer* trainer, const char** tokens, const char* label)"
|
82
|
+
extern "mitie_text_categorizer* mitie_train_text_categorizer(const mitie_text_categorizer_trainer* trainer)"
|
34
83
|
end
|
35
84
|
end
|
data/lib/mitie/ner.rb
CHANGED
@@ -2,11 +2,18 @@ module Mitie
|
|
2
2
|
class NER
|
3
3
|
attr_reader :pointer
|
4
4
|
|
5
|
-
def initialize(path)
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
5
|
+
def initialize(path = nil, pointer: nil)
|
6
|
+
if path
|
7
|
+
# better error message
|
8
|
+
raise ArgumentError, "File does not exist" unless File.exist?(path)
|
9
|
+
@pointer = FFI.mitie_load_named_entity_extractor(path)
|
10
|
+
elsif pointer
|
11
|
+
@pointer = pointer
|
12
|
+
else
|
13
|
+
raise ArgumentError, "Must pass either a path or a pointer"
|
14
|
+
end
|
15
|
+
|
16
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
10
17
|
end
|
11
18
|
|
12
19
|
def tags
|
@@ -23,6 +30,13 @@ module Mitie
|
|
23
30
|
doc(text).entities
|
24
31
|
end
|
25
32
|
|
33
|
+
def save_to_disk(filename)
|
34
|
+
if FFI.mitie_save_named_entity_extractor(filename, pointer) != 0
|
35
|
+
raise Error, "Unable to save model"
|
36
|
+
end
|
37
|
+
nil
|
38
|
+
end
|
39
|
+
|
26
40
|
def tokens(text)
|
27
41
|
doc(text).tokens
|
28
42
|
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Mitie
|
2
|
+
class NERTrainer
|
3
|
+
def initialize(filename)
|
4
|
+
raise ArgumentError, "File does not exist" unless File.exist?(filename)
|
5
|
+
@pointer = FFI.mitie_create_ner_trainer(filename)
|
6
|
+
|
7
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
8
|
+
end
|
9
|
+
|
10
|
+
def add(instance)
|
11
|
+
FFI.mitie_add_ner_training_instance(@pointer, instance.pointer)
|
12
|
+
end
|
13
|
+
|
14
|
+
def beta
|
15
|
+
FFI.mitie_ner_trainer_get_beta(@pointer)
|
16
|
+
end
|
17
|
+
|
18
|
+
def beta=(value)
|
19
|
+
raise ArgumentError, "beta must be greater than or equal to zero" unless value >= 0
|
20
|
+
|
21
|
+
FFI.mitie_ner_trainer_set_beta(@pointer, value)
|
22
|
+
end
|
23
|
+
|
24
|
+
def num_threads
|
25
|
+
FFI.mitie_ner_trainer_get_num_threads(@pointer)
|
26
|
+
end
|
27
|
+
|
28
|
+
def num_threads=(value)
|
29
|
+
FFI.mitie_ner_trainer_set_num_threads(@pointer, value)
|
30
|
+
end
|
31
|
+
|
32
|
+
def size
|
33
|
+
FFI.mitie_ner_trainer_size(@pointer)
|
34
|
+
end
|
35
|
+
|
36
|
+
def train
|
37
|
+
raise Error, "You can't call train() on an empty trainer" if size.zero?
|
38
|
+
|
39
|
+
extractor = FFI.mitie_train_named_entity_extractor(@pointer)
|
40
|
+
|
41
|
+
raise Error, "Unable to create named entity extractor. Probably ran out of RAM." if extractor.null?
|
42
|
+
|
43
|
+
Mitie::NER.new(pointer: extractor)
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.finalize(pointer)
|
47
|
+
# must use proc instead of stabby lambda
|
48
|
+
proc { FFI.mitie_free(pointer) }
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Mitie
|
2
|
+
class NERTrainingInstance
|
3
|
+
attr_reader :pointer
|
4
|
+
|
5
|
+
def initialize(tokens)
|
6
|
+
tokens_pointer = Utils.array_to_pointer(tokens)
|
7
|
+
|
8
|
+
@pointer = FFI.mitie_create_ner_training_instance(tokens_pointer)
|
9
|
+
raise Error, "Unable to create training instance. Probably ran out of RAM." if @pointer.null?
|
10
|
+
|
11
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
12
|
+
end
|
13
|
+
|
14
|
+
def add_entity(range, label)
|
15
|
+
Utils.check_range(range, num_tokens)
|
16
|
+
|
17
|
+
raise ArgumentError, "Range overlaps existing entity" if overlaps_any_entity?(range)
|
18
|
+
|
19
|
+
unless FFI.mitie_add_ner_training_entity(@pointer, range.begin, range.size, label).zero?
|
20
|
+
raise Error, "Unable to add entity to training instance. Probably ran out of RAM."
|
21
|
+
end
|
22
|
+
|
23
|
+
nil
|
24
|
+
end
|
25
|
+
|
26
|
+
def num_entities
|
27
|
+
FFI.mitie_ner_training_instance_num_entities(@pointer)
|
28
|
+
end
|
29
|
+
|
30
|
+
def num_tokens
|
31
|
+
FFI.mitie_ner_training_instance_num_tokens(@pointer)
|
32
|
+
end
|
33
|
+
|
34
|
+
def overlaps_any_entity?(range)
|
35
|
+
Utils.check_range(range, num_tokens)
|
36
|
+
|
37
|
+
FFI.mitie_overlaps_any_entity(@pointer, range.begin, range.size) == 1
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.finalize(pointer)
|
41
|
+
# must use proc instead of stabby lambda
|
42
|
+
proc { FFI.mitie_free(pointer) }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Mitie
|
2
|
+
class TextCategorizer
|
3
|
+
def initialize(path = nil, pointer: nil)
|
4
|
+
if path
|
5
|
+
# better error message
|
6
|
+
raise ArgumentError, "File does not exist" unless File.exist?(path)
|
7
|
+
@pointer = FFI.mitie_load_text_categorizer(path)
|
8
|
+
elsif pointer
|
9
|
+
@pointer = pointer
|
10
|
+
else
|
11
|
+
raise ArgumentError, "Must pass either a path or a pointer"
|
12
|
+
end
|
13
|
+
|
14
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
15
|
+
end
|
16
|
+
|
17
|
+
def categorize(tokens)
|
18
|
+
tokens_pointer = Utils.array_to_pointer(tokens)
|
19
|
+
text_tag = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
|
20
|
+
text_score = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
|
21
|
+
|
22
|
+
if FFI.mitie_categorize_text(@pointer, tokens_pointer, text_tag, text_score) != 0
|
23
|
+
raise Error, "Unable to categorize"
|
24
|
+
end
|
25
|
+
|
26
|
+
{
|
27
|
+
tag: text_tag.ptr.to_s,
|
28
|
+
score: Utils.read_double(text_score)
|
29
|
+
}
|
30
|
+
ensure
|
31
|
+
# text_tag must be freed
|
32
|
+
FFI.mitie_free(text_tag.ptr) if text_tag
|
33
|
+
end
|
34
|
+
|
35
|
+
def save_to_disk(filename)
|
36
|
+
if FFI.mitie_save_text_categorizer(filename, @pointer) != 0
|
37
|
+
raise Error, "Unable to save model"
|
38
|
+
end
|
39
|
+
nil
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.finalize(pointer)
|
43
|
+
# must use proc instead of stabby lambda
|
44
|
+
proc { FFI.mitie_free(pointer) }
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module Mitie
|
2
|
+
class TextCategorizerTrainer
|
3
|
+
def initialize(filename)
|
4
|
+
raise ArgumentError, "File does not exist" unless File.exist?(filename)
|
5
|
+
@pointer = FFI.mitie_create_text_categorizer_trainer(filename)
|
6
|
+
|
7
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
8
|
+
end
|
9
|
+
|
10
|
+
def add(tokens, label)
|
11
|
+
tokens_pointer = Utils.array_to_pointer(tokens)
|
12
|
+
FFI.mitie_add_text_categorizer_labeled_text(@pointer, tokens_pointer, label)
|
13
|
+
end
|
14
|
+
|
15
|
+
def beta
|
16
|
+
FFI.mitie_text_categorizer_trainer_get_beta(@pointer)
|
17
|
+
end
|
18
|
+
|
19
|
+
def beta=(value)
|
20
|
+
raise ArgumentError, "beta must be greater than or equal to zero" unless value >= 0
|
21
|
+
|
22
|
+
FFI.mitie_text_categorizer_trainer_set_beta(@pointer, value)
|
23
|
+
end
|
24
|
+
|
25
|
+
def num_threads
|
26
|
+
FFI.mitie_text_categorizer_trainer_get_num_threads(@pointer)
|
27
|
+
end
|
28
|
+
|
29
|
+
def num_threads=(value)
|
30
|
+
FFI.mitie_text_categorizer_trainer_set_num_threads(@pointer, value)
|
31
|
+
end
|
32
|
+
|
33
|
+
def size
|
34
|
+
FFI.mitie_text_categorizer_trainer_size(@pointer)
|
35
|
+
end
|
36
|
+
|
37
|
+
def train
|
38
|
+
raise Error, "You can't call train() on an empty trainer" if size.zero?
|
39
|
+
|
40
|
+
categorizer = FFI.mitie_train_text_categorizer(@pointer)
|
41
|
+
|
42
|
+
raise Error, "Unable to create text categorizer. Probably ran out of RAM." if categorizer.null?
|
43
|
+
|
44
|
+
Mitie::TextCategorizer.new(pointer: categorizer)
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.finalize(pointer)
|
48
|
+
# must use proc instead of stabby lambda
|
49
|
+
proc { FFI.mitie_free(pointer) }
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
data/lib/mitie/utils.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
module Mitie
|
2
|
+
module Utils
|
3
|
+
def self.array_to_pointer(text)
|
4
|
+
# malloc uses memset to set all bytes to 0
|
5
|
+
tokens_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP * (text.size + 1))
|
6
|
+
text.size.times do |i|
|
7
|
+
tokens_ptr[i * Fiddle::SIZEOF_VOIDP, Fiddle::SIZEOF_VOIDP] = Fiddle::Pointer.to_ptr(text[i]).ref
|
8
|
+
end
|
9
|
+
tokens_ptr
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.check_range(range, num_tokens)
|
13
|
+
if range.none? || !(0..(num_tokens - 1)).cover?(range)
|
14
|
+
raise ArgumentError, "Invalid range"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.read_double(ptr)
|
19
|
+
ptr.to_s(Fiddle::SIZEOF_DOUBLE).unpack1("d")
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/mitie/version.rb
CHANGED
data/lib/mitie.rb
CHANGED
@@ -3,8 +3,14 @@ require "fiddle/import"
|
|
3
3
|
|
4
4
|
# modules
|
5
5
|
require "mitie/binary_relation_detector"
|
6
|
+
require "mitie/binary_relation_trainer"
|
6
7
|
require "mitie/document"
|
7
8
|
require "mitie/ner"
|
9
|
+
require "mitie/ner_training_instance"
|
10
|
+
require "mitie/ner_trainer"
|
11
|
+
require "mitie/text_categorizer"
|
12
|
+
require "mitie/text_categorizer_trainer"
|
13
|
+
require "mitie/utils"
|
8
14
|
require "mitie/version"
|
9
15
|
|
10
16
|
module Mitie
|
@@ -16,10 +22,12 @@ module Mitie
|
|
16
22
|
lib_name =
|
17
23
|
if Gem.win_platform?
|
18
24
|
"mitie.dll"
|
19
|
-
elsif RbConfig::CONFIG["arch"] =~ /arm64-darwin/i
|
20
|
-
"libmitie.arm64.dylib"
|
21
25
|
elsif RbConfig::CONFIG["host_os"] =~ /darwin/i
|
22
|
-
"
|
26
|
+
if RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i
|
27
|
+
"libmitie.arm64.dylib"
|
28
|
+
else
|
29
|
+
"libmitie.dylib"
|
30
|
+
end
|
23
31
|
else
|
24
32
|
"libmitie.so"
|
25
33
|
end
|
metadata
CHANGED
@@ -1,17 +1,17 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mitie
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-06-01 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
|
-
email: andrew@
|
14
|
+
email: andrew@ankane.org
|
15
15
|
executables: []
|
16
16
|
extensions: []
|
17
17
|
extra_rdoc_files: []
|
@@ -21,16 +21,22 @@ files:
|
|
21
21
|
- README.md
|
22
22
|
- lib/mitie.rb
|
23
23
|
- lib/mitie/binary_relation_detector.rb
|
24
|
+
- lib/mitie/binary_relation_trainer.rb
|
24
25
|
- lib/mitie/document.rb
|
25
26
|
- lib/mitie/ffi.rb
|
26
27
|
- lib/mitie/ner.rb
|
28
|
+
- lib/mitie/ner_trainer.rb
|
29
|
+
- lib/mitie/ner_training_instance.rb
|
30
|
+
- lib/mitie/text_categorizer.rb
|
31
|
+
- lib/mitie/text_categorizer_trainer.rb
|
32
|
+
- lib/mitie/utils.rb
|
27
33
|
- lib/mitie/version.rb
|
28
34
|
- vendor/LICENSE.txt
|
29
35
|
- vendor/libmitie.arm64.dylib
|
30
36
|
- vendor/libmitie.dylib
|
31
37
|
- vendor/libmitie.so
|
32
38
|
- vendor/mitie.dll
|
33
|
-
homepage: https://github.com/ankane/mitie
|
39
|
+
homepage: https://github.com/ankane/mitie-ruby
|
34
40
|
licenses:
|
35
41
|
- BSL-1.0
|
36
42
|
metadata: {}
|
@@ -42,14 +48,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
42
48
|
requirements:
|
43
49
|
- - ">="
|
44
50
|
- !ruby/object:Gem::Version
|
45
|
-
version: '2.
|
51
|
+
version: '2.7'
|
46
52
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
47
53
|
requirements:
|
48
54
|
- - ">="
|
49
55
|
- !ruby/object:Gem::Version
|
50
56
|
version: '0'
|
51
57
|
requirements: []
|
52
|
-
rubygems_version: 3.
|
58
|
+
rubygems_version: 3.3.7
|
53
59
|
signing_key:
|
54
60
|
specification_version: 4
|
55
61
|
summary: Named-entity recognition for Ruby
|