mitie 0.1.6 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +83 -3
- data/lib/mitie/binary_relation_detector.rb +47 -26
- data/lib/mitie/binary_relation_trainer.rb +87 -0
- data/lib/mitie/document.rb +26 -28
- data/lib/mitie/ffi.rb +39 -8
- data/lib/mitie/ner_training_instance.rb +2 -6
- data/lib/mitie/text_categorizer.rb +47 -0
- data/lib/mitie/text_categorizer_trainer.rb +52 -0
- data/lib/mitie/utils.rb +10 -0
- data/lib/mitie/version.rb +1 -1
- data/lib/mitie.rb +3 -0
- metadata +6 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 50fb12adcd0042b3c09968108ec382694f8f0df20750b88566bc64c8d85d9e8d
|
4
|
+
data.tar.gz: 9d3d34f9839f71fc17e6651a17069f1f0fdb4b5956f9e57c4b66c9567d908967
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 189d5a17f94fff9abbc1d8d961c8f8407ae85bd1284f727e363b9aa7384d748eb1d80b6ffa1b1a44c8c6fadbbb6e91e6d064ea58fa88af972e6fd6e90e3ef71d
|
7
|
+
data.tar.gz: 202ba418ee98636736185a8f4783601acabecc59dde7e3a5c31a12b56827497b6b2b7e7e26d264b61848281b94cb635249b17a9e8fa009c496dacbe24e6466ca
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# MITIE Ruby
|
2
2
|
|
3
|
-
[MITIE](https://github.com/mit-nlp/MITIE) - named-entity recognition
|
3
|
+
[MITIE](https://github.com/mit-nlp/MITIE) - named-entity recognition, binary relation detection, and text categorization - for Ruby
|
4
4
|
|
5
5
|
- Finds people, organizations, and locations in text
|
6
6
|
- Detects relationships between entities, like `PERSON` was born in `LOCATION`
|
@@ -15,7 +15,7 @@ Add this line to your application’s Gemfile:
|
|
15
15
|
gem "mitie"
|
16
16
|
```
|
17
17
|
|
18
|
-
And download the pre-trained
|
18
|
+
And download the pre-trained models for your language:
|
19
19
|
|
20
20
|
- [English](https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2.tar.bz2)
|
21
21
|
- [Spanish](https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2-Spanish.zip)
|
@@ -23,6 +23,12 @@ And download the pre-trained model for your language:
|
|
23
23
|
|
24
24
|
## Getting Started
|
25
25
|
|
26
|
+
- [Named Entity Recognition](#named-entity-recognition)
|
27
|
+
- [Binary Relation Detection](#binary-relation-detection)
|
28
|
+
- [Text Categorization](#text-categorization)
|
29
|
+
|
30
|
+
## Named Entity Recognition
|
31
|
+
|
26
32
|
Load an NER model
|
27
33
|
|
28
34
|
```ruby
|
@@ -69,7 +75,7 @@ Get all tags for a model
|
|
69
75
|
model.tags
|
70
76
|
```
|
71
77
|
|
72
|
-
|
78
|
+
### Training
|
73
79
|
|
74
80
|
Load an NER model into a trainer
|
75
81
|
|
@@ -138,6 +144,80 @@ This returns
|
|
138
144
|
[{first: "Shopify", second: "Ottawa", score: 0.17649169745814464}]
|
139
145
|
```
|
140
146
|
|
147
|
+
### Training
|
148
|
+
|
149
|
+
Load an NER model into a trainer
|
150
|
+
|
151
|
+
```ruby
|
152
|
+
trainer = Mitie::BinaryRelationTrainer.new(model)
|
153
|
+
```
|
154
|
+
|
155
|
+
Add positive and negative examples to the trainer
|
156
|
+
|
157
|
+
```ruby
|
158
|
+
tokens = ["Shopify", "was", "founded", "in", "Ottawa"]
|
159
|
+
trainer.add_positive_binary_relation(tokens, 0..0, 4..4)
|
160
|
+
trainer.add_negative_binary_relation(tokens, 4..4, 0..0)
|
161
|
+
```
|
162
|
+
|
163
|
+
Train the detector
|
164
|
+
|
165
|
+
```ruby
|
166
|
+
detector = trainer.train
|
167
|
+
```
|
168
|
+
|
169
|
+
Save the detector
|
170
|
+
|
171
|
+
```ruby
|
172
|
+
detector.save_to_disk("binary_relation_detector.svm")
|
173
|
+
```
|
174
|
+
|
175
|
+
## Text Categorization
|
176
|
+
|
177
|
+
Load a model into a trainer
|
178
|
+
|
179
|
+
```ruby
|
180
|
+
trainer = Mitie::TextCategorizerTrainer.new("total_word_feature_extractor.dat")
|
181
|
+
```
|
182
|
+
|
183
|
+
Add labeled text to the trainer
|
184
|
+
|
185
|
+
```ruby
|
186
|
+
trainer.add(["This", "is", "super", "cool"], "positive")
|
187
|
+
```
|
188
|
+
|
189
|
+
Train the model
|
190
|
+
|
191
|
+
```ruby
|
192
|
+
model = trainer.train
|
193
|
+
```
|
194
|
+
|
195
|
+
Save the model
|
196
|
+
|
197
|
+
```ruby
|
198
|
+
model.save_to_disk("text_categorization_model.dat")
|
199
|
+
```
|
200
|
+
|
201
|
+
Load a saved model
|
202
|
+
|
203
|
+
```ruby
|
204
|
+
model = Mitie::TextCategorizer.new("text_categorization_model.dat")
|
205
|
+
```
|
206
|
+
|
207
|
+
Categorize text
|
208
|
+
|
209
|
+
```ruby
|
210
|
+
model.categorize(["What", "a", "super", "nice", "day"])
|
211
|
+
```
|
212
|
+
|
213
|
+
## Deployment
|
214
|
+
|
215
|
+
Check out [Trove](https://github.com/ankane/trove) for deploying models.
|
216
|
+
|
217
|
+
```sh
|
218
|
+
trove push ner_model.dat
|
219
|
+
```
|
220
|
+
|
141
221
|
## History
|
142
222
|
|
143
223
|
View the [changelog](https://github.com/ankane/mitie-ruby/blob/master/CHANGELOG.md)
|
@@ -1,9 +1,16 @@
|
|
1
1
|
module Mitie
|
2
2
|
class BinaryRelationDetector
|
3
|
-
def initialize(path)
|
4
|
-
|
5
|
-
|
6
|
-
|
3
|
+
def initialize(path = nil, pointer: nil)
|
4
|
+
if path
|
5
|
+
# better error message
|
6
|
+
raise ArgumentError, "File does not exist" unless File.exist?(path)
|
7
|
+
@pointer = FFI.mitie_load_binary_relation_detector(path)
|
8
|
+
elsif pointer
|
9
|
+
@pointer = pointer
|
10
|
+
else
|
11
|
+
raise ArgumentError, "Must pass either a path or a pointer"
|
12
|
+
end
|
13
|
+
|
7
14
|
ObjectSpace.define_finalizer(self, self.class.finalize(pointer))
|
8
15
|
end
|
9
16
|
|
@@ -23,38 +30,52 @@ module Mitie
|
|
23
30
|
|
24
31
|
relations = []
|
25
32
|
combinations.each do |entity1, entity2|
|
26
|
-
relation =
|
27
|
-
|
28
|
-
doc.model.pointer,
|
29
|
-
doc.send(:tokens_ptr),
|
30
|
-
entity1[:token_index],
|
31
|
-
entity1[:token_length],
|
32
|
-
entity2[:token_index],
|
33
|
-
entity2[:token_length]
|
34
|
-
)
|
35
|
-
|
36
|
-
score_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
|
37
|
-
status = FFI.mitie_classify_binary_relation(pointer, relation, score_ptr)
|
38
|
-
# TODO make Mitie::Error
|
39
|
-
raise "Bad status: #{status}" if status != 0
|
40
|
-
score = score_ptr.to_s(Fiddle::SIZEOF_DOUBLE).unpack1("d")
|
41
|
-
if score > 0
|
42
|
-
relations << {
|
43
|
-
first: entity1[:text],
|
44
|
-
second: entity2[:text],
|
45
|
-
score: score
|
46
|
-
}
|
47
|
-
end
|
33
|
+
relation = extract_relation(doc, entity1, entity2)
|
34
|
+
relations << relation if relation
|
48
35
|
end
|
49
36
|
relations
|
50
37
|
end
|
51
38
|
|
39
|
+
def save_to_disk(filename)
|
40
|
+
if FFI.mitie_save_binary_relation_detector(filename, pointer) != 0
|
41
|
+
raise Error, "Unable to save detector"
|
42
|
+
end
|
43
|
+
nil
|
44
|
+
end
|
45
|
+
|
52
46
|
private
|
53
47
|
|
54
48
|
def pointer
|
55
49
|
@pointer
|
56
50
|
end
|
57
51
|
|
52
|
+
def extract_relation(doc, entity1, entity2)
|
53
|
+
relation =
|
54
|
+
FFI.mitie_extract_binary_relation(
|
55
|
+
doc.model.pointer,
|
56
|
+
doc.send(:tokens_ptr),
|
57
|
+
entity1[:token_index],
|
58
|
+
entity1[:token_length],
|
59
|
+
entity2[:token_index],
|
60
|
+
entity2[:token_length]
|
61
|
+
)
|
62
|
+
|
63
|
+
score_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
|
64
|
+
status = FFI.mitie_classify_binary_relation(pointer, relation, score_ptr)
|
65
|
+
raise Error, "Bad status: #{status}" if status != 0
|
66
|
+
|
67
|
+
score = Utils.read_double(score_ptr)
|
68
|
+
if score > 0
|
69
|
+
{
|
70
|
+
first: entity1[:text],
|
71
|
+
second: entity2[:text],
|
72
|
+
score: score
|
73
|
+
}
|
74
|
+
end
|
75
|
+
ensure
|
76
|
+
FFI.mitie_free(relation) if relation
|
77
|
+
end
|
78
|
+
|
58
79
|
def self.finalize(pointer)
|
59
80
|
# must use proc instead of stabby lambda
|
60
81
|
proc { FFI.mitie_free(pointer) }
|
@@ -0,0 +1,87 @@
|
|
1
|
+
module Mitie
|
2
|
+
class BinaryRelationTrainer
|
3
|
+
def initialize(ner, name: "")
|
4
|
+
@pointer = FFI.mitie_create_binary_relation_trainer(name, ner.pointer)
|
5
|
+
|
6
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
7
|
+
end
|
8
|
+
|
9
|
+
def add_positive_binary_relation(tokens, range1, range2)
|
10
|
+
check_add(tokens, range1, range2)
|
11
|
+
|
12
|
+
tokens_pointer = Utils.array_to_pointer(tokens)
|
13
|
+
status = FFI.mitie_add_positive_binary_relation(@pointer, tokens_pointer, range1.begin, range1.size, range2.begin, range2.size)
|
14
|
+
if status != 0
|
15
|
+
raise Error, "Unable to add binary relation"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def add_negative_binary_relation(tokens, range1, range2)
|
20
|
+
check_add(tokens, range1, range2)
|
21
|
+
|
22
|
+
tokens_pointer = Utils.array_to_pointer(tokens)
|
23
|
+
status = FFI.mitie_add_negative_binary_relation(@pointer, tokens_pointer, range1.begin, range1.size, range2.begin, range2.size)
|
24
|
+
if status != 0
|
25
|
+
raise Error, "Unable to add binary relation"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def beta
|
30
|
+
FFI.mitie_binary_relation_trainer_get_beta(@pointer)
|
31
|
+
end
|
32
|
+
|
33
|
+
def beta=(value)
|
34
|
+
raise ArgumentError, "beta must be greater than or equal to zero" unless value >= 0
|
35
|
+
|
36
|
+
FFI.mitie_binary_relation_trainer_set_beta(@pointer, value)
|
37
|
+
end
|
38
|
+
|
39
|
+
def num_threads
|
40
|
+
FFI.mitie_binary_relation_trainer_get_num_threads(@pointer)
|
41
|
+
end
|
42
|
+
|
43
|
+
def num_threads=(value)
|
44
|
+
FFI.mitie_binary_relation_trainer_set_num_threads(@pointer, value)
|
45
|
+
end
|
46
|
+
|
47
|
+
def num_positive_examples
|
48
|
+
FFI.mitie_binary_relation_trainer_num_positive_examples(@pointer)
|
49
|
+
end
|
50
|
+
|
51
|
+
def num_negative_examples
|
52
|
+
FFI.mitie_binary_relation_trainer_num_negative_examples(@pointer)
|
53
|
+
end
|
54
|
+
|
55
|
+
def train
|
56
|
+
if num_positive_examples + num_negative_examples == 0
|
57
|
+
raise Error, "You can't call train() on an empty trainer"
|
58
|
+
end
|
59
|
+
|
60
|
+
detector = FFI.mitie_train_binary_relation_detector(@pointer)
|
61
|
+
|
62
|
+
raise Error, "Unable to create binary relation detector. Probably ran out of RAM." if detector.null?
|
63
|
+
|
64
|
+
Mitie::BinaryRelationDetector.new(pointer: detector)
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def check_add(tokens, range1, range2)
|
70
|
+
Utils.check_range(range1, tokens.size)
|
71
|
+
Utils.check_range(range2, tokens.size)
|
72
|
+
|
73
|
+
if entities_overlap?(range1, range2)
|
74
|
+
raise ArgumentError, "Entities overlap"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def entities_overlap?(range1, range2)
|
79
|
+
FFI.mitie_entities_overlap(range1.begin, range1.size, range2.begin, range2.size) == 1
|
80
|
+
end
|
81
|
+
|
82
|
+
def self.finalize(pointer)
|
83
|
+
# must use proc instead of stabby lambda
|
84
|
+
proc { FFI.mitie_free(pointer) }
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
data/lib/mitie/document.rb
CHANGED
@@ -33,37 +33,35 @@ module Mitie
|
|
33
33
|
|
34
34
|
def entities
|
35
35
|
@entities ||= begin
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
offset = tok[0][1]
|
36
|
+
entities = []
|
37
|
+
tokens = tokens_with_offset
|
38
|
+
detections = FFI.mitie_extract_entities(pointer, tokens_ptr)
|
39
|
+
num_detections = FFI.mitie_ner_get_num_detections(detections)
|
40
|
+
num_detections.times do |i|
|
41
|
+
pos = FFI.mitie_ner_get_detection_position(detections, i)
|
42
|
+
len = FFI.mitie_ner_get_detection_length(detections, i)
|
43
|
+
tag = FFI.mitie_ner_get_detection_tagstr(detections, i).to_s
|
44
|
+
score = FFI.mitie_ner_get_detection_score(detections, i)
|
45
|
+
tok = tokens[pos, len]
|
46
|
+
offset = tok[0][1]
|
48
47
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
end
|
56
|
-
entity[:tag] = tag
|
57
|
-
entity[:score] = score
|
58
|
-
entity[:offset] = offset if offset
|
59
|
-
entity[:token_index] = pos
|
60
|
-
entity[:token_length] = len
|
61
|
-
entities << entity
|
48
|
+
entity = {}
|
49
|
+
if offset
|
50
|
+
finish = tok[-1][1] + tok[-1][0].bytesize
|
51
|
+
entity[:text] = text.byteslice(offset...finish)
|
52
|
+
else
|
53
|
+
entity[:text] = tok.map(&:first)
|
62
54
|
end
|
63
|
-
|
64
|
-
|
65
|
-
|
55
|
+
entity[:tag] = tag
|
56
|
+
entity[:score] = score
|
57
|
+
entity[:offset] = offset if offset
|
58
|
+
entity[:token_index] = pos
|
59
|
+
entity[:token_length] = len
|
60
|
+
entities << entity
|
66
61
|
end
|
62
|
+
entities
|
63
|
+
ensure
|
64
|
+
FFI.mitie_free(detections) if detections
|
67
65
|
end
|
68
66
|
end
|
69
67
|
|
data/lib/mitie/ffi.rb
CHANGED
@@ -15,12 +15,11 @@ module Mitie
|
|
15
15
|
extern "void mitie_free(void* object)"
|
16
16
|
extern "char** mitie_tokenize(const char* text)"
|
17
17
|
extern "char** mitie_tokenize_with_offsets(const char* text, unsigned long** token_offsets)"
|
18
|
-
extern "int mitie_save_named_entity_extractor(const char* filename, const mitie_named_entity_extractor* ner)"
|
19
18
|
|
19
|
+
# ner
|
20
20
|
extern "mitie_named_entity_extractor* mitie_load_named_entity_extractor(const char* filename)"
|
21
21
|
extern "unsigned long mitie_get_num_possible_ner_tags(const mitie_named_entity_extractor* ner)"
|
22
22
|
extern "const char* mitie_get_named_entity_tagstr(const mitie_named_entity_extractor* ner, unsigned long idx)"
|
23
|
-
|
24
23
|
extern "mitie_named_entity_detections* mitie_extract_entities(const mitie_named_entity_extractor* ner, char** tokens)"
|
25
24
|
extern "unsigned long mitie_ner_get_num_detections(const mitie_named_entity_detections* dets)"
|
26
25
|
extern "unsigned long mitie_ner_get_detection_position(const mitie_named_entity_detections* dets, unsigned long idx)"
|
@@ -29,12 +28,28 @@ module Mitie
|
|
29
28
|
extern "const char* mitie_ner_get_detection_tagstr(const mitie_named_entity_detections* dets, unsigned long idx)"
|
30
29
|
extern "double mitie_ner_get_detection_score(const mitie_named_entity_detections* dets, unsigned long idx)"
|
31
30
|
|
31
|
+
# binary relation detector
|
32
|
+
extern "mitie_binary_relation_detector* mitie_load_binary_relation_detector(const char* filename)"
|
33
|
+
extern "const char* mitie_binary_relation_detector_name_string(const mitie_binary_relation_detector* detector)"
|
34
|
+
extern "int mitie_entities_overlap(unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)"
|
35
|
+
extern "mitie_binary_relation* mitie_extract_binary_relation(const mitie_named_entity_extractor* ner, char** tokens, unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)"
|
36
|
+
extern "int mitie_classify_binary_relation(const mitie_binary_relation_detector* detector, const mitie_binary_relation* relation, double* score)"
|
37
|
+
|
38
|
+
# text categorizer
|
39
|
+
extern "mitie_text_categorizer* mitie_load_text_categorizer(const char* filename)"
|
40
|
+
extern "int mitie_categorize_text(const mitie_text_categorizer* tcat, const char** tokens, char** text_tag, double* text_score)"
|
41
|
+
|
42
|
+
# save
|
43
|
+
extern "int mitie_save_named_entity_extractor(const char* filename, const mitie_named_entity_extractor* ner)"
|
44
|
+
extern "int mitie_save_binary_relation_detector(const char* filename, const mitie_binary_relation_detector* detector)"
|
45
|
+
extern "int mitie_save_text_categorizer(const char* filename, const mitie_text_categorizer* tcat)"
|
46
|
+
|
47
|
+
# ner trainer
|
32
48
|
extern "mitie_ner_training_instance* mitie_create_ner_training_instance(char** tokens)"
|
33
49
|
extern "unsigned long mitie_ner_training_instance_num_entities(const mitie_ner_training_instance* instance)"
|
34
50
|
extern "unsigned long mitie_ner_training_instance_num_tokens(const mitie_ner_training_instance* instance)"
|
35
51
|
extern "int mitie_overlaps_any_entity(mitie_ner_training_instance* instance, unsigned long start, unsigned long length)"
|
36
52
|
extern "int mitie_add_ner_training_entity(mitie_ner_training_instance* instance, unsigned long start, unsigned long length, const char* label)"
|
37
|
-
|
38
53
|
extern "mitie_ner_trainer* mitie_create_ner_trainer(const char* filename)"
|
39
54
|
extern "unsigned long mitie_ner_trainer_size(const mitie_ner_trainer* trainer)"
|
40
55
|
extern "int mitie_add_ner_training_instance(mitie_ner_trainer* trainer, const mitie_ner_training_instance* instance)"
|
@@ -44,10 +59,26 @@ module Mitie
|
|
44
59
|
extern "unsigned long mitie_ner_trainer_get_num_threads(const mitie_ner_trainer* trainer)"
|
45
60
|
extern "mitie_named_entity_extractor* mitie_train_named_entity_extractor(const mitie_ner_trainer* trainer)"
|
46
61
|
|
47
|
-
|
48
|
-
extern "const char*
|
49
|
-
extern "
|
50
|
-
extern "
|
51
|
-
extern "int
|
62
|
+
# binary relation trainer
|
63
|
+
extern "mitie_binary_relation_trainer* mitie_create_binary_relation_trainer(const char* relation_name, const mitie_named_entity_extractor* ner)"
|
64
|
+
extern "unsigned long mitie_binary_relation_trainer_num_positive_examples(const mitie_binary_relation_trainer* trainer)"
|
65
|
+
extern "unsigned long mitie_binary_relation_trainer_num_negative_examples(const mitie_binary_relation_trainer* trainer)"
|
66
|
+
extern "int mitie_add_positive_binary_relation(mitie_binary_relation_trainer* trainer, char** tokens, unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)"
|
67
|
+
extern "int mitie_add_negative_binary_relation(mitie_binary_relation_trainer* trainer, char** tokens, unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)"
|
68
|
+
extern "void mitie_binary_relation_trainer_set_beta(mitie_binary_relation_trainer* trainer, double beta)"
|
69
|
+
extern "double mitie_binary_relation_trainer_get_beta(const mitie_binary_relation_trainer* trainer)"
|
70
|
+
extern "void mitie_binary_relation_trainer_set_num_threads(mitie_binary_relation_trainer* trainer, unsigned long num_threads)"
|
71
|
+
extern "unsigned long mitie_binary_relation_trainer_get_num_threads(const mitie_binary_relation_trainer* trainer)"
|
72
|
+
extern "mitie_binary_relation_detector* mitie_train_binary_relation_detector(const mitie_binary_relation_trainer* trainer)"
|
73
|
+
|
74
|
+
# text categorizer trainer
|
75
|
+
extern "mitie_text_categorizer_trainer* mitie_create_text_categorizer_trainer(const char* filename)"
|
76
|
+
extern "unsigned long mitie_text_categorizer_trainer_size(const mitie_text_categorizer_trainer* trainer)"
|
77
|
+
extern "void mitie_text_categorizer_trainer_set_beta(mitie_text_categorizer_trainer* trainer, double beta)"
|
78
|
+
extern "double mitie_text_categorizer_trainer_get_beta(const mitie_text_categorizer_trainer* trainer)"
|
79
|
+
extern "void mitie_text_categorizer_trainer_set_num_threads(mitie_text_categorizer_trainer* trainer, unsigned long num_threads)"
|
80
|
+
extern "unsigned long mitie_text_categorizer_trainer_get_num_threads(const mitie_text_categorizer_trainer* trainer)"
|
81
|
+
extern "int mitie_add_text_categorizer_labeled_text(mitie_text_categorizer_trainer* trainer, const char** tokens, const char* label)"
|
82
|
+
extern "mitie_text_categorizer* mitie_train_text_categorizer(const mitie_text_categorizer_trainer* trainer)"
|
52
83
|
end
|
53
84
|
end
|
@@ -12,9 +12,7 @@ module Mitie
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def add_entity(range, label)
|
15
|
-
|
16
|
-
raise ArgumentError, "Invalid range"
|
17
|
-
end
|
15
|
+
Utils.check_range(range, num_tokens)
|
18
16
|
|
19
17
|
raise ArgumentError, "Range overlaps existing entity" if overlaps_any_entity?(range)
|
20
18
|
|
@@ -34,9 +32,7 @@ module Mitie
|
|
34
32
|
end
|
35
33
|
|
36
34
|
def overlaps_any_entity?(range)
|
37
|
-
|
38
|
-
raise ArgumentError, "Invalid range"
|
39
|
-
end
|
35
|
+
Utils.check_range(range, num_tokens)
|
40
36
|
|
41
37
|
FFI.mitie_overlaps_any_entity(@pointer, range.begin, range.size) == 1
|
42
38
|
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Mitie
|
2
|
+
class TextCategorizer
|
3
|
+
def initialize(path = nil, pointer: nil)
|
4
|
+
if path
|
5
|
+
# better error message
|
6
|
+
raise ArgumentError, "File does not exist" unless File.exist?(path)
|
7
|
+
@pointer = FFI.mitie_load_text_categorizer(path)
|
8
|
+
elsif pointer
|
9
|
+
@pointer = pointer
|
10
|
+
else
|
11
|
+
raise ArgumentError, "Must pass either a path or a pointer"
|
12
|
+
end
|
13
|
+
|
14
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
15
|
+
end
|
16
|
+
|
17
|
+
def categorize(tokens)
|
18
|
+
tokens_pointer = Utils.array_to_pointer(tokens)
|
19
|
+
text_tag = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
|
20
|
+
text_score = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
|
21
|
+
|
22
|
+
if FFI.mitie_categorize_text(@pointer, tokens_pointer, text_tag, text_score) != 0
|
23
|
+
raise Error, "Unable to categorize"
|
24
|
+
end
|
25
|
+
|
26
|
+
{
|
27
|
+
tag: text_tag.ptr.to_s,
|
28
|
+
score: Utils.read_double(text_score)
|
29
|
+
}
|
30
|
+
ensure
|
31
|
+
# text_tag must be freed
|
32
|
+
FFI.mitie_free(text_tag.ptr) if text_tag
|
33
|
+
end
|
34
|
+
|
35
|
+
def save_to_disk(filename)
|
36
|
+
if FFI.mitie_save_text_categorizer(filename, @pointer) != 0
|
37
|
+
raise Error, "Unable to save model"
|
38
|
+
end
|
39
|
+
nil
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.finalize(pointer)
|
43
|
+
# must use proc instead of stabby lambda
|
44
|
+
proc { FFI.mitie_free(pointer) }
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module Mitie
|
2
|
+
class TextCategorizerTrainer
|
3
|
+
def initialize(filename)
|
4
|
+
raise ArgumentError, "File does not exist" unless File.exist?(filename)
|
5
|
+
@pointer = FFI.mitie_create_text_categorizer_trainer(filename)
|
6
|
+
|
7
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
8
|
+
end
|
9
|
+
|
10
|
+
def add(tokens, label)
|
11
|
+
tokens_pointer = Utils.array_to_pointer(tokens)
|
12
|
+
FFI.mitie_add_text_categorizer_labeled_text(@pointer, tokens_pointer, label)
|
13
|
+
end
|
14
|
+
|
15
|
+
def beta
|
16
|
+
FFI.mitie_text_categorizer_trainer_get_beta(@pointer)
|
17
|
+
end
|
18
|
+
|
19
|
+
def beta=(value)
|
20
|
+
raise ArgumentError, "beta must be greater than or equal to zero" unless value >= 0
|
21
|
+
|
22
|
+
FFI.mitie_text_categorizer_trainer_set_beta(@pointer, value)
|
23
|
+
end
|
24
|
+
|
25
|
+
def num_threads
|
26
|
+
FFI.mitie_text_categorizer_trainer_get_num_threads(@pointer)
|
27
|
+
end
|
28
|
+
|
29
|
+
def num_threads=(value)
|
30
|
+
FFI.mitie_text_categorizer_trainer_set_num_threads(@pointer, value)
|
31
|
+
end
|
32
|
+
|
33
|
+
def size
|
34
|
+
FFI.mitie_text_categorizer_trainer_size(@pointer)
|
35
|
+
end
|
36
|
+
|
37
|
+
def train
|
38
|
+
raise Error, "You can't call train() on an empty trainer" if size.zero?
|
39
|
+
|
40
|
+
categorizer = FFI.mitie_train_text_categorizer(@pointer)
|
41
|
+
|
42
|
+
raise Error, "Unable to create text categorizer. Probably ran out of RAM." if categorizer.null?
|
43
|
+
|
44
|
+
Mitie::TextCategorizer.new(pointer: categorizer)
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.finalize(pointer)
|
48
|
+
# must use proc instead of stabby lambda
|
49
|
+
proc { FFI.mitie_free(pointer) }
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
data/lib/mitie/utils.rb
CHANGED
@@ -8,5 +8,15 @@ module Mitie
|
|
8
8
|
end
|
9
9
|
tokens_ptr
|
10
10
|
end
|
11
|
+
|
12
|
+
def self.check_range(range, num_tokens)
|
13
|
+
if range.none? || !(0..(num_tokens - 1)).cover?(range)
|
14
|
+
raise ArgumentError, "Invalid range"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.read_double(ptr)
|
19
|
+
ptr.to_s(Fiddle::SIZEOF_DOUBLE).unpack1("d")
|
20
|
+
end
|
11
21
|
end
|
12
22
|
end
|
data/lib/mitie/version.rb
CHANGED
data/lib/mitie.rb
CHANGED
@@ -3,10 +3,13 @@ require "fiddle/import"
|
|
3
3
|
|
4
4
|
# modules
|
5
5
|
require "mitie/binary_relation_detector"
|
6
|
+
require "mitie/binary_relation_trainer"
|
6
7
|
require "mitie/document"
|
7
8
|
require "mitie/ner"
|
8
9
|
require "mitie/ner_training_instance"
|
9
10
|
require "mitie/ner_trainer"
|
11
|
+
require "mitie/text_categorizer"
|
12
|
+
require "mitie/text_categorizer_trainer"
|
10
13
|
require "mitie/utils"
|
11
14
|
require "mitie/version"
|
12
15
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mitie
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-06-01 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email: andrew@ankane.org
|
@@ -21,11 +21,14 @@ files:
|
|
21
21
|
- README.md
|
22
22
|
- lib/mitie.rb
|
23
23
|
- lib/mitie/binary_relation_detector.rb
|
24
|
+
- lib/mitie/binary_relation_trainer.rb
|
24
25
|
- lib/mitie/document.rb
|
25
26
|
- lib/mitie/ffi.rb
|
26
27
|
- lib/mitie/ner.rb
|
27
28
|
- lib/mitie/ner_trainer.rb
|
28
29
|
- lib/mitie/ner_training_instance.rb
|
30
|
+
- lib/mitie/text_categorizer.rb
|
31
|
+
- lib/mitie/text_categorizer_trainer.rb
|
29
32
|
- lib/mitie/utils.rb
|
30
33
|
- lib/mitie/version.rb
|
31
34
|
- vendor/LICENSE.txt
|
@@ -45,7 +48,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
45
48
|
requirements:
|
46
49
|
- - ">="
|
47
50
|
- !ruby/object:Gem::Version
|
48
|
-
version: '2.
|
51
|
+
version: '2.7'
|
49
52
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
53
|
requirements:
|
51
54
|
- - ">="
|