mitie 0.1.6 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +83 -3
- data/lib/mitie/binary_relation_detector.rb +47 -26
- data/lib/mitie/binary_relation_trainer.rb +87 -0
- data/lib/mitie/document.rb +26 -28
- data/lib/mitie/ffi.rb +39 -8
- data/lib/mitie/ner_training_instance.rb +2 -6
- data/lib/mitie/text_categorizer.rb +47 -0
- data/lib/mitie/text_categorizer_trainer.rb +52 -0
- data/lib/mitie/utils.rb +10 -0
- data/lib/mitie/version.rb +1 -1
- data/lib/mitie.rb +3 -0
- metadata +6 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 50fb12adcd0042b3c09968108ec382694f8f0df20750b88566bc64c8d85d9e8d
|
4
|
+
data.tar.gz: 9d3d34f9839f71fc17e6651a17069f1f0fdb4b5956f9e57c4b66c9567d908967
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 189d5a17f94fff9abbc1d8d961c8f8407ae85bd1284f727e363b9aa7384d748eb1d80b6ffa1b1a44c8c6fadbbb6e91e6d064ea58fa88af972e6fd6e90e3ef71d
|
7
|
+
data.tar.gz: 202ba418ee98636736185a8f4783601acabecc59dde7e3a5c31a12b56827497b6b2b7e7e26d264b61848281b94cb635249b17a9e8fa009c496dacbe24e6466ca
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# MITIE Ruby
|
2
2
|
|
3
|
-
[MITIE](https://github.com/mit-nlp/MITIE) - named-entity recognition
|
3
|
+
[MITIE](https://github.com/mit-nlp/MITIE) - named-entity recognition, binary relation detection, and text categorization - for Ruby
|
4
4
|
|
5
5
|
- Finds people, organizations, and locations in text
|
6
6
|
- Detects relationships between entities, like `PERSON` was born in `LOCATION`
|
@@ -15,7 +15,7 @@ Add this line to your application’s Gemfile:
|
|
15
15
|
gem "mitie"
|
16
16
|
```
|
17
17
|
|
18
|
-
And download the pre-trained
|
18
|
+
And download the pre-trained models for your language:
|
19
19
|
|
20
20
|
- [English](https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2.tar.bz2)
|
21
21
|
- [Spanish](https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2-Spanish.zip)
|
@@ -23,6 +23,12 @@ And download the pre-trained model for your language:
|
|
23
23
|
|
24
24
|
## Getting Started
|
25
25
|
|
26
|
+
- [Named Entity Recognition](#named-entity-recognition)
|
27
|
+
- [Binary Relation Detection](#binary-relation-detection)
|
28
|
+
- [Text Categorization](#text-categorization)
|
29
|
+
|
30
|
+
## Named Entity Recognition
|
31
|
+
|
26
32
|
Load an NER model
|
27
33
|
|
28
34
|
```ruby
|
@@ -69,7 +75,7 @@ Get all tags for a model
|
|
69
75
|
model.tags
|
70
76
|
```
|
71
77
|
|
72
|
-
|
78
|
+
### Training
|
73
79
|
|
74
80
|
Load an NER model into a trainer
|
75
81
|
|
@@ -138,6 +144,80 @@ This returns
|
|
138
144
|
[{first: "Shopify", second: "Ottawa", score: 0.17649169745814464}]
|
139
145
|
```
|
140
146
|
|
147
|
+
### Training
|
148
|
+
|
149
|
+
Load an NER model into a trainer
|
150
|
+
|
151
|
+
```ruby
|
152
|
+
trainer = Mitie::BinaryRelationTrainer.new(model)
|
153
|
+
```
|
154
|
+
|
155
|
+
Add positive and negative examples to the trainer
|
156
|
+
|
157
|
+
```ruby
|
158
|
+
tokens = ["Shopify", "was", "founded", "in", "Ottawa"]
|
159
|
+
trainer.add_positive_binary_relation(tokens, 0..0, 4..4)
|
160
|
+
trainer.add_negative_binary_relation(tokens, 4..4, 0..0)
|
161
|
+
```
|
162
|
+
|
163
|
+
Train the detector
|
164
|
+
|
165
|
+
```ruby
|
166
|
+
detector = trainer.train
|
167
|
+
```
|
168
|
+
|
169
|
+
Save the detector
|
170
|
+
|
171
|
+
```ruby
|
172
|
+
detector.save_to_disk("binary_relation_detector.svm")
|
173
|
+
```
|
174
|
+
|
175
|
+
## Text Categorization
|
176
|
+
|
177
|
+
Load a model into a trainer
|
178
|
+
|
179
|
+
```ruby
|
180
|
+
trainer = Mitie::TextCategorizerTrainer.new("total_word_feature_extractor.dat")
|
181
|
+
```
|
182
|
+
|
183
|
+
Add labeled text to the trainer
|
184
|
+
|
185
|
+
```ruby
|
186
|
+
trainer.add(["This", "is", "super", "cool"], "positive")
|
187
|
+
```
|
188
|
+
|
189
|
+
Train the model
|
190
|
+
|
191
|
+
```ruby
|
192
|
+
model = trainer.train
|
193
|
+
```
|
194
|
+
|
195
|
+
Save the model
|
196
|
+
|
197
|
+
```ruby
|
198
|
+
model.save_to_disk("text_categorization_model.dat")
|
199
|
+
```
|
200
|
+
|
201
|
+
Load a saved model
|
202
|
+
|
203
|
+
```ruby
|
204
|
+
model = Mitie::TextCategorizer.new("text_categorization_model.dat")
|
205
|
+
```
|
206
|
+
|
207
|
+
Categorize text
|
208
|
+
|
209
|
+
```ruby
|
210
|
+
model.categorize(["What", "a", "super", "nice", "day"])
|
211
|
+
```
|
212
|
+
|
213
|
+
## Deployment
|
214
|
+
|
215
|
+
Check out [Trove](https://github.com/ankane/trove) for deploying models.
|
216
|
+
|
217
|
+
```sh
|
218
|
+
trove push ner_model.dat
|
219
|
+
```
|
220
|
+
|
141
221
|
## History
|
142
222
|
|
143
223
|
View the [changelog](https://github.com/ankane/mitie-ruby/blob/master/CHANGELOG.md)
|
@@ -1,9 +1,16 @@
|
|
1
1
|
module Mitie
|
2
2
|
class BinaryRelationDetector
|
3
|
-
def initialize(path)
|
4
|
-
|
5
|
-
|
6
|
-
|
3
|
+
def initialize(path = nil, pointer: nil)
|
4
|
+
if path
|
5
|
+
# better error message
|
6
|
+
raise ArgumentError, "File does not exist" unless File.exist?(path)
|
7
|
+
@pointer = FFI.mitie_load_binary_relation_detector(path)
|
8
|
+
elsif pointer
|
9
|
+
@pointer = pointer
|
10
|
+
else
|
11
|
+
raise ArgumentError, "Must pass either a path or a pointer"
|
12
|
+
end
|
13
|
+
|
7
14
|
ObjectSpace.define_finalizer(self, self.class.finalize(pointer))
|
8
15
|
end
|
9
16
|
|
@@ -23,38 +30,52 @@ module Mitie
|
|
23
30
|
|
24
31
|
relations = []
|
25
32
|
combinations.each do |entity1, entity2|
|
26
|
-
relation =
|
27
|
-
|
28
|
-
doc.model.pointer,
|
29
|
-
doc.send(:tokens_ptr),
|
30
|
-
entity1[:token_index],
|
31
|
-
entity1[:token_length],
|
32
|
-
entity2[:token_index],
|
33
|
-
entity2[:token_length]
|
34
|
-
)
|
35
|
-
|
36
|
-
score_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
|
37
|
-
status = FFI.mitie_classify_binary_relation(pointer, relation, score_ptr)
|
38
|
-
# TODO make Mitie::Error
|
39
|
-
raise "Bad status: #{status}" if status != 0
|
40
|
-
score = score_ptr.to_s(Fiddle::SIZEOF_DOUBLE).unpack1("d")
|
41
|
-
if score > 0
|
42
|
-
relations << {
|
43
|
-
first: entity1[:text],
|
44
|
-
second: entity2[:text],
|
45
|
-
score: score
|
46
|
-
}
|
47
|
-
end
|
33
|
+
relation = extract_relation(doc, entity1, entity2)
|
34
|
+
relations << relation if relation
|
48
35
|
end
|
49
36
|
relations
|
50
37
|
end
|
51
38
|
|
39
|
+
def save_to_disk(filename)
|
40
|
+
if FFI.mitie_save_binary_relation_detector(filename, pointer) != 0
|
41
|
+
raise Error, "Unable to save detector"
|
42
|
+
end
|
43
|
+
nil
|
44
|
+
end
|
45
|
+
|
52
46
|
private
|
53
47
|
|
54
48
|
def pointer
|
55
49
|
@pointer
|
56
50
|
end
|
57
51
|
|
52
|
+
def extract_relation(doc, entity1, entity2)
|
53
|
+
relation =
|
54
|
+
FFI.mitie_extract_binary_relation(
|
55
|
+
doc.model.pointer,
|
56
|
+
doc.send(:tokens_ptr),
|
57
|
+
entity1[:token_index],
|
58
|
+
entity1[:token_length],
|
59
|
+
entity2[:token_index],
|
60
|
+
entity2[:token_length]
|
61
|
+
)
|
62
|
+
|
63
|
+
score_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
|
64
|
+
status = FFI.mitie_classify_binary_relation(pointer, relation, score_ptr)
|
65
|
+
raise Error, "Bad status: #{status}" if status != 0
|
66
|
+
|
67
|
+
score = Utils.read_double(score_ptr)
|
68
|
+
if score > 0
|
69
|
+
{
|
70
|
+
first: entity1[:text],
|
71
|
+
second: entity2[:text],
|
72
|
+
score: score
|
73
|
+
}
|
74
|
+
end
|
75
|
+
ensure
|
76
|
+
FFI.mitie_free(relation) if relation
|
77
|
+
end
|
78
|
+
|
58
79
|
def self.finalize(pointer)
|
59
80
|
# must use proc instead of stabby lambda
|
60
81
|
proc { FFI.mitie_free(pointer) }
|
@@ -0,0 +1,87 @@
|
|
1
|
+
module Mitie
|
2
|
+
class BinaryRelationTrainer
|
3
|
+
def initialize(ner, name: "")
|
4
|
+
@pointer = FFI.mitie_create_binary_relation_trainer(name, ner.pointer)
|
5
|
+
|
6
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
7
|
+
end
|
8
|
+
|
9
|
+
def add_positive_binary_relation(tokens, range1, range2)
|
10
|
+
check_add(tokens, range1, range2)
|
11
|
+
|
12
|
+
tokens_pointer = Utils.array_to_pointer(tokens)
|
13
|
+
status = FFI.mitie_add_positive_binary_relation(@pointer, tokens_pointer, range1.begin, range1.size, range2.begin, range2.size)
|
14
|
+
if status != 0
|
15
|
+
raise Error, "Unable to add binary relation"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def add_negative_binary_relation(tokens, range1, range2)
|
20
|
+
check_add(tokens, range1, range2)
|
21
|
+
|
22
|
+
tokens_pointer = Utils.array_to_pointer(tokens)
|
23
|
+
status = FFI.mitie_add_negative_binary_relation(@pointer, tokens_pointer, range1.begin, range1.size, range2.begin, range2.size)
|
24
|
+
if status != 0
|
25
|
+
raise Error, "Unable to add binary relation"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def beta
|
30
|
+
FFI.mitie_binary_relation_trainer_get_beta(@pointer)
|
31
|
+
end
|
32
|
+
|
33
|
+
def beta=(value)
|
34
|
+
raise ArgumentError, "beta must be greater than or equal to zero" unless value >= 0
|
35
|
+
|
36
|
+
FFI.mitie_binary_relation_trainer_set_beta(@pointer, value)
|
37
|
+
end
|
38
|
+
|
39
|
+
def num_threads
|
40
|
+
FFI.mitie_binary_relation_trainer_get_num_threads(@pointer)
|
41
|
+
end
|
42
|
+
|
43
|
+
def num_threads=(value)
|
44
|
+
FFI.mitie_binary_relation_trainer_set_num_threads(@pointer, value)
|
45
|
+
end
|
46
|
+
|
47
|
+
def num_positive_examples
|
48
|
+
FFI.mitie_binary_relation_trainer_num_positive_examples(@pointer)
|
49
|
+
end
|
50
|
+
|
51
|
+
def num_negative_examples
|
52
|
+
FFI.mitie_binary_relation_trainer_num_negative_examples(@pointer)
|
53
|
+
end
|
54
|
+
|
55
|
+
def train
|
56
|
+
if num_positive_examples + num_negative_examples == 0
|
57
|
+
raise Error, "You can't call train() on an empty trainer"
|
58
|
+
end
|
59
|
+
|
60
|
+
detector = FFI.mitie_train_binary_relation_detector(@pointer)
|
61
|
+
|
62
|
+
raise Error, "Unable to create binary relation detector. Probably ran out of RAM." if detector.null?
|
63
|
+
|
64
|
+
Mitie::BinaryRelationDetector.new(pointer: detector)
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def check_add(tokens, range1, range2)
|
70
|
+
Utils.check_range(range1, tokens.size)
|
71
|
+
Utils.check_range(range2, tokens.size)
|
72
|
+
|
73
|
+
if entities_overlap?(range1, range2)
|
74
|
+
raise ArgumentError, "Entities overlap"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def entities_overlap?(range1, range2)
|
79
|
+
FFI.mitie_entities_overlap(range1.begin, range1.size, range2.begin, range2.size) == 1
|
80
|
+
end
|
81
|
+
|
82
|
+
def self.finalize(pointer)
|
83
|
+
# must use proc instead of stabby lambda
|
84
|
+
proc { FFI.mitie_free(pointer) }
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
data/lib/mitie/document.rb
CHANGED
@@ -33,37 +33,35 @@ module Mitie
|
|
33
33
|
|
34
34
|
def entities
|
35
35
|
@entities ||= begin
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
offset = tok[0][1]
|
36
|
+
entities = []
|
37
|
+
tokens = tokens_with_offset
|
38
|
+
detections = FFI.mitie_extract_entities(pointer, tokens_ptr)
|
39
|
+
num_detections = FFI.mitie_ner_get_num_detections(detections)
|
40
|
+
num_detections.times do |i|
|
41
|
+
pos = FFI.mitie_ner_get_detection_position(detections, i)
|
42
|
+
len = FFI.mitie_ner_get_detection_length(detections, i)
|
43
|
+
tag = FFI.mitie_ner_get_detection_tagstr(detections, i).to_s
|
44
|
+
score = FFI.mitie_ner_get_detection_score(detections, i)
|
45
|
+
tok = tokens[pos, len]
|
46
|
+
offset = tok[0][1]
|
48
47
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
end
|
56
|
-
entity[:tag] = tag
|
57
|
-
entity[:score] = score
|
58
|
-
entity[:offset] = offset if offset
|
59
|
-
entity[:token_index] = pos
|
60
|
-
entity[:token_length] = len
|
61
|
-
entities << entity
|
48
|
+
entity = {}
|
49
|
+
if offset
|
50
|
+
finish = tok[-1][1] + tok[-1][0].bytesize
|
51
|
+
entity[:text] = text.byteslice(offset...finish)
|
52
|
+
else
|
53
|
+
entity[:text] = tok.map(&:first)
|
62
54
|
end
|
63
|
-
|
64
|
-
|
65
|
-
|
55
|
+
entity[:tag] = tag
|
56
|
+
entity[:score] = score
|
57
|
+
entity[:offset] = offset if offset
|
58
|
+
entity[:token_index] = pos
|
59
|
+
entity[:token_length] = len
|
60
|
+
entities << entity
|
66
61
|
end
|
62
|
+
entities
|
63
|
+
ensure
|
64
|
+
FFI.mitie_free(detections) if detections
|
67
65
|
end
|
68
66
|
end
|
69
67
|
|
data/lib/mitie/ffi.rb
CHANGED
@@ -15,12 +15,11 @@ module Mitie
|
|
15
15
|
extern "void mitie_free(void* object)"
|
16
16
|
extern "char** mitie_tokenize(const char* text)"
|
17
17
|
extern "char** mitie_tokenize_with_offsets(const char* text, unsigned long** token_offsets)"
|
18
|
-
extern "int mitie_save_named_entity_extractor(const char* filename, const mitie_named_entity_extractor* ner)"
|
19
18
|
|
19
|
+
# ner
|
20
20
|
extern "mitie_named_entity_extractor* mitie_load_named_entity_extractor(const char* filename)"
|
21
21
|
extern "unsigned long mitie_get_num_possible_ner_tags(const mitie_named_entity_extractor* ner)"
|
22
22
|
extern "const char* mitie_get_named_entity_tagstr(const mitie_named_entity_extractor* ner, unsigned long idx)"
|
23
|
-
|
24
23
|
extern "mitie_named_entity_detections* mitie_extract_entities(const mitie_named_entity_extractor* ner, char** tokens)"
|
25
24
|
extern "unsigned long mitie_ner_get_num_detections(const mitie_named_entity_detections* dets)"
|
26
25
|
extern "unsigned long mitie_ner_get_detection_position(const mitie_named_entity_detections* dets, unsigned long idx)"
|
@@ -29,12 +28,28 @@ module Mitie
|
|
29
28
|
extern "const char* mitie_ner_get_detection_tagstr(const mitie_named_entity_detections* dets, unsigned long idx)"
|
30
29
|
extern "double mitie_ner_get_detection_score(const mitie_named_entity_detections* dets, unsigned long idx)"
|
31
30
|
|
31
|
+
# binary relation detector
|
32
|
+
extern "mitie_binary_relation_detector* mitie_load_binary_relation_detector(const char* filename)"
|
33
|
+
extern "const char* mitie_binary_relation_detector_name_string(const mitie_binary_relation_detector* detector)"
|
34
|
+
extern "int mitie_entities_overlap(unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)"
|
35
|
+
extern "mitie_binary_relation* mitie_extract_binary_relation(const mitie_named_entity_extractor* ner, char** tokens, unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)"
|
36
|
+
extern "int mitie_classify_binary_relation(const mitie_binary_relation_detector* detector, const mitie_binary_relation* relation, double* score)"
|
37
|
+
|
38
|
+
# text categorizer
|
39
|
+
extern "mitie_text_categorizer* mitie_load_text_categorizer(const char* filename)"
|
40
|
+
extern "int mitie_categorize_text(const mitie_text_categorizer* tcat, const char** tokens, char** text_tag, double* text_score)"
|
41
|
+
|
42
|
+
# save
|
43
|
+
extern "int mitie_save_named_entity_extractor(const char* filename, const mitie_named_entity_extractor* ner)"
|
44
|
+
extern "int mitie_save_binary_relation_detector(const char* filename, const mitie_binary_relation_detector* detector)"
|
45
|
+
extern "int mitie_save_text_categorizer(const char* filename, const mitie_text_categorizer* tcat)"
|
46
|
+
|
47
|
+
# ner trainer
|
32
48
|
extern "mitie_ner_training_instance* mitie_create_ner_training_instance(char** tokens)"
|
33
49
|
extern "unsigned long mitie_ner_training_instance_num_entities(const mitie_ner_training_instance* instance)"
|
34
50
|
extern "unsigned long mitie_ner_training_instance_num_tokens(const mitie_ner_training_instance* instance)"
|
35
51
|
extern "int mitie_overlaps_any_entity(mitie_ner_training_instance* instance, unsigned long start, unsigned long length)"
|
36
52
|
extern "int mitie_add_ner_training_entity(mitie_ner_training_instance* instance, unsigned long start, unsigned long length, const char* label)"
|
37
|
-
|
38
53
|
extern "mitie_ner_trainer* mitie_create_ner_trainer(const char* filename)"
|
39
54
|
extern "unsigned long mitie_ner_trainer_size(const mitie_ner_trainer* trainer)"
|
40
55
|
extern "int mitie_add_ner_training_instance(mitie_ner_trainer* trainer, const mitie_ner_training_instance* instance)"
|
@@ -44,10 +59,26 @@ module Mitie
|
|
44
59
|
extern "unsigned long mitie_ner_trainer_get_num_threads(const mitie_ner_trainer* trainer)"
|
45
60
|
extern "mitie_named_entity_extractor* mitie_train_named_entity_extractor(const mitie_ner_trainer* trainer)"
|
46
61
|
|
47
|
-
|
48
|
-
extern "const char*
|
49
|
-
extern "
|
50
|
-
extern "
|
51
|
-
extern "int
|
62
|
+
# binary relation trainer
|
63
|
+
extern "mitie_binary_relation_trainer* mitie_create_binary_relation_trainer(const char* relation_name, const mitie_named_entity_extractor* ner)"
|
64
|
+
extern "unsigned long mitie_binary_relation_trainer_num_positive_examples(const mitie_binary_relation_trainer* trainer)"
|
65
|
+
extern "unsigned long mitie_binary_relation_trainer_num_negative_examples(const mitie_binary_relation_trainer* trainer)"
|
66
|
+
extern "int mitie_add_positive_binary_relation(mitie_binary_relation_trainer* trainer, char** tokens, unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)"
|
67
|
+
extern "int mitie_add_negative_binary_relation(mitie_binary_relation_trainer* trainer, char** tokens, unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)"
|
68
|
+
extern "void mitie_binary_relation_trainer_set_beta(mitie_binary_relation_trainer* trainer, double beta)"
|
69
|
+
extern "double mitie_binary_relation_trainer_get_beta(const mitie_binary_relation_trainer* trainer)"
|
70
|
+
extern "void mitie_binary_relation_trainer_set_num_threads(mitie_binary_relation_trainer* trainer, unsigned long num_threads)"
|
71
|
+
extern "unsigned long mitie_binary_relation_trainer_get_num_threads(const mitie_binary_relation_trainer* trainer)"
|
72
|
+
extern "mitie_binary_relation_detector* mitie_train_binary_relation_detector(const mitie_binary_relation_trainer* trainer)"
|
73
|
+
|
74
|
+
# text categorizer trainer
|
75
|
+
extern "mitie_text_categorizer_trainer* mitie_create_text_categorizer_trainer(const char* filename)"
|
76
|
+
extern "unsigned long mitie_text_categorizer_trainer_size(const mitie_text_categorizer_trainer* trainer)"
|
77
|
+
extern "void mitie_text_categorizer_trainer_set_beta(mitie_text_categorizer_trainer* trainer, double beta)"
|
78
|
+
extern "double mitie_text_categorizer_trainer_get_beta(const mitie_text_categorizer_trainer* trainer)"
|
79
|
+
extern "void mitie_text_categorizer_trainer_set_num_threads(mitie_text_categorizer_trainer* trainer, unsigned long num_threads)"
|
80
|
+
extern "unsigned long mitie_text_categorizer_trainer_get_num_threads(const mitie_text_categorizer_trainer* trainer)"
|
81
|
+
extern "int mitie_add_text_categorizer_labeled_text(mitie_text_categorizer_trainer* trainer, const char** tokens, const char* label)"
|
82
|
+
extern "mitie_text_categorizer* mitie_train_text_categorizer(const mitie_text_categorizer_trainer* trainer)"
|
52
83
|
end
|
53
84
|
end
|
@@ -12,9 +12,7 @@ module Mitie
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def add_entity(range, label)
|
15
|
-
|
16
|
-
raise ArgumentError, "Invalid range"
|
17
|
-
end
|
15
|
+
Utils.check_range(range, num_tokens)
|
18
16
|
|
19
17
|
raise ArgumentError, "Range overlaps existing entity" if overlaps_any_entity?(range)
|
20
18
|
|
@@ -34,9 +32,7 @@ module Mitie
|
|
34
32
|
end
|
35
33
|
|
36
34
|
def overlaps_any_entity?(range)
|
37
|
-
|
38
|
-
raise ArgumentError, "Invalid range"
|
39
|
-
end
|
35
|
+
Utils.check_range(range, num_tokens)
|
40
36
|
|
41
37
|
FFI.mitie_overlaps_any_entity(@pointer, range.begin, range.size) == 1
|
42
38
|
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Mitie
|
2
|
+
class TextCategorizer
|
3
|
+
def initialize(path = nil, pointer: nil)
|
4
|
+
if path
|
5
|
+
# better error message
|
6
|
+
raise ArgumentError, "File does not exist" unless File.exist?(path)
|
7
|
+
@pointer = FFI.mitie_load_text_categorizer(path)
|
8
|
+
elsif pointer
|
9
|
+
@pointer = pointer
|
10
|
+
else
|
11
|
+
raise ArgumentError, "Must pass either a path or a pointer"
|
12
|
+
end
|
13
|
+
|
14
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
15
|
+
end
|
16
|
+
|
17
|
+
def categorize(tokens)
|
18
|
+
tokens_pointer = Utils.array_to_pointer(tokens)
|
19
|
+
text_tag = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
|
20
|
+
text_score = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
|
21
|
+
|
22
|
+
if FFI.mitie_categorize_text(@pointer, tokens_pointer, text_tag, text_score) != 0
|
23
|
+
raise Error, "Unable to categorize"
|
24
|
+
end
|
25
|
+
|
26
|
+
{
|
27
|
+
tag: text_tag.ptr.to_s,
|
28
|
+
score: Utils.read_double(text_score)
|
29
|
+
}
|
30
|
+
ensure
|
31
|
+
# text_tag must be freed
|
32
|
+
FFI.mitie_free(text_tag.ptr) if text_tag
|
33
|
+
end
|
34
|
+
|
35
|
+
def save_to_disk(filename)
|
36
|
+
if FFI.mitie_save_text_categorizer(filename, @pointer) != 0
|
37
|
+
raise Error, "Unable to save model"
|
38
|
+
end
|
39
|
+
nil
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.finalize(pointer)
|
43
|
+
# must use proc instead of stabby lambda
|
44
|
+
proc { FFI.mitie_free(pointer) }
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module Mitie
|
2
|
+
class TextCategorizerTrainer
|
3
|
+
def initialize(filename)
|
4
|
+
raise ArgumentError, "File does not exist" unless File.exist?(filename)
|
5
|
+
@pointer = FFI.mitie_create_text_categorizer_trainer(filename)
|
6
|
+
|
7
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
8
|
+
end
|
9
|
+
|
10
|
+
def add(tokens, label)
|
11
|
+
tokens_pointer = Utils.array_to_pointer(tokens)
|
12
|
+
FFI.mitie_add_text_categorizer_labeled_text(@pointer, tokens_pointer, label)
|
13
|
+
end
|
14
|
+
|
15
|
+
def beta
|
16
|
+
FFI.mitie_text_categorizer_trainer_get_beta(@pointer)
|
17
|
+
end
|
18
|
+
|
19
|
+
def beta=(value)
|
20
|
+
raise ArgumentError, "beta must be greater than or equal to zero" unless value >= 0
|
21
|
+
|
22
|
+
FFI.mitie_text_categorizer_trainer_set_beta(@pointer, value)
|
23
|
+
end
|
24
|
+
|
25
|
+
def num_threads
|
26
|
+
FFI.mitie_text_categorizer_trainer_get_num_threads(@pointer)
|
27
|
+
end
|
28
|
+
|
29
|
+
def num_threads=(value)
|
30
|
+
FFI.mitie_text_categorizer_trainer_set_num_threads(@pointer, value)
|
31
|
+
end
|
32
|
+
|
33
|
+
def size
|
34
|
+
FFI.mitie_text_categorizer_trainer_size(@pointer)
|
35
|
+
end
|
36
|
+
|
37
|
+
def train
|
38
|
+
raise Error, "You can't call train() on an empty trainer" if size.zero?
|
39
|
+
|
40
|
+
categorizer = FFI.mitie_train_text_categorizer(@pointer)
|
41
|
+
|
42
|
+
raise Error, "Unable to create text categorizer. Probably ran out of RAM." if categorizer.null?
|
43
|
+
|
44
|
+
Mitie::TextCategorizer.new(pointer: categorizer)
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.finalize(pointer)
|
48
|
+
# must use proc instead of stabby lambda
|
49
|
+
proc { FFI.mitie_free(pointer) }
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
data/lib/mitie/utils.rb
CHANGED
@@ -8,5 +8,15 @@ module Mitie
|
|
8
8
|
end
|
9
9
|
tokens_ptr
|
10
10
|
end
|
11
|
+
|
12
|
+
def self.check_range(range, num_tokens)
|
13
|
+
if range.none? || !(0..(num_tokens - 1)).cover?(range)
|
14
|
+
raise ArgumentError, "Invalid range"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.read_double(ptr)
|
19
|
+
ptr.to_s(Fiddle::SIZEOF_DOUBLE).unpack1("d")
|
20
|
+
end
|
11
21
|
end
|
12
22
|
end
|
data/lib/mitie/version.rb
CHANGED
data/lib/mitie.rb
CHANGED
@@ -3,10 +3,13 @@ require "fiddle/import"
|
|
3
3
|
|
4
4
|
# modules
|
5
5
|
require "mitie/binary_relation_detector"
|
6
|
+
require "mitie/binary_relation_trainer"
|
6
7
|
require "mitie/document"
|
7
8
|
require "mitie/ner"
|
8
9
|
require "mitie/ner_training_instance"
|
9
10
|
require "mitie/ner_trainer"
|
11
|
+
require "mitie/text_categorizer"
|
12
|
+
require "mitie/text_categorizer_trainer"
|
10
13
|
require "mitie/utils"
|
11
14
|
require "mitie/version"
|
12
15
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mitie
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-06-01 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email: andrew@ankane.org
|
@@ -21,11 +21,14 @@ files:
|
|
21
21
|
- README.md
|
22
22
|
- lib/mitie.rb
|
23
23
|
- lib/mitie/binary_relation_detector.rb
|
24
|
+
- lib/mitie/binary_relation_trainer.rb
|
24
25
|
- lib/mitie/document.rb
|
25
26
|
- lib/mitie/ffi.rb
|
26
27
|
- lib/mitie/ner.rb
|
27
28
|
- lib/mitie/ner_trainer.rb
|
28
29
|
- lib/mitie/ner_training_instance.rb
|
30
|
+
- lib/mitie/text_categorizer.rb
|
31
|
+
- lib/mitie/text_categorizer_trainer.rb
|
29
32
|
- lib/mitie/utils.rb
|
30
33
|
- lib/mitie/version.rb
|
31
34
|
- vendor/LICENSE.txt
|
@@ -45,7 +48,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
45
48
|
requirements:
|
46
49
|
- - ">="
|
47
50
|
- !ruby/object:Gem::Version
|
48
|
-
version: '2.
|
51
|
+
version: '2.7'
|
49
52
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
53
|
requirements:
|
51
54
|
- - ">="
|