mitie 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +48 -9
- data/lib/mitie.rb +2 -0
- data/lib/mitie/binary_relation_detector.rb +62 -0
- data/lib/mitie/document.rb +96 -0
- data/lib/mitie/ffi.rb +6 -0
- data/lib/mitie/ner.rb +10 -64
- data/lib/mitie/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 519a88b20911f72d0a66ceaca52e1af3171bab683e27392763f52ac4c494d185
|
4
|
+
data.tar.gz: 6c955aa66776ef1ec92ccc151ba6fce1eeef8d968fad2ac8fd47cb0a7ea4f3bf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1d8d373478c4ae69844959a349598c35f4187ff91e52e5a6be457b5e61769b0c109bb143b4604999665d6a1fce532ad027add26fc3ea03360764bd2025357c91
|
7
|
+
data.tar.gz: b4c13d770bfb8b03108d6a93c757f286d7dc3e0f157ea5b2db94b7028994df632af40bdbd19f1fe7a40a96d4e3b5fa32d28f724df407c6a901f1bb79635cd046
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
# MITIE
|
2
2
|
|
3
|
-
[MITIE](https://github.com/mit-nlp/MITIE) - named-entity recognition - for Ruby
|
3
|
+
[MITIE](https://github.com/mit-nlp/MITIE) - named-entity recognition and binary relation detection - for Ruby
|
4
|
+
|
5
|
+
- Finds people, organizations, and locations in text
|
6
|
+
- Detects relationships between entities, like `PERSON` was born in `LOCATION`
|
4
7
|
|
5
8
|
[](https://travis-ci.org/ankane/mitie) [](https://ci.appveyor.com/project/ankane/mitie/branch/master)
|
6
9
|
|
@@ -20,22 +23,22 @@ And download the pre-trained model for your language:
|
|
20
23
|
|
21
24
|
## Getting Started
|
22
25
|
|
23
|
-
|
26
|
+
Load an NER model
|
24
27
|
|
25
28
|
```ruby
|
26
|
-
|
29
|
+
model = Mitie::NER.new("ner_model.dat")
|
27
30
|
```
|
28
31
|
|
29
|
-
|
32
|
+
Create a document
|
30
33
|
|
31
34
|
```ruby
|
32
|
-
|
35
|
+
doc = model.doc("Nat Friedman is the CEO of GitHub, which is headquartered in San Francisco")
|
33
36
|
```
|
34
37
|
|
35
38
|
Get entities
|
36
39
|
|
37
40
|
```ruby
|
38
|
-
|
41
|
+
doc.entities
|
39
42
|
```
|
40
43
|
|
41
44
|
This returns
|
@@ -51,13 +54,13 @@ This returns
|
|
51
54
|
Get tokens
|
52
55
|
|
53
56
|
```ruby
|
54
|
-
|
57
|
+
doc.tokens
|
55
58
|
```
|
56
59
|
|
57
60
|
Get tokens and their offset
|
58
61
|
|
59
62
|
```ruby
|
60
|
-
|
63
|
+
doc.tokens_with_offset
|
61
64
|
```
|
62
65
|
|
63
66
|
Get all tags for a model
|
@@ -66,6 +69,40 @@ Get all tags for a model
|
|
66
69
|
model.tags
|
67
70
|
```
|
68
71
|
|
72
|
+
## Binary Relation Detection
|
73
|
+
|
74
|
+
Detect relationships betweens two entities, like:
|
75
|
+
|
76
|
+
- `PERSON` was born in `LOCATION`
|
77
|
+
- `ORGANIZATION` was founded in `LOCATION`
|
78
|
+
- `FILM` was directed by `PERSON`
|
79
|
+
|
80
|
+
There are 21 detectors for English. You can find them in the `binary_relations` directory in the model download.
|
81
|
+
|
82
|
+
Load a detector
|
83
|
+
|
84
|
+
```ruby
|
85
|
+
detector = Mitie::BinaryRelationDetector.new("rel_classifier_film.film.directed_by.svm")
|
86
|
+
```
|
87
|
+
|
88
|
+
And create a document
|
89
|
+
|
90
|
+
```ruby
|
91
|
+
doc = model.doc("The Shawshank Redemption was directed by Frank Darabont")
|
92
|
+
```
|
93
|
+
|
94
|
+
Get relations
|
95
|
+
|
96
|
+
```ruby
|
97
|
+
detector.relations(doc)
|
98
|
+
```
|
99
|
+
|
100
|
+
This returns
|
101
|
+
|
102
|
+
```ruby
|
103
|
+
[{first: "Shawshank Redemption", second: "Frank Darabont", score: 1.124211742912441}]
|
104
|
+
```
|
105
|
+
|
69
106
|
## History
|
70
107
|
|
71
108
|
View the [changelog](https://github.com/ankane/mitie/blob/master/CHANGELOG.md)
|
@@ -86,5 +123,7 @@ git clone https://github.com/ankane/mitie.git
|
|
86
123
|
cd mitie
|
87
124
|
bundle install
|
88
125
|
bundle exec rake vendor:all
|
89
|
-
|
126
|
+
|
127
|
+
export MITIE_MODELS_PATH=path/to/MITIE-models/english
|
128
|
+
bundle exec rake test
|
90
129
|
```
|
data/lib/mitie.rb
CHANGED
@@ -0,0 +1,62 @@
|
|
1
|
+
module Mitie
|
2
|
+
class BinaryRelationDetector
|
3
|
+
def initialize(path)
|
4
|
+
# better error message
|
5
|
+
raise ArgumentError, "File does not exist" unless File.exist?(path)
|
6
|
+
@pointer = FFI.mitie_load_binary_relation_detector(path)
|
7
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(pointer))
|
8
|
+
end
|
9
|
+
|
10
|
+
def name
|
11
|
+
FFI.mitie_binary_relation_detector_name_string(pointer).to_s
|
12
|
+
end
|
13
|
+
|
14
|
+
def relations(doc)
|
15
|
+
raise ArgumentError, "Expected Mitie::Document, not #{doc.class.name}" unless doc.is_a?(Document)
|
16
|
+
|
17
|
+
entities = doc.entities
|
18
|
+
combinations = []
|
19
|
+
(entities.size - 1).times do |i|
|
20
|
+
combinations << [entities[i], entities[i + 1]]
|
21
|
+
combinations << [entities[i + 1], entities[i]]
|
22
|
+
end
|
23
|
+
|
24
|
+
relations = []
|
25
|
+
combinations.each do |entity1, entity2|
|
26
|
+
relation =
|
27
|
+
FFI.mitie_extract_binary_relation(
|
28
|
+
doc.model.pointer,
|
29
|
+
doc.send(:tokens_ptr),
|
30
|
+
entity1[:token_index],
|
31
|
+
entity1[:token_length],
|
32
|
+
entity2[:token_index],
|
33
|
+
entity2[:token_length]
|
34
|
+
)
|
35
|
+
|
36
|
+
score_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
|
37
|
+
status = FFI.mitie_classify_binary_relation(pointer, relation, score_ptr)
|
38
|
+
raise "Bad status: #{status}" if status != 0
|
39
|
+
score = score_ptr.to_s(Fiddle::SIZEOF_DOUBLE).unpack1("d")
|
40
|
+
if score > 0
|
41
|
+
relations << {
|
42
|
+
first: entity1[:text],
|
43
|
+
second: entity2[:text],
|
44
|
+
score: score
|
45
|
+
}
|
46
|
+
end
|
47
|
+
end
|
48
|
+
relations
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
def pointer
|
54
|
+
@pointer
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.finalize(pointer)
|
58
|
+
# must use proc instead of stabby lambda
|
59
|
+
proc { FFI.mitie_free(pointer) }
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
module Mitie
|
2
|
+
class Document
|
3
|
+
attr_reader :model, :text
|
4
|
+
|
5
|
+
def initialize(model, text)
|
6
|
+
@model = model
|
7
|
+
@text = text.to_s
|
8
|
+
end
|
9
|
+
|
10
|
+
def tokens
|
11
|
+
@tokens ||= tokens_with_offset.map(&:first)
|
12
|
+
end
|
13
|
+
|
14
|
+
def tokens_with_offset
|
15
|
+
@tokens_with_offset ||= begin
|
16
|
+
i = 0
|
17
|
+
tokens = []
|
18
|
+
loop do
|
19
|
+
token = (tokens_ptr + i * Fiddle::SIZEOF_VOIDP).ptr
|
20
|
+
break if token.null?
|
21
|
+
offset = (offsets_ptr.ptr + i * Fiddle::SIZEOF_LONG).to_s(Fiddle::SIZEOF_LONG).unpack1("L!")
|
22
|
+
tokens << [token.to_s.force_encoding(text.encoding), offset]
|
23
|
+
i += 1
|
24
|
+
end
|
25
|
+
tokens
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def entities
|
30
|
+
@entities ||= begin
|
31
|
+
begin
|
32
|
+
entities = []
|
33
|
+
tokens = tokens_with_offset
|
34
|
+
detections = FFI.mitie_extract_entities(pointer, tokens_ptr)
|
35
|
+
num_detections = FFI.mitie_ner_get_num_detections(detections)
|
36
|
+
num_detections.times do |i|
|
37
|
+
pos = FFI.mitie_ner_get_detection_position(detections, i)
|
38
|
+
len = FFI.mitie_ner_get_detection_length(detections, i)
|
39
|
+
tag = FFI.mitie_ner_get_detection_tagstr(detections, i).to_s
|
40
|
+
score = FFI.mitie_ner_get_detection_score(detections, i)
|
41
|
+
tok = tokens[pos, len]
|
42
|
+
offset = tok[0][1]
|
43
|
+
finish = tok[-1][1] + tok[-1][0].size
|
44
|
+
entities << {
|
45
|
+
text: text[offset...finish],
|
46
|
+
tag: tag,
|
47
|
+
score: score,
|
48
|
+
offset: offset,
|
49
|
+
token_index: pos,
|
50
|
+
token_length: len
|
51
|
+
}
|
52
|
+
end
|
53
|
+
entities
|
54
|
+
ensure
|
55
|
+
FFI.mitie_free(detections) if detections
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def pointer
|
63
|
+
model.pointer
|
64
|
+
end
|
65
|
+
|
66
|
+
def tokens_ptr
|
67
|
+
tokenize[0]
|
68
|
+
end
|
69
|
+
|
70
|
+
def offsets_ptr
|
71
|
+
tokenize[1]
|
72
|
+
end
|
73
|
+
|
74
|
+
def tokenize
|
75
|
+
@tokenize ||= begin
|
76
|
+
offsets_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
|
77
|
+
tokens_ptr = FFI.mitie_tokenize_with_offsets(text, offsets_ptr)
|
78
|
+
|
79
|
+
ObjectSpace.define_finalizer(tokens_ptr, self.class.finalize(tokens_ptr))
|
80
|
+
ObjectSpace.define_finalizer(offsets_ptr, self.class.finalize_ptr(offsets_ptr))
|
81
|
+
|
82
|
+
[tokens_ptr, offsets_ptr]
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def self.finalize(pointer)
|
87
|
+
# must use proc instead of stabby lambda
|
88
|
+
proc { FFI.mitie_free(pointer) }
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.finalize_ptr(pointer)
|
92
|
+
# must use proc instead of stabby lambda
|
93
|
+
proc { FFI.mitie_free(pointer.ptr) }
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/lib/mitie/ffi.rb
CHANGED
@@ -25,5 +25,11 @@ module Mitie
|
|
25
25
|
extern "unsigned long mitie_ner_get_detection_tag(const mitie_named_entity_detections* dets, unsigned long idx)"
|
26
26
|
extern "const char* mitie_ner_get_detection_tagstr(const mitie_named_entity_detections* dets, unsigned long idx)"
|
27
27
|
extern "double mitie_ner_get_detection_score(const mitie_named_entity_detections* dets, unsigned long idx)"
|
28
|
+
|
29
|
+
extern "mitie_binary_relation_detector* mitie_load_binary_relation_detector(const char* filename)"
|
30
|
+
extern "const char* mitie_binary_relation_detector_name_string(const mitie_binary_relation_detector* detector)"
|
31
|
+
extern "int mitie_entities_overlap(unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)"
|
32
|
+
extern "mitie_binary_relation* mitie_extract_binary_relation(const mitie_named_entity_extractor* ner, char** tokens, unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)"
|
33
|
+
extern "int mitie_classify_binary_relation(const mitie_binary_relation_detector* detector, const mitie_binary_relation* relation, double* score)"
|
28
34
|
end
|
29
35
|
end
|
data/lib/mitie/ner.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
module Mitie
|
2
2
|
class NER
|
3
|
+
attr_reader :pointer
|
4
|
+
|
3
5
|
def initialize(path)
|
4
6
|
# better error message
|
5
|
-
raise ArgumentError, "
|
7
|
+
raise ArgumentError, "File does not exist" unless File.exist?(path)
|
6
8
|
@pointer = FFI.mitie_load_named_entity_extractor(path)
|
7
9
|
ObjectSpace.define_finalizer(self, self.class.finalize(pointer))
|
8
10
|
end
|
@@ -13,76 +15,20 @@ module Mitie
|
|
13
15
|
end
|
14
16
|
end
|
15
17
|
|
16
|
-
def
|
17
|
-
|
18
|
-
ptr = FFI.mitie_tokenize(text)
|
19
|
-
i = 0
|
20
|
-
loop do
|
21
|
-
token = (ptr + i * Fiddle::SIZEOF_VOIDP).ptr
|
22
|
-
break if token.null?
|
23
|
-
tokens << token.to_s.force_encoding(text.encoding)
|
24
|
-
i += 1
|
25
|
-
end
|
26
|
-
tokens
|
27
|
-
ensure
|
28
|
-
FFI.mitie_free(ptr) if ptr
|
29
|
-
end
|
30
|
-
|
31
|
-
def tokens_with_offset(text)
|
32
|
-
tokens, ptr = tokens_with_offset_with_ptr(text)
|
33
|
-
tokens
|
34
|
-
ensure
|
35
|
-
FFI.mitie_free(ptr) if ptr
|
18
|
+
def doc(text)
|
19
|
+
Document.new(self, text)
|
36
20
|
end
|
37
21
|
|
38
22
|
def entities(text)
|
39
|
-
entities
|
40
|
-
tokens, tokens_ptr = tokens_with_offset_with_ptr(text)
|
41
|
-
detections = FFI.mitie_extract_entities(pointer, tokens_ptr)
|
42
|
-
num_detections = FFI.mitie_ner_get_num_detections(detections)
|
43
|
-
num_detections.times do |i|
|
44
|
-
pos = FFI.mitie_ner_get_detection_position(detections, i)
|
45
|
-
len = FFI.mitie_ner_get_detection_length(detections, i)
|
46
|
-
tag = FFI.mitie_ner_get_detection_tagstr(detections, i).to_s
|
47
|
-
score = FFI.mitie_ner_get_detection_score(detections, i)
|
48
|
-
tok = tokens[pos, len]
|
49
|
-
offset = tok[0][1]
|
50
|
-
finish = tok[-1][1] + tok[-1][0].size
|
51
|
-
entities << {
|
52
|
-
text: text[offset...finish],
|
53
|
-
tag: tag,
|
54
|
-
score: score,
|
55
|
-
offset: offset
|
56
|
-
}
|
57
|
-
end
|
58
|
-
entities
|
59
|
-
ensure
|
60
|
-
FFI.mitie_free(tokens_ptr) if tokens_ptr
|
61
|
-
FFI.mitie_free(detections) if detections
|
23
|
+
doc(text).entities
|
62
24
|
end
|
63
25
|
|
64
|
-
|
65
|
-
|
66
|
-
def pointer
|
67
|
-
@pointer
|
26
|
+
def tokens(text)
|
27
|
+
doc(text).tokens
|
68
28
|
end
|
69
29
|
|
70
|
-
def
|
71
|
-
|
72
|
-
ptr = FFI.mitie_tokenize_with_offsets(text, token_offsets)
|
73
|
-
i = 0
|
74
|
-
tokens = []
|
75
|
-
loop do
|
76
|
-
token = (ptr + i * Fiddle::SIZEOF_VOIDP).ptr
|
77
|
-
break if token.null?
|
78
|
-
offset = (token_offsets.ptr + i * Fiddle::SIZEOF_LONG).to_s(Fiddle::SIZEOF_LONG).unpack1("L!")
|
79
|
-
tokens << [token.to_s.force_encoding(text.encoding), offset]
|
80
|
-
i += 1
|
81
|
-
end
|
82
|
-
[tokens, ptr]
|
83
|
-
ensure
|
84
|
-
# use ptr, not token_offsets.ptr
|
85
|
-
FFI.mitie_free(token_offsets.ptr) if ptr
|
30
|
+
def tokens_with_offset(text)
|
31
|
+
doc(text).tokens_with_offset
|
86
32
|
end
|
87
33
|
|
88
34
|
def self.finalize(pointer)
|
data/lib/mitie/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mitie
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-09-
|
11
|
+
date: 2020-09-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -62,6 +62,8 @@ files:
|
|
62
62
|
- LICENSE.txt
|
63
63
|
- README.md
|
64
64
|
- lib/mitie.rb
|
65
|
+
- lib/mitie/binary_relation_detector.rb
|
66
|
+
- lib/mitie/document.rb
|
65
67
|
- lib/mitie/ffi.rb
|
66
68
|
- lib/mitie/ner.rb
|
67
69
|
- lib/mitie/version.rb
|