mitie 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +48 -9
- data/lib/mitie.rb +2 -0
- data/lib/mitie/binary_relation_detector.rb +62 -0
- data/lib/mitie/document.rb +96 -0
- data/lib/mitie/ffi.rb +6 -0
- data/lib/mitie/ner.rb +10 -64
- data/lib/mitie/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 519a88b20911f72d0a66ceaca52e1af3171bab683e27392763f52ac4c494d185
|
4
|
+
data.tar.gz: 6c955aa66776ef1ec92ccc151ba6fce1eeef8d968fad2ac8fd47cb0a7ea4f3bf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1d8d373478c4ae69844959a349598c35f4187ff91e52e5a6be457b5e61769b0c109bb143b4604999665d6a1fce532ad027add26fc3ea03360764bd2025357c91
|
7
|
+
data.tar.gz: b4c13d770bfb8b03108d6a93c757f286d7dc3e0f157ea5b2db94b7028994df632af40bdbd19f1fe7a40a96d4e3b5fa32d28f724df407c6a901f1bb79635cd046
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
# MITIE
|
2
2
|
|
3
|
-
[MITIE](https://github.com/mit-nlp/MITIE) - named-entity recognition - for Ruby
|
3
|
+
[MITIE](https://github.com/mit-nlp/MITIE) - named-entity recognition and binary relation detection - for Ruby
|
4
|
+
|
5
|
+
- Finds people, organizations, and locations in text
|
6
|
+
- Detects relationships between entities, like `PERSON` was born in `LOCATION`
|
4
7
|
|
5
8
|
[![Build Status](https://travis-ci.org/ankane/mitie.svg?branch=master)](https://travis-ci.org/ankane/mitie) [![Build status](https://ci.appveyor.com/api/projects/status/stc89tc57xfva451/branch/master?svg=true)](https://ci.appveyor.com/project/ankane/mitie/branch/master)
|
6
9
|
|
@@ -20,22 +23,22 @@ And download the pre-trained model for your language:
|
|
20
23
|
|
21
24
|
## Getting Started
|
22
25
|
|
23
|
-
|
26
|
+
Load an NER model
|
24
27
|
|
25
28
|
```ruby
|
26
|
-
|
29
|
+
model = Mitie::NER.new("ner_model.dat")
|
27
30
|
```
|
28
31
|
|
29
|
-
|
32
|
+
Create a document
|
30
33
|
|
31
34
|
```ruby
|
32
|
-
|
35
|
+
doc = model.doc("Nat Friedman is the CEO of GitHub, which is headquartered in San Francisco")
|
33
36
|
```
|
34
37
|
|
35
38
|
Get entities
|
36
39
|
|
37
40
|
```ruby
|
38
|
-
|
41
|
+
doc.entities
|
39
42
|
```
|
40
43
|
|
41
44
|
This returns
|
@@ -51,13 +54,13 @@ This returns
|
|
51
54
|
Get tokens
|
52
55
|
|
53
56
|
```ruby
|
54
|
-
|
57
|
+
doc.tokens
|
55
58
|
```
|
56
59
|
|
57
60
|
Get tokens and their offset
|
58
61
|
|
59
62
|
```ruby
|
60
|
-
|
63
|
+
doc.tokens_with_offset
|
61
64
|
```
|
62
65
|
|
63
66
|
Get all tags for a model
|
@@ -66,6 +69,40 @@ Get all tags for a model
|
|
66
69
|
model.tags
|
67
70
|
```
|
68
71
|
|
72
|
+
## Binary Relation Detection
|
73
|
+
|
74
|
+
Detect relationships betweens two entities, like:
|
75
|
+
|
76
|
+
- `PERSON` was born in `LOCATION`
|
77
|
+
- `ORGANIZATION` was founded in `LOCATION`
|
78
|
+
- `FILM` was directed by `PERSON`
|
79
|
+
|
80
|
+
There are 21 detectors for English. You can find them in the `binary_relations` directory in the model download.
|
81
|
+
|
82
|
+
Load a detector
|
83
|
+
|
84
|
+
```ruby
|
85
|
+
detector = Mitie::BinaryRelationDetector.new("rel_classifier_film.film.directed_by.svm")
|
86
|
+
```
|
87
|
+
|
88
|
+
And create a document
|
89
|
+
|
90
|
+
```ruby
|
91
|
+
doc = model.doc("The Shawshank Redemption was directed by Frank Darabont")
|
92
|
+
```
|
93
|
+
|
94
|
+
Get relations
|
95
|
+
|
96
|
+
```ruby
|
97
|
+
detector.relations(doc)
|
98
|
+
```
|
99
|
+
|
100
|
+
This returns
|
101
|
+
|
102
|
+
```ruby
|
103
|
+
[{first: "Shawshank Redemption", second: "Frank Darabont", score: 1.124211742912441}]
|
104
|
+
```
|
105
|
+
|
69
106
|
## History
|
70
107
|
|
71
108
|
View the [changelog](https://github.com/ankane/mitie/blob/master/CHANGELOG.md)
|
@@ -86,5 +123,7 @@ git clone https://github.com/ankane/mitie.git
|
|
86
123
|
cd mitie
|
87
124
|
bundle install
|
88
125
|
bundle exec rake vendor:all
|
89
|
-
|
126
|
+
|
127
|
+
export MITIE_MODELS_PATH=path/to/MITIE-models/english
|
128
|
+
bundle exec rake test
|
90
129
|
```
|
data/lib/mitie.rb
CHANGED
@@ -0,0 +1,62 @@
|
|
1
|
+
module Mitie
|
2
|
+
class BinaryRelationDetector
|
3
|
+
def initialize(path)
|
4
|
+
# better error message
|
5
|
+
raise ArgumentError, "File does not exist" unless File.exist?(path)
|
6
|
+
@pointer = FFI.mitie_load_binary_relation_detector(path)
|
7
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(pointer))
|
8
|
+
end
|
9
|
+
|
10
|
+
def name
|
11
|
+
FFI.mitie_binary_relation_detector_name_string(pointer).to_s
|
12
|
+
end
|
13
|
+
|
14
|
+
def relations(doc)
|
15
|
+
raise ArgumentError, "Expected Mitie::Document, not #{doc.class.name}" unless doc.is_a?(Document)
|
16
|
+
|
17
|
+
entities = doc.entities
|
18
|
+
combinations = []
|
19
|
+
(entities.size - 1).times do |i|
|
20
|
+
combinations << [entities[i], entities[i + 1]]
|
21
|
+
combinations << [entities[i + 1], entities[i]]
|
22
|
+
end
|
23
|
+
|
24
|
+
relations = []
|
25
|
+
combinations.each do |entity1, entity2|
|
26
|
+
relation =
|
27
|
+
FFI.mitie_extract_binary_relation(
|
28
|
+
doc.model.pointer,
|
29
|
+
doc.send(:tokens_ptr),
|
30
|
+
entity1[:token_index],
|
31
|
+
entity1[:token_length],
|
32
|
+
entity2[:token_index],
|
33
|
+
entity2[:token_length]
|
34
|
+
)
|
35
|
+
|
36
|
+
score_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
|
37
|
+
status = FFI.mitie_classify_binary_relation(pointer, relation, score_ptr)
|
38
|
+
raise "Bad status: #{status}" if status != 0
|
39
|
+
score = score_ptr.to_s(Fiddle::SIZEOF_DOUBLE).unpack1("d")
|
40
|
+
if score > 0
|
41
|
+
relations << {
|
42
|
+
first: entity1[:text],
|
43
|
+
second: entity2[:text],
|
44
|
+
score: score
|
45
|
+
}
|
46
|
+
end
|
47
|
+
end
|
48
|
+
relations
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
def pointer
|
54
|
+
@pointer
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.finalize(pointer)
|
58
|
+
# must use proc instead of stabby lambda
|
59
|
+
proc { FFI.mitie_free(pointer) }
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
module Mitie
|
2
|
+
class Document
|
3
|
+
attr_reader :model, :text
|
4
|
+
|
5
|
+
def initialize(model, text)
|
6
|
+
@model = model
|
7
|
+
@text = text.to_s
|
8
|
+
end
|
9
|
+
|
10
|
+
def tokens
|
11
|
+
@tokens ||= tokens_with_offset.map(&:first)
|
12
|
+
end
|
13
|
+
|
14
|
+
def tokens_with_offset
|
15
|
+
@tokens_with_offset ||= begin
|
16
|
+
i = 0
|
17
|
+
tokens = []
|
18
|
+
loop do
|
19
|
+
token = (tokens_ptr + i * Fiddle::SIZEOF_VOIDP).ptr
|
20
|
+
break if token.null?
|
21
|
+
offset = (offsets_ptr.ptr + i * Fiddle::SIZEOF_LONG).to_s(Fiddle::SIZEOF_LONG).unpack1("L!")
|
22
|
+
tokens << [token.to_s.force_encoding(text.encoding), offset]
|
23
|
+
i += 1
|
24
|
+
end
|
25
|
+
tokens
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def entities
|
30
|
+
@entities ||= begin
|
31
|
+
begin
|
32
|
+
entities = []
|
33
|
+
tokens = tokens_with_offset
|
34
|
+
detections = FFI.mitie_extract_entities(pointer, tokens_ptr)
|
35
|
+
num_detections = FFI.mitie_ner_get_num_detections(detections)
|
36
|
+
num_detections.times do |i|
|
37
|
+
pos = FFI.mitie_ner_get_detection_position(detections, i)
|
38
|
+
len = FFI.mitie_ner_get_detection_length(detections, i)
|
39
|
+
tag = FFI.mitie_ner_get_detection_tagstr(detections, i).to_s
|
40
|
+
score = FFI.mitie_ner_get_detection_score(detections, i)
|
41
|
+
tok = tokens[pos, len]
|
42
|
+
offset = tok[0][1]
|
43
|
+
finish = tok[-1][1] + tok[-1][0].size
|
44
|
+
entities << {
|
45
|
+
text: text[offset...finish],
|
46
|
+
tag: tag,
|
47
|
+
score: score,
|
48
|
+
offset: offset,
|
49
|
+
token_index: pos,
|
50
|
+
token_length: len
|
51
|
+
}
|
52
|
+
end
|
53
|
+
entities
|
54
|
+
ensure
|
55
|
+
FFI.mitie_free(detections) if detections
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def pointer
|
63
|
+
model.pointer
|
64
|
+
end
|
65
|
+
|
66
|
+
def tokens_ptr
|
67
|
+
tokenize[0]
|
68
|
+
end
|
69
|
+
|
70
|
+
def offsets_ptr
|
71
|
+
tokenize[1]
|
72
|
+
end
|
73
|
+
|
74
|
+
def tokenize
|
75
|
+
@tokenize ||= begin
|
76
|
+
offsets_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
|
77
|
+
tokens_ptr = FFI.mitie_tokenize_with_offsets(text, offsets_ptr)
|
78
|
+
|
79
|
+
ObjectSpace.define_finalizer(tokens_ptr, self.class.finalize(tokens_ptr))
|
80
|
+
ObjectSpace.define_finalizer(offsets_ptr, self.class.finalize_ptr(offsets_ptr))
|
81
|
+
|
82
|
+
[tokens_ptr, offsets_ptr]
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def self.finalize(pointer)
|
87
|
+
# must use proc instead of stabby lambda
|
88
|
+
proc { FFI.mitie_free(pointer) }
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.finalize_ptr(pointer)
|
92
|
+
# must use proc instead of stabby lambda
|
93
|
+
proc { FFI.mitie_free(pointer.ptr) }
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/lib/mitie/ffi.rb
CHANGED
@@ -25,5 +25,11 @@ module Mitie
|
|
25
25
|
extern "unsigned long mitie_ner_get_detection_tag(const mitie_named_entity_detections* dets, unsigned long idx)"
|
26
26
|
extern "const char* mitie_ner_get_detection_tagstr(const mitie_named_entity_detections* dets, unsigned long idx)"
|
27
27
|
extern "double mitie_ner_get_detection_score(const mitie_named_entity_detections* dets, unsigned long idx)"
|
28
|
+
|
29
|
+
extern "mitie_binary_relation_detector* mitie_load_binary_relation_detector(const char* filename)"
|
30
|
+
extern "const char* mitie_binary_relation_detector_name_string(const mitie_binary_relation_detector* detector)"
|
31
|
+
extern "int mitie_entities_overlap(unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)"
|
32
|
+
extern "mitie_binary_relation* mitie_extract_binary_relation(const mitie_named_entity_extractor* ner, char** tokens, unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)"
|
33
|
+
extern "int mitie_classify_binary_relation(const mitie_binary_relation_detector* detector, const mitie_binary_relation* relation, double* score)"
|
28
34
|
end
|
29
35
|
end
|
data/lib/mitie/ner.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
module Mitie
|
2
2
|
class NER
|
3
|
+
attr_reader :pointer
|
4
|
+
|
3
5
|
def initialize(path)
|
4
6
|
# better error message
|
5
|
-
raise ArgumentError, "
|
7
|
+
raise ArgumentError, "File does not exist" unless File.exist?(path)
|
6
8
|
@pointer = FFI.mitie_load_named_entity_extractor(path)
|
7
9
|
ObjectSpace.define_finalizer(self, self.class.finalize(pointer))
|
8
10
|
end
|
@@ -13,76 +15,20 @@ module Mitie
|
|
13
15
|
end
|
14
16
|
end
|
15
17
|
|
16
|
-
def
|
17
|
-
|
18
|
-
ptr = FFI.mitie_tokenize(text)
|
19
|
-
i = 0
|
20
|
-
loop do
|
21
|
-
token = (ptr + i * Fiddle::SIZEOF_VOIDP).ptr
|
22
|
-
break if token.null?
|
23
|
-
tokens << token.to_s.force_encoding(text.encoding)
|
24
|
-
i += 1
|
25
|
-
end
|
26
|
-
tokens
|
27
|
-
ensure
|
28
|
-
FFI.mitie_free(ptr) if ptr
|
29
|
-
end
|
30
|
-
|
31
|
-
def tokens_with_offset(text)
|
32
|
-
tokens, ptr = tokens_with_offset_with_ptr(text)
|
33
|
-
tokens
|
34
|
-
ensure
|
35
|
-
FFI.mitie_free(ptr) if ptr
|
18
|
+
def doc(text)
|
19
|
+
Document.new(self, text)
|
36
20
|
end
|
37
21
|
|
38
22
|
def entities(text)
|
39
|
-
entities
|
40
|
-
tokens, tokens_ptr = tokens_with_offset_with_ptr(text)
|
41
|
-
detections = FFI.mitie_extract_entities(pointer, tokens_ptr)
|
42
|
-
num_detections = FFI.mitie_ner_get_num_detections(detections)
|
43
|
-
num_detections.times do |i|
|
44
|
-
pos = FFI.mitie_ner_get_detection_position(detections, i)
|
45
|
-
len = FFI.mitie_ner_get_detection_length(detections, i)
|
46
|
-
tag = FFI.mitie_ner_get_detection_tagstr(detections, i).to_s
|
47
|
-
score = FFI.mitie_ner_get_detection_score(detections, i)
|
48
|
-
tok = tokens[pos, len]
|
49
|
-
offset = tok[0][1]
|
50
|
-
finish = tok[-1][1] + tok[-1][0].size
|
51
|
-
entities << {
|
52
|
-
text: text[offset...finish],
|
53
|
-
tag: tag,
|
54
|
-
score: score,
|
55
|
-
offset: offset
|
56
|
-
}
|
57
|
-
end
|
58
|
-
entities
|
59
|
-
ensure
|
60
|
-
FFI.mitie_free(tokens_ptr) if tokens_ptr
|
61
|
-
FFI.mitie_free(detections) if detections
|
23
|
+
doc(text).entities
|
62
24
|
end
|
63
25
|
|
64
|
-
|
65
|
-
|
66
|
-
def pointer
|
67
|
-
@pointer
|
26
|
+
def tokens(text)
|
27
|
+
doc(text).tokens
|
68
28
|
end
|
69
29
|
|
70
|
-
def
|
71
|
-
|
72
|
-
ptr = FFI.mitie_tokenize_with_offsets(text, token_offsets)
|
73
|
-
i = 0
|
74
|
-
tokens = []
|
75
|
-
loop do
|
76
|
-
token = (ptr + i * Fiddle::SIZEOF_VOIDP).ptr
|
77
|
-
break if token.null?
|
78
|
-
offset = (token_offsets.ptr + i * Fiddle::SIZEOF_LONG).to_s(Fiddle::SIZEOF_LONG).unpack1("L!")
|
79
|
-
tokens << [token.to_s.force_encoding(text.encoding), offset]
|
80
|
-
i += 1
|
81
|
-
end
|
82
|
-
[tokens, ptr]
|
83
|
-
ensure
|
84
|
-
# use ptr, not token_offsets.ptr
|
85
|
-
FFI.mitie_free(token_offsets.ptr) if ptr
|
30
|
+
def tokens_with_offset(text)
|
31
|
+
doc(text).tokens_with_offset
|
86
32
|
end
|
87
33
|
|
88
34
|
def self.finalize(pointer)
|
data/lib/mitie/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mitie
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-09-
|
11
|
+
date: 2020-09-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -62,6 +62,8 @@ files:
|
|
62
62
|
- LICENSE.txt
|
63
63
|
- README.md
|
64
64
|
- lib/mitie.rb
|
65
|
+
- lib/mitie/binary_relation_detector.rb
|
66
|
+
- lib/mitie/document.rb
|
65
67
|
- lib/mitie/ffi.rb
|
66
68
|
- lib/mitie/ner.rb
|
67
69
|
- lib/mitie/version.rb
|