mitie 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +23 -0
- data/README.md +93 -0
- data/lib/mitie.rb +25 -0
- data/lib/mitie/ffi.rb +29 -0
- data/lib/mitie/ner.rb +91 -0
- data/lib/mitie/version.rb +3 -0
- metadata +91 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: ea3ef115016c59ecb496ffbbe13c4ac3a2ffda6acf9392ac423103b9c3cfe634
|
4
|
+
data.tar.gz: 6eb77dd514ba3c08c30e1216921cd83619f206a658018c1d4522c598e175e8b2
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 682fb3ea1c0be1889f2e1e177204309ea7b6d2989834a4d2bae49ddf567e309b9fef9ea872746626590ec06b9aabb455067d88e696ea3a4f59a9b785b43819c9
|
7
|
+
data.tar.gz: 4260a6dff4eb613278468d9fff2ef19f99f49bed81571781b4d2ac886f39bed9356a4a20a9220fa2b31e8c12965bffac5e86b40318b28aa450ebd60a3c53c3cd
|
data/CHANGELOG.md
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
Boost Software License - Version 1.0 - August 17th, 2003
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person or organization
|
4
|
+
obtaining a copy of the software and accompanying documentation covered by
|
5
|
+
this license (the "Software") to use, reproduce, display, distribute,
|
6
|
+
execute, and transmit the Software, and to prepare derivative works of the
|
7
|
+
Software, and to permit third-parties to whom the Software is furnished to
|
8
|
+
do so, all subject to the following:
|
9
|
+
|
10
|
+
The copyright notices in the Software and this entire statement, including
|
11
|
+
the above license grant, this restriction and the following disclaimer,
|
12
|
+
must be included in all copies of the Software, in whole or in part, and
|
13
|
+
all derivative works of the Software, unless such copies or derivative
|
14
|
+
works are solely in the form of machine-executable object code generated by
|
15
|
+
a source language processor.
|
16
|
+
|
17
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
18
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
19
|
+
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
20
|
+
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
21
|
+
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
22
|
+
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
23
|
+
DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
# MITIE
|
2
|
+
|
3
|
+
[MITIE](https://github.com/mit-nlp/MITIE) - named-entity recognition - for Ruby
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
First, install MITIE. For Homebrew, use:
|
8
|
+
|
9
|
+
```sh
|
10
|
+
brew install mitie
|
11
|
+
```
|
12
|
+
|
13
|
+
Add this line to your application’s Gemfile:
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
gem 'mitie'
|
17
|
+
```
|
18
|
+
|
19
|
+
And download the pre-trained model for your language:
|
20
|
+
|
21
|
+
- [English](https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2.tar.bz2)
|
22
|
+
- [Spanish](https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2-Spanish.zip)
|
23
|
+
- [German](https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2-German.tar.bz2)
|
24
|
+
|
25
|
+
## Getting Started
|
26
|
+
|
27
|
+
Get your text
|
28
|
+
|
29
|
+
```ruby
|
30
|
+
text = "Nat Friedman is the CEO of GitHub, which is headquartered in San Francisco"
|
31
|
+
```
|
32
|
+
|
33
|
+
Load an NER model
|
34
|
+
|
35
|
+
```ruby
|
36
|
+
model = Mitie::NER.new("ner_model.dat")
|
37
|
+
```
|
38
|
+
|
39
|
+
Get entities
|
40
|
+
|
41
|
+
```ruby
|
42
|
+
model.entities(text)
|
43
|
+
```
|
44
|
+
|
45
|
+
This returns
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
[
|
49
|
+
{text: "Nat Friedman", tag: "PERSON", score: 1.099661347535191, offset: 0},
|
50
|
+
{text: "GitHub", tag: "ORGANIZATION", score: 0.344641651251650, offset: 27},
|
51
|
+
{text: "San Francisco", tag: "LOCATION", score: 1.428241888939011, offset: 61}
|
52
|
+
]
|
53
|
+
```
|
54
|
+
|
55
|
+
Get tokens
|
56
|
+
|
57
|
+
```ruby
|
58
|
+
model.tokens(text)
|
59
|
+
```
|
60
|
+
|
61
|
+
Get tokens and their offset
|
62
|
+
|
63
|
+
```ruby
|
64
|
+
model.tokens_with_offset(text)
|
65
|
+
```
|
66
|
+
|
67
|
+
Get all tags for a model
|
68
|
+
|
69
|
+
```ruby
|
70
|
+
model.tags
|
71
|
+
```
|
72
|
+
|
73
|
+
## History
|
74
|
+
|
75
|
+
View the [changelog](https://github.com/ankane/mitie/blob/master/CHANGELOG.md)
|
76
|
+
|
77
|
+
## Contributing
|
78
|
+
|
79
|
+
Everyone is encouraged to help improve this project. Here are a few ways you can help:
|
80
|
+
|
81
|
+
- [Report bugs](https://github.com/ankane/mitie/issues)
|
82
|
+
- Fix bugs and [submit pull requests](https://github.com/ankane/mitie/pulls)
|
83
|
+
- Write, clarify, or fix documentation
|
84
|
+
- Suggest or add new features
|
85
|
+
|
86
|
+
To get started with development:
|
87
|
+
|
88
|
+
```sh
|
89
|
+
git clone https://github.com/ankane/mitie.git
|
90
|
+
cd mitie
|
91
|
+
bundle install
|
92
|
+
MITIE_NER_PATH=path/to/ner_model.dat bundle exec rake test
|
93
|
+
```
|
data/lib/mitie.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# stdlib
|
2
|
+
require "fiddle/import"
|
3
|
+
|
4
|
+
# modules
|
5
|
+
require "mitie/ner"
|
6
|
+
require "mitie/version"
|
7
|
+
|
8
|
+
module Mitie
|
9
|
+
class Error < StandardError; end
|
10
|
+
|
11
|
+
class << self
|
12
|
+
attr_accessor :ffi_lib
|
13
|
+
end
|
14
|
+
self.ffi_lib =
|
15
|
+
if Gem.win_platform?
|
16
|
+
["mitie.dll"]
|
17
|
+
elsif RbConfig::CONFIG["host_os"] =~ /darwin/i
|
18
|
+
["libmitie.dylib"]
|
19
|
+
else
|
20
|
+
["libmitie.so"]
|
21
|
+
end
|
22
|
+
|
23
|
+
# friendlier error message
|
24
|
+
autoload :FFI, "mitie/ffi"
|
25
|
+
end
|
data/lib/mitie/ffi.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
module Mitie
|
2
|
+
module FFI
|
3
|
+
extend Fiddle::Importer
|
4
|
+
|
5
|
+
libs = Mitie.ffi_lib.dup
|
6
|
+
begin
|
7
|
+
dlload Fiddle.dlopen(libs.shift)
|
8
|
+
rescue Fiddle::DLError => e
|
9
|
+
retry if libs.any?
|
10
|
+
raise e
|
11
|
+
end
|
12
|
+
|
13
|
+
extern "void mitie_free(void* object)"
|
14
|
+
extern "char** mitie_tokenize(const char* text)"
|
15
|
+
extern "char** mitie_tokenize_with_offsets(const char* text, unsigned long** token_offsets)"
|
16
|
+
|
17
|
+
extern "mitie_named_entity_extractor* mitie_load_named_entity_extractor(const char* filename)"
|
18
|
+
extern "unsigned long mitie_get_num_possible_ner_tags(const mitie_named_entity_extractor* ner)"
|
19
|
+
extern "const char* mitie_get_named_entity_tagstr(const mitie_named_entity_extractor* ner, unsigned long idx)"
|
20
|
+
|
21
|
+
extern "mitie_named_entity_detections* mitie_extract_entities(const mitie_named_entity_extractor* ner, char** tokens)"
|
22
|
+
extern "unsigned long mitie_ner_get_num_detections(const mitie_named_entity_detections* dets)"
|
23
|
+
extern "unsigned long mitie_ner_get_detection_position(const mitie_named_entity_detections* dets, unsigned long idx)"
|
24
|
+
extern "unsigned long mitie_ner_get_detection_length(const mitie_named_entity_detections* dets, unsigned long idx)"
|
25
|
+
extern "unsigned long mitie_ner_get_detection_tag(const mitie_named_entity_detections* dets, unsigned long idx)"
|
26
|
+
extern "const char* mitie_ner_get_detection_tagstr(const mitie_named_entity_detections* dets, unsigned long idx)"
|
27
|
+
extern "double mitie_ner_get_detection_score(const mitie_named_entity_detections* dets, unsigned long idx)"
|
28
|
+
end
|
29
|
+
end
|
data/lib/mitie/ner.rb
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
module Mitie
|
2
|
+
class NER
|
3
|
+
def initialize(path)
|
4
|
+
@pointer = FFI.mitie_load_named_entity_extractor(path)
|
5
|
+
ObjectSpace.define_finalizer(self, self.class.finalize(pointer))
|
6
|
+
end
|
7
|
+
|
8
|
+
def tags
|
9
|
+
FFI.mitie_get_num_possible_ner_tags(pointer).times.map do |i|
|
10
|
+
FFI.mitie_get_named_entity_tagstr(pointer, i).to_s
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def tokens(text)
|
15
|
+
tokens = []
|
16
|
+
ptr = FFI.mitie_tokenize(text)
|
17
|
+
i = 0
|
18
|
+
loop do
|
19
|
+
token = (ptr + i * Fiddle::SIZEOF_VOIDP).ptr
|
20
|
+
break if token.null?
|
21
|
+
tokens << token.to_s.force_encoding(text.encoding)
|
22
|
+
i += 1
|
23
|
+
end
|
24
|
+
tokens
|
25
|
+
ensure
|
26
|
+
FFI.mitie_free(ptr) if ptr
|
27
|
+
end
|
28
|
+
|
29
|
+
def tokens_with_offset(text)
|
30
|
+
tokens, ptr = tokens_with_offset_with_ptr(text)
|
31
|
+
tokens
|
32
|
+
ensure
|
33
|
+
FFI.mitie_free(ptr) if ptr
|
34
|
+
end
|
35
|
+
|
36
|
+
def entities(text)
|
37
|
+
entities = []
|
38
|
+
tokens, tokens_ptr = tokens_with_offset_with_ptr(text)
|
39
|
+
detections = FFI.mitie_extract_entities(pointer, tokens_ptr)
|
40
|
+
num_detections = FFI.mitie_ner_get_num_detections(detections)
|
41
|
+
num_detections.times do |i|
|
42
|
+
pos = FFI.mitie_ner_get_detection_position(detections, i)
|
43
|
+
len = FFI.mitie_ner_get_detection_length(detections, i)
|
44
|
+
tag = FFI.mitie_ner_get_detection_tagstr(detections, i).to_s
|
45
|
+
score = FFI.mitie_ner_get_detection_score(detections, i)
|
46
|
+
tok = tokens[pos, len]
|
47
|
+
offset = tok[0][1]
|
48
|
+
finish = tok[-1][1] + tok[-1][0].size
|
49
|
+
entities << {
|
50
|
+
text: text[offset...finish],
|
51
|
+
tag: tag,
|
52
|
+
score: score,
|
53
|
+
offset: offset
|
54
|
+
}
|
55
|
+
end
|
56
|
+
entities
|
57
|
+
ensure
|
58
|
+
FFI.mitie_free(tokens_ptr) if tokens_ptr
|
59
|
+
FFI.mitie_free(detections) if detections
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
def pointer
|
65
|
+
@pointer
|
66
|
+
end
|
67
|
+
|
68
|
+
def tokens_with_offset_with_ptr(text)
|
69
|
+
token_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
|
70
|
+
ptr = FFI.mitie_tokenize_with_offsets(text, token_offsets)
|
71
|
+
i = 0
|
72
|
+
tokens = []
|
73
|
+
loop do
|
74
|
+
token = (ptr + i * Fiddle::SIZEOF_VOIDP).ptr
|
75
|
+
break if token.null?
|
76
|
+
offset = (token_offsets.ptr + i * Fiddle::SIZEOF_LONG).to_s(Fiddle::SIZEOF_LONG).unpack1("L!")
|
77
|
+
tokens << [token.to_s.force_encoding(text.encoding), offset]
|
78
|
+
i += 1
|
79
|
+
end
|
80
|
+
[tokens, ptr]
|
81
|
+
ensure
|
82
|
+
# use ptr, not token_offsets.ptr
|
83
|
+
FFI.mitie_free(token_offsets.ptr) if ptr
|
84
|
+
end
|
85
|
+
|
86
|
+
def self.finalize(pointer)
|
87
|
+
# must use proc instead of stabby lambda
|
88
|
+
proc { FFI.mitie_free(pointer) }
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
metadata
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mitie
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Andrew Kane
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-09-14 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: minitest
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '5'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '5'
|
55
|
+
description:
|
56
|
+
email: andrew@chartkick.com
|
57
|
+
executables: []
|
58
|
+
extensions: []
|
59
|
+
extra_rdoc_files: []
|
60
|
+
files:
|
61
|
+
- CHANGELOG.md
|
62
|
+
- LICENSE.txt
|
63
|
+
- README.md
|
64
|
+
- lib/mitie.rb
|
65
|
+
- lib/mitie/ffi.rb
|
66
|
+
- lib/mitie/ner.rb
|
67
|
+
- lib/mitie/version.rb
|
68
|
+
homepage: https://github.com/ankane/mitie
|
69
|
+
licenses:
|
70
|
+
- BSL-1.0
|
71
|
+
metadata: {}
|
72
|
+
post_install_message:
|
73
|
+
rdoc_options: []
|
74
|
+
require_paths:
|
75
|
+
- lib
|
76
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
77
|
+
requirements:
|
78
|
+
- - ">="
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: '2.5'
|
81
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
requirements: []
|
87
|
+
rubygems_version: 3.1.2
|
88
|
+
signing_key:
|
89
|
+
specification_version: 4
|
90
|
+
summary: Named-entity recognition for Ruby
|
91
|
+
test_files: []
|