mitie 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: ea3ef115016c59ecb496ffbbe13c4ac3a2ffda6acf9392ac423103b9c3cfe634
4
+ data.tar.gz: 6eb77dd514ba3c08c30e1216921cd83619f206a658018c1d4522c598e175e8b2
5
+ SHA512:
6
+ metadata.gz: 682fb3ea1c0be1889f2e1e177204309ea7b6d2989834a4d2bae49ddf567e309b9fef9ea872746626590ec06b9aabb455067d88e696ea3a4f59a9b785b43819c9
7
+ data.tar.gz: 4260a6dff4eb613278468d9fff2ef19f99f49bed81571781b4d2ac886f39bed9356a4a20a9220fa2b31e8c12965bffac5e86b40318b28aa450ebd60a3c53c3cd
@@ -0,0 +1,3 @@
1
+ ## 0.1.0 (2020-09-14)
2
+
3
+ - First release
@@ -0,0 +1,23 @@
1
+ Boost Software License - Version 1.0 - August 17th, 2003
2
+
3
+ Permission is hereby granted, free of charge, to any person or organization
4
+ obtaining a copy of the software and accompanying documentation covered by
5
+ this license (the "Software") to use, reproduce, display, distribute,
6
+ execute, and transmit the Software, and to prepare derivative works of the
7
+ Software, and to permit third-parties to whom the Software is furnished to
8
+ do so, all subject to the following:
9
+
10
+ The copyright notices in the Software and this entire statement, including
11
+ the above license grant, this restriction and the following disclaimer,
12
+ must be included in all copies of the Software, in whole or in part, and
13
+ all derivative works of the Software, unless such copies or derivative
14
+ works are solely in the form of machine-executable object code generated by
15
+ a source language processor.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
20
+ SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
21
+ FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
22
+ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23
+ DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,93 @@
1
+ # MITIE
2
+
3
+ [MITIE](https://github.com/mit-nlp/MITIE) - named-entity recognition - for Ruby
4
+
5
+ ## Installation
6
+
7
+ First, install MITIE. For Homebrew, use:
8
+
9
+ ```sh
10
+ brew install mitie
11
+ ```
12
+
13
+ Add this line to your application’s Gemfile:
14
+
15
+ ```ruby
16
+ gem 'mitie'
17
+ ```
18
+
19
+ And download the pre-trained model for your language:
20
+
21
+ - [English](https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2.tar.bz2)
22
+ - [Spanish](https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2-Spanish.zip)
23
+ - [German](https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2-German.tar.bz2)
24
+
25
+ ## Getting Started
26
+
27
+ Get your text
28
+
29
+ ```ruby
30
+ text = "Nat Friedman is the CEO of GitHub, which is headquartered in San Francisco"
31
+ ```
32
+
33
+ Load an NER model
34
+
35
+ ```ruby
36
+ model = Mitie::NER.new("ner_model.dat")
37
+ ```
38
+
39
+ Get entities
40
+
41
+ ```ruby
42
+ model.entities(text)
43
+ ```
44
+
45
+ This returns
46
+
47
+ ```ruby
48
+ [
49
+ {text: "Nat Friedman", tag: "PERSON", score: 1.099661347535191, offset: 0},
50
+ {text: "GitHub", tag: "ORGANIZATION", score: 0.344641651251650, offset: 27},
51
+ {text: "San Francisco", tag: "LOCATION", score: 1.428241888939011, offset: 61}
52
+ ]
53
+ ```
54
+
55
+ Get tokens
56
+
57
+ ```ruby
58
+ model.tokens(text)
59
+ ```
60
+
61
+ Get tokens and their offset
62
+
63
+ ```ruby
64
+ model.tokens_with_offset(text)
65
+ ```
66
+
67
+ Get all tags for a model
68
+
69
+ ```ruby
70
+ model.tags
71
+ ```
72
+
73
+ ## History
74
+
75
+ View the [changelog](https://github.com/ankane/mitie/blob/master/CHANGELOG.md)
76
+
77
+ ## Contributing
78
+
79
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
80
+
81
+ - [Report bugs](https://github.com/ankane/mitie/issues)
82
+ - Fix bugs and [submit pull requests](https://github.com/ankane/mitie/pulls)
83
+ - Write, clarify, or fix documentation
84
+ - Suggest or add new features
85
+
86
+ To get started with development:
87
+
88
+ ```sh
89
+ git clone https://github.com/ankane/mitie.git
90
+ cd mitie
91
+ bundle install
92
+ MITIE_NER_PATH=path/to/ner_model.dat bundle exec rake test
93
+ ```
@@ -0,0 +1,25 @@
1
+ # stdlib
2
+ require "fiddle/import"
3
+
4
+ # modules
5
+ require "mitie/ner"
6
+ require "mitie/version"
7
+
8
+ module Mitie
9
+ class Error < StandardError; end
10
+
11
+ class << self
12
+ attr_accessor :ffi_lib
13
+ end
14
+ self.ffi_lib =
15
+ if Gem.win_platform?
16
+ ["mitie.dll"]
17
+ elsif RbConfig::CONFIG["host_os"] =~ /darwin/i
18
+ ["libmitie.dylib"]
19
+ else
20
+ ["libmitie.so"]
21
+ end
22
+
23
+ # friendlier error message
24
+ autoload :FFI, "mitie/ffi"
25
+ end
@@ -0,0 +1,29 @@
1
+ module Mitie
2
+ module FFI
3
+ extend Fiddle::Importer
4
+
5
+ libs = Mitie.ffi_lib.dup
6
+ begin
7
+ dlload Fiddle.dlopen(libs.shift)
8
+ rescue Fiddle::DLError => e
9
+ retry if libs.any?
10
+ raise e
11
+ end
12
+
13
+ extern "void mitie_free(void* object)"
14
+ extern "char** mitie_tokenize(const char* text)"
15
+ extern "char** mitie_tokenize_with_offsets(const char* text, unsigned long** token_offsets)"
16
+
17
+ extern "mitie_named_entity_extractor* mitie_load_named_entity_extractor(const char* filename)"
18
+ extern "unsigned long mitie_get_num_possible_ner_tags(const mitie_named_entity_extractor* ner)"
19
+ extern "const char* mitie_get_named_entity_tagstr(const mitie_named_entity_extractor* ner, unsigned long idx)"
20
+
21
+ extern "mitie_named_entity_detections* mitie_extract_entities(const mitie_named_entity_extractor* ner, char** tokens)"
22
+ extern "unsigned long mitie_ner_get_num_detections(const mitie_named_entity_detections* dets)"
23
+ extern "unsigned long mitie_ner_get_detection_position(const mitie_named_entity_detections* dets, unsigned long idx)"
24
+ extern "unsigned long mitie_ner_get_detection_length(const mitie_named_entity_detections* dets, unsigned long idx)"
25
+ extern "unsigned long mitie_ner_get_detection_tag(const mitie_named_entity_detections* dets, unsigned long idx)"
26
+ extern "const char* mitie_ner_get_detection_tagstr(const mitie_named_entity_detections* dets, unsigned long idx)"
27
+ extern "double mitie_ner_get_detection_score(const mitie_named_entity_detections* dets, unsigned long idx)"
28
+ end
29
+ end
@@ -0,0 +1,91 @@
1
+ module Mitie
2
+ class NER
3
+ def initialize(path)
4
+ @pointer = FFI.mitie_load_named_entity_extractor(path)
5
+ ObjectSpace.define_finalizer(self, self.class.finalize(pointer))
6
+ end
7
+
8
+ def tags
9
+ FFI.mitie_get_num_possible_ner_tags(pointer).times.map do |i|
10
+ FFI.mitie_get_named_entity_tagstr(pointer, i).to_s
11
+ end
12
+ end
13
+
14
+ def tokens(text)
15
+ tokens = []
16
+ ptr = FFI.mitie_tokenize(text)
17
+ i = 0
18
+ loop do
19
+ token = (ptr + i * Fiddle::SIZEOF_VOIDP).ptr
20
+ break if token.null?
21
+ tokens << token.to_s.force_encoding(text.encoding)
22
+ i += 1
23
+ end
24
+ tokens
25
+ ensure
26
+ FFI.mitie_free(ptr) if ptr
27
+ end
28
+
29
+ def tokens_with_offset(text)
30
+ tokens, ptr = tokens_with_offset_with_ptr(text)
31
+ tokens
32
+ ensure
33
+ FFI.mitie_free(ptr) if ptr
34
+ end
35
+
36
+ def entities(text)
37
+ entities = []
38
+ tokens, tokens_ptr = tokens_with_offset_with_ptr(text)
39
+ detections = FFI.mitie_extract_entities(pointer, tokens_ptr)
40
+ num_detections = FFI.mitie_ner_get_num_detections(detections)
41
+ num_detections.times do |i|
42
+ pos = FFI.mitie_ner_get_detection_position(detections, i)
43
+ len = FFI.mitie_ner_get_detection_length(detections, i)
44
+ tag = FFI.mitie_ner_get_detection_tagstr(detections, i).to_s
45
+ score = FFI.mitie_ner_get_detection_score(detections, i)
46
+ tok = tokens[pos, len]
47
+ offset = tok[0][1]
48
+ finish = tok[-1][1] + tok[-1][0].size
49
+ entities << {
50
+ text: text[offset...finish],
51
+ tag: tag,
52
+ score: score,
53
+ offset: offset
54
+ }
55
+ end
56
+ entities
57
+ ensure
58
+ FFI.mitie_free(tokens_ptr) if tokens_ptr
59
+ FFI.mitie_free(detections) if detections
60
+ end
61
+
62
+ private
63
+
64
+ def pointer
65
+ @pointer
66
+ end
67
+
68
+ def tokens_with_offset_with_ptr(text)
69
+ token_offsets = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
70
+ ptr = FFI.mitie_tokenize_with_offsets(text, token_offsets)
71
+ i = 0
72
+ tokens = []
73
+ loop do
74
+ token = (ptr + i * Fiddle::SIZEOF_VOIDP).ptr
75
+ break if token.null?
76
+ offset = (token_offsets.ptr + i * Fiddle::SIZEOF_LONG).to_s(Fiddle::SIZEOF_LONG).unpack1("L!")
77
+ tokens << [token.to_s.force_encoding(text.encoding), offset]
78
+ i += 1
79
+ end
80
+ [tokens, ptr]
81
+ ensure
82
+ # use ptr, not token_offsets.ptr
83
+ FFI.mitie_free(token_offsets.ptr) if ptr
84
+ end
85
+
86
+ def self.finalize(pointer)
87
+ # must use proc instead of stabby lambda
88
+ proc { FFI.mitie_free(pointer) }
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,3 @@
1
+ module Mitie
2
+ VERSION = "0.1.0"
3
+ end
metadata ADDED
@@ -0,0 +1,91 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mitie
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Andrew Kane
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-09-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '5'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '5'
55
+ description:
56
+ email: andrew@chartkick.com
57
+ executables: []
58
+ extensions: []
59
+ extra_rdoc_files: []
60
+ files:
61
+ - CHANGELOG.md
62
+ - LICENSE.txt
63
+ - README.md
64
+ - lib/mitie.rb
65
+ - lib/mitie/ffi.rb
66
+ - lib/mitie/ner.rb
67
+ - lib/mitie/version.rb
68
+ homepage: https://github.com/ankane/mitie
69
+ licenses:
70
+ - BSL-1.0
71
+ metadata: {}
72
+ post_install_message:
73
+ rdoc_options: []
74
+ require_paths:
75
+ - lib
76
+ required_ruby_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: '2.5'
81
+ required_rubygems_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ requirements: []
87
+ rubygems_version: 3.1.2
88
+ signing_key:
89
+ specification_version: 4
90
+ summary: Named-entity recognition for Ruby
91
+ test_files: []