mitie 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e0ceaa2d4609a2a1b3b4056d67d88a1b0f55616ac8fd1a0509f070c352a96ea1
4
- data.tar.gz: 0b90eba1027ca5a46a405a97411d2b5fe193d20666ee8c8bad347bb2c79c225b
3
+ metadata.gz: 1e36228752078ce915d9d6cd49f3d8087c62263b0e41bebb5bcfd5a478398fe6
4
+ data.tar.gz: 54881a7c47cd855b678cf71a5baa7486b22acdbac802e15eda3984fc2215838d
5
5
  SHA512:
6
- metadata.gz: 8281e51659e08157d305535f3cd242082d4173368c36fa167716771b008a82233d02e43d1015c39e27c4c704ae0ec53935834eb0fefb71dac3b515962987d7eb
7
- data.tar.gz: 96d3564684f8197651f93238876f2bbeb69f860a8a373f36dcafb578db698555153b320f2bd0c799cd8d5d939ad05c3746c59e907d4ad62cfa4917fbb7ec2313
6
+ metadata.gz: 573e0e14ecd284cb9754abe95f2c3cd2ebbd9e2dbebaa5a0f33fad847abe6ebc3d0875c916d05f95fc1fdd9208c12d906315f0f5a557dfc6b3db63b620aa3f6f
7
+ data.tar.gz: 272335c83fc924aaf696382959d6cd93dd667139f282fef71b78b29a6784bda16f221cdd36691e885db6075312a00d463772f4408ee130ea5dde6a7a49b77d55
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.1.6 (2022-03-20)
2
+
3
+ - Added support for training NER models
4
+ - Improved ARM detection
5
+
1
6
  ## 0.1.5 (2021-01-29)
2
7
 
3
8
  - Fixed issue with multibyte characters
data/README.md CHANGED
@@ -1,18 +1,18 @@
1
- # MITIE
1
+ # MITIE Ruby
2
2
 
3
3
  [MITIE](https://github.com/mit-nlp/MITIE) - named-entity recognition and binary relation detection - for Ruby
4
4
 
5
5
  - Finds people, organizations, and locations in text
6
6
  - Detects relationships between entities, like `PERSON` was born in `LOCATION`
7
7
 
8
- [![Build Status](https://github.com/ankane/mitie/workflows/build/badge.svg?branch=master)](https://github.com/ankane/mitie/actions)
8
+ [![Build Status](https://github.com/ankane/mitie-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/mitie-ruby/actions)
9
9
 
10
10
  ## Installation
11
11
 
12
12
  Add this line to your application’s Gemfile:
13
13
 
14
14
  ```ruby
15
- gem 'mitie'
15
+ gem "mitie"
16
16
  ```
17
17
 
18
18
  And download the pre-trained model for your language:
@@ -69,6 +69,41 @@ Get all tags for a model
69
69
  model.tags
70
70
  ```
71
71
 
72
+ ## Training
73
+
74
+ Load an NER model into a trainer
75
+
76
+ ```ruby
77
+ trainer = Mitie::NERTrainer.new("total_word_feature_extractor.dat")
78
+ ```
79
+
80
+ Create training instances
81
+
82
+ ```ruby
83
+ tokens = ["You", "can", "do", "machine", "learning", "in", "Ruby", "!"]
84
+ instance = Mitie::NERTrainingInstance.new(tokens)
85
+ instance.add_entity(3..4, "topic") # machine learning
86
+ instance.add_entity(6..6, "language") # Ruby
87
+ ```
88
+
89
+ Add the training instances to the trainer
90
+
91
+ ```ruby
92
+ trainer.add(instance)
93
+ ```
94
+
95
+ Train the model
96
+
97
+ ```ruby
98
+ model = trainer.train
99
+ ```
100
+
101
+ Save the model
102
+
103
+ ```ruby
104
+ model.save_to_disk("ner_model.dat")
105
+ ```
106
+
72
107
  ## Binary Relation Detection
73
108
 
74
109
  Detect relationships betweens two entities, like:
@@ -105,22 +140,22 @@ This returns
105
140
 
106
141
  ## History
107
142
 
108
- View the [changelog](https://github.com/ankane/mitie/blob/master/CHANGELOG.md)
143
+ View the [changelog](https://github.com/ankane/mitie-ruby/blob/master/CHANGELOG.md)
109
144
 
110
145
  ## Contributing
111
146
 
112
147
  Everyone is encouraged to help improve this project. Here are a few ways you can help:
113
148
 
114
- - [Report bugs](https://github.com/ankane/mitie/issues)
115
- - Fix bugs and [submit pull requests](https://github.com/ankane/mitie/pulls)
149
+ - [Report bugs](https://github.com/ankane/mitie-ruby/issues)
150
+ - Fix bugs and [submit pull requests](https://github.com/ankane/mitie-ruby/pulls)
116
151
  - Write, clarify, or fix documentation
117
152
  - Suggest or add new features
118
153
 
119
154
  To get started with development:
120
155
 
121
156
  ```sh
122
- git clone https://github.com/ankane/mitie.git
123
- cd mitie
157
+ git clone https://github.com/ankane/mitie-ruby.git
158
+ cd mitie-ruby
124
159
  bundle install
125
160
  bundle exec rake vendor:all
126
161
 
@@ -35,6 +35,7 @@ module Mitie
35
35
 
36
36
  score_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
37
37
  status = FFI.mitie_classify_binary_relation(pointer, relation, score_ptr)
38
+ # TODO make Mitie::Error
38
39
  raise "Bad status: #{status}" if status != 0
39
40
  score = score_ptr.to_s(Fiddle::SIZEOF_DOUBLE).unpack1("d")
40
41
  if score > 0
@@ -84,11 +84,7 @@ module Mitie
84
84
  def tokenize
85
85
  @tokenize ||= begin
86
86
  if text.is_a?(Array)
87
- # malloc uses memset to set all bytes to 0
88
- tokens_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP * (text.size + 1))
89
- text.size.times do |i|
90
- tokens_ptr[i * Fiddle::SIZEOF_VOIDP, Fiddle::SIZEOF_VOIDP] = Fiddle::Pointer.to_ptr(text[i]).ref
91
- end
87
+ tokens_ptr = Utils.array_to_pointer(text)
92
88
  [tokens_ptr, nil]
93
89
  else
94
90
  offsets_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
data/lib/mitie/ffi.rb CHANGED
@@ -10,9 +10,12 @@ module Mitie
10
10
  raise e
11
11
  end
12
12
 
13
+ # https://github.com/mit-nlp/MITIE/blob/master/mitielib/include/mitie.h
14
+
13
15
  extern "void mitie_free(void* object)"
14
16
  extern "char** mitie_tokenize(const char* text)"
15
17
  extern "char** mitie_tokenize_with_offsets(const char* text, unsigned long** token_offsets)"
18
+ extern "int mitie_save_named_entity_extractor(const char* filename, const mitie_named_entity_extractor* ner)"
16
19
 
17
20
  extern "mitie_named_entity_extractor* mitie_load_named_entity_extractor(const char* filename)"
18
21
  extern "unsigned long mitie_get_num_possible_ner_tags(const mitie_named_entity_extractor* ner)"
@@ -26,6 +29,21 @@ module Mitie
26
29
  extern "const char* mitie_ner_get_detection_tagstr(const mitie_named_entity_detections* dets, unsigned long idx)"
27
30
  extern "double mitie_ner_get_detection_score(const mitie_named_entity_detections* dets, unsigned long idx)"
28
31
 
32
+ extern "mitie_ner_training_instance* mitie_create_ner_training_instance(char** tokens)"
33
+ extern "unsigned long mitie_ner_training_instance_num_entities(const mitie_ner_training_instance* instance)"
34
+ extern "unsigned long mitie_ner_training_instance_num_tokens(const mitie_ner_training_instance* instance)"
35
+ extern "int mitie_overlaps_any_entity(mitie_ner_training_instance* instance, unsigned long start, unsigned long length)"
36
+ extern "int mitie_add_ner_training_entity(mitie_ner_training_instance* instance, unsigned long start, unsigned long length, const char* label)"
37
+
38
+ extern "mitie_ner_trainer* mitie_create_ner_trainer(const char* filename)"
39
+ extern "unsigned long mitie_ner_trainer_size(const mitie_ner_trainer* trainer)"
40
+ extern "int mitie_add_ner_training_instance(mitie_ner_trainer* trainer, const mitie_ner_training_instance* instance)"
41
+ extern "void mitie_ner_trainer_set_beta(mitie_ner_trainer* trainer, double beta)"
42
+ extern "double mitie_ner_trainer_get_beta(const mitie_ner_trainer* trainer)"
43
+ extern "void mitie_ner_trainer_set_num_threads(mitie_ner_trainer* trainer, unsigned long num_threads)"
44
+ extern "unsigned long mitie_ner_trainer_get_num_threads(const mitie_ner_trainer* trainer)"
45
+ extern "mitie_named_entity_extractor* mitie_train_named_entity_extractor(const mitie_ner_trainer* trainer)"
46
+
29
47
  extern "mitie_binary_relation_detector* mitie_load_binary_relation_detector(const char* filename)"
30
48
  extern "const char* mitie_binary_relation_detector_name_string(const mitie_binary_relation_detector* detector)"
31
49
  extern "int mitie_entities_overlap(unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)"
data/lib/mitie/ner.rb CHANGED
@@ -2,11 +2,18 @@ module Mitie
2
2
  class NER
3
3
  attr_reader :pointer
4
4
 
5
- def initialize(path)
6
- # better error message
7
- raise ArgumentError, "File does not exist" unless File.exist?(path)
8
- @pointer = FFI.mitie_load_named_entity_extractor(path)
9
- ObjectSpace.define_finalizer(self, self.class.finalize(pointer))
5
+ def initialize(path = nil, pointer: nil)
6
+ if path
7
+ # better error message
8
+ raise ArgumentError, "File does not exist" unless File.exist?(path)
9
+ @pointer = FFI.mitie_load_named_entity_extractor(path)
10
+ elsif pointer
11
+ @pointer = pointer
12
+ else
13
+ raise ArgumentError, "Must pass either a path or a pointer"
14
+ end
15
+
16
+ ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
10
17
  end
11
18
 
12
19
  def tags
@@ -23,6 +30,13 @@ module Mitie
23
30
  doc(text).entities
24
31
  end
25
32
 
33
+ def save_to_disk(filename)
34
+ if FFI.mitie_save_named_entity_extractor(filename, pointer) != 0
35
+ raise Error, "Unable to save model"
36
+ end
37
+ nil
38
+ end
39
+
26
40
  def tokens(text)
27
41
  doc(text).tokens
28
42
  end
@@ -0,0 +1,51 @@
1
+ module Mitie
2
+ class NERTrainer
3
+ def initialize(filename)
4
+ raise ArgumentError, "File does not exist" unless File.exist?(filename)
5
+ @pointer = FFI.mitie_create_ner_trainer(filename)
6
+
7
+ ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
8
+ end
9
+
10
+ def add(instance)
11
+ FFI.mitie_add_ner_training_instance(@pointer, instance.pointer)
12
+ end
13
+
14
+ def beta
15
+ FFI.mitie_ner_trainer_get_beta(@pointer)
16
+ end
17
+
18
+ def beta=(value)
19
+ raise ArgumentError, "beta must be greater than or equal to zero" unless value >= 0
20
+
21
+ FFI.mitie_ner_trainer_set_beta(@pointer, value)
22
+ end
23
+
24
+ def num_threads
25
+ FFI.mitie_ner_trainer_get_num_threads(@pointer)
26
+ end
27
+
28
+ def num_threads=(value)
29
+ FFI.mitie_ner_trainer_set_num_threads(@pointer, value)
30
+ end
31
+
32
+ def size
33
+ FFI.mitie_ner_trainer_size(@pointer)
34
+ end
35
+
36
+ def train
37
+ raise Error, "You can't call train() on an empty trainer" if size.zero?
38
+
39
+ extractor = FFI.mitie_train_named_entity_extractor(@pointer)
40
+
41
+ raise Error, "Unable to create named entity extractor. Probably ran out of RAM." if extractor.null?
42
+
43
+ Mitie::NER.new(pointer: extractor)
44
+ end
45
+
46
+ def self.finalize(pointer)
47
+ # must use proc instead of stabby lambda
48
+ proc { FFI.mitie_free(pointer) }
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,49 @@
1
+ module Mitie
2
+ class NERTrainingInstance
3
+ attr_reader :pointer
4
+
5
+ def initialize(tokens)
6
+ tokens_pointer = Utils.array_to_pointer(tokens)
7
+
8
+ @pointer = FFI.mitie_create_ner_training_instance(tokens_pointer)
9
+ raise Error, "Unable to create training instance. Probably ran out of RAM." if @pointer.null?
10
+
11
+ ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
12
+ end
13
+
14
+ def add_entity(range, label)
15
+ if range.none? || range.end >= num_tokens || range.begin < 0
16
+ raise ArgumentError, "Invalid range"
17
+ end
18
+
19
+ raise ArgumentError, "Range overlaps existing entity" if overlaps_any_entity?(range)
20
+
21
+ unless FFI.mitie_add_ner_training_entity(@pointer, range.begin, range.size, label).zero?
22
+ raise Error, "Unable to add entity to training instance. Probably ran out of RAM."
23
+ end
24
+
25
+ nil
26
+ end
27
+
28
+ def num_entities
29
+ FFI.mitie_ner_training_instance_num_entities(@pointer)
30
+ end
31
+
32
+ def num_tokens
33
+ FFI.mitie_ner_training_instance_num_tokens(@pointer)
34
+ end
35
+
36
+ def overlaps_any_entity?(range)
37
+ if range.none? || range.max >= num_tokens
38
+ raise ArgumentError, "Invalid range"
39
+ end
40
+
41
+ FFI.mitie_overlaps_any_entity(@pointer, range.begin, range.size) == 1
42
+ end
43
+
44
+ def self.finalize(pointer)
45
+ # must use proc instead of stabby lambda
46
+ proc { FFI.mitie_free(pointer) }
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,12 @@
1
+ module Mitie
2
+ module Utils
3
+ def self.array_to_pointer(text)
4
+ # malloc uses memset to set all bytes to 0
5
+ tokens_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP * (text.size + 1))
6
+ text.size.times do |i|
7
+ tokens_ptr[i * Fiddle::SIZEOF_VOIDP, Fiddle::SIZEOF_VOIDP] = Fiddle::Pointer.to_ptr(text[i]).ref
8
+ end
9
+ tokens_ptr
10
+ end
11
+ end
12
+ end
data/lib/mitie/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Mitie
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.6"
3
3
  end
data/lib/mitie.rb CHANGED
@@ -5,6 +5,9 @@ require "fiddle/import"
5
5
  require "mitie/binary_relation_detector"
6
6
  require "mitie/document"
7
7
  require "mitie/ner"
8
+ require "mitie/ner_training_instance"
9
+ require "mitie/ner_trainer"
10
+ require "mitie/utils"
8
11
  require "mitie/version"
9
12
 
10
13
  module Mitie
@@ -16,10 +19,12 @@ module Mitie
16
19
  lib_name =
17
20
  if Gem.win_platform?
18
21
  "mitie.dll"
19
- elsif RbConfig::CONFIG["arch"] =~ /arm64-darwin/i
20
- "libmitie.arm64.dylib"
21
22
  elsif RbConfig::CONFIG["host_os"] =~ /darwin/i
22
- "libmitie.dylib"
23
+ if RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i
24
+ "libmitie.arm64.dylib"
25
+ else
26
+ "libmitie.dylib"
27
+ end
23
28
  else
24
29
  "libmitie.so"
25
30
  end
metadata CHANGED
@@ -1,17 +1,17 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mitie
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-30 00:00:00.000000000 Z
11
+ date: 2022-03-21 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
- email: andrew@chartkick.com
14
+ email: andrew@ankane.org
15
15
  executables: []
16
16
  extensions: []
17
17
  extra_rdoc_files: []
@@ -24,13 +24,16 @@ files:
24
24
  - lib/mitie/document.rb
25
25
  - lib/mitie/ffi.rb
26
26
  - lib/mitie/ner.rb
27
+ - lib/mitie/ner_trainer.rb
28
+ - lib/mitie/ner_training_instance.rb
29
+ - lib/mitie/utils.rb
27
30
  - lib/mitie/version.rb
28
31
  - vendor/LICENSE.txt
29
32
  - vendor/libmitie.arm64.dylib
30
33
  - vendor/libmitie.dylib
31
34
  - vendor/libmitie.so
32
35
  - vendor/mitie.dll
33
- homepage: https://github.com/ankane/mitie
36
+ homepage: https://github.com/ankane/mitie-ruby
34
37
  licenses:
35
38
  - BSL-1.0
36
39
  metadata: {}
@@ -49,7 +52,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
49
52
  - !ruby/object:Gem::Version
50
53
  version: '0'
51
54
  requirements: []
52
- rubygems_version: 3.2.3
55
+ rubygems_version: 3.3.7
53
56
  signing_key:
54
57
  specification_version: 4
55
58
  summary: Named-entity recognition for Ruby