mitie 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +2 -2
- data/lib/mitie/binary_relation_detector.rb +1 -7
- data/lib/mitie/binary_relation_trainer.rb +1 -7
- data/lib/mitie/document.rb +1 -6
- data/lib/mitie/ffi.rb +1 -0
- data/lib/mitie/ner.rb +1 -7
- data/lib/mitie/ner_trainer.rb +1 -7
- data/lib/mitie/ner_training_instance.rb +1 -7
- data/lib/mitie/text_categorizer.rb +3 -8
- data/lib/mitie/text_categorizer_trainer.rb +3 -8
- data/lib/mitie/version.rb +1 -1
- data/lib/mitie.rb +43 -10
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4106166da7718e20cce4c31c7f3974b12b06e1d2d5c7f30327981c0d0d31652a
|
4
|
+
data.tar.gz: 9345cf0bedb85634dd43c2aaef3438870649479add03a3ca48bc0f8892822c4f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dc6581ed4865bff3a21724a056946d90526be0fc1d32803e625a918247075331adf7a5e721cfddbdc366c4df47d2e91c739953bccbd7e451cdb344c12fdf27de
|
7
|
+
data.tar.gz: d96cab2092c6cca5f25f51e1fae4977874d1d85e881d1686110c99d2b975de288e57b725035a766ea80c1304013d91e13953c5caa41d6572e7ab9bacbdad7ae8
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
## 0.2.2 (2023-06-07)
|
2
|
+
|
3
|
+
- Fixed error with `dup` and `clone`
|
4
|
+
|
5
|
+
## 0.2.1 (2022-06-12)
|
6
|
+
|
7
|
+
- Added `tokenize` and `tokenize_file` methods
|
8
|
+
- Added support for untokenized text to text categorization
|
9
|
+
|
1
10
|
## 0.2.0 (2022-06-01)
|
2
11
|
|
3
12
|
- Added support for text categorization
|
data/README.md
CHANGED
@@ -183,7 +183,7 @@ trainer = Mitie::TextCategorizerTrainer.new("total_word_feature_extractor.dat")
|
|
183
183
|
Add labeled text to the trainer
|
184
184
|
|
185
185
|
```ruby
|
186
|
-
trainer.add(
|
186
|
+
trainer.add("This is super cool", "positive")
|
187
187
|
```
|
188
188
|
|
189
189
|
Train the model
|
@@ -207,7 +207,7 @@ model = Mitie::TextCategorizer.new("text_categorization_model.dat")
|
|
207
207
|
Categorize text
|
208
208
|
|
209
209
|
```ruby
|
210
|
-
model.categorize(
|
210
|
+
model.categorize("What a super nice day")
|
211
211
|
```
|
212
212
|
|
213
213
|
## Deployment
|
@@ -5,13 +5,12 @@ module Mitie
|
|
5
5
|
# better error message
|
6
6
|
raise ArgumentError, "File does not exist" unless File.exist?(path)
|
7
7
|
@pointer = FFI.mitie_load_binary_relation_detector(path)
|
8
|
+
@pointer.free = FFI["mitie_free"]
|
8
9
|
elsif pointer
|
9
10
|
@pointer = pointer
|
10
11
|
else
|
11
12
|
raise ArgumentError, "Must pass either a path or a pointer"
|
12
13
|
end
|
13
|
-
|
14
|
-
ObjectSpace.define_finalizer(self, self.class.finalize(pointer))
|
15
14
|
end
|
16
15
|
|
17
16
|
def name
|
@@ -75,10 +74,5 @@ module Mitie
|
|
75
74
|
ensure
|
76
75
|
FFI.mitie_free(relation) if relation
|
77
76
|
end
|
78
|
-
|
79
|
-
def self.finalize(pointer)
|
80
|
-
# must use proc instead of stabby lambda
|
81
|
-
proc { FFI.mitie_free(pointer) }
|
82
|
-
end
|
83
77
|
end
|
84
78
|
end
|
@@ -2,8 +2,7 @@ module Mitie
|
|
2
2
|
class BinaryRelationTrainer
|
3
3
|
def initialize(ner, name: "")
|
4
4
|
@pointer = FFI.mitie_create_binary_relation_trainer(name, ner.pointer)
|
5
|
-
|
6
|
-
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
5
|
+
@pointer.free = FFI["mitie_free"]
|
7
6
|
end
|
8
7
|
|
9
8
|
def add_positive_binary_relation(tokens, range1, range2)
|
@@ -78,10 +77,5 @@ module Mitie
|
|
78
77
|
def entities_overlap?(range1, range2)
|
79
78
|
FFI.mitie_entities_overlap(range1.begin, range1.size, range2.begin, range2.size) == 1
|
80
79
|
end
|
81
|
-
|
82
|
-
def self.finalize(pointer)
|
83
|
-
# must use proc instead of stabby lambda
|
84
|
-
proc { FFI.mitie_free(pointer) }
|
85
|
-
end
|
86
80
|
end
|
87
81
|
end
|
data/lib/mitie/document.rb
CHANGED
@@ -87,8 +87,8 @@ module Mitie
|
|
87
87
|
else
|
88
88
|
offsets_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
|
89
89
|
tokens_ptr = FFI.mitie_tokenize_with_offsets(text, offsets_ptr)
|
90
|
+
tokens_ptr.free = FFI["mitie_free"]
|
90
91
|
|
91
|
-
ObjectSpace.define_finalizer(tokens_ptr, self.class.finalize(tokens_ptr))
|
92
92
|
ObjectSpace.define_finalizer(offsets_ptr, self.class.finalize_ptr(offsets_ptr))
|
93
93
|
|
94
94
|
[tokens_ptr, offsets_ptr]
|
@@ -96,11 +96,6 @@ module Mitie
|
|
96
96
|
end
|
97
97
|
end
|
98
98
|
|
99
|
-
def self.finalize(pointer)
|
100
|
-
# must use proc instead of stabby lambda
|
101
|
-
proc { FFI.mitie_free(pointer) }
|
102
|
-
end
|
103
|
-
|
104
99
|
def self.finalize_ptr(pointer)
|
105
100
|
# must use proc instead of stabby lambda
|
106
101
|
proc { FFI.mitie_free(pointer.ptr) }
|
data/lib/mitie/ffi.rb
CHANGED
@@ -14,6 +14,7 @@ module Mitie
|
|
14
14
|
|
15
15
|
extern "void mitie_free(void* object)"
|
16
16
|
extern "char** mitie_tokenize(const char* text)"
|
17
|
+
extern "char** mitie_tokenize_file(const char* filename)"
|
17
18
|
extern "char** mitie_tokenize_with_offsets(const char* text, unsigned long** token_offsets)"
|
18
19
|
|
19
20
|
# ner
|
data/lib/mitie/ner.rb
CHANGED
@@ -7,13 +7,12 @@ module Mitie
|
|
7
7
|
# better error message
|
8
8
|
raise ArgumentError, "File does not exist" unless File.exist?(path)
|
9
9
|
@pointer = FFI.mitie_load_named_entity_extractor(path)
|
10
|
+
@pointer.free = FFI["mitie_free"]
|
10
11
|
elsif pointer
|
11
12
|
@pointer = pointer
|
12
13
|
else
|
13
14
|
raise ArgumentError, "Must pass either a path or a pointer"
|
14
15
|
end
|
15
|
-
|
16
|
-
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
17
16
|
end
|
18
17
|
|
19
18
|
def tags
|
@@ -44,10 +43,5 @@ module Mitie
|
|
44
43
|
def tokens_with_offset(text)
|
45
44
|
doc(text).tokens_with_offset
|
46
45
|
end
|
47
|
-
|
48
|
-
def self.finalize(pointer)
|
49
|
-
# must use proc instead of stabby lambda
|
50
|
-
proc { FFI.mitie_free(pointer) }
|
51
|
-
end
|
52
46
|
end
|
53
47
|
end
|
data/lib/mitie/ner_trainer.rb
CHANGED
@@ -3,8 +3,7 @@ module Mitie
|
|
3
3
|
def initialize(filename)
|
4
4
|
raise ArgumentError, "File does not exist" unless File.exist?(filename)
|
5
5
|
@pointer = FFI.mitie_create_ner_trainer(filename)
|
6
|
-
|
7
|
-
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
6
|
+
@pointer.free = FFI["mitie_free"]
|
8
7
|
end
|
9
8
|
|
10
9
|
def add(instance)
|
@@ -42,10 +41,5 @@ module Mitie
|
|
42
41
|
|
43
42
|
Mitie::NER.new(pointer: extractor)
|
44
43
|
end
|
45
|
-
|
46
|
-
def self.finalize(pointer)
|
47
|
-
# must use proc instead of stabby lambda
|
48
|
-
proc { FFI.mitie_free(pointer) }
|
49
|
-
end
|
50
44
|
end
|
51
45
|
end
|
@@ -7,8 +7,7 @@ module Mitie
|
|
7
7
|
|
8
8
|
@pointer = FFI.mitie_create_ner_training_instance(tokens_pointer)
|
9
9
|
raise Error, "Unable to create training instance. Probably ran out of RAM." if @pointer.null?
|
10
|
-
|
11
|
-
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
10
|
+
@pointer.free = FFI["mitie_free"]
|
12
11
|
end
|
13
12
|
|
14
13
|
def add_entity(range, label)
|
@@ -36,10 +35,5 @@ module Mitie
|
|
36
35
|
|
37
36
|
FFI.mitie_overlaps_any_entity(@pointer, range.begin, range.size) == 1
|
38
37
|
end
|
39
|
-
|
40
|
-
def self.finalize(pointer)
|
41
|
-
# must use proc instead of stabby lambda
|
42
|
-
proc { FFI.mitie_free(pointer) }
|
43
|
-
end
|
44
38
|
end
|
45
39
|
end
|
@@ -5,16 +5,16 @@ module Mitie
|
|
5
5
|
# better error message
|
6
6
|
raise ArgumentError, "File does not exist" unless File.exist?(path)
|
7
7
|
@pointer = FFI.mitie_load_text_categorizer(path)
|
8
|
+
@pointer.free = FFI["mitie_free"]
|
8
9
|
elsif pointer
|
9
10
|
@pointer = pointer
|
10
11
|
else
|
11
12
|
raise ArgumentError, "Must pass either a path or a pointer"
|
12
13
|
end
|
13
|
-
|
14
|
-
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
15
14
|
end
|
16
15
|
|
17
|
-
def categorize(
|
16
|
+
def categorize(text)
|
17
|
+
tokens = text.is_a?(Array) ? text : Mitie.tokenize(text)
|
18
18
|
tokens_pointer = Utils.array_to_pointer(tokens)
|
19
19
|
text_tag = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
|
20
20
|
text_score = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
|
@@ -38,10 +38,5 @@ module Mitie
|
|
38
38
|
end
|
39
39
|
nil
|
40
40
|
end
|
41
|
-
|
42
|
-
def self.finalize(pointer)
|
43
|
-
# must use proc instead of stabby lambda
|
44
|
-
proc { FFI.mitie_free(pointer) }
|
45
|
-
end
|
46
41
|
end
|
47
42
|
end
|
@@ -3,11 +3,11 @@ module Mitie
|
|
3
3
|
def initialize(filename)
|
4
4
|
raise ArgumentError, "File does not exist" unless File.exist?(filename)
|
5
5
|
@pointer = FFI.mitie_create_text_categorizer_trainer(filename)
|
6
|
-
|
7
|
-
ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
|
6
|
+
@pointer.free = FFI["mitie_free"]
|
8
7
|
end
|
9
8
|
|
10
|
-
def add(
|
9
|
+
def add(text, label)
|
10
|
+
tokens = text.is_a?(Array) ? text : Mitie.tokenize(text)
|
11
11
|
tokens_pointer = Utils.array_to_pointer(tokens)
|
12
12
|
FFI.mitie_add_text_categorizer_labeled_text(@pointer, tokens_pointer, label)
|
13
13
|
end
|
@@ -43,10 +43,5 @@ module Mitie
|
|
43
43
|
|
44
44
|
Mitie::TextCategorizer.new(pointer: categorizer)
|
45
45
|
end
|
46
|
-
|
47
|
-
def self.finalize(pointer)
|
48
|
-
# must use proc instead of stabby lambda
|
49
|
-
proc { FFI.mitie_free(pointer) }
|
50
|
-
end
|
51
46
|
end
|
52
47
|
end
|
data/lib/mitie/version.rb
CHANGED
data/lib/mitie.rb
CHANGED
@@ -2,16 +2,16 @@
|
|
2
2
|
require "fiddle/import"
|
3
3
|
|
4
4
|
# modules
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
5
|
+
require_relative "mitie/binary_relation_detector"
|
6
|
+
require_relative "mitie/binary_relation_trainer"
|
7
|
+
require_relative "mitie/document"
|
8
|
+
require_relative "mitie/ner"
|
9
|
+
require_relative "mitie/ner_training_instance"
|
10
|
+
require_relative "mitie/ner_trainer"
|
11
|
+
require_relative "mitie/text_categorizer"
|
12
|
+
require_relative "mitie/text_categorizer_trainer"
|
13
|
+
require_relative "mitie/utils"
|
14
|
+
require_relative "mitie/version"
|
15
15
|
|
16
16
|
module Mitie
|
17
17
|
class Error < StandardError; end
|
@@ -36,4 +36,37 @@ module Mitie
|
|
36
36
|
|
37
37
|
# friendlier error message
|
38
38
|
autoload :FFI, "mitie/ffi"
|
39
|
+
|
40
|
+
class << self
|
41
|
+
def tokenize(text)
|
42
|
+
tokens_ptr = FFI.mitie_tokenize(text.to_s)
|
43
|
+
tokens = read_tokens(tokens_ptr)
|
44
|
+
tokens.each { |t| t.force_encoding(text.encoding) }
|
45
|
+
tokens
|
46
|
+
ensure
|
47
|
+
FFI.mitie_free(tokens_ptr) if tokens_ptr
|
48
|
+
end
|
49
|
+
|
50
|
+
def tokenize_file(filename)
|
51
|
+
raise ArgumentError, "File does not exist" unless File.exist?(filename)
|
52
|
+
tokens_ptr = FFI.mitie_tokenize_file(filename)
|
53
|
+
read_tokens(tokens_ptr)
|
54
|
+
ensure
|
55
|
+
FFI.mitie_free(tokens_ptr) if tokens_ptr
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def read_tokens(tokens_ptr)
|
61
|
+
i = 0
|
62
|
+
tokens = []
|
63
|
+
loop do
|
64
|
+
token = (tokens_ptr + i * Fiddle::SIZEOF_VOIDP).ptr
|
65
|
+
break if token.null?
|
66
|
+
tokens << token.to_s
|
67
|
+
i += 1
|
68
|
+
end
|
69
|
+
tokens
|
70
|
+
end
|
71
|
+
end
|
39
72
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mitie
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-06-08 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email: andrew@ankane.org
|
@@ -55,7 +55,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
55
55
|
- !ruby/object:Gem::Version
|
56
56
|
version: '0'
|
57
57
|
requirements: []
|
58
|
-
rubygems_version: 3.
|
58
|
+
rubygems_version: 3.4.10
|
59
59
|
signing_key:
|
60
60
|
specification_version: 4
|
61
61
|
summary: Named-entity recognition for Ruby
|