mitie 0.2.0 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 50fb12adcd0042b3c09968108ec382694f8f0df20750b88566bc64c8d85d9e8d
4
- data.tar.gz: 9d3d34f9839f71fc17e6651a17069f1f0fdb4b5956f9e57c4b66c9567d908967
3
+ metadata.gz: 4106166da7718e20cce4c31c7f3974b12b06e1d2d5c7f30327981c0d0d31652a
4
+ data.tar.gz: 9345cf0bedb85634dd43c2aaef3438870649479add03a3ca48bc0f8892822c4f
5
5
  SHA512:
6
- metadata.gz: 189d5a17f94fff9abbc1d8d961c8f8407ae85bd1284f727e363b9aa7384d748eb1d80b6ffa1b1a44c8c6fadbbb6e91e6d064ea58fa88af972e6fd6e90e3ef71d
7
- data.tar.gz: 202ba418ee98636736185a8f4783601acabecc59dde7e3a5c31a12b56827497b6b2b7e7e26d264b61848281b94cb635249b17a9e8fa009c496dacbe24e6466ca
6
+ metadata.gz: dc6581ed4865bff3a21724a056946d90526be0fc1d32803e625a918247075331adf7a5e721cfddbdc366c4df47d2e91c739953bccbd7e451cdb344c12fdf27de
7
+ data.tar.gz: d96cab2092c6cca5f25f51e1fae4977874d1d85e881d1686110c99d2b975de288e57b725035a766ea80c1304013d91e13953c5caa41d6572e7ab9bacbdad7ae8
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ## 0.2.2 (2023-06-07)
2
+
3
+ - Fixed error with `dup` and `clone`
4
+
5
+ ## 0.2.1 (2022-06-12)
6
+
7
+ - Added `tokenize` and `tokenize_file` methods
8
+ - Added support for untokenized text to text categorization
9
+
1
10
  ## 0.2.0 (2022-06-01)
2
11
 
3
12
  - Added support for text categorization
data/README.md CHANGED
@@ -183,7 +183,7 @@ trainer = Mitie::TextCategorizerTrainer.new("total_word_feature_extractor.dat")
183
183
  Add labeled text to the trainer
184
184
 
185
185
  ```ruby
186
- trainer.add(["This", "is", "super", "cool"], "positive")
186
+ trainer.add("This is super cool", "positive")
187
187
  ```
188
188
 
189
189
  Train the model
@@ -207,7 +207,7 @@ model = Mitie::TextCategorizer.new("text_categorization_model.dat")
207
207
  Categorize text
208
208
 
209
209
  ```ruby
210
- model.categorize(["What", "a", "super", "nice", "day"])
210
+ model.categorize("What a super nice day")
211
211
  ```
212
212
 
213
213
  ## Deployment
@@ -5,13 +5,12 @@ module Mitie
5
5
  # better error message
6
6
  raise ArgumentError, "File does not exist" unless File.exist?(path)
7
7
  @pointer = FFI.mitie_load_binary_relation_detector(path)
8
+ @pointer.free = FFI["mitie_free"]
8
9
  elsif pointer
9
10
  @pointer = pointer
10
11
  else
11
12
  raise ArgumentError, "Must pass either a path or a pointer"
12
13
  end
13
-
14
- ObjectSpace.define_finalizer(self, self.class.finalize(pointer))
15
14
  end
16
15
 
17
16
  def name
@@ -75,10 +74,5 @@ module Mitie
75
74
  ensure
76
75
  FFI.mitie_free(relation) if relation
77
76
  end
78
-
79
- def self.finalize(pointer)
80
- # must use proc instead of stabby lambda
81
- proc { FFI.mitie_free(pointer) }
82
- end
83
77
  end
84
78
  end
@@ -2,8 +2,7 @@ module Mitie
2
2
  class BinaryRelationTrainer
3
3
  def initialize(ner, name: "")
4
4
  @pointer = FFI.mitie_create_binary_relation_trainer(name, ner.pointer)
5
-
6
- ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
5
+ @pointer.free = FFI["mitie_free"]
7
6
  end
8
7
 
9
8
  def add_positive_binary_relation(tokens, range1, range2)
@@ -78,10 +77,5 @@ module Mitie
78
77
  def entities_overlap?(range1, range2)
79
78
  FFI.mitie_entities_overlap(range1.begin, range1.size, range2.begin, range2.size) == 1
80
79
  end
81
-
82
- def self.finalize(pointer)
83
- # must use proc instead of stabby lambda
84
- proc { FFI.mitie_free(pointer) }
85
- end
86
80
  end
87
81
  end
@@ -87,8 +87,8 @@ module Mitie
87
87
  else
88
88
  offsets_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
89
89
  tokens_ptr = FFI.mitie_tokenize_with_offsets(text, offsets_ptr)
90
+ tokens_ptr.free = FFI["mitie_free"]
90
91
 
91
- ObjectSpace.define_finalizer(tokens_ptr, self.class.finalize(tokens_ptr))
92
92
  ObjectSpace.define_finalizer(offsets_ptr, self.class.finalize_ptr(offsets_ptr))
93
93
 
94
94
  [tokens_ptr, offsets_ptr]
@@ -96,11 +96,6 @@ module Mitie
96
96
  end
97
97
  end
98
98
 
99
- def self.finalize(pointer)
100
- # must use proc instead of stabby lambda
101
- proc { FFI.mitie_free(pointer) }
102
- end
103
-
104
99
  def self.finalize_ptr(pointer)
105
100
  # must use proc instead of stabby lambda
106
101
  proc { FFI.mitie_free(pointer.ptr) }
data/lib/mitie/ffi.rb CHANGED
@@ -14,6 +14,7 @@ module Mitie
14
14
 
15
15
  extern "void mitie_free(void* object)"
16
16
  extern "char** mitie_tokenize(const char* text)"
17
+ extern "char** mitie_tokenize_file(const char* filename)"
17
18
  extern "char** mitie_tokenize_with_offsets(const char* text, unsigned long** token_offsets)"
18
19
 
19
20
  # ner
data/lib/mitie/ner.rb CHANGED
@@ -7,13 +7,12 @@ module Mitie
7
7
  # better error message
8
8
  raise ArgumentError, "File does not exist" unless File.exist?(path)
9
9
  @pointer = FFI.mitie_load_named_entity_extractor(path)
10
+ @pointer.free = FFI["mitie_free"]
10
11
  elsif pointer
11
12
  @pointer = pointer
12
13
  else
13
14
  raise ArgumentError, "Must pass either a path or a pointer"
14
15
  end
15
-
16
- ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
17
16
  end
18
17
 
19
18
  def tags
@@ -44,10 +43,5 @@ module Mitie
44
43
  def tokens_with_offset(text)
45
44
  doc(text).tokens_with_offset
46
45
  end
47
-
48
- def self.finalize(pointer)
49
- # must use proc instead of stabby lambda
50
- proc { FFI.mitie_free(pointer) }
51
- end
52
46
  end
53
47
  end
@@ -3,8 +3,7 @@ module Mitie
3
3
  def initialize(filename)
4
4
  raise ArgumentError, "File does not exist" unless File.exist?(filename)
5
5
  @pointer = FFI.mitie_create_ner_trainer(filename)
6
-
7
- ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
6
+ @pointer.free = FFI["mitie_free"]
8
7
  end
9
8
 
10
9
  def add(instance)
@@ -42,10 +41,5 @@ module Mitie
42
41
 
43
42
  Mitie::NER.new(pointer: extractor)
44
43
  end
45
-
46
- def self.finalize(pointer)
47
- # must use proc instead of stabby lambda
48
- proc { FFI.mitie_free(pointer) }
49
- end
50
44
  end
51
45
  end
@@ -7,8 +7,7 @@ module Mitie
7
7
 
8
8
  @pointer = FFI.mitie_create_ner_training_instance(tokens_pointer)
9
9
  raise Error, "Unable to create training instance. Probably ran out of RAM." if @pointer.null?
10
-
11
- ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
10
+ @pointer.free = FFI["mitie_free"]
12
11
  end
13
12
 
14
13
  def add_entity(range, label)
@@ -36,10 +35,5 @@ module Mitie
36
35
 
37
36
  FFI.mitie_overlaps_any_entity(@pointer, range.begin, range.size) == 1
38
37
  end
39
-
40
- def self.finalize(pointer)
41
- # must use proc instead of stabby lambda
42
- proc { FFI.mitie_free(pointer) }
43
- end
44
38
  end
45
39
  end
@@ -5,16 +5,16 @@ module Mitie
5
5
  # better error message
6
6
  raise ArgumentError, "File does not exist" unless File.exist?(path)
7
7
  @pointer = FFI.mitie_load_text_categorizer(path)
8
+ @pointer.free = FFI["mitie_free"]
8
9
  elsif pointer
9
10
  @pointer = pointer
10
11
  else
11
12
  raise ArgumentError, "Must pass either a path or a pointer"
12
13
  end
13
-
14
- ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
15
14
  end
16
15
 
17
- def categorize(tokens)
16
+ def categorize(text)
17
+ tokens = text.is_a?(Array) ? text : Mitie.tokenize(text)
18
18
  tokens_pointer = Utils.array_to_pointer(tokens)
19
19
  text_tag = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
20
20
  text_score = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
@@ -38,10 +38,5 @@ module Mitie
38
38
  end
39
39
  nil
40
40
  end
41
-
42
- def self.finalize(pointer)
43
- # must use proc instead of stabby lambda
44
- proc { FFI.mitie_free(pointer) }
45
- end
46
41
  end
47
42
  end
@@ -3,11 +3,11 @@ module Mitie
3
3
  def initialize(filename)
4
4
  raise ArgumentError, "File does not exist" unless File.exist?(filename)
5
5
  @pointer = FFI.mitie_create_text_categorizer_trainer(filename)
6
-
7
- ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
6
+ @pointer.free = FFI["mitie_free"]
8
7
  end
9
8
 
10
- def add(tokens, label)
9
+ def add(text, label)
10
+ tokens = text.is_a?(Array) ? text : Mitie.tokenize(text)
11
11
  tokens_pointer = Utils.array_to_pointer(tokens)
12
12
  FFI.mitie_add_text_categorizer_labeled_text(@pointer, tokens_pointer, label)
13
13
  end
@@ -43,10 +43,5 @@ module Mitie
43
43
 
44
44
  Mitie::TextCategorizer.new(pointer: categorizer)
45
45
  end
46
-
47
- def self.finalize(pointer)
48
- # must use proc instead of stabby lambda
49
- proc { FFI.mitie_free(pointer) }
50
- end
51
46
  end
52
47
  end
data/lib/mitie/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Mitie
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.2"
3
3
  end
data/lib/mitie.rb CHANGED
@@ -2,16 +2,16 @@
2
2
  require "fiddle/import"
3
3
 
4
4
  # modules
5
- require "mitie/binary_relation_detector"
6
- require "mitie/binary_relation_trainer"
7
- require "mitie/document"
8
- require "mitie/ner"
9
- require "mitie/ner_training_instance"
10
- require "mitie/ner_trainer"
11
- require "mitie/text_categorizer"
12
- require "mitie/text_categorizer_trainer"
13
- require "mitie/utils"
14
- require "mitie/version"
5
+ require_relative "mitie/binary_relation_detector"
6
+ require_relative "mitie/binary_relation_trainer"
7
+ require_relative "mitie/document"
8
+ require_relative "mitie/ner"
9
+ require_relative "mitie/ner_training_instance"
10
+ require_relative "mitie/ner_trainer"
11
+ require_relative "mitie/text_categorizer"
12
+ require_relative "mitie/text_categorizer_trainer"
13
+ require_relative "mitie/utils"
14
+ require_relative "mitie/version"
15
15
 
16
16
  module Mitie
17
17
  class Error < StandardError; end
@@ -36,4 +36,37 @@ module Mitie
36
36
 
37
37
  # friendlier error message
38
38
  autoload :FFI, "mitie/ffi"
39
+
40
+ class << self
41
+ def tokenize(text)
42
+ tokens_ptr = FFI.mitie_tokenize(text.to_s)
43
+ tokens = read_tokens(tokens_ptr)
44
+ tokens.each { |t| t.force_encoding(text.encoding) }
45
+ tokens
46
+ ensure
47
+ FFI.mitie_free(tokens_ptr) if tokens_ptr
48
+ end
49
+
50
+ def tokenize_file(filename)
51
+ raise ArgumentError, "File does not exist" unless File.exist?(filename)
52
+ tokens_ptr = FFI.mitie_tokenize_file(filename)
53
+ read_tokens(tokens_ptr)
54
+ ensure
55
+ FFI.mitie_free(tokens_ptr) if tokens_ptr
56
+ end
57
+
58
+ private
59
+
60
+ def read_tokens(tokens_ptr)
61
+ i = 0
62
+ tokens = []
63
+ loop do
64
+ token = (tokens_ptr + i * Fiddle::SIZEOF_VOIDP).ptr
65
+ break if token.null?
66
+ tokens << token.to_s
67
+ i += 1
68
+ end
69
+ tokens
70
+ end
71
+ end
39
72
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mitie
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-06-01 00:00:00.000000000 Z
11
+ date: 2023-06-08 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org
@@ -55,7 +55,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
55
55
  - !ruby/object:Gem::Version
56
56
  version: '0'
57
57
  requirements: []
58
- rubygems_version: 3.3.7
58
+ rubygems_version: 3.4.10
59
59
  signing_key:
60
60
  specification_version: 4
61
61
  summary: Named-entity recognition for Ruby