mitie 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 50fb12adcd0042b3c09968108ec382694f8f0df20750b88566bc64c8d85d9e8d
4
- data.tar.gz: 9d3d34f9839f71fc17e6651a17069f1f0fdb4b5956f9e57c4b66c9567d908967
3
+ metadata.gz: 4106166da7718e20cce4c31c7f3974b12b06e1d2d5c7f30327981c0d0d31652a
4
+ data.tar.gz: 9345cf0bedb85634dd43c2aaef3438870649479add03a3ca48bc0f8892822c4f
5
5
  SHA512:
6
- metadata.gz: 189d5a17f94fff9abbc1d8d961c8f8407ae85bd1284f727e363b9aa7384d748eb1d80b6ffa1b1a44c8c6fadbbb6e91e6d064ea58fa88af972e6fd6e90e3ef71d
7
- data.tar.gz: 202ba418ee98636736185a8f4783601acabecc59dde7e3a5c31a12b56827497b6b2b7e7e26d264b61848281b94cb635249b17a9e8fa009c496dacbe24e6466ca
6
+ metadata.gz: dc6581ed4865bff3a21724a056946d90526be0fc1d32803e625a918247075331adf7a5e721cfddbdc366c4df47d2e91c739953bccbd7e451cdb344c12fdf27de
7
+ data.tar.gz: d96cab2092c6cca5f25f51e1fae4977874d1d85e881d1686110c99d2b975de288e57b725035a766ea80c1304013d91e13953c5caa41d6572e7ab9bacbdad7ae8
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ## 0.2.2 (2023-06-07)
2
+
3
+ - Fixed error with `dup` and `clone`
4
+
5
+ ## 0.2.1 (2022-06-12)
6
+
7
+ - Added `tokenize` and `tokenize_file` methods
8
+ - Added support for untokenized text to text categorization
9
+
1
10
  ## 0.2.0 (2022-06-01)
2
11
 
3
12
  - Added support for text categorization
data/README.md CHANGED
@@ -183,7 +183,7 @@ trainer = Mitie::TextCategorizerTrainer.new("total_word_feature_extractor.dat")
183
183
  Add labeled text to the trainer
184
184
 
185
185
  ```ruby
186
- trainer.add(["This", "is", "super", "cool"], "positive")
186
+ trainer.add("This is super cool", "positive")
187
187
  ```
188
188
 
189
189
  Train the model
@@ -207,7 +207,7 @@ model = Mitie::TextCategorizer.new("text_categorization_model.dat")
207
207
  Categorize text
208
208
 
209
209
  ```ruby
210
- model.categorize(["What", "a", "super", "nice", "day"])
210
+ model.categorize("What a super nice day")
211
211
  ```
212
212
 
213
213
  ## Deployment
@@ -5,13 +5,12 @@ module Mitie
5
5
  # better error message
6
6
  raise ArgumentError, "File does not exist" unless File.exist?(path)
7
7
  @pointer = FFI.mitie_load_binary_relation_detector(path)
8
+ @pointer.free = FFI["mitie_free"]
8
9
  elsif pointer
9
10
  @pointer = pointer
10
11
  else
11
12
  raise ArgumentError, "Must pass either a path or a pointer"
12
13
  end
13
-
14
- ObjectSpace.define_finalizer(self, self.class.finalize(pointer))
15
14
  end
16
15
 
17
16
  def name
@@ -75,10 +74,5 @@ module Mitie
75
74
  ensure
76
75
  FFI.mitie_free(relation) if relation
77
76
  end
78
-
79
- def self.finalize(pointer)
80
- # must use proc instead of stabby lambda
81
- proc { FFI.mitie_free(pointer) }
82
- end
83
77
  end
84
78
  end
@@ -2,8 +2,7 @@ module Mitie
2
2
  class BinaryRelationTrainer
3
3
  def initialize(ner, name: "")
4
4
  @pointer = FFI.mitie_create_binary_relation_trainer(name, ner.pointer)
5
-
6
- ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
5
+ @pointer.free = FFI["mitie_free"]
7
6
  end
8
7
 
9
8
  def add_positive_binary_relation(tokens, range1, range2)
@@ -78,10 +77,5 @@ module Mitie
78
77
  def entities_overlap?(range1, range2)
79
78
  FFI.mitie_entities_overlap(range1.begin, range1.size, range2.begin, range2.size) == 1
80
79
  end
81
-
82
- def self.finalize(pointer)
83
- # must use proc instead of stabby lambda
84
- proc { FFI.mitie_free(pointer) }
85
- end
86
80
  end
87
81
  end
@@ -87,8 +87,8 @@ module Mitie
87
87
  else
88
88
  offsets_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
89
89
  tokens_ptr = FFI.mitie_tokenize_with_offsets(text, offsets_ptr)
90
+ tokens_ptr.free = FFI["mitie_free"]
90
91
 
91
- ObjectSpace.define_finalizer(tokens_ptr, self.class.finalize(tokens_ptr))
92
92
  ObjectSpace.define_finalizer(offsets_ptr, self.class.finalize_ptr(offsets_ptr))
93
93
 
94
94
  [tokens_ptr, offsets_ptr]
@@ -96,11 +96,6 @@ module Mitie
96
96
  end
97
97
  end
98
98
 
99
- def self.finalize(pointer)
100
- # must use proc instead of stabby lambda
101
- proc { FFI.mitie_free(pointer) }
102
- end
103
-
104
99
  def self.finalize_ptr(pointer)
105
100
  # must use proc instead of stabby lambda
106
101
  proc { FFI.mitie_free(pointer.ptr) }
data/lib/mitie/ffi.rb CHANGED
@@ -14,6 +14,7 @@ module Mitie
14
14
 
15
15
  extern "void mitie_free(void* object)"
16
16
  extern "char** mitie_tokenize(const char* text)"
17
+ extern "char** mitie_tokenize_file(const char* filename)"
17
18
  extern "char** mitie_tokenize_with_offsets(const char* text, unsigned long** token_offsets)"
18
19
 
19
20
  # ner
data/lib/mitie/ner.rb CHANGED
@@ -7,13 +7,12 @@ module Mitie
7
7
  # better error message
8
8
  raise ArgumentError, "File does not exist" unless File.exist?(path)
9
9
  @pointer = FFI.mitie_load_named_entity_extractor(path)
10
+ @pointer.free = FFI["mitie_free"]
10
11
  elsif pointer
11
12
  @pointer = pointer
12
13
  else
13
14
  raise ArgumentError, "Must pass either a path or a pointer"
14
15
  end
15
-
16
- ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
17
16
  end
18
17
 
19
18
  def tags
@@ -44,10 +43,5 @@ module Mitie
44
43
  def tokens_with_offset(text)
45
44
  doc(text).tokens_with_offset
46
45
  end
47
-
48
- def self.finalize(pointer)
49
- # must use proc instead of stabby lambda
50
- proc { FFI.mitie_free(pointer) }
51
- end
52
46
  end
53
47
  end
@@ -3,8 +3,7 @@ module Mitie
3
3
  def initialize(filename)
4
4
  raise ArgumentError, "File does not exist" unless File.exist?(filename)
5
5
  @pointer = FFI.mitie_create_ner_trainer(filename)
6
-
7
- ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
6
+ @pointer.free = FFI["mitie_free"]
8
7
  end
9
8
 
10
9
  def add(instance)
@@ -42,10 +41,5 @@ module Mitie
42
41
 
43
42
  Mitie::NER.new(pointer: extractor)
44
43
  end
45
-
46
- def self.finalize(pointer)
47
- # must use proc instead of stabby lambda
48
- proc { FFI.mitie_free(pointer) }
49
- end
50
44
  end
51
45
  end
@@ -7,8 +7,7 @@ module Mitie
7
7
 
8
8
  @pointer = FFI.mitie_create_ner_training_instance(tokens_pointer)
9
9
  raise Error, "Unable to create training instance. Probably ran out of RAM." if @pointer.null?
10
-
11
- ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
10
+ @pointer.free = FFI["mitie_free"]
12
11
  end
13
12
 
14
13
  def add_entity(range, label)
@@ -36,10 +35,5 @@ module Mitie
36
35
 
37
36
  FFI.mitie_overlaps_any_entity(@pointer, range.begin, range.size) == 1
38
37
  end
39
-
40
- def self.finalize(pointer)
41
- # must use proc instead of stabby lambda
42
- proc { FFI.mitie_free(pointer) }
43
- end
44
38
  end
45
39
  end
@@ -5,16 +5,16 @@ module Mitie
5
5
  # better error message
6
6
  raise ArgumentError, "File does not exist" unless File.exist?(path)
7
7
  @pointer = FFI.mitie_load_text_categorizer(path)
8
+ @pointer.free = FFI["mitie_free"]
8
9
  elsif pointer
9
10
  @pointer = pointer
10
11
  else
11
12
  raise ArgumentError, "Must pass either a path or a pointer"
12
13
  end
13
-
14
- ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
15
14
  end
16
15
 
17
- def categorize(tokens)
16
+ def categorize(text)
17
+ tokens = text.is_a?(Array) ? text : Mitie.tokenize(text)
18
18
  tokens_pointer = Utils.array_to_pointer(tokens)
19
19
  text_tag = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
20
20
  text_score = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
@@ -38,10 +38,5 @@ module Mitie
38
38
  end
39
39
  nil
40
40
  end
41
-
42
- def self.finalize(pointer)
43
- # must use proc instead of stabby lambda
44
- proc { FFI.mitie_free(pointer) }
45
- end
46
41
  end
47
42
  end
@@ -3,11 +3,11 @@ module Mitie
3
3
  def initialize(filename)
4
4
  raise ArgumentError, "File does not exist" unless File.exist?(filename)
5
5
  @pointer = FFI.mitie_create_text_categorizer_trainer(filename)
6
-
7
- ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
6
+ @pointer.free = FFI["mitie_free"]
8
7
  end
9
8
 
10
- def add(tokens, label)
9
+ def add(text, label)
10
+ tokens = text.is_a?(Array) ? text : Mitie.tokenize(text)
11
11
  tokens_pointer = Utils.array_to_pointer(tokens)
12
12
  FFI.mitie_add_text_categorizer_labeled_text(@pointer, tokens_pointer, label)
13
13
  end
@@ -43,10 +43,5 @@ module Mitie
43
43
 
44
44
  Mitie::TextCategorizer.new(pointer: categorizer)
45
45
  end
46
-
47
- def self.finalize(pointer)
48
- # must use proc instead of stabby lambda
49
- proc { FFI.mitie_free(pointer) }
50
- end
51
46
  end
52
47
  end
data/lib/mitie/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Mitie
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.2"
3
3
  end
data/lib/mitie.rb CHANGED
@@ -2,16 +2,16 @@
2
2
  require "fiddle/import"
3
3
 
4
4
  # modules
5
- require "mitie/binary_relation_detector"
6
- require "mitie/binary_relation_trainer"
7
- require "mitie/document"
8
- require "mitie/ner"
9
- require "mitie/ner_training_instance"
10
- require "mitie/ner_trainer"
11
- require "mitie/text_categorizer"
12
- require "mitie/text_categorizer_trainer"
13
- require "mitie/utils"
14
- require "mitie/version"
5
+ require_relative "mitie/binary_relation_detector"
6
+ require_relative "mitie/binary_relation_trainer"
7
+ require_relative "mitie/document"
8
+ require_relative "mitie/ner"
9
+ require_relative "mitie/ner_training_instance"
10
+ require_relative "mitie/ner_trainer"
11
+ require_relative "mitie/text_categorizer"
12
+ require_relative "mitie/text_categorizer_trainer"
13
+ require_relative "mitie/utils"
14
+ require_relative "mitie/version"
15
15
 
16
16
  module Mitie
17
17
  class Error < StandardError; end
@@ -36,4 +36,37 @@ module Mitie
36
36
 
37
37
  # friendlier error message
38
38
  autoload :FFI, "mitie/ffi"
39
+
40
+ class << self
41
+ def tokenize(text)
42
+ tokens_ptr = FFI.mitie_tokenize(text.to_s)
43
+ tokens = read_tokens(tokens_ptr)
44
+ tokens.each { |t| t.force_encoding(text.encoding) }
45
+ tokens
46
+ ensure
47
+ FFI.mitie_free(tokens_ptr) if tokens_ptr
48
+ end
49
+
50
+ def tokenize_file(filename)
51
+ raise ArgumentError, "File does not exist" unless File.exist?(filename)
52
+ tokens_ptr = FFI.mitie_tokenize_file(filename)
53
+ read_tokens(tokens_ptr)
54
+ ensure
55
+ FFI.mitie_free(tokens_ptr) if tokens_ptr
56
+ end
57
+
58
+ private
59
+
60
+ def read_tokens(tokens_ptr)
61
+ i = 0
62
+ tokens = []
63
+ loop do
64
+ token = (tokens_ptr + i * Fiddle::SIZEOF_VOIDP).ptr
65
+ break if token.null?
66
+ tokens << token.to_s
67
+ i += 1
68
+ end
69
+ tokens
70
+ end
71
+ end
39
72
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mitie
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-06-01 00:00:00.000000000 Z
11
+ date: 2023-06-08 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org
@@ -55,7 +55,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
55
55
  - !ruby/object:Gem::Version
56
56
  version: '0'
57
57
  requirements: []
58
- rubygems_version: 3.3.7
58
+ rubygems_version: 3.4.10
59
59
  signing_key:
60
60
  specification_version: 4
61
61
  summary: Named-entity recognition for Ruby