mitie 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 50fb12adcd0042b3c09968108ec382694f8f0df20750b88566bc64c8d85d9e8d
4
- data.tar.gz: 9d3d34f9839f71fc17e6651a17069f1f0fdb4b5956f9e57c4b66c9567d908967
3
+ metadata.gz: dc2b7bdba2fba6b335ab9750efab8766190e618c4b8e2542ff6409f6727ec8b2
4
+ data.tar.gz: 03d85b928082a04b46209694c8e1b294e5c4205057a26b5cc65e695bbe3564a9
5
5
  SHA512:
6
- metadata.gz: 189d5a17f94fff9abbc1d8d961c8f8407ae85bd1284f727e363b9aa7384d748eb1d80b6ffa1b1a44c8c6fadbbb6e91e6d064ea58fa88af972e6fd6e90e3ef71d
7
- data.tar.gz: 202ba418ee98636736185a8f4783601acabecc59dde7e3a5c31a12b56827497b6b2b7e7e26d264b61848281b94cb635249b17a9e8fa009c496dacbe24e6466ca
6
+ metadata.gz: 4092a2dc005bb76527429454c301179c63f0eeee4913a3a8f56190e13d7f7551ee3bbfcd1098a6c335cf3b886ec0dee4e52f5c062975a4a058d117229f45b340
7
+ data.tar.gz: 066e5800a520b16002388088fc367939a26738bd638631d3b86ff9074d508cf5cd051154fbbf7be129f07f82c97a851caf7a6486e6fff2fb7f6a4b252d426ec0
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.2.1 (2022-06-12)
2
+
3
+ - Added `tokenize` and `tokenize_file` methods
4
+ - Added support for untokenized text to text categorization
5
+
1
6
  ## 0.2.0 (2022-06-01)
2
7
 
3
8
  - Added support for text categorization
data/README.md CHANGED
@@ -183,7 +183,7 @@ trainer = Mitie::TextCategorizerTrainer.new("total_word_feature_extractor.dat")
183
183
  Add labeled text to the trainer
184
184
 
185
185
  ```ruby
186
- trainer.add(["This", "is", "super", "cool"], "positive")
186
+ trainer.add("This is super cool", "positive")
187
187
  ```
188
188
 
189
189
  Train the model
@@ -207,7 +207,7 @@ model = Mitie::TextCategorizer.new("text_categorization_model.dat")
207
207
  Categorize text
208
208
 
209
209
  ```ruby
210
- model.categorize(["What", "a", "super", "nice", "day"])
210
+ model.categorize("What a super nice day")
211
211
  ```
212
212
 
213
213
  ## Deployment
data/lib/mitie/ffi.rb CHANGED
@@ -14,6 +14,7 @@ module Mitie
14
14
 
15
15
  extern "void mitie_free(void* object)"
16
16
  extern "char** mitie_tokenize(const char* text)"
17
+ extern "char** mitie_tokenize_file(const char* filename)"
17
18
  extern "char** mitie_tokenize_with_offsets(const char* text, unsigned long** token_offsets)"
18
19
 
19
20
  # ner
@@ -14,7 +14,8 @@ module Mitie
14
14
  ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
15
15
  end
16
16
 
17
- def categorize(tokens)
17
+ def categorize(text)
18
+ tokens = text.is_a?(Array) ? text : Mitie.tokenize(text)
18
19
  tokens_pointer = Utils.array_to_pointer(tokens)
19
20
  text_tag = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
20
21
  text_score = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
@@ -7,7 +7,8 @@ module Mitie
7
7
  ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
8
8
  end
9
9
 
10
- def add(tokens, label)
10
+ def add(text, label)
11
+ tokens = text.is_a?(Array) ? text : Mitie.tokenize(text)
11
12
  tokens_pointer = Utils.array_to_pointer(tokens)
12
13
  FFI.mitie_add_text_categorizer_labeled_text(@pointer, tokens_pointer, label)
13
14
  end
data/lib/mitie/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Mitie
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.1"
3
3
  end
data/lib/mitie.rb CHANGED
@@ -36,4 +36,36 @@ module Mitie
36
36
 
37
37
  # friendlier error message
38
38
  autoload :FFI, "mitie/ffi"
39
+
40
+ class << self
41
+ def tokenize(text)
42
+ tokens_ptr = FFI.mitie_tokenize(text)
43
+ tokens = read_tokens(tokens_ptr)
44
+ tokens.each { |t| t.force_encoding(text.encoding) }
45
+ tokens
46
+ ensure
47
+ FFI.mitie_free(tokens_ptr) if tokens_ptr
48
+ end
49
+
50
+ def tokenize_file(filename)
51
+ tokens_ptr = FFI.mitie_tokenize_file(filename)
52
+ read_tokens(tokens_ptr)
53
+ ensure
54
+ FFI.mitie_free(tokens_ptr) if tokens_ptr
55
+ end
56
+
57
+ private
58
+
59
+ def read_tokens(tokens_ptr)
60
+ i = 0
61
+ tokens = []
62
+ loop do
63
+ token = (tokens_ptr + i * Fiddle::SIZEOF_VOIDP).ptr
64
+ break if token.null?
65
+ tokens << token.to_s
66
+ i += 1
67
+ end
68
+ tokens
69
+ end
70
+ end
39
71
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mitie
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-06-01 00:00:00.000000000 Z
11
+ date: 2022-06-12 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org