mitie 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 50fb12adcd0042b3c09968108ec382694f8f0df20750b88566bc64c8d85d9e8d
4
- data.tar.gz: 9d3d34f9839f71fc17e6651a17069f1f0fdb4b5956f9e57c4b66c9567d908967
3
+ metadata.gz: dc2b7bdba2fba6b335ab9750efab8766190e618c4b8e2542ff6409f6727ec8b2
4
+ data.tar.gz: 03d85b928082a04b46209694c8e1b294e5c4205057a26b5cc65e695bbe3564a9
5
5
  SHA512:
6
- metadata.gz: 189d5a17f94fff9abbc1d8d961c8f8407ae85bd1284f727e363b9aa7384d748eb1d80b6ffa1b1a44c8c6fadbbb6e91e6d064ea58fa88af972e6fd6e90e3ef71d
7
- data.tar.gz: 202ba418ee98636736185a8f4783601acabecc59dde7e3a5c31a12b56827497b6b2b7e7e26d264b61848281b94cb635249b17a9e8fa009c496dacbe24e6466ca
6
+ metadata.gz: 4092a2dc005bb76527429454c301179c63f0eeee4913a3a8f56190e13d7f7551ee3bbfcd1098a6c335cf3b886ec0dee4e52f5c062975a4a058d117229f45b340
7
+ data.tar.gz: 066e5800a520b16002388088fc367939a26738bd638631d3b86ff9074d508cf5cd051154fbbf7be129f07f82c97a851caf7a6486e6fff2fb7f6a4b252d426ec0
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.2.1 (2022-06-12)
2
+
3
+ - Added `tokenize` and `tokenize_file` methods
4
+ - Added support for untokenized text to text categorization
5
+
1
6
  ## 0.2.0 (2022-06-01)
2
7
 
3
8
  - Added support for text categorization
data/README.md CHANGED
@@ -183,7 +183,7 @@ trainer = Mitie::TextCategorizerTrainer.new("total_word_feature_extractor.dat")
183
183
  Add labeled text to the trainer
184
184
 
185
185
  ```ruby
186
- trainer.add(["This", "is", "super", "cool"], "positive")
186
+ trainer.add("This is super cool", "positive")
187
187
  ```
188
188
 
189
189
  Train the model
@@ -207,7 +207,7 @@ model = Mitie::TextCategorizer.new("text_categorization_model.dat")
207
207
  Categorize text
208
208
 
209
209
  ```ruby
210
- model.categorize(["What", "a", "super", "nice", "day"])
210
+ model.categorize("What a super nice day")
211
211
  ```
212
212
 
213
213
  ## Deployment
data/lib/mitie/ffi.rb CHANGED
@@ -14,6 +14,7 @@ module Mitie
14
14
 
15
15
  extern "void mitie_free(void* object)"
16
16
  extern "char** mitie_tokenize(const char* text)"
17
+ extern "char** mitie_tokenize_file(const char* filename)"
17
18
  extern "char** mitie_tokenize_with_offsets(const char* text, unsigned long** token_offsets)"
18
19
 
19
20
  # ner
@@ -14,7 +14,8 @@ module Mitie
14
14
  ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
15
15
  end
16
16
 
17
- def categorize(tokens)
17
+ def categorize(text)
18
+ tokens = text.is_a?(Array) ? text : Mitie.tokenize(text)
18
19
  tokens_pointer = Utils.array_to_pointer(tokens)
19
20
  text_tag = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
20
21
  text_score = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE)
@@ -7,7 +7,8 @@ module Mitie
7
7
  ObjectSpace.define_finalizer(self, self.class.finalize(@pointer))
8
8
  end
9
9
 
10
- def add(tokens, label)
10
+ def add(text, label)
11
+ tokens = text.is_a?(Array) ? text : Mitie.tokenize(text)
11
12
  tokens_pointer = Utils.array_to_pointer(tokens)
12
13
  FFI.mitie_add_text_categorizer_labeled_text(@pointer, tokens_pointer, label)
13
14
  end
data/lib/mitie/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Mitie
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.1"
3
3
  end
data/lib/mitie.rb CHANGED
@@ -36,4 +36,36 @@ module Mitie
36
36
 
37
37
  # friendlier error message
38
38
  autoload :FFI, "mitie/ffi"
39
+
40
+ class << self
41
+ def tokenize(text)
42
+ tokens_ptr = FFI.mitie_tokenize(text)
43
+ tokens = read_tokens(tokens_ptr)
44
+ tokens.each { |t| t.force_encoding(text.encoding) }
45
+ tokens
46
+ ensure
47
+ FFI.mitie_free(tokens_ptr) if tokens_ptr
48
+ end
49
+
50
+ def tokenize_file(filename)
51
+ tokens_ptr = FFI.mitie_tokenize_file(filename)
52
+ read_tokens(tokens_ptr)
53
+ ensure
54
+ FFI.mitie_free(tokens_ptr) if tokens_ptr
55
+ end
56
+
57
+ private
58
+
59
+ def read_tokens(tokens_ptr)
60
+ i = 0
61
+ tokens = []
62
+ loop do
63
+ token = (tokens_ptr + i * Fiddle::SIZEOF_VOIDP).ptr
64
+ break if token.null?
65
+ tokens << token.to_s
66
+ i += 1
67
+ end
68
+ tokens
69
+ end
70
+ end
39
71
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mitie
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-06-01 00:00:00.000000000 Z
11
+ date: 2022-06-12 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org