tokkens 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.travis.yml +2 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +2 -0
- data/LICENSE.md +21 -0
- data/README.md +201 -0
- data/Rakefile +6 -0
- data/examples/classify.rb +61 -0
- data/lib/tokkens.rb +3 -0
- data/lib/tokkens/tokenizer.rb +57 -0
- data/lib/tokkens/tokens.rb +141 -0
- data/lib/tokkens/version.rb +3 -0
- data/spec/spec_helper.rb +8 -0
- data/spec/tokenizer_spec.rb +40 -0
- data/spec/tokens_spec.rb +133 -0
- data/tokkens.gemspec +29 -0
- metadata +108 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: de30d6649f845763e36fd45ed9770090021a44b2
|
4
|
+
data.tar.gz: 34c389c2c767df2f9661b8c3a7a341da9294c20c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: cd819fc173e3fbc889fc99c269f44033b74a514a9dd173485617f9f78d9455adc17cdb3c5b1ccb00730786bb46460c5254769cb9975bd2d5b8aeb70c6238a145
|
7
|
+
data.tar.gz: 4ab7c1a5a804c33b4fdd64bd798f2e2ce0837eb0ad9ed2da781573193c212e9c0a8f1a7e175db5d0492e0099f756ec9afb7b402814bc2af4dd3d80b9d0b3526e
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/CHANGELOG.md
ADDED
data/Gemfile
ADDED
data/LICENSE.md
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2017 wvengen
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,201 @@
|
|
1
|
+
# Tokkens
|
2
|
+
|
3
|
+
[](https://travis-ci.org/q-m/tokkens-ruby)
|
4
|
+
[](http://www.rubydoc.info/github/q-m/tokkens-ruby/master)
|
5
|
+
|
6
|
+
`Tokkens` makes it easy to apply a [vector space model](https://en.wikipedia.org/wiki/Vector_space_model)
|
7
|
+
to text documents, targeted towards with machine learning. It provides a mapping
|
8
|
+
between numbers and tokens (strings).
|
9
|
+
|
10
|
+
Read more about [installation](#installation), [usage](#usage) or skip to an [example](#example).
|
11
|
+
|
12
|
+
## Installation
|
13
|
+
|
14
|
+
Add this line to your application's Gemfile:
|
15
|
+
|
16
|
+
```ruby
|
17
|
+
gem 'tokkens'
|
18
|
+
```
|
19
|
+
|
20
|
+
And then execute:
|
21
|
+
|
22
|
+
$ bundle
|
23
|
+
|
24
|
+
Or install it yourself as:
|
25
|
+
|
26
|
+
$ gem install tokkens
|
27
|
+
|
28
|
+
Note that you'll need [Ruby](http://ruby-lang.org/) 2+.
|
29
|
+
|
30
|
+
## Usage
|
31
|
+
|
32
|
+
### Tokens
|
33
|
+
|
34
|
+
#### `get` and `find`
|
35
|
+
|
36
|
+
`Tokens` is a store for mapping strings (tokens) to numbers. Each string gets
|
37
|
+
its own unique number. First instantiate a new instance.
|
38
|
+
|
39
|
+
```ruby
|
40
|
+
require 'tokkens'
|
41
|
+
@tokens = Tokkens::Tokens.new
|
42
|
+
```
|
43
|
+
|
44
|
+
Then `get` a number for some tokens. You'll notice that each distinct token
|
45
|
+
gets its own number.
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
puts @tokens.get('foo')
|
49
|
+
# => 1
|
50
|
+
puts @tokens.get('bar')
|
51
|
+
# => 2
|
52
|
+
puts @tokens.get('foo')
|
53
|
+
# => 1
|
54
|
+
```
|
55
|
+
|
56
|
+
The reverse operation is `find` (code is optimized for `get`).
|
57
|
+
|
58
|
+
```ruby
|
59
|
+
puts @tokens.find(2)
|
60
|
+
# => "bar"
|
61
|
+
```
|
62
|
+
|
63
|
+
The `prefix` option can be used to add a prefix to the token.
|
64
|
+
|
65
|
+
```ruby
|
66
|
+
puts @tokens.get('blup', prefix: 'DESC:')
|
67
|
+
# => 3
|
68
|
+
puts @tokens.find(3)
|
69
|
+
# => "DESC:blup"
|
70
|
+
puts @tokens.find(3, prefix: 'DESC:')
|
71
|
+
# => "blup"
|
72
|
+
```
|
73
|
+
|
74
|
+
#### `load` and `save`
|
75
|
+
|
76
|
+
To persist tokens across runs, one can load and save the list of tokens. At the
|
77
|
+
moment, this is a plain text file, with one line containing number, occurence and token.
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
@tokens.save('foo.tokens')
|
81
|
+
# ---- some time later
|
82
|
+
@tokens = Tokkens::Tokens.new
|
83
|
+
@tokens.load('foo.tokens')
|
84
|
+
```
|
85
|
+
|
86
|
+
#### `limit!`
|
87
|
+
|
88
|
+
One common operation is reducing the number of words, to retain only those that are
|
89
|
+
most relevant. This is called feature selection or
|
90
|
+
[dimensionality reduction](https://en.wikipedia.org/wiki/Dimensionality_reduction).
|
91
|
+
You can select by maximum `max_size` (most occuring words are kept).
|
92
|
+
|
93
|
+
```ruby
|
94
|
+
@tokens = Tokkens::Tokens.new
|
95
|
+
@tokens.get('foo')
|
96
|
+
@tokens.get('bar')
|
97
|
+
@tokens.get('baz')
|
98
|
+
@tokens.indexes
|
99
|
+
# => [1, 2, 3]
|
100
|
+
@tokens.limit!(max_size: 2)
|
101
|
+
@tokens.indexes
|
102
|
+
# => [1, 2]
|
103
|
+
```
|
104
|
+
|
105
|
+
Or you can reduce by minimum `min_occurence`.
|
106
|
+
|
107
|
+
```ruby
|
108
|
+
@tokens.get('zab')
|
109
|
+
# => 4
|
110
|
+
@tokens.get('bar')
|
111
|
+
# => 2
|
112
|
+
@tokens.indexes
|
113
|
+
# => [1, 2, 4]
|
114
|
+
@tokens.limit!(min_occurence: 2)
|
115
|
+
@tokens.indexes
|
116
|
+
# => [2]
|
117
|
+
```
|
118
|
+
|
119
|
+
Note that this limits only the tokens store, if you reference the tokens removed
|
120
|
+
elsewhere, you may still need to remove those.
|
121
|
+
|
122
|
+
#### `freeze!` and `thaw!`
|
123
|
+
|
124
|
+
`Tokens` may be used to train a model from a training dataset, and then use it to
|
125
|
+
predict based on the model. In this case, new tokens need to be added during the
|
126
|
+
training stage, but it doesn't make sense to generate new tokens during prediction.
|
127
|
+
|
128
|
+
By default, `Tokens` makes new tokens when an unrecognized token is passed to `get`.
|
129
|
+
But when it has been `frozen?` by `freeze!`, new tokens will return `nil` instead.
|
130
|
+
If for some reason, you'd like to add new tokens again, use `thaw!`.
|
131
|
+
|
132
|
+
```ruby
|
133
|
+
@tokens.freeze!
|
134
|
+
@tokens.get('hithere')
|
135
|
+
# => 4
|
136
|
+
@tokens.get('blahblah')
|
137
|
+
# => nil
|
138
|
+
@tokens.thaw!
|
139
|
+
@tokens.get('blahblah')
|
140
|
+
# => 5
|
141
|
+
```
|
142
|
+
|
143
|
+
Note that after `load`ing, the state may be frozen.
|
144
|
+
|
145
|
+
### Tokenizer
|
146
|
+
|
147
|
+
When processing sentences or other text bodies, `Tokenizer` provides a way to map
|
148
|
+
this to an array of numbers (using `Token`).
|
149
|
+
|
150
|
+
```ruby
|
151
|
+
@tokenizer = Tokkens::Tokenizer.new
|
152
|
+
@tokenizer.get('hi from example')
|
153
|
+
# => [1, 2, 3]
|
154
|
+
@tokenizer.tokens.find(3)
|
155
|
+
# => "example"
|
156
|
+
```
|
157
|
+
|
158
|
+
The `prefix` keyword argument also works here.
|
159
|
+
|
160
|
+
```ruby
|
161
|
+
@tokenizer.get('from example', prefix: 'X:')
|
162
|
+
# => [4, 5]
|
163
|
+
@tokenizer.tokens.find(5)
|
164
|
+
# => "X:example"
|
165
|
+
```
|
166
|
+
|
167
|
+
One can specify a minimum length (default 2) and stop words for tokenizing.
|
168
|
+
|
169
|
+
```ruby
|
170
|
+
@tokenizer = Tokkens::Tokenizer.new(min_length: 3, stop_words: %w(and the))
|
171
|
+
@tokenizer.get('the cat and a bird').map {|i| @tokenizer.tokens.find(i)}
|
172
|
+
# => ["cat", "bird"]
|
173
|
+
```
|
174
|
+
|
175
|
+
### Example
|
176
|
+
|
177
|
+
A basic text classification example using [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/)
|
178
|
+
can be found in [examples/classify.rb](examples/classify.rb). Run it as follows:
|
179
|
+
|
180
|
+
```
|
181
|
+
$ gem install liblinear-ruby
|
182
|
+
$ ruby examples/classify.rb
|
183
|
+
How many students are in for the exams today? -> students exams -> school
|
184
|
+
The forest has large trees, while the field has its flowers. -> trees field flowers -> nature
|
185
|
+
Can we park our cars inside that building to go shopping? -> cars building shopping -> city
|
186
|
+
```
|
187
|
+
|
188
|
+
The classifier was trained using three training sentences for each class.
|
189
|
+
The output shows a prediction for three test sentences. Each test sentence is
|
190
|
+
printed, followed by the tokens, followed by the predicted class.
|
191
|
+
|
192
|
+
## [MIT license](LICENSE.md)
|
193
|
+
|
194
|
+
## Contributing
|
195
|
+
|
196
|
+
1. Fork it ( https://github.com/[my-github-username]/tokkens/fork )
|
197
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
198
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
199
|
+
4. Make sure the tests are green (`rspec`)
|
200
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
201
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
# Document classification example using tokkens and linear SVM
|
2
|
+
require 'tokkens' # `rake install` or `gem install tokkens`
|
3
|
+
require 'liblinear' # `gem install liblinear-ruby`
|
4
|
+
|
5
|
+
# define the training data
|
6
|
+
TRAINING_DATA = [
|
7
|
+
['school', 'The teacher writes a formula on the blackboard, while students are studying for their exams.'],
|
8
|
+
['school', 'Students play soccer during the break after class, while a teacher watches over them.'],
|
9
|
+
['school', 'All the students are studying hard for the final exams.'],
|
10
|
+
['nature', 'The fox is running around the trees, while flowers bloom in the field.'],
|
11
|
+
['nature', 'Where are the rabbits hiding today? Their holes below the trees are empty.'],
|
12
|
+
['nature', 'The dark sky is bringing rain. The fox hides, rabbits find their holes, but the flowers surrender.'],
|
13
|
+
['city', 'Cars are passing by swiftly, until the traffic lights become red.'],
|
14
|
+
['city', 'Look at the high building, with so many windows. Who would live there?'],
|
15
|
+
['city', 'The shopping centre building is over there, you will find everything you need to buy.'],
|
16
|
+
]
|
17
|
+
|
18
|
+
# after training, these test sentences will receive a predicted classification
|
19
|
+
TEST_DATA = [
|
20
|
+
'How many students are in for the exams today?',
|
21
|
+
'The forest has large trees, while the field has its flowers.',
|
22
|
+
'Can we park our cars inside that building to go shopping?',
|
23
|
+
]
|
24
|
+
|
25
|
+
# stop words don't carry meaning, we better ignore them
|
26
|
+
STOP_WORDS = %w(
|
27
|
+
the a on to at so today all many some
|
28
|
+
are is will would their you them their our everyone everything who there
|
29
|
+
while during over for below by with after in around until where
|
30
|
+
)
|
31
|
+
|
32
|
+
def preprocess(s)
|
33
|
+
s.downcase.gsub(/[^a-z\s]/, '')
|
34
|
+
end
|
35
|
+
|
36
|
+
@labels = Tokkens::Tokens.new
|
37
|
+
@tokenizer = Tokkens::Tokenizer.new(stop_words: STOP_WORDS)
|
38
|
+
|
39
|
+
# train
|
40
|
+
training_labels = []
|
41
|
+
training_samples = []
|
42
|
+
TRAINING_DATA.each do |(label, sentence)|
|
43
|
+
training_labels << @labels.get(label)
|
44
|
+
tokens = @tokenizer.get(preprocess(sentence)).uniq
|
45
|
+
training_samples << Hash[tokens.zip([1] * tokens.length)]
|
46
|
+
end
|
47
|
+
#tokenizer.tokens.limit!(occurence: 2) # limit number of tokens - doesn't affect training though!
|
48
|
+
@model = Liblinear.train({}, training_labels, training_samples)
|
49
|
+
|
50
|
+
# predict
|
51
|
+
@tokenizer.tokens.freeze!
|
52
|
+
TEST_DATA.each do |sentence|
|
53
|
+
tokens = @tokenizer.get(preprocess(sentence))
|
54
|
+
label_number = Liblinear.predict(@model, Hash[tokens.zip([1] * tokens.length)])
|
55
|
+
puts "#{sentence} -> #{tokens.map{|i| @tokenizer.tokens.find(i)}.join(' ')} -> #{@labels.find(label_number)}"
|
56
|
+
end
|
57
|
+
|
58
|
+
# you might want to persist data for prediction at a later time
|
59
|
+
#model.save('test.model')
|
60
|
+
#labels.save('test.labels')
|
61
|
+
#tokenizer.tokens.save('test.tokens')
|
data/lib/tokkens.rb
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
require_relative 'tokens'
|
2
|
+
|
3
|
+
module Tokkens
|
4
|
+
# Converts a string to a list of token numbers.
|
5
|
+
#
|
6
|
+
# Useful for computing with text, like machine learning.
|
7
|
+
# Before using the tokenizer, you're expected to have pre-processed
|
8
|
+
# the textdepending on application. For example, converting to lowercase,
|
9
|
+
# removing non-word characters, transliterating accented characters.
|
10
|
+
#
|
11
|
+
# This class then splits the string into tokens by whitespace, and
|
12
|
+
# removes tokens not passing the selection criteria.
|
13
|
+
#
|
14
|
+
class Tokenizer
|
15
|
+
|
16
|
+
# default minimum token length
|
17
|
+
MIN_LENGTH = 2
|
18
|
+
|
19
|
+
# no default stop words to ignore
|
20
|
+
STOP_WORDS = []
|
21
|
+
|
22
|
+
# @!attribute [r] tokens
|
23
|
+
# @return [Tokens] object to use for obtaining tokens
|
24
|
+
# @!attribute [r] stop_words
|
25
|
+
# @return [Array<String>] stop words to ignore
|
26
|
+
# @!attribute [r] min_length
|
27
|
+
# @return [Fixnum] Minimum length for tokens
|
28
|
+
attr_reader :tokens, :stop_words, :min_length
|
29
|
+
|
30
|
+
# Create a new tokenizer
|
31
|
+
#
|
32
|
+
# @param tokens [Tokens] object to use for obtaining token numbers
|
33
|
+
# @param min_length [Fixnum] minimum length for tokens
|
34
|
+
# @param stop_words [Array<String>] stop words to ignore
|
35
|
+
def initialize(tokens = nil, min_length: MIN_LENGTH, stop_words: STOP_WORDS)
|
36
|
+
@tokens = tokens || Tokens.new
|
37
|
+
@stop_words = stop_words
|
38
|
+
@min_length = min_length
|
39
|
+
end
|
40
|
+
|
41
|
+
# @return [Array<Fixnum>] array of token numbers
|
42
|
+
def get(s, **kwargs)
|
43
|
+
return [] if !s || s.strip == ''
|
44
|
+
tokenize(s).map {|token| @tokens.get(token, **kwargs) }.compact
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def tokenize(s)
|
50
|
+
s.split.select(&method(:include?))
|
51
|
+
end
|
52
|
+
|
53
|
+
def include?(s)
|
54
|
+
s.length >= @min_length && !@stop_words.include?(s)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
module Tokkens
|
2
|
+
# Converts a string token to a uniquely identifying sequential number.
|
3
|
+
#
|
4
|
+
# Useful for working with a {https://en.wikipedia.org/wiki/Vector_space_model vector space model}
|
5
|
+
# for text.
|
6
|
+
class Tokens
|
7
|
+
|
8
|
+
# @!attribute [r] offset
|
9
|
+
# @return [Fixnum] Number of first token.
|
10
|
+
attr_accessor :offset
|
11
|
+
|
12
|
+
def initialize(offset: 1)
|
13
|
+
# liblinear can't use offset 0, libsvm doesn't mind to start at one
|
14
|
+
@tokens = {}
|
15
|
+
@offset = offset
|
16
|
+
@next_number = offset
|
17
|
+
@frozen = false
|
18
|
+
end
|
19
|
+
|
20
|
+
# Stop assigning new numbers to token.
|
21
|
+
# @see #frozen?
|
22
|
+
# @see #thaw!
|
23
|
+
def freeze!
|
24
|
+
@frozen = true
|
25
|
+
end
|
26
|
+
|
27
|
+
# Allow new tokens to be created.
|
28
|
+
# @see #freeze!
|
29
|
+
# @see #frozen?
|
30
|
+
def thaw!
|
31
|
+
@frozen = false
|
32
|
+
end
|
33
|
+
|
34
|
+
# @return [Boolean] Whether the tokens are frozen or not.
|
35
|
+
# @see #freeze!
|
36
|
+
# @see #thaw!
|
37
|
+
def frozen?
|
38
|
+
@frozen
|
39
|
+
end
|
40
|
+
|
41
|
+
# Limit the number of tokens.
|
42
|
+
#
|
43
|
+
# @param max_size [Fixnum] Maximum number of tokens to retain
|
44
|
+
# @param min_occurence [Fixnum] Keep only tokens seen at least this many times
|
45
|
+
# @return [Fixnum] Number of tokens left
|
46
|
+
def limit!(max_size: nil, min_occurence: nil)
|
47
|
+
# @todo raise if frozen
|
48
|
+
if min_occurence
|
49
|
+
@tokens.delete_if {|name, data| data[1] < min_occurence }
|
50
|
+
end
|
51
|
+
if max_size
|
52
|
+
@tokens = Hash[@tokens.to_a.sort_by {|a| -a[1][1] }[0..(max_size-1)]]
|
53
|
+
end
|
54
|
+
@tokens.length
|
55
|
+
end
|
56
|
+
|
57
|
+
# Return a number for a new or existing token.
|
58
|
+
#
|
59
|
+
# When the token was seen before, the same number is returned. If the token
|
60
|
+
# is first seen and this class isn't {#frozen?}, a new number is returned;
|
61
|
+
# else +nil+ is returned.
|
62
|
+
#
|
63
|
+
# @param s [String] token to return number for
|
64
|
+
# @option kwargs [String] :prefix optional string to prepend to the token
|
65
|
+
# @return [Fixnum, NilClass] number for given token
|
66
|
+
def get(s, **kwargs)
|
67
|
+
return if !s || s.strip == ''
|
68
|
+
@frozen ? retrieve(s, **kwargs) : upsert(s, **kwargs)
|
69
|
+
end
|
70
|
+
|
71
|
+
# Return an token by number.
|
72
|
+
#
|
73
|
+
# This class is optimized for retrieving by token, not by number.
|
74
|
+
#
|
75
|
+
# @param i [String] number to return token for
|
76
|
+
# @param prefix [String] optional string to remove from beginning of token
|
77
|
+
# @return [String, NilClass] given token, or +nil+ when not found
|
78
|
+
def find(i, prefix: nil)
|
79
|
+
@tokens.each do |s, data|
|
80
|
+
if data[0] == i
|
81
|
+
return (prefix && s.start_with?(prefix)) ? s[prefix.length..-1] : s
|
82
|
+
end
|
83
|
+
end
|
84
|
+
nil
|
85
|
+
end
|
86
|
+
|
87
|
+
# Return indexes for all of the current tokens.
|
88
|
+
#
|
89
|
+
# @return [Array<Fixnum>] All current token numbers.
|
90
|
+
# @see #limit!
|
91
|
+
def indexes
|
92
|
+
@tokens.values.map(&:first)
|
93
|
+
end
|
94
|
+
|
95
|
+
# Load tokens from file.
|
96
|
+
#
|
97
|
+
# The tokens are frozen by default.
|
98
|
+
# All previously existing tokens are removed.
|
99
|
+
#
|
100
|
+
# @param filename [String] Filename
|
101
|
+
def load(filename)
|
102
|
+
File.open(filename) do |f|
|
103
|
+
@tokens = {}
|
104
|
+
f.each_line do |line|
|
105
|
+
id, count, name = line.rstrip.split(/\s+/, 3)
|
106
|
+
@tokens[name.strip] = [id.to_i, count]
|
107
|
+
end
|
108
|
+
end
|
109
|
+
# safer
|
110
|
+
freeze!
|
111
|
+
end
|
112
|
+
|
113
|
+
# Save tokens to file.
|
114
|
+
#
|
115
|
+
# @param filename [String] Filename
|
116
|
+
def save(filename)
|
117
|
+
File.open(filename, 'w') do |f|
|
118
|
+
@tokens.each do |token, (index, count)|
|
119
|
+
f.puts "#{index} #{count} #{token}"
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
private
|
125
|
+
|
126
|
+
def retrieve(s, prefix: '')
|
127
|
+
data = @tokens[prefix + s]
|
128
|
+
data[0] if data
|
129
|
+
end
|
130
|
+
|
131
|
+
# return token number, update next_number; always returns a number
|
132
|
+
def upsert(s, prefix: '')
|
133
|
+
unless data = @tokens[prefix + s]
|
134
|
+
@tokens[prefix + s] = data = [@next_number, 0]
|
135
|
+
@next_number += 1
|
136
|
+
end
|
137
|
+
data[1] += 1
|
138
|
+
data[0]
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe Tokenizer do
|
4
|
+
let(:tokenizer) { described_class.new }
|
5
|
+
let(:offset) { 1 } # default token offset
|
6
|
+
|
7
|
+
describe '#get' do
|
8
|
+
it 'does tokenization' do
|
9
|
+
expect(tokenizer.get('foo bar')).to eq ([offset, offset + 1])
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'ignores too short tokens' do
|
13
|
+
t = described_class.new(min_length: 2)
|
14
|
+
expect(t.get('x')).to eq []
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'ignores stop words' do
|
18
|
+
t = described_class.new(stop_words: ['xyz'])
|
19
|
+
expect(t.get('xyz foo')).to eq [offset]
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'does not return nil tokens' do
|
23
|
+
tokenizer.tokens.get('foo')
|
24
|
+
tokenizer.tokens.freeze!
|
25
|
+
expect(tokenizer.get('foo bar')).to eq [offset]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe '#tokens' do
|
30
|
+
it 'returns a tokens object by default' do
|
31
|
+
expect(tokenizer.tokens).to be_a Tokens
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'can be overridden' do
|
35
|
+
tokens = Tokens.new
|
36
|
+
t = described_class.new(tokens)
|
37
|
+
expect(t.tokens).to be tokens
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/spec/tokens_spec.rb
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
require 'tempfile'
|
3
|
+
|
4
|
+
describe Tokens do
|
5
|
+
let(:tokens) { described_class.new }
|
6
|
+
let(:offset) { 1 } # default offset
|
7
|
+
|
8
|
+
describe '#get' do
|
9
|
+
it 'can new tokens' do
|
10
|
+
expect(tokens.get('bar')).to eq offset
|
11
|
+
expect(tokens.get('foo')).to eq (offset + 1)
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'can get an existing token' do
|
15
|
+
tokens.get('bar')
|
16
|
+
expect(tokens.get('bar')).to eq offset
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'can include a prefix' do
|
20
|
+
tokens.get('bar', prefix: 'XyZ$')
|
21
|
+
expect(tokens.get('XyZ$bar')).to eq offset
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'can get an existing token when frozen' do
|
25
|
+
tokens.get('blup')
|
26
|
+
tokens.freeze!
|
27
|
+
expect(tokens.get('blup')).to eq offset
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'cannot get a new token when frozen' do
|
31
|
+
tokens.get('blup')
|
32
|
+
tokens.freeze!
|
33
|
+
expect(tokens.get('blabla')).to be_nil
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe '#find' do
|
38
|
+
it 'can find an existing token' do
|
39
|
+
tokens.get('blup')
|
40
|
+
i = tokens.get('blah')
|
41
|
+
expect(tokens.find(i)).to eq 'blah'
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'returns nil for a non-existing token' do
|
45
|
+
tokens.get('blup')
|
46
|
+
expect(tokens.find(offset + 1)).to eq nil
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'removes the prefix' do
|
50
|
+
i = tokens.get('blup', prefix: 'FOO$')
|
51
|
+
expect(tokens.find(i, prefix: 'FOO$')).to eq 'blup'
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
describe '#indexes' do
|
56
|
+
it 'is empty without tokens' do
|
57
|
+
expect(tokens.indexes).to eq []
|
58
|
+
end
|
59
|
+
|
60
|
+
it 'returns the expected indexes' do
|
61
|
+
tokens.get('foo')
|
62
|
+
tokens.get('blup')
|
63
|
+
expect(tokens.indexes).to eq [offset, offset + 1]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
describe '#offset' do
|
68
|
+
it 'has a default' do
|
69
|
+
expect(described_class.new.offset).to eq offset
|
70
|
+
end
|
71
|
+
|
72
|
+
it 'can override the default' do
|
73
|
+
expect(described_class.new(offset: 5).offset).to eq 5
|
74
|
+
end
|
75
|
+
|
76
|
+
it 'affects the first number' do
|
77
|
+
tokens = described_class.new(offset: 12)
|
78
|
+
expect(tokens.get('hi')).to eq 12
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
describe '#frozen?' do
|
83
|
+
it 'is not frozen by default' do
|
84
|
+
expect(tokens.frozen?).to be false
|
85
|
+
end
|
86
|
+
|
87
|
+
it 'can be frozen' do
|
88
|
+
tokens.freeze!
|
89
|
+
expect(tokens.frozen?).to be true
|
90
|
+
end
|
91
|
+
|
92
|
+
it 'can be thawed' do
|
93
|
+
tokens.freeze!
|
94
|
+
tokens.thaw!
|
95
|
+
expect(tokens.frozen?).to be false
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
describe '#limit!' do
|
100
|
+
it 'limits to most frequent tokens by max_size' do
|
101
|
+
tokens.get('foo')
|
102
|
+
tokens.get('blup')
|
103
|
+
tokens.get('blup')
|
104
|
+
tokens.limit!(max_size: 1)
|
105
|
+
expect(tokens.indexes).to eq [offset + 1]
|
106
|
+
end
|
107
|
+
|
108
|
+
it 'limits by min_occurence' do
|
109
|
+
tokens.get('foo')
|
110
|
+
tokens.get('blup')
|
111
|
+
tokens.get('foo')
|
112
|
+
tokens.limit!(min_occurence: 2)
|
113
|
+
expect(tokens.indexes).to eq [offset]
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
describe '#load' do
|
118
|
+
let(:file) { Tempfile.new('tokens') }
|
119
|
+
after { file.unlink }
|
120
|
+
|
121
|
+
it 'saves and loads tokens' do
|
122
|
+
tokens.get('foo')
|
123
|
+
tokens.get('bar')
|
124
|
+
tokens.save(file.path)
|
125
|
+
expect(File.exists?(file.path)).to be true
|
126
|
+
expect(File.zero?(file.path)).to be false
|
127
|
+
|
128
|
+
ntokens = described_class.new
|
129
|
+
ntokens.load(file.path)
|
130
|
+
expect(tokens.get('bar')).to eq (offset + 1)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
data/tokkens.gemspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'tokkens/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "tokkens"
|
8
|
+
spec.version = Tokkens::VERSION
|
9
|
+
spec.authors = ["wvengen"]
|
10
|
+
spec.email = ["dev-rails@willem.engen.nl"]
|
11
|
+
spec.summary = %q{Simple text to numbers tokenizer}
|
12
|
+
spec.homepage = "https://github.com/q-m/ruby-tokkens"
|
13
|
+
spec.license = "MIT"
|
14
|
+
spec.description = <<-EOD
|
15
|
+
Tokkens makes it easy to apply a vector space model to text documents,
|
16
|
+
targeted towards with machine learning. It provides a mapping between
|
17
|
+
numbers and tokens (strings)
|
18
|
+
EOD
|
19
|
+
|
20
|
+
spec.files = `git ls-files -z`.split("\x0")
|
21
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
22
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
23
|
+
spec.require_paths = ["lib"]
|
24
|
+
|
25
|
+
spec.required_ruby_version = '>= 2.0'
|
26
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
27
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
28
|
+
spec.add_development_dependency "rspec", "~> 3.5.0"
|
29
|
+
end
|
metadata
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tokkens
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- wvengen
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-02-08 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.7'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 3.5.0
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 3.5.0
|
55
|
+
description: |2
|
56
|
+
Tokkens makes it easy to apply a vector space model to text documents,
|
57
|
+
targeted towards with machine learning. It provides a mapping between
|
58
|
+
numbers and tokens (strings)
|
59
|
+
email:
|
60
|
+
- dev-rails@willem.engen.nl
|
61
|
+
executables: []
|
62
|
+
extensions: []
|
63
|
+
extra_rdoc_files: []
|
64
|
+
files:
|
65
|
+
- ".gitignore"
|
66
|
+
- ".travis.yml"
|
67
|
+
- CHANGELOG.md
|
68
|
+
- Gemfile
|
69
|
+
- LICENSE.md
|
70
|
+
- README.md
|
71
|
+
- Rakefile
|
72
|
+
- examples/classify.rb
|
73
|
+
- lib/tokkens.rb
|
74
|
+
- lib/tokkens/tokenizer.rb
|
75
|
+
- lib/tokkens/tokens.rb
|
76
|
+
- lib/tokkens/version.rb
|
77
|
+
- spec/spec_helper.rb
|
78
|
+
- spec/tokenizer_spec.rb
|
79
|
+
- spec/tokens_spec.rb
|
80
|
+
- tokkens.gemspec
|
81
|
+
homepage: https://github.com/q-m/ruby-tokkens
|
82
|
+
licenses:
|
83
|
+
- MIT
|
84
|
+
metadata: {}
|
85
|
+
post_install_message:
|
86
|
+
rdoc_options: []
|
87
|
+
require_paths:
|
88
|
+
- lib
|
89
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
90
|
+
requirements:
|
91
|
+
- - ">="
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '2.0'
|
94
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
95
|
+
requirements:
|
96
|
+
- - ">="
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '0'
|
99
|
+
requirements: []
|
100
|
+
rubyforge_project:
|
101
|
+
rubygems_version: 2.4.3
|
102
|
+
signing_key:
|
103
|
+
specification_version: 4
|
104
|
+
summary: Simple text to numbers tokenizer
|
105
|
+
test_files:
|
106
|
+
- spec/spec_helper.rb
|
107
|
+
- spec/tokenizer_spec.rb
|
108
|
+
- spec/tokens_spec.rb
|