tokkens 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.travis.yml +2 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +2 -0
- data/LICENSE.md +21 -0
- data/README.md +201 -0
- data/Rakefile +6 -0
- data/examples/classify.rb +61 -0
- data/lib/tokkens.rb +3 -0
- data/lib/tokkens/tokenizer.rb +57 -0
- data/lib/tokkens/tokens.rb +141 -0
- data/lib/tokkens/version.rb +3 -0
- data/spec/spec_helper.rb +8 -0
- data/spec/tokenizer_spec.rb +40 -0
- data/spec/tokens_spec.rb +133 -0
- data/tokkens.gemspec +29 -0
- metadata +108 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: de30d6649f845763e36fd45ed9770090021a44b2
|
4
|
+
data.tar.gz: 34c389c2c767df2f9661b8c3a7a341da9294c20c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: cd819fc173e3fbc889fc99c269f44033b74a514a9dd173485617f9f78d9455adc17cdb3c5b1ccb00730786bb46460c5254769cb9975bd2d5b8aeb70c6238a145
|
7
|
+
data.tar.gz: 4ab7c1a5a804c33b4fdd64bd798f2e2ce0837eb0ad9ed2da781573193c212e9c0a8f1a7e175db5d0492e0099f756ec9afb7b402814bc2af4dd3d80b9d0b3526e
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/CHANGELOG.md
ADDED
data/Gemfile
ADDED
data/LICENSE.md
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2017 wvengen
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,201 @@
|
|
1
|
+
# Tokkens
|
2
|
+
|
3
|
+
[![Build Status](https://travis-ci.org/q-m/tokkens-ruby.svg?branch=master)](https://travis-ci.org/q-m/tokkens-ruby)
|
4
|
+
[![Documentation](https://img.shields.io/badge/yard-docs-blue.svg)](http://www.rubydoc.info/github/q-m/tokkens-ruby/master)
|
5
|
+
|
6
|
+
`Tokkens` makes it easy to apply a [vector space model](https://en.wikipedia.org/wiki/Vector_space_model)
|
7
|
+
to text documents, targeted towards with machine learning. It provides a mapping
|
8
|
+
between numbers and tokens (strings).
|
9
|
+
|
10
|
+
Read more about [installation](#installation), [usage](#usage) or skip to an [example](#example).
|
11
|
+
|
12
|
+
## Installation
|
13
|
+
|
14
|
+
Add this line to your application's Gemfile:
|
15
|
+
|
16
|
+
```ruby
|
17
|
+
gem 'tokkens'
|
18
|
+
```
|
19
|
+
|
20
|
+
And then execute:
|
21
|
+
|
22
|
+
$ bundle
|
23
|
+
|
24
|
+
Or install it yourself as:
|
25
|
+
|
26
|
+
$ gem install tokkens
|
27
|
+
|
28
|
+
Note that you'll need [Ruby](http://ruby-lang.org/) 2+.
|
29
|
+
|
30
|
+
## Usage
|
31
|
+
|
32
|
+
### Tokens
|
33
|
+
|
34
|
+
#### `get` and `find`
|
35
|
+
|
36
|
+
`Tokens` is a store for mapping strings (tokens) to numbers. Each string gets
|
37
|
+
its own unique number. First instantiate a new instance.
|
38
|
+
|
39
|
+
```ruby
|
40
|
+
require 'tokkens'
|
41
|
+
@tokens = Tokkens::Tokens.new
|
42
|
+
```
|
43
|
+
|
44
|
+
Then `get` a number for some tokens. You'll notice that each distinct token
|
45
|
+
gets its own number.
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
puts @tokens.get('foo')
|
49
|
+
# => 1
|
50
|
+
puts @tokens.get('bar')
|
51
|
+
# => 2
|
52
|
+
puts @tokens.get('foo')
|
53
|
+
# => 1
|
54
|
+
```
|
55
|
+
|
56
|
+
The reverse operation is `find` (code is optimized for `get`).
|
57
|
+
|
58
|
+
```ruby
|
59
|
+
puts @tokens.find(2)
|
60
|
+
# => "bar"
|
61
|
+
```
|
62
|
+
|
63
|
+
The `prefix` option can be used to add a prefix to the token.
|
64
|
+
|
65
|
+
```ruby
|
66
|
+
puts @tokens.get('blup', prefix: 'DESC:')
|
67
|
+
# => 3
|
68
|
+
puts @tokens.find(3)
|
69
|
+
# => "DESC:blup"
|
70
|
+
puts @tokens.find(3, prefix: 'DESC:')
|
71
|
+
# => "blup"
|
72
|
+
```
|
73
|
+
|
74
|
+
#### `load` and `save`
|
75
|
+
|
76
|
+
To persist tokens across runs, one can load and save the list of tokens. At the
|
77
|
+
moment, this is a plain text file, with one line containing number, occurence and token.
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
@tokens.save('foo.tokens')
|
81
|
+
# ---- some time later
|
82
|
+
@tokens = Tokkens::Tokens.new
|
83
|
+
@tokens.load('foo.tokens')
|
84
|
+
```
|
85
|
+
|
86
|
+
#### `limit!`
|
87
|
+
|
88
|
+
One common operation is reducing the number of words, to retain only those that are
|
89
|
+
most relevant. This is called feature selection or
|
90
|
+
[dimensionality reduction](https://en.wikipedia.org/wiki/Dimensionality_reduction).
|
91
|
+
You can select by maximum `max_size` (most occuring words are kept).
|
92
|
+
|
93
|
+
```ruby
|
94
|
+
@tokens = Tokkens::Tokens.new
|
95
|
+
@tokens.get('foo')
|
96
|
+
@tokens.get('bar')
|
97
|
+
@tokens.get('baz')
|
98
|
+
@tokens.indexes
|
99
|
+
# => [1, 2, 3]
|
100
|
+
@tokens.limit!(max_size: 2)
|
101
|
+
@tokens.indexes
|
102
|
+
# => [1, 2]
|
103
|
+
```
|
104
|
+
|
105
|
+
Or you can reduce by minimum `min_occurence`.
|
106
|
+
|
107
|
+
```ruby
|
108
|
+
@tokens.get('zab')
|
109
|
+
# => 4
|
110
|
+
@tokens.get('bar')
|
111
|
+
# => 2
|
112
|
+
@tokens.indexes
|
113
|
+
# => [1, 2, 4]
|
114
|
+
@tokens.limit!(min_occurence: 2)
|
115
|
+
@tokens.indexes
|
116
|
+
# => [2]
|
117
|
+
```
|
118
|
+
|
119
|
+
Note that this limits only the tokens store, if you reference the tokens removed
|
120
|
+
elsewhere, you may still need to remove those.
|
121
|
+
|
122
|
+
#### `freeze!` and `thaw!`
|
123
|
+
|
124
|
+
`Tokens` may be used to train a model from a training dataset, and then use it to
|
125
|
+
predict based on the model. In this case, new tokens need to be added during the
|
126
|
+
training stage, but it doesn't make sense to generate new tokens during prediction.
|
127
|
+
|
128
|
+
By default, `Tokens` makes new tokens when an unrecognized token is passed to `get`.
|
129
|
+
But when it has been `frozen?` by `freeze!`, new tokens will return `nil` instead.
|
130
|
+
If for some reason, you'd like to add new tokens again, use `thaw!`.
|
131
|
+
|
132
|
+
```ruby
|
133
|
+
@tokens.freeze!
|
134
|
+
@tokens.get('hithere')
|
135
|
+
# => 4
|
136
|
+
@tokens.get('blahblah')
|
137
|
+
# => nil
|
138
|
+
@tokens.thaw!
|
139
|
+
@tokens.get('blahblah')
|
140
|
+
# => 5
|
141
|
+
```
|
142
|
+
|
143
|
+
Note that after `load`ing, the state may be frozen.
|
144
|
+
|
145
|
+
### Tokenizer
|
146
|
+
|
147
|
+
When processing sentences or other text bodies, `Tokenizer` provides a way to map
|
148
|
+
this to an array of numbers (using `Token`).
|
149
|
+
|
150
|
+
```ruby
|
151
|
+
@tokenizer = Tokkens::Tokenizer.new
|
152
|
+
@tokenizer.get('hi from example')
|
153
|
+
# => [1, 2, 3]
|
154
|
+
@tokenizer.tokens.find(3)
|
155
|
+
# => "example"
|
156
|
+
```
|
157
|
+
|
158
|
+
The `prefix` keyword argument also works here.
|
159
|
+
|
160
|
+
```ruby
|
161
|
+
@tokenizer.get('from example', prefix: 'X:')
|
162
|
+
# => [4, 5]
|
163
|
+
@tokenizer.tokens.find(5)
|
164
|
+
# => "X:example"
|
165
|
+
```
|
166
|
+
|
167
|
+
One can specify a minimum length (default 2) and stop words for tokenizing.
|
168
|
+
|
169
|
+
```ruby
|
170
|
+
@tokenizer = Tokkens::Tokenizer.new(min_length: 3, stop_words: %w(and the))
|
171
|
+
@tokenizer.get('the cat and a bird').map {|i| @tokenizer.tokens.find(i)}
|
172
|
+
# => ["cat", "bird"]
|
173
|
+
```
|
174
|
+
|
175
|
+
### Example
|
176
|
+
|
177
|
+
A basic text classification example using [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/)
|
178
|
+
can be found in [examples/classify.rb](examples/classify.rb). Run it as follows:
|
179
|
+
|
180
|
+
```
|
181
|
+
$ gem install liblinear-ruby
|
182
|
+
$ ruby examples/classify.rb
|
183
|
+
How many students are in for the exams today? -> students exams -> school
|
184
|
+
The forest has large trees, while the field has its flowers. -> trees field flowers -> nature
|
185
|
+
Can we park our cars inside that building to go shopping? -> cars building shopping -> city
|
186
|
+
```
|
187
|
+
|
188
|
+
The classifier was trained using three training sentences for each class.
|
189
|
+
The output shows a prediction for three test sentences. Each test sentence is
|
190
|
+
printed, followed by the tokens, followed by the predicted class.
|
191
|
+
|
192
|
+
## [MIT license](LICENSE.md)
|
193
|
+
|
194
|
+
## Contributing
|
195
|
+
|
196
|
+
1. Fork it ( https://github.com/[my-github-username]/tokkens/fork )
|
197
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
198
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
199
|
+
4. Make sure the tests are green (`rspec`)
|
200
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
201
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
# Document classification example using tokkens and linear SVM
|
2
|
+
require 'tokkens' # `rake install` or `gem install tokkens`
|
3
|
+
require 'liblinear' # `gem install liblinear-ruby`
|
4
|
+
|
5
|
+
# define the training data
|
6
|
+
TRAINING_DATA = [
|
7
|
+
['school', 'The teacher writes a formula on the blackboard, while students are studying for their exams.'],
|
8
|
+
['school', 'Students play soccer during the break after class, while a teacher watches over them.'],
|
9
|
+
['school', 'All the students are studying hard for the final exams.'],
|
10
|
+
['nature', 'The fox is running around the trees, while flowers bloom in the field.'],
|
11
|
+
['nature', 'Where are the rabbits hiding today? Their holes below the trees are empty.'],
|
12
|
+
['nature', 'The dark sky is bringing rain. The fox hides, rabbits find their holes, but the flowers surrender.'],
|
13
|
+
['city', 'Cars are passing by swiftly, until the traffic lights become red.'],
|
14
|
+
['city', 'Look at the high building, with so many windows. Who would live there?'],
|
15
|
+
['city', 'The shopping centre building is over there, you will find everything you need to buy.'],
|
16
|
+
]
|
17
|
+
|
18
|
+
# after training, these test sentences will receive a predicted classification
|
19
|
+
TEST_DATA = [
|
20
|
+
'How many students are in for the exams today?',
|
21
|
+
'The forest has large trees, while the field has its flowers.',
|
22
|
+
'Can we park our cars inside that building to go shopping?',
|
23
|
+
]
|
24
|
+
|
25
|
+
# stop words don't carry meaning, we better ignore them
|
26
|
+
STOP_WORDS = %w(
|
27
|
+
the a on to at so today all many some
|
28
|
+
are is will would their you them their our everyone everything who there
|
29
|
+
while during over for below by with after in around until where
|
30
|
+
)
|
31
|
+
|
32
|
+
def preprocess(s)
|
33
|
+
s.downcase.gsub(/[^a-z\s]/, '')
|
34
|
+
end
|
35
|
+
|
36
|
+
@labels = Tokkens::Tokens.new
|
37
|
+
@tokenizer = Tokkens::Tokenizer.new(stop_words: STOP_WORDS)
|
38
|
+
|
39
|
+
# train
|
40
|
+
training_labels = []
|
41
|
+
training_samples = []
|
42
|
+
TRAINING_DATA.each do |(label, sentence)|
|
43
|
+
training_labels << @labels.get(label)
|
44
|
+
tokens = @tokenizer.get(preprocess(sentence)).uniq
|
45
|
+
training_samples << Hash[tokens.zip([1] * tokens.length)]
|
46
|
+
end
|
47
|
+
#tokenizer.tokens.limit!(occurence: 2) # limit number of tokens - doesn't affect training though!
|
48
|
+
@model = Liblinear.train({}, training_labels, training_samples)
|
49
|
+
|
50
|
+
# predict
|
51
|
+
@tokenizer.tokens.freeze!
|
52
|
+
TEST_DATA.each do |sentence|
|
53
|
+
tokens = @tokenizer.get(preprocess(sentence))
|
54
|
+
label_number = Liblinear.predict(@model, Hash[tokens.zip([1] * tokens.length)])
|
55
|
+
puts "#{sentence} -> #{tokens.map{|i| @tokenizer.tokens.find(i)}.join(' ')} -> #{@labels.find(label_number)}"
|
56
|
+
end
|
57
|
+
|
58
|
+
# you might want to persist data for prediction at a later time
|
59
|
+
#model.save('test.model')
|
60
|
+
#labels.save('test.labels')
|
61
|
+
#tokenizer.tokens.save('test.tokens')
|
data/lib/tokkens.rb
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
require_relative 'tokens'
|
2
|
+
|
3
|
+
module Tokkens
|
4
|
+
# Converts a string to a list of token numbers.
|
5
|
+
#
|
6
|
+
# Useful for computing with text, like machine learning.
|
7
|
+
# Before using the tokenizer, you're expected to have pre-processed
|
8
|
+
# the textdepending on application. For example, converting to lowercase,
|
9
|
+
# removing non-word characters, transliterating accented characters.
|
10
|
+
#
|
11
|
+
# This class then splits the string into tokens by whitespace, and
|
12
|
+
# removes tokens not passing the selection criteria.
|
13
|
+
#
|
14
|
+
class Tokenizer
|
15
|
+
|
16
|
+
# default minimum token length
|
17
|
+
MIN_LENGTH = 2
|
18
|
+
|
19
|
+
# no default stop words to ignore
|
20
|
+
STOP_WORDS = []
|
21
|
+
|
22
|
+
# @!attribute [r] tokens
|
23
|
+
# @return [Tokens] object to use for obtaining tokens
|
24
|
+
# @!attribute [r] stop_words
|
25
|
+
# @return [Array<String>] stop words to ignore
|
26
|
+
# @!attribute [r] min_length
|
27
|
+
# @return [Fixnum] Minimum length for tokens
|
28
|
+
attr_reader :tokens, :stop_words, :min_length
|
29
|
+
|
30
|
+
# Create a new tokenizer
|
31
|
+
#
|
32
|
+
# @param tokens [Tokens] object to use for obtaining token numbers
|
33
|
+
# @param min_length [Fixnum] minimum length for tokens
|
34
|
+
# @param stop_words [Array<String>] stop words to ignore
|
35
|
+
def initialize(tokens = nil, min_length: MIN_LENGTH, stop_words: STOP_WORDS)
|
36
|
+
@tokens = tokens || Tokens.new
|
37
|
+
@stop_words = stop_words
|
38
|
+
@min_length = min_length
|
39
|
+
end
|
40
|
+
|
41
|
+
# @return [Array<Fixnum>] array of token numbers
|
42
|
+
def get(s, **kwargs)
|
43
|
+
return [] if !s || s.strip == ''
|
44
|
+
tokenize(s).map {|token| @tokens.get(token, **kwargs) }.compact
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def tokenize(s)
|
50
|
+
s.split.select(&method(:include?))
|
51
|
+
end
|
52
|
+
|
53
|
+
def include?(s)
|
54
|
+
s.length >= @min_length && !@stop_words.include?(s)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
module Tokkens
|
2
|
+
# Converts a string token to a uniquely identifying sequential number.
|
3
|
+
#
|
4
|
+
# Useful for working with a {https://en.wikipedia.org/wiki/Vector_space_model vector space model}
|
5
|
+
# for text.
|
6
|
+
class Tokens
|
7
|
+
|
8
|
+
# @!attribute [r] offset
|
9
|
+
# @return [Fixnum] Number of first token.
|
10
|
+
attr_accessor :offset
|
11
|
+
|
12
|
+
def initialize(offset: 1)
|
13
|
+
# liblinear can't use offset 0, libsvm doesn't mind to start at one
|
14
|
+
@tokens = {}
|
15
|
+
@offset = offset
|
16
|
+
@next_number = offset
|
17
|
+
@frozen = false
|
18
|
+
end
|
19
|
+
|
20
|
+
# Stop assigning new numbers to token.
|
21
|
+
# @see #frozen?
|
22
|
+
# @see #thaw!
|
23
|
+
def freeze!
|
24
|
+
@frozen = true
|
25
|
+
end
|
26
|
+
|
27
|
+
# Allow new tokens to be created.
|
28
|
+
# @see #freeze!
|
29
|
+
# @see #frozen?
|
30
|
+
def thaw!
|
31
|
+
@frozen = false
|
32
|
+
end
|
33
|
+
|
34
|
+
# @return [Boolean] Whether the tokens are frozen or not.
|
35
|
+
# @see #freeze!
|
36
|
+
# @see #thaw!
|
37
|
+
def frozen?
|
38
|
+
@frozen
|
39
|
+
end
|
40
|
+
|
41
|
+
# Limit the number of tokens.
|
42
|
+
#
|
43
|
+
# @param max_size [Fixnum] Maximum number of tokens to retain
|
44
|
+
# @param min_occurence [Fixnum] Keep only tokens seen at least this many times
|
45
|
+
# @return [Fixnum] Number of tokens left
|
46
|
+
def limit!(max_size: nil, min_occurence: nil)
|
47
|
+
# @todo raise if frozen
|
48
|
+
if min_occurence
|
49
|
+
@tokens.delete_if {|name, data| data[1] < min_occurence }
|
50
|
+
end
|
51
|
+
if max_size
|
52
|
+
@tokens = Hash[@tokens.to_a.sort_by {|a| -a[1][1] }[0..(max_size-1)]]
|
53
|
+
end
|
54
|
+
@tokens.length
|
55
|
+
end
|
56
|
+
|
57
|
+
# Return a number for a new or existing token.
|
58
|
+
#
|
59
|
+
# When the token was seen before, the same number is returned. If the token
|
60
|
+
# is first seen and this class isn't {#frozen?}, a new number is returned;
|
61
|
+
# else +nil+ is returned.
|
62
|
+
#
|
63
|
+
# @param s [String] token to return number for
|
64
|
+
# @option kwargs [String] :prefix optional string to prepend to the token
|
65
|
+
# @return [Fixnum, NilClass] number for given token
|
66
|
+
def get(s, **kwargs)
|
67
|
+
return if !s || s.strip == ''
|
68
|
+
@frozen ? retrieve(s, **kwargs) : upsert(s, **kwargs)
|
69
|
+
end
|
70
|
+
|
71
|
+
# Return an token by number.
|
72
|
+
#
|
73
|
+
# This class is optimized for retrieving by token, not by number.
|
74
|
+
#
|
75
|
+
# @param i [String] number to return token for
|
76
|
+
# @param prefix [String] optional string to remove from beginning of token
|
77
|
+
# @return [String, NilClass] given token, or +nil+ when not found
|
78
|
+
def find(i, prefix: nil)
|
79
|
+
@tokens.each do |s, data|
|
80
|
+
if data[0] == i
|
81
|
+
return (prefix && s.start_with?(prefix)) ? s[prefix.length..-1] : s
|
82
|
+
end
|
83
|
+
end
|
84
|
+
nil
|
85
|
+
end
|
86
|
+
|
87
|
+
# Return indexes for all of the current tokens.
|
88
|
+
#
|
89
|
+
# @return [Array<Fixnum>] All current token numbers.
|
90
|
+
# @see #limit!
|
91
|
+
def indexes
|
92
|
+
@tokens.values.map(&:first)
|
93
|
+
end
|
94
|
+
|
95
|
+
# Load tokens from file.
|
96
|
+
#
|
97
|
+
# The tokens are frozen by default.
|
98
|
+
# All previously existing tokens are removed.
|
99
|
+
#
|
100
|
+
# @param filename [String] Filename
|
101
|
+
def load(filename)
|
102
|
+
File.open(filename) do |f|
|
103
|
+
@tokens = {}
|
104
|
+
f.each_line do |line|
|
105
|
+
id, count, name = line.rstrip.split(/\s+/, 3)
|
106
|
+
@tokens[name.strip] = [id.to_i, count]
|
107
|
+
end
|
108
|
+
end
|
109
|
+
# safer
|
110
|
+
freeze!
|
111
|
+
end
|
112
|
+
|
113
|
+
# Save tokens to file.
|
114
|
+
#
|
115
|
+
# @param filename [String] Filename
|
116
|
+
def save(filename)
|
117
|
+
File.open(filename, 'w') do |f|
|
118
|
+
@tokens.each do |token, (index, count)|
|
119
|
+
f.puts "#{index} #{count} #{token}"
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
private
|
125
|
+
|
126
|
+
def retrieve(s, prefix: '')
|
127
|
+
data = @tokens[prefix + s]
|
128
|
+
data[0] if data
|
129
|
+
end
|
130
|
+
|
131
|
+
# return token number, update next_number; always returns a number
|
132
|
+
def upsert(s, prefix: '')
|
133
|
+
unless data = @tokens[prefix + s]
|
134
|
+
@tokens[prefix + s] = data = [@next_number, 0]
|
135
|
+
@next_number += 1
|
136
|
+
end
|
137
|
+
data[1] += 1
|
138
|
+
data[0]
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe Tokenizer do
|
4
|
+
let(:tokenizer) { described_class.new }
|
5
|
+
let(:offset) { 1 } # default token offset
|
6
|
+
|
7
|
+
describe '#get' do
|
8
|
+
it 'does tokenization' do
|
9
|
+
expect(tokenizer.get('foo bar')).to eq ([offset, offset + 1])
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'ignores too short tokens' do
|
13
|
+
t = described_class.new(min_length: 2)
|
14
|
+
expect(t.get('x')).to eq []
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'ignores stop words' do
|
18
|
+
t = described_class.new(stop_words: ['xyz'])
|
19
|
+
expect(t.get('xyz foo')).to eq [offset]
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'does not return nil tokens' do
|
23
|
+
tokenizer.tokens.get('foo')
|
24
|
+
tokenizer.tokens.freeze!
|
25
|
+
expect(tokenizer.get('foo bar')).to eq [offset]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe '#tokens' do
|
30
|
+
it 'returns a tokens object by default' do
|
31
|
+
expect(tokenizer.tokens).to be_a Tokens
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'can be overridden' do
|
35
|
+
tokens = Tokens.new
|
36
|
+
t = described_class.new(tokens)
|
37
|
+
expect(t.tokens).to be tokens
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/spec/tokens_spec.rb
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
require 'tempfile'
|
3
|
+
|
4
|
+
describe Tokens do
|
5
|
+
let(:tokens) { described_class.new }
|
6
|
+
let(:offset) { 1 } # default offset
|
7
|
+
|
8
|
+
describe '#get' do
|
9
|
+
it 'can new tokens' do
|
10
|
+
expect(tokens.get('bar')).to eq offset
|
11
|
+
expect(tokens.get('foo')).to eq (offset + 1)
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'can get an existing token' do
|
15
|
+
tokens.get('bar')
|
16
|
+
expect(tokens.get('bar')).to eq offset
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'can include a prefix' do
|
20
|
+
tokens.get('bar', prefix: 'XyZ$')
|
21
|
+
expect(tokens.get('XyZ$bar')).to eq offset
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'can get an existing token when frozen' do
|
25
|
+
tokens.get('blup')
|
26
|
+
tokens.freeze!
|
27
|
+
expect(tokens.get('blup')).to eq offset
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'cannot get a new token when frozen' do
|
31
|
+
tokens.get('blup')
|
32
|
+
tokens.freeze!
|
33
|
+
expect(tokens.get('blabla')).to be_nil
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe '#find' do
|
38
|
+
it 'can find an existing token' do
|
39
|
+
tokens.get('blup')
|
40
|
+
i = tokens.get('blah')
|
41
|
+
expect(tokens.find(i)).to eq 'blah'
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'returns nil for a non-existing token' do
|
45
|
+
tokens.get('blup')
|
46
|
+
expect(tokens.find(offset + 1)).to eq nil
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'removes the prefix' do
|
50
|
+
i = tokens.get('blup', prefix: 'FOO$')
|
51
|
+
expect(tokens.find(i, prefix: 'FOO$')).to eq 'blup'
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
describe '#indexes' do
|
56
|
+
it 'is empty without tokens' do
|
57
|
+
expect(tokens.indexes).to eq []
|
58
|
+
end
|
59
|
+
|
60
|
+
it 'returns the expected indexes' do
|
61
|
+
tokens.get('foo')
|
62
|
+
tokens.get('blup')
|
63
|
+
expect(tokens.indexes).to eq [offset, offset + 1]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
describe '#offset' do
|
68
|
+
it 'has a default' do
|
69
|
+
expect(described_class.new.offset).to eq offset
|
70
|
+
end
|
71
|
+
|
72
|
+
it 'can override the default' do
|
73
|
+
expect(described_class.new(offset: 5).offset).to eq 5
|
74
|
+
end
|
75
|
+
|
76
|
+
it 'affects the first number' do
|
77
|
+
tokens = described_class.new(offset: 12)
|
78
|
+
expect(tokens.get('hi')).to eq 12
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
describe '#frozen?' do
|
83
|
+
it 'is not frozen by default' do
|
84
|
+
expect(tokens.frozen?).to be false
|
85
|
+
end
|
86
|
+
|
87
|
+
it 'can be frozen' do
|
88
|
+
tokens.freeze!
|
89
|
+
expect(tokens.frozen?).to be true
|
90
|
+
end
|
91
|
+
|
92
|
+
it 'can be thawed' do
|
93
|
+
tokens.freeze!
|
94
|
+
tokens.thaw!
|
95
|
+
expect(tokens.frozen?).to be false
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
describe '#limit!' do
|
100
|
+
it 'limits to most frequent tokens by max_size' do
|
101
|
+
tokens.get('foo')
|
102
|
+
tokens.get('blup')
|
103
|
+
tokens.get('blup')
|
104
|
+
tokens.limit!(max_size: 1)
|
105
|
+
expect(tokens.indexes).to eq [offset + 1]
|
106
|
+
end
|
107
|
+
|
108
|
+
it 'limits by min_occurence' do
|
109
|
+
tokens.get('foo')
|
110
|
+
tokens.get('blup')
|
111
|
+
tokens.get('foo')
|
112
|
+
tokens.limit!(min_occurence: 2)
|
113
|
+
expect(tokens.indexes).to eq [offset]
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
describe '#load' do
|
118
|
+
let(:file) { Tempfile.new('tokens') }
|
119
|
+
after { file.unlink }
|
120
|
+
|
121
|
+
it 'saves and loads tokens' do
|
122
|
+
tokens.get('foo')
|
123
|
+
tokens.get('bar')
|
124
|
+
tokens.save(file.path)
|
125
|
+
expect(File.exists?(file.path)).to be true
|
126
|
+
expect(File.zero?(file.path)).to be false
|
127
|
+
|
128
|
+
ntokens = described_class.new
|
129
|
+
ntokens.load(file.path)
|
130
|
+
expect(tokens.get('bar')).to eq (offset + 1)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
data/tokkens.gemspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'tokkens/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "tokkens"
|
8
|
+
spec.version = Tokkens::VERSION
|
9
|
+
spec.authors = ["wvengen"]
|
10
|
+
spec.email = ["dev-rails@willem.engen.nl"]
|
11
|
+
spec.summary = %q{Simple text to numbers tokenizer}
|
12
|
+
spec.homepage = "https://github.com/q-m/ruby-tokkens"
|
13
|
+
spec.license = "MIT"
|
14
|
+
spec.description = <<-EOD
|
15
|
+
Tokkens makes it easy to apply a vector space model to text documents,
|
16
|
+
targeted towards with machine learning. It provides a mapping between
|
17
|
+
numbers and tokens (strings)
|
18
|
+
EOD
|
19
|
+
|
20
|
+
spec.files = `git ls-files -z`.split("\x0")
|
21
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
22
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
23
|
+
spec.require_paths = ["lib"]
|
24
|
+
|
25
|
+
spec.required_ruby_version = '>= 2.0'
|
26
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
27
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
28
|
+
spec.add_development_dependency "rspec", "~> 3.5.0"
|
29
|
+
end
|
metadata
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tokkens
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- wvengen
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-02-08 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.7'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 3.5.0
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 3.5.0
|
55
|
+
description: |2
|
56
|
+
Tokkens makes it easy to apply a vector space model to text documents,
|
57
|
+
targeted towards with machine learning. It provides a mapping between
|
58
|
+
numbers and tokens (strings)
|
59
|
+
email:
|
60
|
+
- dev-rails@willem.engen.nl
|
61
|
+
executables: []
|
62
|
+
extensions: []
|
63
|
+
extra_rdoc_files: []
|
64
|
+
files:
|
65
|
+
- ".gitignore"
|
66
|
+
- ".travis.yml"
|
67
|
+
- CHANGELOG.md
|
68
|
+
- Gemfile
|
69
|
+
- LICENSE.md
|
70
|
+
- README.md
|
71
|
+
- Rakefile
|
72
|
+
- examples/classify.rb
|
73
|
+
- lib/tokkens.rb
|
74
|
+
- lib/tokkens/tokenizer.rb
|
75
|
+
- lib/tokkens/tokens.rb
|
76
|
+
- lib/tokkens/version.rb
|
77
|
+
- spec/spec_helper.rb
|
78
|
+
- spec/tokenizer_spec.rb
|
79
|
+
- spec/tokens_spec.rb
|
80
|
+
- tokkens.gemspec
|
81
|
+
homepage: https://github.com/q-m/ruby-tokkens
|
82
|
+
licenses:
|
83
|
+
- MIT
|
84
|
+
metadata: {}
|
85
|
+
post_install_message:
|
86
|
+
rdoc_options: []
|
87
|
+
require_paths:
|
88
|
+
- lib
|
89
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
90
|
+
requirements:
|
91
|
+
- - ">="
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '2.0'
|
94
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
95
|
+
requirements:
|
96
|
+
- - ">="
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '0'
|
99
|
+
requirements: []
|
100
|
+
rubyforge_project:
|
101
|
+
rubygems_version: 2.4.3
|
102
|
+
signing_key:
|
103
|
+
specification_version: 4
|
104
|
+
summary: Simple text to numbers tokenizer
|
105
|
+
test_files:
|
106
|
+
- spec/spec_helper.rb
|
107
|
+
- spec/tokenizer_spec.rb
|
108
|
+
- spec/tokens_spec.rb
|