vector_embed 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/CHANGELOG +3 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/LICENSE.txt +22 -0
- data/README.md +111 -0
- data/Rakefile +8 -0
- data/bin/vector_embed +3 -0
- data/lib/vector_embed.rb +66 -0
- data/lib/vector_embed/maker.rb +46 -0
- data/lib/vector_embed/maker/boolean.rb +42 -0
- data/lib/vector_embed/maker/ngram.rb +39 -0
- data/lib/vector_embed/maker/number.rb +37 -0
- data/lib/vector_embed/maker/phrase.rb +18 -0
- data/lib/vector_embed/stop_word.rb +24 -0
- data/lib/vector_embed/version.rb +3 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/vector_embed_spec.rb +219 -0
- data/vector_embed.gemspec +23 -0
- metadata +135 -0
data/.gitignore
ADDED
data/.rspec
ADDED
data/CHANGELOG
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Seamus Abshere
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Seamus Abshere
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
# VectorEmbed
|
2
|
+
|
3
|
+
Vector embedding of strings, booleans, numerics, and arrays into [LIBSVM](http://www.csie.ntu.edu.tw/~cjlin/libsvm/) / [LIBLINEAR](http://www.csie.ntu.edu.tw/~cjlin/liblinear/) format.
|
4
|
+
|
5
|
+
Inspired by [Sally](http://www.mlsec.org/sally/), except `VectorEmbed` is meant to handle categorical and continuous data at the same time.
|
6
|
+
|
7
|
+
## Usage
|
8
|
+
|
9
|
+
Create a `VectorEmbed` instance, which auto-detects and then remembers what kind of data goes into each feature:
|
10
|
+
|
11
|
+
>> require 'vector_embed'
|
12
|
+
=> true
|
13
|
+
>> v = VectorEmbed.new
|
14
|
+
=> #<VectorEmbed:0x007fd605815208 [...]>
|
15
|
+
|
16
|
+
Output a line with a label and arbitrary features:
|
17
|
+
|
18
|
+
>> label = 1
|
19
|
+
=> 1
|
20
|
+
>> features = { color: 'red', year: 1995, weight: 5.4e9 }
|
21
|
+
=> {:color=>"red", :year=>1995, :weight=>5400000000.0}
|
22
|
+
>> v.line(label, features)
|
23
|
+
=> "1 1997960:1 5556418:5400000000.0 8227451:1995"
|
24
|
+
|
25
|
+
Output another line:
|
26
|
+
|
27
|
+
>> label = 0
|
28
|
+
=> 0
|
29
|
+
>> features = { color: 'blue', year: 1821, weight: 3.3 }
|
30
|
+
=> {:color=>"blue", :year=>1821, :weight=>3.3}
|
31
|
+
>> v.line(label, features)
|
32
|
+
=> "0 1089740:1 5556418:3.3 8227451:1821"
|
33
|
+
|
34
|
+
Note that `color: 'red'` and `color: 'blue'` are being translated into categories:
|
35
|
+
|
36
|
+
1997960:1 # murmur3("color\x00red"):1
|
37
|
+
1089740:1 # murmur3("color\x00blue"):1
|
38
|
+
|
39
|
+
A similar thing happens with `true`/`false`:
|
40
|
+
|
41
|
+
>> v.line(1, yes: true, no: false)
|
42
|
+
=> "1 1559987:1 3324244:1"
|
43
|
+
|
44
|
+
i.e.
|
45
|
+
|
46
|
+
1559987:1 # murmur3("yes\x00true"):1
|
47
|
+
3324244:1 # murmur3("no\x00false"):1
|
48
|
+
|
49
|
+
## N-grams
|
50
|
+
|
51
|
+
Currently uses same parameter names as [Sally](http://www.mlsec.org/sally/manual.html).
|
52
|
+
|
53
|
+
### Word ngrams
|
54
|
+
|
55
|
+
>> v = VectorEmbed.new ngram_len: 1, ngram_delim: /\s+/, stop_words: %w{the and or}
|
56
|
+
=> #<VectorEmbed:0x007fd6033b77f8 [...]>
|
57
|
+
>> v.line(1, notes: 'the quick brown fox')
|
58
|
+
=> "1 1512788:1 3426202:1 5079692:1"
|
59
|
+
|
60
|
+
You get the idea: ("the" has been filtered out by stop words)
|
61
|
+
|
62
|
+
1512788:1 # murmur3("notes\x00quick"):1
|
63
|
+
3426202:1 # murmur3("notes\x00brown"):1
|
64
|
+
5079692:1 # murmur3("notes\x00fox"):1
|
65
|
+
|
66
|
+
### Byte n-grams
|
67
|
+
|
68
|
+
>> v = VectorEmbed.new ngram_len: 2, ngram_delim: ''
|
69
|
+
=> #<VectorEmbed:0x007fd60337ea20 [...]>
|
70
|
+
>> v.line(1, notes: 'foobar')
|
71
|
+
=> "1 2148745:1 2878919:1 3600333:1 3621715:1 5885921:1"
|
72
|
+
|
73
|
+
So therefore:
|
74
|
+
|
75
|
+
2148745:1 # murmur3("notes\x00fo"):1
|
76
|
+
2878919:1 # murmur3("notes\x00oo"):1
|
77
|
+
3600333:1 # murmur3("notes\x00ob"):1
|
78
|
+
3621715:1 # murmur3("notes\x00ba"):1
|
79
|
+
5885921:1 # murmur3("notes\x00ar"):1
|
80
|
+
|
81
|
+
## Debugging
|
82
|
+
|
83
|
+
`VectorEmbed` tries to do the right thing, but if it's not, try turning on debugging:
|
84
|
+
|
85
|
+
>> v = VectorEmbed.new
|
86
|
+
=> #<VectorEmbed:0x007fd6034020a0 [...]>
|
87
|
+
>> v.logger.level = Logger::DEBUG
|
88
|
+
=> 0
|
89
|
+
>> v.line(1, '3' => 7, foo: 'bar', truthy: false, nullity: nil)
|
90
|
+
D, [2013-02-20T16:55:00.139299 #21595] DEBUG -- : Interpreting "3" as Number given first value 7
|
91
|
+
D, [2013-02-20T16:55:00.139561 #21595] DEBUG -- : Interpreting :foo as Phrase given first value "bar"
|
92
|
+
D, [2013-02-20T16:55:00.139671 #21595] DEBUG -- : Interpreting :truthy as Boolean given first value false
|
93
|
+
D, [2013-02-20T16:55:00.139755 #21595] DEBUG -- : Interpreting :nullity as Boolean given first value nil
|
94
|
+
D, [2013-02-20T16:55:00.139872 #21595] DEBUG -- : Interpreting "label" as Number given first value 1
|
95
|
+
=> "1 2647413:7 4091306:1 7123386:1 9259635:1"
|
96
|
+
|
97
|
+
One thing it doesn't like: (assuming you have already performed the lines above)
|
98
|
+
|
99
|
+
>> v.line(1, '3' => 'bar')
|
100
|
+
ArgumentError: Can't embed "bar" in number feature "3".
|
101
|
+
|
102
|
+
It's saying that, given you first passed it `7`, it thought `"3"` was a feature that held numbers.
|
103
|
+
|
104
|
+
## Gotchas
|
105
|
+
|
106
|
+
* Following Sally, it only uses the first 22 bits of the murmur hash for feature indices... more and LIBSVM seems to choke.
|
107
|
+
* Stop words are currently filtered out of feature indices... probably shouldn't be.
|
108
|
+
|
109
|
+
## Copyright
|
110
|
+
|
111
|
+
Copyright 2013 Seamus Abshere
|
data/Rakefile
ADDED
data/bin/vector_embed
ADDED
data/lib/vector_embed.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
3
|
+
require 'vector_embed/version'
|
4
|
+
require 'vector_embed/maker'
|
5
|
+
|
6
|
+
require 'vector_embed/stop_word'
|
7
|
+
|
8
|
+
class VectorEmbed
|
9
|
+
# http://stackoverflow.com/questions/638565/parsing-scientific-notation-sensibly
|
10
|
+
JUST_A_NUMBER = /\A\s*[+\-]?(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?\s*\z/
|
11
|
+
BLANK = /\A\s*\z/
|
12
|
+
NULL_BYTE = "\x00"
|
13
|
+
|
14
|
+
attr_reader :options
|
15
|
+
attr_accessor :logger
|
16
|
+
|
17
|
+
def initialize(options = {})
|
18
|
+
@mutex = Mutex.new
|
19
|
+
@feature_makers = {}
|
20
|
+
@logger = options[:logger] || (l = Logger.new($stderr); l.level = Logger::INFO; l)
|
21
|
+
@options = options.dup
|
22
|
+
end
|
23
|
+
|
24
|
+
def line(label, features = {})
|
25
|
+
feature_pairs = features.inject([]) do |memo, (k, v)|
|
26
|
+
case v
|
27
|
+
when Array
|
28
|
+
v.each_with_index do |vv, i|
|
29
|
+
memo.concat feature_maker([k, i].join(NULL_BYTE), vv).pairs(vv)
|
30
|
+
end
|
31
|
+
else
|
32
|
+
memo.concat feature_maker(k, v).pairs(v)
|
33
|
+
end
|
34
|
+
memo
|
35
|
+
end.compact.sort_by do |k_value, _|
|
36
|
+
k_value
|
37
|
+
end.map do |pair|
|
38
|
+
pair.join ':'
|
39
|
+
end
|
40
|
+
([label_maker(label).value(label)] + feature_pairs).join ' '
|
41
|
+
end
|
42
|
+
|
43
|
+
def preprocess(v)
|
44
|
+
StopWord.remove stop_words, v
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def stop_words
|
50
|
+
@stop_words ||= options.fetch(:stop_words, []).map do |raw_stop_word|
|
51
|
+
StopWord.new raw_stop_word
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def label_maker(label)
|
56
|
+
@label_maker || @mutex.synchronize do
|
57
|
+
@label_maker ||= Maker.pick([Maker::Boolean, Maker::Number], 'label', label, self)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def feature_maker(k, v)
|
62
|
+
@feature_makers[k] || @mutex.synchronize do
|
63
|
+
@feature_makers[k] ||= Maker.pick([Maker::Boolean, Maker::Number, Maker::Ngram, Maker::Phrase], k, v, self)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'vector_embed/maker/phrase'
|
2
|
+
require 'vector_embed/maker/ngram'
|
3
|
+
require 'vector_embed/maker/number'
|
4
|
+
require 'vector_embed/maker/boolean'
|
5
|
+
|
6
|
+
require 'murmurhash3'
|
7
|
+
|
8
|
+
class VectorEmbed
|
9
|
+
class Maker
|
10
|
+
class << self
|
11
|
+
def pick(choices, k, first_v, parent)
|
12
|
+
if klass = choices.detect { |klass| klass.want?(k, first_v, parent) }
|
13
|
+
parent.logger.debug { "Interpreting #{k.inspect} as #{klass.name.split('::').last} given first value #{first_v.inspect}" }
|
14
|
+
klass.new k, parent
|
15
|
+
else
|
16
|
+
raise "Can't use #{first_v.class} for #{k.inspect} given #{first_v.inspect} and choices #{choices.inspect}"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def index(*parts)
|
21
|
+
MurmurHash3::V32.str_hash(parts.join(NULL_BYTE)).to_s[0..6].to_i
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_reader :parent
|
26
|
+
attr_reader :k
|
27
|
+
|
28
|
+
def initialize(k, parent)
|
29
|
+
@k = k
|
30
|
+
@parent = parent
|
31
|
+
end
|
32
|
+
|
33
|
+
def pairs(v)
|
34
|
+
case v
|
35
|
+
when Array
|
36
|
+
memo = []
|
37
|
+
v.each_with_index do |vv, i|
|
38
|
+
memo << [ Maker.index(k, i), value(vv) ]
|
39
|
+
end
|
40
|
+
memo
|
41
|
+
else
|
42
|
+
[ [ Maker.index(k), value(v) ] ]
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'vector_embed/maker'
|
2
|
+
|
3
|
+
class VectorEmbed
|
4
|
+
class Maker
|
5
|
+
class Boolean < Maker
|
6
|
+
class << self
|
7
|
+
def want?(k, v, parent)
|
8
|
+
case v
|
9
|
+
when NilClass, TrueClass, FalseClass, 'true', 'false', 'null'
|
10
|
+
true
|
11
|
+
else
|
12
|
+
false
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def value(v)
|
18
|
+
case v
|
19
|
+
when TrueClass, 'true', 't', 'yes', 'on'
|
20
|
+
1
|
21
|
+
when FalseClass, 'false', 'f', 'no', 'off'
|
22
|
+
0
|
23
|
+
else
|
24
|
+
raise "Can't embed #{v.inspect} in boolean feature #{k.inspect}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def pairs(v)
|
29
|
+
case v
|
30
|
+
when TrueClass, 'true', 't', 'yes', 'on'
|
31
|
+
[ [ Maker.index(k, 'true'), 1 ] ]
|
32
|
+
when FalseClass, 'false', 'f', 'no', 'off'
|
33
|
+
[ [ Maker.index(k, 'false'), 1 ] ]
|
34
|
+
when NilClass, 'null', BLANK
|
35
|
+
[ [ Maker.index(k, 'null'), 1 ] ]
|
36
|
+
else
|
37
|
+
raise ArgumentError, "Can't embed #{v.inspect} in boolean feature #{k.inspect}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'vector_embed/maker'
|
2
|
+
|
3
|
+
class VectorEmbed
|
4
|
+
class Maker
|
5
|
+
class Ngram < Maker
|
6
|
+
class << self
|
7
|
+
def want?(k, v, parent)
|
8
|
+
parent.options[:ngram_len]
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
attr_reader :len
|
13
|
+
attr_reader :delim
|
14
|
+
|
15
|
+
def initialize(k, parent)
|
16
|
+
super
|
17
|
+
@len = parent.options[:ngram_len].to_i
|
18
|
+
raise ArgumentError, ":ngram_len must be > 0" unless @len > 0
|
19
|
+
@delim = parent.options[:ngram_delim]
|
20
|
+
end
|
21
|
+
|
22
|
+
def pairs(v)
|
23
|
+
raise "Ngram can't handle #{v.inspect}, only a single string for now" unless v.is_a?(String)
|
24
|
+
v = parent.preprocess v.to_s
|
25
|
+
if len == 1
|
26
|
+
# word mode
|
27
|
+
v.split delim
|
28
|
+
elsif delim == ''
|
29
|
+
# byte mode
|
30
|
+
(0..v.length-len).map { |i| v[i,len] }
|
31
|
+
else
|
32
|
+
raise "Word n-gram not supported yet"
|
33
|
+
end.map do |ngram|
|
34
|
+
[ [ Maker.index(k, 'ngram', ngram), 1 ] ]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'vector_embed/maker'
|
2
|
+
|
3
|
+
class VectorEmbed
|
4
|
+
class Maker
|
5
|
+
class Number < Maker
|
6
|
+
class << self
|
7
|
+
def want?(k, v, parent)
|
8
|
+
v.is_a?(::Numeric) or v =~ JUST_A_NUMBER
|
9
|
+
end
|
10
|
+
|
11
|
+
def numify(v)
|
12
|
+
num = if v.is_a?(String)
|
13
|
+
if v.include?('.') or v.include?('e')
|
14
|
+
v.to_f
|
15
|
+
else
|
16
|
+
v.to_i
|
17
|
+
end
|
18
|
+
else
|
19
|
+
v
|
20
|
+
end
|
21
|
+
num > 1e10 ? ('%.10e' % num) : num
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def value(v)
|
26
|
+
case v
|
27
|
+
when Numeric, JUST_A_NUMBER
|
28
|
+
Number.numify v
|
29
|
+
when NilClass
|
30
|
+
0
|
31
|
+
else
|
32
|
+
raise ArgumentError, "Can't embed #{v.inspect} in number feature #{k.inspect}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'vector_embed/maker'
|
2
|
+
|
3
|
+
class VectorEmbed
|
4
|
+
class Maker
|
5
|
+
class Phrase < Maker
|
6
|
+
class << self
|
7
|
+
def want?(k, v, parent)
|
8
|
+
true
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def pairs(v)
|
13
|
+
v = parent.preprocess v.to_s
|
14
|
+
[ [ Maker.index(k, v), 1 ] ]
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'vector_embed'
|
2
|
+
|
3
|
+
class VectorEmbed
|
4
|
+
class StopWord
|
5
|
+
class << self
|
6
|
+
def remove(stop_words, str)
|
7
|
+
memo = str.dup
|
8
|
+
stop_words.each do |stop_word|
|
9
|
+
stop_word.apply! memo
|
10
|
+
end
|
11
|
+
memo.gsub! /\s+/, ' '
|
12
|
+
memo.strip!
|
13
|
+
memo
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def initialize(raw_stop_word)
|
18
|
+
@pattern = /\s*\b#{raw_stop_word}\b\s*/i
|
19
|
+
end
|
20
|
+
def apply!(str)
|
21
|
+
str.gsub! @pattern, ' '
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
+
# loaded once.
|
5
|
+
#
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
+
RSpec.configure do |config|
|
8
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
9
|
+
config.run_all_when_everything_filtered = true
|
10
|
+
config.filter_run :focus
|
11
|
+
|
12
|
+
# Run specs in random order to surface order dependencies. If you find an
|
13
|
+
# order dependency and want to debug it, you can fix the order by providing
|
14
|
+
# the seed, which is printed after each run.
|
15
|
+
# --seed 1234
|
16
|
+
config.order = 'random'
|
17
|
+
end
|
18
|
+
|
19
|
+
require 'pry'
|
20
|
+
|
21
|
+
require 'vector_embed'
|
@@ -0,0 +1,219 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe VectorEmbed do
|
4
|
+
describe 'in labels' do
|
5
|
+
it "stores true/false as 1/0" do
|
6
|
+
v = VectorEmbed.new
|
7
|
+
v.line(true).should == '1'
|
8
|
+
v.line(false).should == '0'
|
9
|
+
v.line('true').should == '1'
|
10
|
+
v.line('false').should == '0'
|
11
|
+
end
|
12
|
+
|
13
|
+
it "stores numbers as numbers" do
|
14
|
+
v = VectorEmbed.new
|
15
|
+
v.line(5.4).should == '5.4'
|
16
|
+
v.line(-3.9).should == '-3.9'
|
17
|
+
end
|
18
|
+
|
19
|
+
it "doesn't allow strings" do
|
20
|
+
v = VectorEmbed.new
|
21
|
+
lambda { v.line('foo') }.should raise_error(/string.*label/i)
|
22
|
+
end
|
23
|
+
|
24
|
+
it "doesn't allow mixing" do
|
25
|
+
v = VectorEmbed.new
|
26
|
+
v.line(5.4)
|
27
|
+
lambda { v.line(true) }.should raise_error(/Can't embed.*number/)
|
28
|
+
v = VectorEmbed.new
|
29
|
+
v.line(true)
|
30
|
+
lambda { v.line(5.4) }.should raise_error(/Can't embed.*boolean/)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# aka dimension indexes
|
35
|
+
describe 'in feature keys' do
|
36
|
+
it "stores values as their string equivalents" do
|
37
|
+
v = VectorEmbed.new
|
38
|
+
v.line(1, 1 => 9).should == "1 #{l_h('1')}:9"
|
39
|
+
v.line(1, 5.4 => 9).should == "1 #{l_h('5.4')}:9"
|
40
|
+
v.line(1, '5.4' => 9).should == "1 #{l_h('5.4')}:9"
|
41
|
+
v.line(1, '5.4 ' => 9).should == "1 #{l_h('5.4 ')}:9"
|
42
|
+
v.line(1, 'foo' => 9).should == "1 #{l_h('foo')}:9"
|
43
|
+
v.line(1, 'foo bar' => 9).should == "1 #{l_h('foo bar')}:9"
|
44
|
+
v.line(1, true => 9).should == "1 #{l_h('true')}:9"
|
45
|
+
v.line(1, 'true' => 9).should == "1 #{l_h('true')}:9"
|
46
|
+
v.line(1, false => 9).should == "1 #{l_h('false')}:9"
|
47
|
+
v.line(1, 'false' => 9).should == "1 #{l_h('false')}:9"
|
48
|
+
end
|
49
|
+
|
50
|
+
it "treats nil as a blank string" do
|
51
|
+
v = VectorEmbed.new
|
52
|
+
v.line(1, nil => 9).should == "1 #{l_h('')}:9"
|
53
|
+
end
|
54
|
+
|
55
|
+
it "leaves whitespace alone" do
|
56
|
+
v = VectorEmbed.new
|
57
|
+
v.line(1, '' => 9).should == "1 #{l_h('')}:9"
|
58
|
+
v.line(1, ' ' => 9).should == "1 #{l_h(' ')}:9"
|
59
|
+
v.line(1, ' ' => 9).should == "1 #{l_h(' ')}:9"
|
60
|
+
v.line(1, ' foo ' => 9).should == "1 #{l_h(' foo ')}:9"
|
61
|
+
v.line(1, '5.4 ' => 9).should == "1 #{l_h('5.4 ')}:9"
|
62
|
+
end
|
63
|
+
|
64
|
+
it "orders feature names" do
|
65
|
+
v = VectorEmbed.new
|
66
|
+
v.line(1, 1 => 3, 2 => 7).should == "1 #{l_h('2')}:7 #{l_h('1')}:3"
|
67
|
+
end
|
68
|
+
|
69
|
+
it "allows mixed string and number feature values" do
|
70
|
+
v = VectorEmbed.new
|
71
|
+
v.line(1, a: :b).should == "1 #{l_h("a\x00b")}:1"
|
72
|
+
v.line(1, a: 13).should == "1 #{l_h("a\x0013")}:1"
|
73
|
+
v.line(1, 1 => 9).should == "1 #{l_h('1')}:9" # 9 is not hashed, 1 is
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
describe 'feature values' do
|
78
|
+
it "stores true/false/nil as (1,0,0)/(0,1,0)/(0,0,1)" do
|
79
|
+
v = VectorEmbed.new
|
80
|
+
v.line(1, 1 => true).should == "1 #{l_h("1\x00true")}:1"
|
81
|
+
v.line(1, 1 => 'true').should == "1 #{l_h("1\x00true")}:1"
|
82
|
+
v.line(1, 1 => false).should == "1 #{l_h("1\x00false")}:1"
|
83
|
+
v.line(1, 1 => 'false').should == "1 #{l_h("1\x00false")}:1"
|
84
|
+
v.line(1, 1 => nil).should == "1 #{l_h("1\x00null")}:1"
|
85
|
+
v.line(1, 1 => 'null').should == "1 #{l_h("1\x00null")}:1"
|
86
|
+
end
|
87
|
+
|
88
|
+
it "stores numbers as numbers" do
|
89
|
+
v = VectorEmbed.new
|
90
|
+
v.line(1, 1 => 9).should == "1 #{l_h('1')}:9"
|
91
|
+
v.line(1, 1 => '9').should == "1 #{l_h('1')}:9"
|
92
|
+
v.line(1, 1 => 5.4).should == "1 #{l_h('1')}:5.4"
|
93
|
+
v.line(1, 1 => '5.4').should == "1 #{l_h('1')}:5.4"
|
94
|
+
v.line(1, 1 => 9e9).should == "1 #{l_h('1')}:9000000000.0"
|
95
|
+
v.line(1, 1 => '9e9').should == "1 #{l_h('1')}:9000000000.0"
|
96
|
+
end
|
97
|
+
|
98
|
+
it "stores strings as m-category attributes" do
|
99
|
+
v = VectorEmbed.new
|
100
|
+
v.line(1, 1 => 'sfh').should == "1 #{l_h("1\x00sfh")}:1"
|
101
|
+
v.line(1, 1 => 'mfh').should == "1 #{l_h("1\x00mfh")}:1"
|
102
|
+
v.line(1, 1 => 'foo bar').should == "1 #{l_h("1\x00foo bar")}:1"
|
103
|
+
v.line(1, 1 => 'foo bar ').should == "1 #{l_h("1\x00foo bar")}:1"
|
104
|
+
v.line(1, 1 => ' foo bar ').should == "1 #{l_h("1\x00foo bar")}:1"
|
105
|
+
end
|
106
|
+
|
107
|
+
it "in string mode, treats true/false/nil as strings" do
|
108
|
+
v = VectorEmbed.new
|
109
|
+
v.line(1, 1 => 'foo').should == "1 #{l_h("1\x00foo")}:1"
|
110
|
+
v.line(1, 1 => true).should == "1 #{l_h("1\x00true")}:1"
|
111
|
+
v.line(1, 1 => false).should == "1 #{l_h("1\x00false")}:1"
|
112
|
+
v.line(1, 1 => nil).should == "1 #{l_h("1\x00")}:1"
|
113
|
+
end
|
114
|
+
|
115
|
+
it "in string mode, treats numbers as strings" do
|
116
|
+
v = VectorEmbed.new
|
117
|
+
v.line(1, 1 => 'foo').should == "1 #{l_h("1\x00foo")}:1"
|
118
|
+
v.line(1, 1 => 1).should == "1 #{l_h("1\x001")}:1"
|
119
|
+
v.line(1, 1 => 5.4).should == "1 #{l_h("1\x005.4")}:1"
|
120
|
+
v.line(1, 1 => 9e9).should == "1 #{l_h("1\x00" + 9e9.to_s)}:1"
|
121
|
+
end
|
122
|
+
|
123
|
+
it "flattens and stores arrays" do
|
124
|
+
v = VectorEmbed.new
|
125
|
+
v.line(1, 'foo' => [7,13,19]).should == sortme("1 #{l_h("foo\x001")}:13 #{l_h("foo\x000")}:7 #{l_h("foo\x002")}:19")
|
126
|
+
v.line(1, 'bar' => ['a','b','c']).should == sortme("1 #{l_h("bar\x001\x00b")}:1 #{l_h("bar\x000\x00a")}:1 #{l_h("bar\x002\x00c")}:1")
|
127
|
+
end
|
128
|
+
|
129
|
+
it "in number mode, treats null as 0" do
|
130
|
+
v = VectorEmbed.new
|
131
|
+
v.line(1, 1 => 9).should == "1 #{l_h('1')}:9"
|
132
|
+
v.line(1, 1 => nil).should == "1 #{l_h('1')}:0"
|
133
|
+
end
|
134
|
+
|
135
|
+
it "doesn't allow embedding boolean in number mode or vice-versa" do
|
136
|
+
v = VectorEmbed.new
|
137
|
+
v.line(1, 1 => true)
|
138
|
+
v.line(1, 2 => 5.4) # that's fine, different dimension
|
139
|
+
lambda { v.line(1, 1 => 5.4) }.should raise_error(ArgumentError)
|
140
|
+
v = VectorEmbed.new
|
141
|
+
v.line(1, 1 => 5.4)
|
142
|
+
v.line(1, 2 => true) # that's fine, diff dim
|
143
|
+
lambda { v.line(1, 1 => true) }.should raise_error(ArgumentError)
|
144
|
+
end
|
145
|
+
|
146
|
+
it "doesn't allow embedding string in number mode" do
|
147
|
+
v = VectorEmbed.new
|
148
|
+
v.line(1, 1 => 9)
|
149
|
+
v.line(1, 2 => 'foo') # that's fine, different dimension
|
150
|
+
lambda { v.line(1, 1 => 'foo') }.should raise_error(ArgumentError)
|
151
|
+
end
|
152
|
+
|
153
|
+
it "uses scientific notation for large numbers" do
|
154
|
+
v = VectorEmbed.new
|
155
|
+
v.line(5, 1 => 8.12e13).should == "5 #{l_h('1')}:8.1200000000e+13"
|
156
|
+
end
|
157
|
+
|
158
|
+
it "detects numbers in strings" do
|
159
|
+
v = VectorEmbed.new
|
160
|
+
v.line(5, 1 => '8.12e13').should == "5 #{l_h('1')}:8.1200000000e+13"
|
161
|
+
end
|
162
|
+
|
163
|
+
it "allows 2 byte n-grams" do
|
164
|
+
v = VectorEmbed.new ngram_len: 2, ngram_delim: ''
|
165
|
+
v.line(1, 1 => 'foo').should == sortme("1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1")
|
166
|
+
v.line(1, 1 => 'bar').should == sortme("1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00ba")}:1")
|
167
|
+
v.line(1, 1 => 'baz').should == sortme("1 #{l_h("1\x00ngram\x00az")}:1 #{l_h("1\x00ngram\x00ba")}:1")
|
168
|
+
v.line(1, 1 => 'foobar').should == sortme("1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("1\x00ngram\x00ob")}:1 #{l_h("1\x00ngram\x00ba")}:1")
|
169
|
+
v.line(1, 1 => 'foo bar').should == sortme("1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("1\x00ngram\x00o ")}:1 #{l_h("1\x00ngram\x00 b")}:1 #{l_h("1\x00ngram\x00ba")}:1 #{l_h("1\x00ngram\x00ar")}:1")
|
170
|
+
end
|
171
|
+
|
172
|
+
it "allows word-grams" do
|
173
|
+
v = VectorEmbed.new ngram_len: 1, ngram_delim: /\s+/
|
174
|
+
v.line(1, 1 => 'foo').should == sortme("1 #{l_h("1\x00ngram\x00foo")}:1")
|
175
|
+
v.line(1, 1 => 'foobar').should == sortme("1 #{l_h("1\x00ngram\x00foobar")}:1")
|
176
|
+
v.line(1, 1 => 'foo bar').should == sortme("1 #{l_h("1\x00ngram\x00bar")}:1 #{l_h("1\x00ngram\x00foo")}:1")
|
177
|
+
end
|
178
|
+
|
179
|
+
it "allows 2 byte n-grams with stop words" do
|
180
|
+
v = VectorEmbed.new ngram_len: 2, ngram_delim: '', stop_words: %w{the and or}
|
181
|
+
v.line(1, 1 => 'foo or').should == sortme("1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1")
|
182
|
+
v.line(1, 1 => 'the bar').should == sortme("1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00ba")}:1")
|
183
|
+
v.line(1, 1 => 'and baz').should == sortme("1 #{l_h("1\x00ngram\x00az")}:1 #{l_h("1\x00ngram\x00ba")}:1")
|
184
|
+
v.line(1, 1 => 'foobar or the and').should == sortme("1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("1\x00ngram\x00ob")}:1 #{l_h("1\x00ngram\x00ba")}:1")
|
185
|
+
v.line(1, 1 => 'foo or and the bar').should == sortme("1 #{l_h("1\x00ngram\x00 b")}:1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("1\x00ngram\x00o ")}:1 #{l_h("1\x00ngram\x00ba")}:1")
|
186
|
+
end
|
187
|
+
|
188
|
+
it "allows word-grams with stop words" do
|
189
|
+
v = VectorEmbed.new ngram_len: 1, ngram_delim: /\s+/, stop_words: %w{the and or}
|
190
|
+
v.line(1, 1 => 'foo or').should == "1 #{l_h("1\x00ngram\x00foo")}:1"
|
191
|
+
v.line(1, 1 => 'foo the bar').should == "1 #{l_h("1\x00ngram\x00bar")}:1 #{l_h("1\x00ngram\x00foo")}:1"
|
192
|
+
v.line(1, 1 => 'foo bar and').should == "1 #{l_h("1\x00ngram\x00bar")}:1 #{l_h("1\x00ngram\x00foo")}:1"
|
193
|
+
end
|
194
|
+
|
195
|
+
it "doesn't do anything weird when you have multiple features" do
|
196
|
+
v = VectorEmbed.new
|
197
|
+
v.line(1, 1 => 'foo', 2 => 'bar', 'baz' => 'zoo').should == sortme("1 #{l_h("1\x00foo")}:1 #{l_h("2\x00bar")}:1 #{l_h("baz\x00zoo")}:1")
|
198
|
+
end
|
199
|
+
|
200
|
+
end
|
201
|
+
|
202
|
+
private
|
203
|
+
|
204
|
+
def h(v)
|
205
|
+
MurmurHash3::V32.str_hash v
|
206
|
+
end
|
207
|
+
|
208
|
+
# for labels
|
209
|
+
def l_h(v)
|
210
|
+
h(v).to_s[0..6].to_i
|
211
|
+
end
|
212
|
+
|
213
|
+
def sortme(line)
|
214
|
+
parts = line.split(' ')
|
215
|
+
label = parts.shift
|
216
|
+
features = parts.map { |p| p.split(':') }.sort_by { |k, v| k.to_i }.map { |k, v| [k, v].join(':') }
|
217
|
+
([label] + features).join(' ')
|
218
|
+
end
|
219
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/vector_embed/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.name = "vector_embed"
|
6
|
+
gem.version = VectorEmbed::VERSION
|
7
|
+
gem.authors = ["Seamus Abshere"]
|
8
|
+
gem.email = ["seamus@abshere.net"]
|
9
|
+
gem.description = %q{Vector embedding of strings, booleans, numerics, and arrays into LIBSVM / LIBLINEAR format.}
|
10
|
+
gem.summary = %q{Vector embedding of strings, booleans, numerics, and arrays into LIBSVM / LIBLINEAR format.}
|
11
|
+
gem.homepage = "https://github.com/seamusabshere/vector_embed"
|
12
|
+
|
13
|
+
gem.files = `git ls-files`.split($/)
|
14
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
15
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
16
|
+
gem.require_paths = ["lib"]
|
17
|
+
|
18
|
+
gem.add_dependency 'murmurhash3'
|
19
|
+
|
20
|
+
gem.add_development_dependency 'rspec'
|
21
|
+
gem.add_development_dependency 'pry'
|
22
|
+
gem.add_development_dependency 'yard'
|
23
|
+
end
|
metadata
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: vector_embed
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Seamus Abshere
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-02-20 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: murmurhash3
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rspec
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: pry
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: yard
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
description: Vector embedding of strings, booleans, numerics, and arrays into LIBSVM
|
79
|
+
/ LIBLINEAR format.
|
80
|
+
email:
|
81
|
+
- seamus@abshere.net
|
82
|
+
executables:
|
83
|
+
- vector_embed
|
84
|
+
extensions: []
|
85
|
+
extra_rdoc_files: []
|
86
|
+
files:
|
87
|
+
- .gitignore
|
88
|
+
- .rspec
|
89
|
+
- CHANGELOG
|
90
|
+
- Gemfile
|
91
|
+
- LICENSE
|
92
|
+
- LICENSE.txt
|
93
|
+
- README.md
|
94
|
+
- Rakefile
|
95
|
+
- bin/vector_embed
|
96
|
+
- lib/vector_embed.rb
|
97
|
+
- lib/vector_embed/maker.rb
|
98
|
+
- lib/vector_embed/maker/boolean.rb
|
99
|
+
- lib/vector_embed/maker/ngram.rb
|
100
|
+
- lib/vector_embed/maker/number.rb
|
101
|
+
- lib/vector_embed/maker/phrase.rb
|
102
|
+
- lib/vector_embed/stop_word.rb
|
103
|
+
- lib/vector_embed/version.rb
|
104
|
+
- spec/spec_helper.rb
|
105
|
+
- spec/vector_embed_spec.rb
|
106
|
+
- vector_embed.gemspec
|
107
|
+
homepage: https://github.com/seamusabshere/vector_embed
|
108
|
+
licenses: []
|
109
|
+
post_install_message:
|
110
|
+
rdoc_options: []
|
111
|
+
require_paths:
|
112
|
+
- lib
|
113
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
114
|
+
none: false
|
115
|
+
requirements:
|
116
|
+
- - ! '>='
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
version: '0'
|
119
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
120
|
+
none: false
|
121
|
+
requirements:
|
122
|
+
- - ! '>='
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
requirements: []
|
126
|
+
rubyforge_project:
|
127
|
+
rubygems_version: 1.8.25
|
128
|
+
signing_key:
|
129
|
+
specification_version: 3
|
130
|
+
summary: Vector embedding of strings, booleans, numerics, and arrays into LIBSVM /
|
131
|
+
LIBLINEAR format.
|
132
|
+
test_files:
|
133
|
+
- spec/spec_helper.rb
|
134
|
+
- spec/vector_embed_spec.rb
|
135
|
+
has_rdoc:
|