vector_embed 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/CHANGELOG +3 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/LICENSE.txt +22 -0
- data/README.md +111 -0
- data/Rakefile +8 -0
- data/bin/vector_embed +3 -0
- data/lib/vector_embed.rb +66 -0
- data/lib/vector_embed/maker.rb +46 -0
- data/lib/vector_embed/maker/boolean.rb +42 -0
- data/lib/vector_embed/maker/ngram.rb +39 -0
- data/lib/vector_embed/maker/number.rb +37 -0
- data/lib/vector_embed/maker/phrase.rb +18 -0
- data/lib/vector_embed/stop_word.rb +24 -0
- data/lib/vector_embed/version.rb +3 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/vector_embed_spec.rb +219 -0
- data/vector_embed.gemspec +23 -0
- metadata +135 -0
data/.gitignore
ADDED
data/.rspec
ADDED
data/CHANGELOG
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Seamus Abshere
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Seamus Abshere
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
# VectorEmbed
|
2
|
+
|
3
|
+
Vector embedding of strings, booleans, numerics, and arrays into [LIBSVM](http://www.csie.ntu.edu.tw/~cjlin/libsvm/) / [LIBLINEAR](http://www.csie.ntu.edu.tw/~cjlin/liblinear/) format.
|
4
|
+
|
5
|
+
Inspired by [Sally](http://www.mlsec.org/sally/), except `VectorEmbed` is meant to handle categorical and continuous data at the same time.
|
6
|
+
|
7
|
+
## Usage
|
8
|
+
|
9
|
+
Create a `VectorEmbed` instance, which auto-detects and then remembers what kind of data goes into each feature:
|
10
|
+
|
11
|
+
>> require 'vector_embed'
|
12
|
+
=> true
|
13
|
+
>> v = VectorEmbed.new
|
14
|
+
=> #<VectorEmbed:0x007fd605815208 [...]>
|
15
|
+
|
16
|
+
Output a line with a label and arbitrary features:
|
17
|
+
|
18
|
+
>> label = 1
|
19
|
+
=> 1
|
20
|
+
>> features = { color: 'red', year: 1995, weight: 5.4e9 }
|
21
|
+
=> {:color=>"red", :year=>1995, :weight=>5400000000.0}
|
22
|
+
>> v.line(label, features)
|
23
|
+
=> "1 1997960:1 5556418:5400000000.0 8227451:1995"
|
24
|
+
|
25
|
+
Output another line:
|
26
|
+
|
27
|
+
>> label = 0
|
28
|
+
=> 0
|
29
|
+
>> features = { color: 'blue', year: 1821, weight: 3.3 }
|
30
|
+
=> {:color=>"blue", :year=>1821, :weight=>3.3}
|
31
|
+
>> v.line(label, features)
|
32
|
+
=> "0 1089740:1 5556418:3.3 8227451:1821"
|
33
|
+
|
34
|
+
Note that `color: 'red'` and `color: 'blue'` are being translated into categories:
|
35
|
+
|
36
|
+
1997960:1 # murmur3("color\x00red"):1
|
37
|
+
1089740:1 # murmur3("color\x00blue"):1
|
38
|
+
|
39
|
+
A similar thing happens with `true`/`false`:
|
40
|
+
|
41
|
+
>> v.line(1, yes: true, no: false)
|
42
|
+
=> "1 1559987:1 3324244:1"
|
43
|
+
|
44
|
+
i.e.
|
45
|
+
|
46
|
+
1559987:1 # murmur3("yes\x00true"):1
|
47
|
+
3324244:1 # murmur3("no\x00false"):1
|
48
|
+
|
49
|
+
## N-grams
|
50
|
+
|
51
|
+
Currently uses same parameter names as [Sally](http://www.mlsec.org/sally/manual.html).
|
52
|
+
|
53
|
+
### Word ngrams
|
54
|
+
|
55
|
+
>> v = VectorEmbed.new ngram_len: 1, ngram_delim: /\s+/, stop_words: %w{the and or}
|
56
|
+
=> #<VectorEmbed:0x007fd6033b77f8 [...]>
|
57
|
+
>> v.line(1, notes: 'the quick brown fox')
|
58
|
+
=> "1 1512788:1 3426202:1 5079692:1"
|
59
|
+
|
60
|
+
You get the idea: ("the" has been filtered out by stop words)
|
61
|
+
|
62
|
+
1512788:1 # murmur3("notes\x00quick"):1
|
63
|
+
3426202:1 # murmur3("notes\x00brown"):1
|
64
|
+
5079692:1 # murmur3("notes\x00fox"):1
|
65
|
+
|
66
|
+
### Byte n-grams
|
67
|
+
|
68
|
+
>> v = VectorEmbed.new ngram_len: 2, ngram_delim: ''
|
69
|
+
=> #<VectorEmbed:0x007fd60337ea20 [...]>
|
70
|
+
>> v.line(1, notes: 'foobar')
|
71
|
+
=> "1 2148745:1 2878919:1 3600333:1 3621715:1 5885921:1"
|
72
|
+
|
73
|
+
So therefore:
|
74
|
+
|
75
|
+
2148745:1 # murmur3("notes\x00fo"):1
|
76
|
+
2878919:1 # murmur3("notes\x00oo"):1
|
77
|
+
3600333:1 # murmur3("notes\x00ob"):1
|
78
|
+
3621715:1 # murmur3("notes\x00ba"):1
|
79
|
+
5885921:1 # murmur3("notes\x00ar"):1
|
80
|
+
|
81
|
+
## Debugging
|
82
|
+
|
83
|
+
`VectorEmbed` tries to do the right thing, but if it's not, try turning on debugging:
|
84
|
+
|
85
|
+
>> v = VectorEmbed.new
|
86
|
+
=> #<VectorEmbed:0x007fd6034020a0 [...]>
|
87
|
+
>> v.logger.level = Logger::DEBUG
|
88
|
+
=> 0
|
89
|
+
>> v.line(1, '3' => 7, foo: 'bar', truthy: false, nullity: nil)
|
90
|
+
D, [2013-02-20T16:55:00.139299 #21595] DEBUG -- : Interpreting "3" as Number given first value 7
|
91
|
+
D, [2013-02-20T16:55:00.139561 #21595] DEBUG -- : Interpreting :foo as Phrase given first value "bar"
|
92
|
+
D, [2013-02-20T16:55:00.139671 #21595] DEBUG -- : Interpreting :truthy as Boolean given first value false
|
93
|
+
D, [2013-02-20T16:55:00.139755 #21595] DEBUG -- : Interpreting :nullity as Boolean given first value nil
|
94
|
+
D, [2013-02-20T16:55:00.139872 #21595] DEBUG -- : Interpreting "label" as Number given first value 1
|
95
|
+
=> "1 2647413:7 4091306:1 7123386:1 9259635:1"
|
96
|
+
|
97
|
+
One thing it doesn't like: (assuming you have already performed the lines above)
|
98
|
+
|
99
|
+
>> v.line(1, '3' => 'bar')
|
100
|
+
ArgumentError: Can't embed "bar" in number feature "3".
|
101
|
+
|
102
|
+
It's saying that, given you first passed it `7`, it thought `"3"` was a feature that held numbers.
|
103
|
+
|
104
|
+
## Gotchas
|
105
|
+
|
106
|
+
* Following Sally, it only uses the first 22 bits of the murmur hash for feature indices... more and LIBSVM seems to choke.
|
107
|
+
* Stop words are currently filtered out of feature indices... probably shouldn't be.
|
108
|
+
|
109
|
+
## Copyright
|
110
|
+
|
111
|
+
Copyright 2013 Seamus Abshere
|
data/Rakefile
ADDED
data/bin/vector_embed
ADDED
data/lib/vector_embed.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
3
|
+
require 'vector_embed/version'
|
4
|
+
require 'vector_embed/maker'
|
5
|
+
|
6
|
+
require 'vector_embed/stop_word'
|
7
|
+
|
8
|
+
class VectorEmbed
|
9
|
+
# http://stackoverflow.com/questions/638565/parsing-scientific-notation-sensibly
|
10
|
+
JUST_A_NUMBER = /\A\s*[+\-]?(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?\s*\z/
|
11
|
+
BLANK = /\A\s*\z/
|
12
|
+
NULL_BYTE = "\x00"
|
13
|
+
|
14
|
+
attr_reader :options
|
15
|
+
attr_accessor :logger
|
16
|
+
|
17
|
+
def initialize(options = {})
|
18
|
+
@mutex = Mutex.new
|
19
|
+
@feature_makers = {}
|
20
|
+
@logger = options[:logger] || (l = Logger.new($stderr); l.level = Logger::INFO; l)
|
21
|
+
@options = options.dup
|
22
|
+
end
|
23
|
+
|
24
|
+
def line(label, features = {})
|
25
|
+
feature_pairs = features.inject([]) do |memo, (k, v)|
|
26
|
+
case v
|
27
|
+
when Array
|
28
|
+
v.each_with_index do |vv, i|
|
29
|
+
memo.concat feature_maker([k, i].join(NULL_BYTE), vv).pairs(vv)
|
30
|
+
end
|
31
|
+
else
|
32
|
+
memo.concat feature_maker(k, v).pairs(v)
|
33
|
+
end
|
34
|
+
memo
|
35
|
+
end.compact.sort_by do |k_value, _|
|
36
|
+
k_value
|
37
|
+
end.map do |pair|
|
38
|
+
pair.join ':'
|
39
|
+
end
|
40
|
+
([label_maker(label).value(label)] + feature_pairs).join ' '
|
41
|
+
end
|
42
|
+
|
43
|
+
def preprocess(v)
|
44
|
+
StopWord.remove stop_words, v
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def stop_words
|
50
|
+
@stop_words ||= options.fetch(:stop_words, []).map do |raw_stop_word|
|
51
|
+
StopWord.new raw_stop_word
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def label_maker(label)
|
56
|
+
@label_maker || @mutex.synchronize do
|
57
|
+
@label_maker ||= Maker.pick([Maker::Boolean, Maker::Number], 'label', label, self)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def feature_maker(k, v)
|
62
|
+
@feature_makers[k] || @mutex.synchronize do
|
63
|
+
@feature_makers[k] ||= Maker.pick([Maker::Boolean, Maker::Number, Maker::Ngram, Maker::Phrase], k, v, self)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'vector_embed/maker/phrase'
|
2
|
+
require 'vector_embed/maker/ngram'
|
3
|
+
require 'vector_embed/maker/number'
|
4
|
+
require 'vector_embed/maker/boolean'
|
5
|
+
|
6
|
+
require 'murmurhash3'
|
7
|
+
|
8
|
+
class VectorEmbed
|
9
|
+
class Maker
|
10
|
+
class << self
|
11
|
+
def pick(choices, k, first_v, parent)
|
12
|
+
if klass = choices.detect { |klass| klass.want?(k, first_v, parent) }
|
13
|
+
parent.logger.debug { "Interpreting #{k.inspect} as #{klass.name.split('::').last} given first value #{first_v.inspect}" }
|
14
|
+
klass.new k, parent
|
15
|
+
else
|
16
|
+
raise "Can't use #{first_v.class} for #{k.inspect} given #{first_v.inspect} and choices #{choices.inspect}"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def index(*parts)
|
21
|
+
MurmurHash3::V32.str_hash(parts.join(NULL_BYTE)).to_s[0..6].to_i
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_reader :parent
|
26
|
+
attr_reader :k
|
27
|
+
|
28
|
+
def initialize(k, parent)
|
29
|
+
@k = k
|
30
|
+
@parent = parent
|
31
|
+
end
|
32
|
+
|
33
|
+
def pairs(v)
|
34
|
+
case v
|
35
|
+
when Array
|
36
|
+
memo = []
|
37
|
+
v.each_with_index do |vv, i|
|
38
|
+
memo << [ Maker.index(k, i), value(vv) ]
|
39
|
+
end
|
40
|
+
memo
|
41
|
+
else
|
42
|
+
[ [ Maker.index(k), value(v) ] ]
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'vector_embed/maker'
|
2
|
+
|
3
|
+
class VectorEmbed
|
4
|
+
class Maker
|
5
|
+
class Boolean < Maker
|
6
|
+
class << self
|
7
|
+
def want?(k, v, parent)
|
8
|
+
case v
|
9
|
+
when NilClass, TrueClass, FalseClass, 'true', 'false', 'null'
|
10
|
+
true
|
11
|
+
else
|
12
|
+
false
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def value(v)
|
18
|
+
case v
|
19
|
+
when TrueClass, 'true', 't', 'yes', 'on'
|
20
|
+
1
|
21
|
+
when FalseClass, 'false', 'f', 'no', 'off'
|
22
|
+
0
|
23
|
+
else
|
24
|
+
raise "Can't embed #{v.inspect} in boolean feature #{k.inspect}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def pairs(v)
|
29
|
+
case v
|
30
|
+
when TrueClass, 'true', 't', 'yes', 'on'
|
31
|
+
[ [ Maker.index(k, 'true'), 1 ] ]
|
32
|
+
when FalseClass, 'false', 'f', 'no', 'off'
|
33
|
+
[ [ Maker.index(k, 'false'), 1 ] ]
|
34
|
+
when NilClass, 'null', BLANK
|
35
|
+
[ [ Maker.index(k, 'null'), 1 ] ]
|
36
|
+
else
|
37
|
+
raise ArgumentError, "Can't embed #{v.inspect} in boolean feature #{k.inspect}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'vector_embed/maker'
|
2
|
+
|
3
|
+
class VectorEmbed
|
4
|
+
class Maker
|
5
|
+
class Ngram < Maker
|
6
|
+
class << self
|
7
|
+
def want?(k, v, parent)
|
8
|
+
parent.options[:ngram_len]
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
attr_reader :len
|
13
|
+
attr_reader :delim
|
14
|
+
|
15
|
+
def initialize(k, parent)
|
16
|
+
super
|
17
|
+
@len = parent.options[:ngram_len].to_i
|
18
|
+
raise ArgumentError, ":ngram_len must be > 0" unless @len > 0
|
19
|
+
@delim = parent.options[:ngram_delim]
|
20
|
+
end
|
21
|
+
|
22
|
+
def pairs(v)
|
23
|
+
raise "Ngram can't handle #{v.inspect}, only a single string for now" unless v.is_a?(String)
|
24
|
+
v = parent.preprocess v.to_s
|
25
|
+
if len == 1
|
26
|
+
# word mode
|
27
|
+
v.split delim
|
28
|
+
elsif delim == ''
|
29
|
+
# byte mode
|
30
|
+
(0..v.length-len).map { |i| v[i,len] }
|
31
|
+
else
|
32
|
+
raise "Word n-gram not supported yet"
|
33
|
+
end.map do |ngram|
|
34
|
+
[ [ Maker.index(k, 'ngram', ngram), 1 ] ]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'vector_embed/maker'
|
2
|
+
|
3
|
+
class VectorEmbed
|
4
|
+
class Maker
|
5
|
+
class Number < Maker
|
6
|
+
class << self
|
7
|
+
def want?(k, v, parent)
|
8
|
+
v.is_a?(::Numeric) or v =~ JUST_A_NUMBER
|
9
|
+
end
|
10
|
+
|
11
|
+
def numify(v)
|
12
|
+
num = if v.is_a?(String)
|
13
|
+
if v.include?('.') or v.include?('e')
|
14
|
+
v.to_f
|
15
|
+
else
|
16
|
+
v.to_i
|
17
|
+
end
|
18
|
+
else
|
19
|
+
v
|
20
|
+
end
|
21
|
+
num > 1e10 ? ('%.10e' % num) : num
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def value(v)
|
26
|
+
case v
|
27
|
+
when Numeric, JUST_A_NUMBER
|
28
|
+
Number.numify v
|
29
|
+
when NilClass
|
30
|
+
0
|
31
|
+
else
|
32
|
+
raise ArgumentError, "Can't embed #{v.inspect} in number feature #{k.inspect}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'vector_embed/maker'
|
2
|
+
|
3
|
+
class VectorEmbed
|
4
|
+
class Maker
|
5
|
+
class Phrase < Maker
|
6
|
+
class << self
|
7
|
+
def want?(k, v, parent)
|
8
|
+
true
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def pairs(v)
|
13
|
+
v = parent.preprocess v.to_s
|
14
|
+
[ [ Maker.index(k, v), 1 ] ]
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'vector_embed'
|
2
|
+
|
3
|
+
class VectorEmbed
|
4
|
+
class StopWord
|
5
|
+
class << self
|
6
|
+
def remove(stop_words, str)
|
7
|
+
memo = str.dup
|
8
|
+
stop_words.each do |stop_word|
|
9
|
+
stop_word.apply! memo
|
10
|
+
end
|
11
|
+
memo.gsub! /\s+/, ' '
|
12
|
+
memo.strip!
|
13
|
+
memo
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def initialize(raw_stop_word)
|
18
|
+
@pattern = /\s*\b#{raw_stop_word}\b\s*/i
|
19
|
+
end
|
20
|
+
def apply!(str)
|
21
|
+
str.gsub! @pattern, ' '
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
+
# loaded once.
|
5
|
+
#
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
+
RSpec.configure do |config|
|
8
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
9
|
+
config.run_all_when_everything_filtered = true
|
10
|
+
config.filter_run :focus
|
11
|
+
|
12
|
+
# Run specs in random order to surface order dependencies. If you find an
|
13
|
+
# order dependency and want to debug it, you can fix the order by providing
|
14
|
+
# the seed, which is printed after each run.
|
15
|
+
# --seed 1234
|
16
|
+
config.order = 'random'
|
17
|
+
end
|
18
|
+
|
19
|
+
require 'pry'
|
20
|
+
|
21
|
+
require 'vector_embed'
|
@@ -0,0 +1,219 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe VectorEmbed do
|
4
|
+
describe 'in labels' do
|
5
|
+
it "stores true/false as 1/0" do
|
6
|
+
v = VectorEmbed.new
|
7
|
+
v.line(true).should == '1'
|
8
|
+
v.line(false).should == '0'
|
9
|
+
v.line('true').should == '1'
|
10
|
+
v.line('false').should == '0'
|
11
|
+
end
|
12
|
+
|
13
|
+
it "stores numbers as numbers" do
|
14
|
+
v = VectorEmbed.new
|
15
|
+
v.line(5.4).should == '5.4'
|
16
|
+
v.line(-3.9).should == '-3.9'
|
17
|
+
end
|
18
|
+
|
19
|
+
it "doesn't allow strings" do
|
20
|
+
v = VectorEmbed.new
|
21
|
+
lambda { v.line('foo') }.should raise_error(/string.*label/i)
|
22
|
+
end
|
23
|
+
|
24
|
+
it "doesn't allow mixing" do
|
25
|
+
v = VectorEmbed.new
|
26
|
+
v.line(5.4)
|
27
|
+
lambda { v.line(true) }.should raise_error(/Can't embed.*number/)
|
28
|
+
v = VectorEmbed.new
|
29
|
+
v.line(true)
|
30
|
+
lambda { v.line(5.4) }.should raise_error(/Can't embed.*boolean/)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# aka dimension indexes
|
35
|
+
describe 'in feature keys' do
|
36
|
+
it "stores values as their string equivalents" do
|
37
|
+
v = VectorEmbed.new
|
38
|
+
v.line(1, 1 => 9).should == "1 #{l_h('1')}:9"
|
39
|
+
v.line(1, 5.4 => 9).should == "1 #{l_h('5.4')}:9"
|
40
|
+
v.line(1, '5.4' => 9).should == "1 #{l_h('5.4')}:9"
|
41
|
+
v.line(1, '5.4 ' => 9).should == "1 #{l_h('5.4 ')}:9"
|
42
|
+
v.line(1, 'foo' => 9).should == "1 #{l_h('foo')}:9"
|
43
|
+
v.line(1, 'foo bar' => 9).should == "1 #{l_h('foo bar')}:9"
|
44
|
+
v.line(1, true => 9).should == "1 #{l_h('true')}:9"
|
45
|
+
v.line(1, 'true' => 9).should == "1 #{l_h('true')}:9"
|
46
|
+
v.line(1, false => 9).should == "1 #{l_h('false')}:9"
|
47
|
+
v.line(1, 'false' => 9).should == "1 #{l_h('false')}:9"
|
48
|
+
end
|
49
|
+
|
50
|
+
it "treats nil as a blank string" do
|
51
|
+
v = VectorEmbed.new
|
52
|
+
v.line(1, nil => 9).should == "1 #{l_h('')}:9"
|
53
|
+
end
|
54
|
+
|
55
|
+
it "leaves whitespace alone" do
|
56
|
+
v = VectorEmbed.new
|
57
|
+
v.line(1, '' => 9).should == "1 #{l_h('')}:9"
|
58
|
+
v.line(1, ' ' => 9).should == "1 #{l_h(' ')}:9"
|
59
|
+
v.line(1, ' ' => 9).should == "1 #{l_h(' ')}:9"
|
60
|
+
v.line(1, ' foo ' => 9).should == "1 #{l_h(' foo ')}:9"
|
61
|
+
v.line(1, '5.4 ' => 9).should == "1 #{l_h('5.4 ')}:9"
|
62
|
+
end
|
63
|
+
|
64
|
+
it "orders feature names" do
|
65
|
+
v = VectorEmbed.new
|
66
|
+
v.line(1, 1 => 3, 2 => 7).should == "1 #{l_h('2')}:7 #{l_h('1')}:3"
|
67
|
+
end
|
68
|
+
|
69
|
+
it "allows mixed string and number feature values" do
|
70
|
+
v = VectorEmbed.new
|
71
|
+
v.line(1, a: :b).should == "1 #{l_h("a\x00b")}:1"
|
72
|
+
v.line(1, a: 13).should == "1 #{l_h("a\x0013")}:1"
|
73
|
+
v.line(1, 1 => 9).should == "1 #{l_h('1')}:9" # 9 is not hashed, 1 is
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
describe 'feature values' do
|
78
|
+
it "stores true/false/nil as (1,0,0)/(0,1,0)/(0,0,1)" do
|
79
|
+
v = VectorEmbed.new
|
80
|
+
v.line(1, 1 => true).should == "1 #{l_h("1\x00true")}:1"
|
81
|
+
v.line(1, 1 => 'true').should == "1 #{l_h("1\x00true")}:1"
|
82
|
+
v.line(1, 1 => false).should == "1 #{l_h("1\x00false")}:1"
|
83
|
+
v.line(1, 1 => 'false').should == "1 #{l_h("1\x00false")}:1"
|
84
|
+
v.line(1, 1 => nil).should == "1 #{l_h("1\x00null")}:1"
|
85
|
+
v.line(1, 1 => 'null').should == "1 #{l_h("1\x00null")}:1"
|
86
|
+
end
|
87
|
+
|
88
|
+
it "stores numbers as numbers" do
|
89
|
+
v = VectorEmbed.new
|
90
|
+
v.line(1, 1 => 9).should == "1 #{l_h('1')}:9"
|
91
|
+
v.line(1, 1 => '9').should == "1 #{l_h('1')}:9"
|
92
|
+
v.line(1, 1 => 5.4).should == "1 #{l_h('1')}:5.4"
|
93
|
+
v.line(1, 1 => '5.4').should == "1 #{l_h('1')}:5.4"
|
94
|
+
v.line(1, 1 => 9e9).should == "1 #{l_h('1')}:9000000000.0"
|
95
|
+
v.line(1, 1 => '9e9').should == "1 #{l_h('1')}:9000000000.0"
|
96
|
+
end
|
97
|
+
|
98
|
+
it "stores strings as m-category attributes" do
|
99
|
+
v = VectorEmbed.new
|
100
|
+
v.line(1, 1 => 'sfh').should == "1 #{l_h("1\x00sfh")}:1"
|
101
|
+
v.line(1, 1 => 'mfh').should == "1 #{l_h("1\x00mfh")}:1"
|
102
|
+
v.line(1, 1 => 'foo bar').should == "1 #{l_h("1\x00foo bar")}:1"
|
103
|
+
v.line(1, 1 => 'foo bar ').should == "1 #{l_h("1\x00foo bar")}:1"
|
104
|
+
v.line(1, 1 => ' foo bar ').should == "1 #{l_h("1\x00foo bar")}:1"
|
105
|
+
end
|
106
|
+
|
107
|
+
it "in string mode, treats true/false/nil as strings" do
|
108
|
+
v = VectorEmbed.new
|
109
|
+
v.line(1, 1 => 'foo').should == "1 #{l_h("1\x00foo")}:1"
|
110
|
+
v.line(1, 1 => true).should == "1 #{l_h("1\x00true")}:1"
|
111
|
+
v.line(1, 1 => false).should == "1 #{l_h("1\x00false")}:1"
|
112
|
+
v.line(1, 1 => nil).should == "1 #{l_h("1\x00")}:1"
|
113
|
+
end
|
114
|
+
|
115
|
+
it "in string mode, treats numbers as strings" do
|
116
|
+
v = VectorEmbed.new
|
117
|
+
v.line(1, 1 => 'foo').should == "1 #{l_h("1\x00foo")}:1"
|
118
|
+
v.line(1, 1 => 1).should == "1 #{l_h("1\x001")}:1"
|
119
|
+
v.line(1, 1 => 5.4).should == "1 #{l_h("1\x005.4")}:1"
|
120
|
+
v.line(1, 1 => 9e9).should == "1 #{l_h("1\x00" + 9e9.to_s)}:1"
|
121
|
+
end
|
122
|
+
|
123
|
+
it "flattens and stores arrays" do
|
124
|
+
v = VectorEmbed.new
|
125
|
+
v.line(1, 'foo' => [7,13,19]).should == sortme("1 #{l_h("foo\x001")}:13 #{l_h("foo\x000")}:7 #{l_h("foo\x002")}:19")
|
126
|
+
v.line(1, 'bar' => ['a','b','c']).should == sortme("1 #{l_h("bar\x001\x00b")}:1 #{l_h("bar\x000\x00a")}:1 #{l_h("bar\x002\x00c")}:1")
|
127
|
+
end
|
128
|
+
|
129
|
+
it "in number mode, treats null as 0" do
|
130
|
+
v = VectorEmbed.new
|
131
|
+
v.line(1, 1 => 9).should == "1 #{l_h('1')}:9"
|
132
|
+
v.line(1, 1 => nil).should == "1 #{l_h('1')}:0"
|
133
|
+
end
|
134
|
+
|
135
|
+
it "doesn't allow embedding boolean in number mode or vice-versa" do
|
136
|
+
v = VectorEmbed.new
|
137
|
+
v.line(1, 1 => true)
|
138
|
+
v.line(1, 2 => 5.4) # that's fine, different dimension
|
139
|
+
lambda { v.line(1, 1 => 5.4) }.should raise_error(ArgumentError)
|
140
|
+
v = VectorEmbed.new
|
141
|
+
v.line(1, 1 => 5.4)
|
142
|
+
v.line(1, 2 => true) # that's fine, diff dim
|
143
|
+
lambda { v.line(1, 1 => true) }.should raise_error(ArgumentError)
|
144
|
+
end
|
145
|
+
|
146
|
+
it "doesn't allow embedding string in number mode" do
|
147
|
+
v = VectorEmbed.new
|
148
|
+
v.line(1, 1 => 9)
|
149
|
+
v.line(1, 2 => 'foo') # that's fine, different dimension
|
150
|
+
lambda { v.line(1, 1 => 'foo') }.should raise_error(ArgumentError)
|
151
|
+
end
|
152
|
+
|
153
|
+
it "uses scientific notation for large numbers" do
|
154
|
+
v = VectorEmbed.new
|
155
|
+
v.line(5, 1 => 8.12e13).should == "5 #{l_h('1')}:8.1200000000e+13"
|
156
|
+
end
|
157
|
+
|
158
|
+
it "detects numbers in strings" do
|
159
|
+
v = VectorEmbed.new
|
160
|
+
v.line(5, 1 => '8.12e13').should == "5 #{l_h('1')}:8.1200000000e+13"
|
161
|
+
end
|
162
|
+
|
163
|
+
it "allows 2 byte n-grams" do
|
164
|
+
v = VectorEmbed.new ngram_len: 2, ngram_delim: ''
|
165
|
+
v.line(1, 1 => 'foo').should == sortme("1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1")
|
166
|
+
v.line(1, 1 => 'bar').should == sortme("1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00ba")}:1")
|
167
|
+
v.line(1, 1 => 'baz').should == sortme("1 #{l_h("1\x00ngram\x00az")}:1 #{l_h("1\x00ngram\x00ba")}:1")
|
168
|
+
v.line(1, 1 => 'foobar').should == sortme("1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("1\x00ngram\x00ob")}:1 #{l_h("1\x00ngram\x00ba")}:1")
|
169
|
+
v.line(1, 1 => 'foo bar').should == sortme("1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("1\x00ngram\x00o ")}:1 #{l_h("1\x00ngram\x00 b")}:1 #{l_h("1\x00ngram\x00ba")}:1 #{l_h("1\x00ngram\x00ar")}:1")
|
170
|
+
end
|
171
|
+
|
172
|
+
it "allows word-grams" do
|
173
|
+
v = VectorEmbed.new ngram_len: 1, ngram_delim: /\s+/
|
174
|
+
v.line(1, 1 => 'foo').should == sortme("1 #{l_h("1\x00ngram\x00foo")}:1")
|
175
|
+
v.line(1, 1 => 'foobar').should == sortme("1 #{l_h("1\x00ngram\x00foobar")}:1")
|
176
|
+
v.line(1, 1 => 'foo bar').should == sortme("1 #{l_h("1\x00ngram\x00bar")}:1 #{l_h("1\x00ngram\x00foo")}:1")
|
177
|
+
end
|
178
|
+
|
179
|
+
it "allows 2 byte n-grams with stop words" do
|
180
|
+
v = VectorEmbed.new ngram_len: 2, ngram_delim: '', stop_words: %w{the and or}
|
181
|
+
v.line(1, 1 => 'foo or').should == sortme("1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1")
|
182
|
+
v.line(1, 1 => 'the bar').should == sortme("1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00ba")}:1")
|
183
|
+
v.line(1, 1 => 'and baz').should == sortme("1 #{l_h("1\x00ngram\x00az")}:1 #{l_h("1\x00ngram\x00ba")}:1")
|
184
|
+
v.line(1, 1 => 'foobar or the and').should == sortme("1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("1\x00ngram\x00ob")}:1 #{l_h("1\x00ngram\x00ba")}:1")
|
185
|
+
v.line(1, 1 => 'foo or and the bar').should == sortme("1 #{l_h("1\x00ngram\x00 b")}:1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("1\x00ngram\x00o ")}:1 #{l_h("1\x00ngram\x00ba")}:1")
|
186
|
+
end
|
187
|
+
|
188
|
+
it "allows word-grams with stop words" do
|
189
|
+
v = VectorEmbed.new ngram_len: 1, ngram_delim: /\s+/, stop_words: %w{the and or}
|
190
|
+
v.line(1, 1 => 'foo or').should == "1 #{l_h("1\x00ngram\x00foo")}:1"
|
191
|
+
v.line(1, 1 => 'foo the bar').should == "1 #{l_h("1\x00ngram\x00bar")}:1 #{l_h("1\x00ngram\x00foo")}:1"
|
192
|
+
v.line(1, 1 => 'foo bar and').should == "1 #{l_h("1\x00ngram\x00bar")}:1 #{l_h("1\x00ngram\x00foo")}:1"
|
193
|
+
end
|
194
|
+
|
195
|
+
it "doesn't do anything weird when you have multiple features" do
|
196
|
+
v = VectorEmbed.new
|
197
|
+
v.line(1, 1 => 'foo', 2 => 'bar', 'baz' => 'zoo').should == sortme("1 #{l_h("1\x00foo")}:1 #{l_h("2\x00bar")}:1 #{l_h("baz\x00zoo")}:1")
|
198
|
+
end
|
199
|
+
|
200
|
+
end
|
201
|
+
|
202
|
+
private
|
203
|
+
|
204
|
+
def h(v)
|
205
|
+
MurmurHash3::V32.str_hash v
|
206
|
+
end
|
207
|
+
|
208
|
+
# for labels
|
209
|
+
def l_h(v)
|
210
|
+
h(v).to_s[0..6].to_i
|
211
|
+
end
|
212
|
+
|
213
|
+
def sortme(line)
|
214
|
+
parts = line.split(' ')
|
215
|
+
label = parts.shift
|
216
|
+
features = parts.map { |p| p.split(':') }.sort_by { |k, v| k.to_i }.map { |k, v| [k, v].join(':') }
|
217
|
+
([label] + features).join(' ')
|
218
|
+
end
|
219
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/vector_embed/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.name = "vector_embed"
|
6
|
+
gem.version = VectorEmbed::VERSION
|
7
|
+
gem.authors = ["Seamus Abshere"]
|
8
|
+
gem.email = ["seamus@abshere.net"]
|
9
|
+
gem.description = %q{Vector embedding of strings, booleans, numerics, and arrays into LIBSVM / LIBLINEAR format.}
|
10
|
+
gem.summary = %q{Vector embedding of strings, booleans, numerics, and arrays into LIBSVM / LIBLINEAR format.}
|
11
|
+
gem.homepage = "https://github.com/seamusabshere/vector_embed"
|
12
|
+
|
13
|
+
gem.files = `git ls-files`.split($/)
|
14
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
15
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
16
|
+
gem.require_paths = ["lib"]
|
17
|
+
|
18
|
+
gem.add_dependency 'murmurhash3'
|
19
|
+
|
20
|
+
gem.add_development_dependency 'rspec'
|
21
|
+
gem.add_development_dependency 'pry'
|
22
|
+
gem.add_development_dependency 'yard'
|
23
|
+
end
|
metadata
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: vector_embed
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Seamus Abshere
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-02-20 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: murmurhash3
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rspec
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: pry
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: yard
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
description: Vector embedding of strings, booleans, numerics, and arrays into LIBSVM
|
79
|
+
/ LIBLINEAR format.
|
80
|
+
email:
|
81
|
+
- seamus@abshere.net
|
82
|
+
executables:
|
83
|
+
- vector_embed
|
84
|
+
extensions: []
|
85
|
+
extra_rdoc_files: []
|
86
|
+
files:
|
87
|
+
- .gitignore
|
88
|
+
- .rspec
|
89
|
+
- CHANGELOG
|
90
|
+
- Gemfile
|
91
|
+
- LICENSE
|
92
|
+
- LICENSE.txt
|
93
|
+
- README.md
|
94
|
+
- Rakefile
|
95
|
+
- bin/vector_embed
|
96
|
+
- lib/vector_embed.rb
|
97
|
+
- lib/vector_embed/maker.rb
|
98
|
+
- lib/vector_embed/maker/boolean.rb
|
99
|
+
- lib/vector_embed/maker/ngram.rb
|
100
|
+
- lib/vector_embed/maker/number.rb
|
101
|
+
- lib/vector_embed/maker/phrase.rb
|
102
|
+
- lib/vector_embed/stop_word.rb
|
103
|
+
- lib/vector_embed/version.rb
|
104
|
+
- spec/spec_helper.rb
|
105
|
+
- spec/vector_embed_spec.rb
|
106
|
+
- vector_embed.gemspec
|
107
|
+
homepage: https://github.com/seamusabshere/vector_embed
|
108
|
+
licenses: []
|
109
|
+
post_install_message:
|
110
|
+
rdoc_options: []
|
111
|
+
require_paths:
|
112
|
+
- lib
|
113
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
114
|
+
none: false
|
115
|
+
requirements:
|
116
|
+
- - ! '>='
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
version: '0'
|
119
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
120
|
+
none: false
|
121
|
+
requirements:
|
122
|
+
- - ! '>='
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
requirements: []
|
126
|
+
rubyforge_project:
|
127
|
+
rubygems_version: 1.8.25
|
128
|
+
signing_key:
|
129
|
+
specification_version: 3
|
130
|
+
summary: Vector embedding of strings, booleans, numerics, and arrays into LIBSVM /
|
131
|
+
LIBLINEAR format.
|
132
|
+
test_files:
|
133
|
+
- spec/spec_helper.rb
|
134
|
+
- spec/vector_embed_spec.rb
|
135
|
+
has_rdoc:
|