vector_embed 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/CHANGELOG ADDED
@@ -0,0 +1,3 @@
1
+ 0.0.1 / 2013-02-20
2
+
3
+ * First release!
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in vector_embed.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Seamus Abshere
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Seamus Abshere
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,111 @@
1
+ # VectorEmbed
2
+
3
+ Vector embedding of strings, booleans, numerics, and arrays into [LIBSVM](http://www.csie.ntu.edu.tw/~cjlin/libsvm/) / [LIBLINEAR](http://www.csie.ntu.edu.tw/~cjlin/liblinear/) format.
4
+
5
+ Inspired by [Sally](http://www.mlsec.org/sally/), except `VectorEmbed` is meant to handle categorical and continuous data at the same time.
6
+
7
+ ## Usage
8
+
9
+ Create a `VectorEmbed` instance, which auto-detects and then remembers what kind of data goes into each feature:
10
+
11
+ >> require 'vector_embed'
12
+ => true
13
+ >> v = VectorEmbed.new
14
+ => #<VectorEmbed:0x007fd605815208 [...]>
15
+
16
+ Output a line with a label and arbitrary features:
17
+
18
+ >> label = 1
19
+ => 1
20
+ >> features = { color: 'red', year: 1995, weight: 5.4e9 }
21
+ => {:color=>"red", :year=>1995, :weight=>5400000000.0}
22
+ >> v.line(label, features)
23
+ => "1 1997960:1 5556418:5400000000.0 8227451:1995"
24
+
25
+ Output another line:
26
+
27
+ >> label = 0
28
+ => 0
29
+ >> features = { color: 'blue', year: 1821, weight: 3.3 }
30
+ => {:color=>"blue", :year=>1821, :weight=>3.3}
31
+ >> v.line(label, features)
32
+ => "0 1089740:1 5556418:3.3 8227451:1821"
33
+
34
+ Note that `color: 'red'` and `color: 'blue'` are being translated into categories:
35
+
36
+ 1997960:1 # murmur3("color\x00red"):1
37
+ 1089740:1 # murmur3("color\x00blue"):1
38
+
39
+ A similar thing happens with `true`/`false`:
40
+
41
+ >> v.line(1, yes: true, no: false)
42
+ => "1 1559987:1 3324244:1"
43
+
44
+ i.e.
45
+
46
+ 1559987:1 # murmur3("yes\x00true"):1
47
+ 3324244:1 # murmur3("no\x00false"):1
48
+
49
+ ## N-grams
50
+
51
+ Currently uses same parameter names as [Sally](http://www.mlsec.org/sally/manual.html).
52
+
53
+ ### Word ngrams
54
+
55
+ >> v = VectorEmbed.new ngram_len: 1, ngram_delim: /\s+/, stop_words: %w{the and or}
56
+ => #<VectorEmbed:0x007fd6033b77f8 [...]>
57
+ >> v.line(1, notes: 'the quick brown fox')
58
+ => "1 1512788:1 3426202:1 5079692:1"
59
+
60
+ You get the idea: ("the" has been filtered out by stop words)
61
+
62
+ 1512788:1 # murmur3("notes\x00quick"):1
63
+ 3426202:1 # murmur3("notes\x00brown"):1
64
+ 5079692:1 # murmur3("notes\x00fox"):1
65
+
66
+ ### Byte n-grams
67
+
68
+ >> v = VectorEmbed.new ngram_len: 2, ngram_delim: ''
69
+ => #<VectorEmbed:0x007fd60337ea20 [...]>
70
+ >> v.line(1, notes: 'foobar')
71
+ => "1 2148745:1 2878919:1 3600333:1 3621715:1 5885921:1"
72
+
73
+ So therefore:
74
+
75
+ 2148745:1 # murmur3("notes\x00fo"):1
76
+ 2878919:1 # murmur3("notes\x00oo"):1
77
+ 3600333:1 # murmur3("notes\x00ob"):1
78
+ 3621715:1 # murmur3("notes\x00ba"):1
79
+ 5885921:1 # murmur3("notes\x00ar"):1
80
+
81
+ ## Debugging
82
+
83
+ `VectorEmbed` tries to do the right thing, but if it's not, try turning on debugging:
84
+
85
+ >> v = VectorEmbed.new
86
+ => #<VectorEmbed:0x007fd6034020a0 [...]>
87
+ >> v.logger.level = Logger::DEBUG
88
+ => 0
89
+ >> v.line(1, '3' => 7, foo: 'bar', truthy: false, nullity: nil)
90
+ D, [2013-02-20T16:55:00.139299 #21595] DEBUG -- : Interpreting "3" as Number given first value 7
91
+ D, [2013-02-20T16:55:00.139561 #21595] DEBUG -- : Interpreting :foo as Phrase given first value "bar"
92
+ D, [2013-02-20T16:55:00.139671 #21595] DEBUG -- : Interpreting :truthy as Boolean given first value false
93
+ D, [2013-02-20T16:55:00.139755 #21595] DEBUG -- : Interpreting :nullity as Boolean given first value nil
94
+ D, [2013-02-20T16:55:00.139872 #21595] DEBUG -- : Interpreting "label" as Number given first value 1
95
+ => "1 2647413:7 4091306:1 7123386:1 9259635:1"
96
+
97
+ One thing it doesn't like: (assuming you have already performed the lines above)
98
+
99
+ >> v.line(1, '3' => 'bar')
100
+ ArgumentError: Can't embed "bar" in number feature "3".
101
+
102
+ It's saying that, given you first passed it `7`, it thought `"3"` was a feature that held numbers.
103
+
104
+ ## Gotchas
105
+
106
+ * Following Sally, it only uses the first 22 bits of the murmur hash for feature indices... more and LIBSVM seems to choke.
107
+ * Stop words are currently filtered out of feature indices... probably shouldn't be.
108
+
109
+ ## Copyright
110
+
111
+ Copyright 2013 Seamus Abshere
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'yard'
4
+ YARD::Rake::YardocTask.new
5
+
6
+ require 'rspec/core/rake_task'
7
+ RSpec::Core::RakeTask.new('spec')
8
+ task :default => :spec
data/bin/vector_embed ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'vector_embed'
@@ -0,0 +1,66 @@
1
+ require 'logger'
2
+
3
+ require 'vector_embed/version'
4
+ require 'vector_embed/maker'
5
+
6
+ require 'vector_embed/stop_word'
7
+
8
+ class VectorEmbed
9
+ # http://stackoverflow.com/questions/638565/parsing-scientific-notation-sensibly
10
+ JUST_A_NUMBER = /\A\s*[+\-]?(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?\s*\z/
11
+ BLANK = /\A\s*\z/
12
+ NULL_BYTE = "\x00"
13
+
14
+ attr_reader :options
15
+ attr_accessor :logger
16
+
17
+ def initialize(options = {})
18
+ @mutex = Mutex.new
19
+ @feature_makers = {}
20
+ @logger = options[:logger] || (l = Logger.new($stderr); l.level = Logger::INFO; l)
21
+ @options = options.dup
22
+ end
23
+
24
+ def line(label, features = {})
25
+ feature_pairs = features.inject([]) do |memo, (k, v)|
26
+ case v
27
+ when Array
28
+ v.each_with_index do |vv, i|
29
+ memo.concat feature_maker([k, i].join(NULL_BYTE), vv).pairs(vv)
30
+ end
31
+ else
32
+ memo.concat feature_maker(k, v).pairs(v)
33
+ end
34
+ memo
35
+ end.compact.sort_by do |k_value, _|
36
+ k_value
37
+ end.map do |pair|
38
+ pair.join ':'
39
+ end
40
+ ([label_maker(label).value(label)] + feature_pairs).join ' '
41
+ end
42
+
43
+ def preprocess(v)
44
+ StopWord.remove stop_words, v
45
+ end
46
+
47
+ private
48
+
49
+ def stop_words
50
+ @stop_words ||= options.fetch(:stop_words, []).map do |raw_stop_word|
51
+ StopWord.new raw_stop_word
52
+ end
53
+ end
54
+
55
+ def label_maker(label)
56
+ @label_maker || @mutex.synchronize do
57
+ @label_maker ||= Maker.pick([Maker::Boolean, Maker::Number], 'label', label, self)
58
+ end
59
+ end
60
+
61
+ def feature_maker(k, v)
62
+ @feature_makers[k] || @mutex.synchronize do
63
+ @feature_makers[k] ||= Maker.pick([Maker::Boolean, Maker::Number, Maker::Ngram, Maker::Phrase], k, v, self)
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,46 @@
1
+ require 'vector_embed/maker/phrase'
2
+ require 'vector_embed/maker/ngram'
3
+ require 'vector_embed/maker/number'
4
+ require 'vector_embed/maker/boolean'
5
+
6
+ require 'murmurhash3'
7
+
8
+ class VectorEmbed
9
+ class Maker
10
+ class << self
11
+ def pick(choices, k, first_v, parent)
12
+ if klass = choices.detect { |klass| klass.want?(k, first_v, parent) }
13
+ parent.logger.debug { "Interpreting #{k.inspect} as #{klass.name.split('::').last} given first value #{first_v.inspect}" }
14
+ klass.new k, parent
15
+ else
16
+ raise "Can't use #{first_v.class} for #{k.inspect} given #{first_v.inspect} and choices #{choices.inspect}"
17
+ end
18
+ end
19
+
20
+ def index(*parts)
21
+ MurmurHash3::V32.str_hash(parts.join(NULL_BYTE)).to_s[0..6].to_i
22
+ end
23
+ end
24
+
25
+ attr_reader :parent
26
+ attr_reader :k
27
+
28
+ def initialize(k, parent)
29
+ @k = k
30
+ @parent = parent
31
+ end
32
+
33
+ def pairs(v)
34
+ case v
35
+ when Array
36
+ memo = []
37
+ v.each_with_index do |vv, i|
38
+ memo << [ Maker.index(k, i), value(vv) ]
39
+ end
40
+ memo
41
+ else
42
+ [ [ Maker.index(k), value(v) ] ]
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,42 @@
1
+ require 'vector_embed/maker'
2
+
3
+ class VectorEmbed
4
+ class Maker
5
+ class Boolean < Maker
6
+ class << self
7
+ def want?(k, v, parent)
8
+ case v
9
+ when NilClass, TrueClass, FalseClass, 'true', 'false', 'null'
10
+ true
11
+ else
12
+ false
13
+ end
14
+ end
15
+ end
16
+
17
+ def value(v)
18
+ case v
19
+ when TrueClass, 'true', 't', 'yes', 'on'
20
+ 1
21
+ when FalseClass, 'false', 'f', 'no', 'off'
22
+ 0
23
+ else
24
+ raise "Can't embed #{v.inspect} in boolean feature #{k.inspect}"
25
+ end
26
+ end
27
+
28
+ def pairs(v)
29
+ case v
30
+ when TrueClass, 'true', 't', 'yes', 'on'
31
+ [ [ Maker.index(k, 'true'), 1 ] ]
32
+ when FalseClass, 'false', 'f', 'no', 'off'
33
+ [ [ Maker.index(k, 'false'), 1 ] ]
34
+ when NilClass, 'null', BLANK
35
+ [ [ Maker.index(k, 'null'), 1 ] ]
36
+ else
37
+ raise ArgumentError, "Can't embed #{v.inspect} in boolean feature #{k.inspect}"
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,39 @@
1
+ require 'vector_embed/maker'
2
+
3
+ class VectorEmbed
4
+ class Maker
5
+ class Ngram < Maker
6
+ class << self
7
+ def want?(k, v, parent)
8
+ parent.options[:ngram_len]
9
+ end
10
+ end
11
+
12
+ attr_reader :len
13
+ attr_reader :delim
14
+
15
+ def initialize(k, parent)
16
+ super
17
+ @len = parent.options[:ngram_len].to_i
18
+ raise ArgumentError, ":ngram_len must be > 0" unless @len > 0
19
+ @delim = parent.options[:ngram_delim]
20
+ end
21
+
22
+ def pairs(v)
23
+ raise "Ngram can't handle #{v.inspect}, only a single string for now" unless v.is_a?(String)
24
+ v = parent.preprocess v.to_s
25
+ if len == 1
26
+ # word mode
27
+ v.split delim
28
+ elsif delim == ''
29
+ # byte mode
30
+ (0..v.length-len).map { |i| v[i,len] }
31
+ else
32
+ raise "Word n-gram not supported yet"
33
+ end.map do |ngram|
34
+ [ [ Maker.index(k, 'ngram', ngram), 1 ] ]
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,37 @@
1
+ require 'vector_embed/maker'
2
+
3
+ class VectorEmbed
4
+ class Maker
5
+ class Number < Maker
6
+ class << self
7
+ def want?(k, v, parent)
8
+ v.is_a?(::Numeric) or v =~ JUST_A_NUMBER
9
+ end
10
+
11
+ def numify(v)
12
+ num = if v.is_a?(String)
13
+ if v.include?('.') or v.include?('e')
14
+ v.to_f
15
+ else
16
+ v.to_i
17
+ end
18
+ else
19
+ v
20
+ end
21
+ num > 1e10 ? ('%.10e' % num) : num
22
+ end
23
+ end
24
+
25
+ def value(v)
26
+ case v
27
+ when Numeric, JUST_A_NUMBER
28
+ Number.numify v
29
+ when NilClass
30
+ 0
31
+ else
32
+ raise ArgumentError, "Can't embed #{v.inspect} in number feature #{k.inspect}"
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,18 @@
1
+ require 'vector_embed/maker'
2
+
3
+ class VectorEmbed
4
+ class Maker
5
+ class Phrase < Maker
6
+ class << self
7
+ def want?(k, v, parent)
8
+ true
9
+ end
10
+ end
11
+
12
+ def pairs(v)
13
+ v = parent.preprocess v.to_s
14
+ [ [ Maker.index(k, v), 1 ] ]
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,24 @@
1
+ require 'vector_embed'
2
+
3
+ class VectorEmbed
4
+ class StopWord
5
+ class << self
6
+ def remove(stop_words, str)
7
+ memo = str.dup
8
+ stop_words.each do |stop_word|
9
+ stop_word.apply! memo
10
+ end
11
+ memo.gsub! /\s+/, ' '
12
+ memo.strip!
13
+ memo
14
+ end
15
+ end
16
+
17
+ def initialize(raw_stop_word)
18
+ @pattern = /\s*\b#{raw_stop_word}\b\s*/i
19
+ end
20
+ def apply!(str)
21
+ str.gsub! @pattern, ' '
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,3 @@
1
+ class VectorEmbed
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,21 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # Require this file using `require "spec_helper"` to ensure that it is only
4
+ # loaded once.
5
+ #
6
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
+ RSpec.configure do |config|
8
+ config.treat_symbols_as_metadata_keys_with_true_values = true
9
+ config.run_all_when_everything_filtered = true
10
+ config.filter_run :focus
11
+
12
+ # Run specs in random order to surface order dependencies. If you find an
13
+ # order dependency and want to debug it, you can fix the order by providing
14
+ # the seed, which is printed after each run.
15
+ # --seed 1234
16
+ config.order = 'random'
17
+ end
18
+
19
+ require 'pry'
20
+
21
+ require 'vector_embed'
@@ -0,0 +1,219 @@
1
+ require 'spec_helper'
2
+
3
+ describe VectorEmbed do
4
+ describe 'in labels' do
5
+ it "stores true/false as 1/0" do
6
+ v = VectorEmbed.new
7
+ v.line(true).should == '1'
8
+ v.line(false).should == '0'
9
+ v.line('true').should == '1'
10
+ v.line('false').should == '0'
11
+ end
12
+
13
+ it "stores numbers as numbers" do
14
+ v = VectorEmbed.new
15
+ v.line(5.4).should == '5.4'
16
+ v.line(-3.9).should == '-3.9'
17
+ end
18
+
19
+ it "doesn't allow strings" do
20
+ v = VectorEmbed.new
21
+ lambda { v.line('foo') }.should raise_error(/string.*label/i)
22
+ end
23
+
24
+ it "doesn't allow mixing" do
25
+ v = VectorEmbed.new
26
+ v.line(5.4)
27
+ lambda { v.line(true) }.should raise_error(/Can't embed.*number/)
28
+ v = VectorEmbed.new
29
+ v.line(true)
30
+ lambda { v.line(5.4) }.should raise_error(/Can't embed.*boolean/)
31
+ end
32
+ end
33
+
34
+ # aka dimension indexes
35
+ describe 'in feature keys' do
36
+ it "stores values as their string equivalents" do
37
+ v = VectorEmbed.new
38
+ v.line(1, 1 => 9).should == "1 #{l_h('1')}:9"
39
+ v.line(1, 5.4 => 9).should == "1 #{l_h('5.4')}:9"
40
+ v.line(1, '5.4' => 9).should == "1 #{l_h('5.4')}:9"
41
+ v.line(1, '5.4 ' => 9).should == "1 #{l_h('5.4 ')}:9"
42
+ v.line(1, 'foo' => 9).should == "1 #{l_h('foo')}:9"
43
+ v.line(1, 'foo bar' => 9).should == "1 #{l_h('foo bar')}:9"
44
+ v.line(1, true => 9).should == "1 #{l_h('true')}:9"
45
+ v.line(1, 'true' => 9).should == "1 #{l_h('true')}:9"
46
+ v.line(1, false => 9).should == "1 #{l_h('false')}:9"
47
+ v.line(1, 'false' => 9).should == "1 #{l_h('false')}:9"
48
+ end
49
+
50
+ it "treats nil as a blank string" do
51
+ v = VectorEmbed.new
52
+ v.line(1, nil => 9).should == "1 #{l_h('')}:9"
53
+ end
54
+
55
+ it "leaves whitespace alone" do
56
+ v = VectorEmbed.new
57
+ v.line(1, '' => 9).should == "1 #{l_h('')}:9"
58
+ v.line(1, ' ' => 9).should == "1 #{l_h(' ')}:9"
59
+ v.line(1, ' ' => 9).should == "1 #{l_h(' ')}:9"
60
+ v.line(1, ' foo ' => 9).should == "1 #{l_h(' foo ')}:9"
61
+ v.line(1, '5.4 ' => 9).should == "1 #{l_h('5.4 ')}:9"
62
+ end
63
+
64
+ it "orders feature names" do
65
+ v = VectorEmbed.new
66
+ v.line(1, 1 => 3, 2 => 7).should == "1 #{l_h('2')}:7 #{l_h('1')}:3"
67
+ end
68
+
69
+ it "allows mixed string and number feature values" do
70
+ v = VectorEmbed.new
71
+ v.line(1, a: :b).should == "1 #{l_h("a\x00b")}:1"
72
+ v.line(1, a: 13).should == "1 #{l_h("a\x0013")}:1"
73
+ v.line(1, 1 => 9).should == "1 #{l_h('1')}:9" # 9 is not hashed, 1 is
74
+ end
75
+ end
76
+
77
+ describe 'feature values' do
78
+ it "stores true/false/nil as (1,0,0)/(0,1,0)/(0,0,1)" do
79
+ v = VectorEmbed.new
80
+ v.line(1, 1 => true).should == "1 #{l_h("1\x00true")}:1"
81
+ v.line(1, 1 => 'true').should == "1 #{l_h("1\x00true")}:1"
82
+ v.line(1, 1 => false).should == "1 #{l_h("1\x00false")}:1"
83
+ v.line(1, 1 => 'false').should == "1 #{l_h("1\x00false")}:1"
84
+ v.line(1, 1 => nil).should == "1 #{l_h("1\x00null")}:1"
85
+ v.line(1, 1 => 'null').should == "1 #{l_h("1\x00null")}:1"
86
+ end
87
+
88
+ it "stores numbers as numbers" do
89
+ v = VectorEmbed.new
90
+ v.line(1, 1 => 9).should == "1 #{l_h('1')}:9"
91
+ v.line(1, 1 => '9').should == "1 #{l_h('1')}:9"
92
+ v.line(1, 1 => 5.4).should == "1 #{l_h('1')}:5.4"
93
+ v.line(1, 1 => '5.4').should == "1 #{l_h('1')}:5.4"
94
+ v.line(1, 1 => 9e9).should == "1 #{l_h('1')}:9000000000.0"
95
+ v.line(1, 1 => '9e9').should == "1 #{l_h('1')}:9000000000.0"
96
+ end
97
+
98
+ it "stores strings as m-category attributes" do
99
+ v = VectorEmbed.new
100
+ v.line(1, 1 => 'sfh').should == "1 #{l_h("1\x00sfh")}:1"
101
+ v.line(1, 1 => 'mfh').should == "1 #{l_h("1\x00mfh")}:1"
102
+ v.line(1, 1 => 'foo bar').should == "1 #{l_h("1\x00foo bar")}:1"
103
+ v.line(1, 1 => 'foo bar ').should == "1 #{l_h("1\x00foo bar")}:1"
104
+ v.line(1, 1 => ' foo bar ').should == "1 #{l_h("1\x00foo bar")}:1"
105
+ end
106
+
107
+ it "in string mode, treats true/false/nil as strings" do
108
+ v = VectorEmbed.new
109
+ v.line(1, 1 => 'foo').should == "1 #{l_h("1\x00foo")}:1"
110
+ v.line(1, 1 => true).should == "1 #{l_h("1\x00true")}:1"
111
+ v.line(1, 1 => false).should == "1 #{l_h("1\x00false")}:1"
112
+ v.line(1, 1 => nil).should == "1 #{l_h("1\x00")}:1"
113
+ end
114
+
115
+ it "in string mode, treats numbers as strings" do
116
+ v = VectorEmbed.new
117
+ v.line(1, 1 => 'foo').should == "1 #{l_h("1\x00foo")}:1"
118
+ v.line(1, 1 => 1).should == "1 #{l_h("1\x001")}:1"
119
+ v.line(1, 1 => 5.4).should == "1 #{l_h("1\x005.4")}:1"
120
+ v.line(1, 1 => 9e9).should == "1 #{l_h("1\x00" + 9e9.to_s)}:1"
121
+ end
122
+
123
+ it "flattens and stores arrays" do
124
+ v = VectorEmbed.new
125
+ v.line(1, 'foo' => [7,13,19]).should == sortme("1 #{l_h("foo\x001")}:13 #{l_h("foo\x000")}:7 #{l_h("foo\x002")}:19")
126
+ v.line(1, 'bar' => ['a','b','c']).should == sortme("1 #{l_h("bar\x001\x00b")}:1 #{l_h("bar\x000\x00a")}:1 #{l_h("bar\x002\x00c")}:1")
127
+ end
128
+
129
+ it "in number mode, treats null as 0" do
130
+ v = VectorEmbed.new
131
+ v.line(1, 1 => 9).should == "1 #{l_h('1')}:9"
132
+ v.line(1, 1 => nil).should == "1 #{l_h('1')}:0"
133
+ end
134
+
135
+ it "doesn't allow embedding boolean in number mode or vice-versa" do
136
+ v = VectorEmbed.new
137
+ v.line(1, 1 => true)
138
+ v.line(1, 2 => 5.4) # that's fine, different dimension
139
+ lambda { v.line(1, 1 => 5.4) }.should raise_error(ArgumentError)
140
+ v = VectorEmbed.new
141
+ v.line(1, 1 => 5.4)
142
+ v.line(1, 2 => true) # that's fine, diff dim
143
+ lambda { v.line(1, 1 => true) }.should raise_error(ArgumentError)
144
+ end
145
+
146
+ it "doesn't allow embedding string in number mode" do
147
+ v = VectorEmbed.new
148
+ v.line(1, 1 => 9)
149
+ v.line(1, 2 => 'foo') # that's fine, different dimension
150
+ lambda { v.line(1, 1 => 'foo') }.should raise_error(ArgumentError)
151
+ end
152
+
153
+ it "uses scientific notation for large numbers" do
154
+ v = VectorEmbed.new
155
+ v.line(5, 1 => 8.12e13).should == "5 #{l_h('1')}:8.1200000000e+13"
156
+ end
157
+
158
+ it "detects numbers in strings" do
159
+ v = VectorEmbed.new
160
+ v.line(5, 1 => '8.12e13').should == "5 #{l_h('1')}:8.1200000000e+13"
161
+ end
162
+
163
+ it "allows 2 byte n-grams" do
164
+ v = VectorEmbed.new ngram_len: 2, ngram_delim: ''
165
+ v.line(1, 1 => 'foo').should == sortme("1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1")
166
+ v.line(1, 1 => 'bar').should == sortme("1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00ba")}:1")
167
+ v.line(1, 1 => 'baz').should == sortme("1 #{l_h("1\x00ngram\x00az")}:1 #{l_h("1\x00ngram\x00ba")}:1")
168
+ v.line(1, 1 => 'foobar').should == sortme("1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("1\x00ngram\x00ob")}:1 #{l_h("1\x00ngram\x00ba")}:1")
169
+ v.line(1, 1 => 'foo bar').should == sortme("1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("1\x00ngram\x00o ")}:1 #{l_h("1\x00ngram\x00 b")}:1 #{l_h("1\x00ngram\x00ba")}:1 #{l_h("1\x00ngram\x00ar")}:1")
170
+ end
171
+
172
+ it "allows word-grams" do
173
+ v = VectorEmbed.new ngram_len: 1, ngram_delim: /\s+/
174
+ v.line(1, 1 => 'foo').should == sortme("1 #{l_h("1\x00ngram\x00foo")}:1")
175
+ v.line(1, 1 => 'foobar').should == sortme("1 #{l_h("1\x00ngram\x00foobar")}:1")
176
+ v.line(1, 1 => 'foo bar').should == sortme("1 #{l_h("1\x00ngram\x00bar")}:1 #{l_h("1\x00ngram\x00foo")}:1")
177
+ end
178
+
179
+ it "allows 2 byte n-grams with stop words" do
180
+ v = VectorEmbed.new ngram_len: 2, ngram_delim: '', stop_words: %w{the and or}
181
+ v.line(1, 1 => 'foo or').should == sortme("1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1")
182
+ v.line(1, 1 => 'the bar').should == sortme("1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00ba")}:1")
183
+ v.line(1, 1 => 'and baz').should == sortme("1 #{l_h("1\x00ngram\x00az")}:1 #{l_h("1\x00ngram\x00ba")}:1")
184
+ v.line(1, 1 => 'foobar or the and').should == sortme("1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("1\x00ngram\x00ob")}:1 #{l_h("1\x00ngram\x00ba")}:1")
185
+ v.line(1, 1 => 'foo or and the bar').should == sortme("1 #{l_h("1\x00ngram\x00 b")}:1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("1\x00ngram\x00o ")}:1 #{l_h("1\x00ngram\x00ba")}:1")
186
+ end
187
+
188
+ it "allows word-grams with stop words" do
189
+ v = VectorEmbed.new ngram_len: 1, ngram_delim: /\s+/, stop_words: %w{the and or}
190
+ v.line(1, 1 => 'foo or').should == "1 #{l_h("1\x00ngram\x00foo")}:1"
191
+ v.line(1, 1 => 'foo the bar').should == "1 #{l_h("1\x00ngram\x00bar")}:1 #{l_h("1\x00ngram\x00foo")}:1"
192
+ v.line(1, 1 => 'foo bar and').should == "1 #{l_h("1\x00ngram\x00bar")}:1 #{l_h("1\x00ngram\x00foo")}:1"
193
+ end
194
+
195
+ it "doesn't do anything weird when you have multiple features" do
196
+ v = VectorEmbed.new
197
+ v.line(1, 1 => 'foo', 2 => 'bar', 'baz' => 'zoo').should == sortme("1 #{l_h("1\x00foo")}:1 #{l_h("2\x00bar")}:1 #{l_h("baz\x00zoo")}:1")
198
+ end
199
+
200
+ end
201
+
202
+ private
203
+
204
+ def h(v)
205
+ MurmurHash3::V32.str_hash v
206
+ end
207
+
208
+ # for labels
209
+ def l_h(v)
210
+ h(v).to_s[0..6].to_i
211
+ end
212
+
213
+ def sortme(line)
214
+ parts = line.split(' ')
215
+ label = parts.shift
216
+ features = parts.map { |p| p.split(':') }.sort_by { |k, v| k.to_i }.map { |k, v| [k, v].join(':') }
217
+ ([label] + features).join(' ')
218
+ end
219
+ end
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/vector_embed/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.name = "vector_embed"
6
+ gem.version = VectorEmbed::VERSION
7
+ gem.authors = ["Seamus Abshere"]
8
+ gem.email = ["seamus@abshere.net"]
9
+ gem.description = %q{Vector embedding of strings, booleans, numerics, and arrays into LIBSVM / LIBLINEAR format.}
10
+ gem.summary = %q{Vector embedding of strings, booleans, numerics, and arrays into LIBSVM / LIBLINEAR format.}
11
+ gem.homepage = "https://github.com/seamusabshere/vector_embed"
12
+
13
+ gem.files = `git ls-files`.split($/)
14
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
15
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
16
+ gem.require_paths = ["lib"]
17
+
18
+ gem.add_dependency 'murmurhash3'
19
+
20
+ gem.add_development_dependency 'rspec'
21
+ gem.add_development_dependency 'pry'
22
+ gem.add_development_dependency 'yard'
23
+ end
metadata ADDED
@@ -0,0 +1,135 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: vector_embed
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Seamus Abshere
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-02-20 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: murmurhash3
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rspec
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: pry
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: yard
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ description: Vector embedding of strings, booleans, numerics, and arrays into LIBSVM
79
+ / LIBLINEAR format.
80
+ email:
81
+ - seamus@abshere.net
82
+ executables:
83
+ - vector_embed
84
+ extensions: []
85
+ extra_rdoc_files: []
86
+ files:
87
+ - .gitignore
88
+ - .rspec
89
+ - CHANGELOG
90
+ - Gemfile
91
+ - LICENSE
92
+ - LICENSE.txt
93
+ - README.md
94
+ - Rakefile
95
+ - bin/vector_embed
96
+ - lib/vector_embed.rb
97
+ - lib/vector_embed/maker.rb
98
+ - lib/vector_embed/maker/boolean.rb
99
+ - lib/vector_embed/maker/ngram.rb
100
+ - lib/vector_embed/maker/number.rb
101
+ - lib/vector_embed/maker/phrase.rb
102
+ - lib/vector_embed/stop_word.rb
103
+ - lib/vector_embed/version.rb
104
+ - spec/spec_helper.rb
105
+ - spec/vector_embed_spec.rb
106
+ - vector_embed.gemspec
107
+ homepage: https://github.com/seamusabshere/vector_embed
108
+ licenses: []
109
+ post_install_message:
110
+ rdoc_options: []
111
+ require_paths:
112
+ - lib
113
+ required_ruby_version: !ruby/object:Gem::Requirement
114
+ none: false
115
+ requirements:
116
+ - - ! '>='
117
+ - !ruby/object:Gem::Version
118
+ version: '0'
119
+ required_rubygems_version: !ruby/object:Gem::Requirement
120
+ none: false
121
+ requirements:
122
+ - - ! '>='
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ requirements: []
126
+ rubyforge_project:
127
+ rubygems_version: 1.8.25
128
+ signing_key:
129
+ specification_version: 3
130
+ summary: Vector embedding of strings, booleans, numerics, and arrays into LIBSVM /
131
+ LIBLINEAR format.
132
+ test_files:
133
+ - spec/spec_helper.rb
134
+ - spec/vector_embed_spec.rb
135
+ has_rdoc: