vector_embed 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/CHANGELOG ADDED
@@ -0,0 +1,3 @@
1
+ 0.0.1 / 2013-02-20
2
+
3
+ * First release!
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in vector_embed.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Seamus Abshere
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Seamus Abshere
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,111 @@
1
+ # VectorEmbed
2
+
3
+ Vector embedding of strings, booleans, numerics, and arrays into [LIBSVM](http://www.csie.ntu.edu.tw/~cjlin/libsvm/) / [LIBLINEAR](http://www.csie.ntu.edu.tw/~cjlin/liblinear/) format.
4
+
5
+ Inspired by [Sally](http://www.mlsec.org/sally/), except `VectorEmbed` is meant to handle categorical and continuous data at the same time.
6
+
7
+ ## Usage
8
+
9
+ Create a `VectorEmbed` instance, which auto-detects and then remembers what kind of data goes into each feature:
10
+
11
+ >> require 'vector_embed'
12
+ => true
13
+ >> v = VectorEmbed.new
14
+ => #<VectorEmbed:0x007fd605815208 [...]>
15
+
16
+ Output a line with a label and arbitrary features:
17
+
18
+ >> label = 1
19
+ => 1
20
+ >> features = { color: 'red', year: 1995, weight: 5.4e9 }
21
+ => {:color=>"red", :year=>1995, :weight=>5400000000.0}
22
+ >> v.line(label, features)
23
+ => "1 1997960:1 5556418:5400000000.0 8227451:1995"
24
+
25
+ Output another line:
26
+
27
+ >> label = 0
28
+ => 0
29
+ >> features = { color: 'blue', year: 1821, weight: 3.3 }
30
+ => {:color=>"blue", :year=>1821, :weight=>3.3}
31
+ >> v.line(label, features)
32
+ => "0 1089740:1 5556418:3.3 8227451:1821"
33
+
34
+ Note that `color: 'red'` and `color: 'blue'` are being translated into categories:
35
+
36
+ 1997960:1 # murmur3("color\x00red"):1
37
+ 1089740:1 # murmur3("color\x00blue"):1
38
+
39
+ A similar thing happens with `true`/`false`:
40
+
41
+ >> v.line(1, yes: true, no: false)
42
+ => "1 1559987:1 3324244:1"
43
+
44
+ i.e.
45
+
46
+ 1559987:1 # murmur3("yes\x00true"):1
47
+ 3324244:1 # murmur3("no\x00false"):1
48
+
49
+ ## N-grams
50
+
51
+ Currently uses same parameter names as [Sally](http://www.mlsec.org/sally/manual.html).
52
+
53
+ ### Word ngrams
54
+
55
+ >> v = VectorEmbed.new ngram_len: 1, ngram_delim: /\s+/, stop_words: %w{the and or}
56
+ => #<VectorEmbed:0x007fd6033b77f8 [...]>
57
+ >> v.line(1, notes: 'the quick brown fox')
58
+ => "1 1512788:1 3426202:1 5079692:1"
59
+
60
+ You get the idea: ("the" has been filtered out by stop words)
61
+
62
+ 1512788:1 # murmur3("notes\x00quick"):1
63
+ 3426202:1 # murmur3("notes\x00brown"):1
64
+ 5079692:1 # murmur3("notes\x00fox"):1
65
+
66
+ ### Byte n-grams
67
+
68
+ >> v = VectorEmbed.new ngram_len: 2, ngram_delim: ''
69
+ => #<VectorEmbed:0x007fd60337ea20 [...]>
70
+ >> v.line(1, notes: 'foobar')
71
+ => "1 2148745:1 2878919:1 3600333:1 3621715:1 5885921:1"
72
+
73
+ So therefore:
74
+
75
+ 2148745:1 # murmur3("notes\x00fo"):1
76
+ 2878919:1 # murmur3("notes\x00oo"):1
77
+ 3600333:1 # murmur3("notes\x00ob"):1
78
+ 3621715:1 # murmur3("notes\x00ba"):1
79
+ 5885921:1 # murmur3("notes\x00ar"):1
80
+
81
+ ## Debugging
82
+
83
+ `VectorEmbed` tries to do the right thing, but if it's not, try turning on debugging:
84
+
85
+ >> v = VectorEmbed.new
86
+ => #<VectorEmbed:0x007fd6034020a0 [...]>
87
+ >> v.logger.level = Logger::DEBUG
88
+ => 0
89
+ >> v.line(1, '3' => 7, foo: 'bar', truthy: false, nullity: nil)
90
+ D, [2013-02-20T16:55:00.139299 #21595] DEBUG -- : Interpreting "3" as Number given first value 7
91
+ D, [2013-02-20T16:55:00.139561 #21595] DEBUG -- : Interpreting :foo as Phrase given first value "bar"
92
+ D, [2013-02-20T16:55:00.139671 #21595] DEBUG -- : Interpreting :truthy as Boolean given first value false
93
+ D, [2013-02-20T16:55:00.139755 #21595] DEBUG -- : Interpreting :nullity as Boolean given first value nil
94
+ D, [2013-02-20T16:55:00.139872 #21595] DEBUG -- : Interpreting "label" as Number given first value 1
95
+ => "1 2647413:7 4091306:1 7123386:1 9259635:1"
96
+
97
+ One thing it doesn't like: (assuming you have already performed the lines above)
98
+
99
+ >> v.line(1, '3' => 'bar')
100
+ ArgumentError: Can't embed "bar" in number feature "3".
101
+
102
+ It's saying that, given you first passed it `7`, it thought `"3"` was a feature that held numbers.
103
+
104
+ ## Gotchas
105
+
106
+ * Following Sally, it only uses the first 22 bits of the murmur hash for feature indices... more and LIBSVM seems to choke.
107
+ * Stop words are currently filtered out of feature indices... probably shouldn't be.
108
+
109
+ ## Copyright
110
+
111
+ Copyright 2013 Seamus Abshere
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'yard'
4
+ YARD::Rake::YardocTask.new
5
+
6
+ require 'rspec/core/rake_task'
7
+ RSpec::Core::RakeTask.new('spec')
8
+ task :default => :spec
data/bin/vector_embed ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'vector_embed'
@@ -0,0 +1,66 @@
1
+ require 'logger'
2
+
3
+ require 'vector_embed/version'
4
+ require 'vector_embed/maker'
5
+
6
+ require 'vector_embed/stop_word'
7
+
8
+ class VectorEmbed
9
+ # http://stackoverflow.com/questions/638565/parsing-scientific-notation-sensibly
10
+ JUST_A_NUMBER = /\A\s*[+\-]?(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?\s*\z/
11
+ BLANK = /\A\s*\z/
12
+ NULL_BYTE = "\x00"
13
+
14
+ attr_reader :options
15
+ attr_accessor :logger
16
+
17
+ def initialize(options = {})
18
+ @mutex = Mutex.new
19
+ @feature_makers = {}
20
+ @logger = options[:logger] || (l = Logger.new($stderr); l.level = Logger::INFO; l)
21
+ @options = options.dup
22
+ end
23
+
24
+ def line(label, features = {})
25
+ feature_pairs = features.inject([]) do |memo, (k, v)|
26
+ case v
27
+ when Array
28
+ v.each_with_index do |vv, i|
29
+ memo.concat feature_maker([k, i].join(NULL_BYTE), vv).pairs(vv)
30
+ end
31
+ else
32
+ memo.concat feature_maker(k, v).pairs(v)
33
+ end
34
+ memo
35
+ end.compact.sort_by do |k_value, _|
36
+ k_value
37
+ end.map do |pair|
38
+ pair.join ':'
39
+ end
40
+ ([label_maker(label).value(label)] + feature_pairs).join ' '
41
+ end
42
+
43
+ def preprocess(v)
44
+ StopWord.remove stop_words, v
45
+ end
46
+
47
+ private
48
+
49
+ def stop_words
50
+ @stop_words ||= options.fetch(:stop_words, []).map do |raw_stop_word|
51
+ StopWord.new raw_stop_word
52
+ end
53
+ end
54
+
55
+ def label_maker(label)
56
+ @label_maker || @mutex.synchronize do
57
+ @label_maker ||= Maker.pick([Maker::Boolean, Maker::Number], 'label', label, self)
58
+ end
59
+ end
60
+
61
+ def feature_maker(k, v)
62
+ @feature_makers[k] || @mutex.synchronize do
63
+ @feature_makers[k] ||= Maker.pick([Maker::Boolean, Maker::Number, Maker::Ngram, Maker::Phrase], k, v, self)
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,46 @@
1
+ require 'vector_embed/maker/phrase'
2
+ require 'vector_embed/maker/ngram'
3
+ require 'vector_embed/maker/number'
4
+ require 'vector_embed/maker/boolean'
5
+
6
+ require 'murmurhash3'
7
+
8
+ class VectorEmbed
9
+ class Maker
10
+ class << self
11
+ def pick(choices, k, first_v, parent)
12
+ if klass = choices.detect { |klass| klass.want?(k, first_v, parent) }
13
+ parent.logger.debug { "Interpreting #{k.inspect} as #{klass.name.split('::').last} given first value #{first_v.inspect}" }
14
+ klass.new k, parent
15
+ else
16
+ raise "Can't use #{first_v.class} for #{k.inspect} given #{first_v.inspect} and choices #{choices.inspect}"
17
+ end
18
+ end
19
+
20
+ def index(*parts)
21
+ MurmurHash3::V32.str_hash(parts.join(NULL_BYTE)).to_s[0..6].to_i
22
+ end
23
+ end
24
+
25
+ attr_reader :parent
26
+ attr_reader :k
27
+
28
+ def initialize(k, parent)
29
+ @k = k
30
+ @parent = parent
31
+ end
32
+
33
+ def pairs(v)
34
+ case v
35
+ when Array
36
+ memo = []
37
+ v.each_with_index do |vv, i|
38
+ memo << [ Maker.index(k, i), value(vv) ]
39
+ end
40
+ memo
41
+ else
42
+ [ [ Maker.index(k), value(v) ] ]
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,42 @@
1
+ require 'vector_embed/maker'
2
+
3
+ class VectorEmbed
4
+ class Maker
5
+ class Boolean < Maker
6
+ class << self
7
+ def want?(k, v, parent)
8
+ case v
9
+ when NilClass, TrueClass, FalseClass, 'true', 'false', 'null'
10
+ true
11
+ else
12
+ false
13
+ end
14
+ end
15
+ end
16
+
17
+ def value(v)
18
+ case v
19
+ when TrueClass, 'true', 't', 'yes', 'on'
20
+ 1
21
+ when FalseClass, 'false', 'f', 'no', 'off'
22
+ 0
23
+ else
24
+ raise "Can't embed #{v.inspect} in boolean feature #{k.inspect}"
25
+ end
26
+ end
27
+
28
+ def pairs(v)
29
+ case v
30
+ when TrueClass, 'true', 't', 'yes', 'on'
31
+ [ [ Maker.index(k, 'true'), 1 ] ]
32
+ when FalseClass, 'false', 'f', 'no', 'off'
33
+ [ [ Maker.index(k, 'false'), 1 ] ]
34
+ when NilClass, 'null', BLANK
35
+ [ [ Maker.index(k, 'null'), 1 ] ]
36
+ else
37
+ raise ArgumentError, "Can't embed #{v.inspect} in boolean feature #{k.inspect}"
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,39 @@
1
+ require 'vector_embed/maker'
2
+
3
+ class VectorEmbed
4
+ class Maker
5
+ class Ngram < Maker
6
+ class << self
7
+ def want?(k, v, parent)
8
+ parent.options[:ngram_len]
9
+ end
10
+ end
11
+
12
+ attr_reader :len
13
+ attr_reader :delim
14
+
15
+ def initialize(k, parent)
16
+ super
17
+ @len = parent.options[:ngram_len].to_i
18
+ raise ArgumentError, ":ngram_len must be > 0" unless @len > 0
19
+ @delim = parent.options[:ngram_delim]
20
+ end
21
+
22
+ def pairs(v)
23
+ raise "Ngram can't handle #{v.inspect}, only a single string for now" unless v.is_a?(String)
24
+ v = parent.preprocess v.to_s
25
+ if len == 1
26
+ # word mode
27
+ v.split delim
28
+ elsif delim == ''
29
+ # byte mode
30
+ (0..v.length-len).map { |i| v[i,len] }
31
+ else
32
+ raise "Word n-gram not supported yet"
33
+ end.map do |ngram|
34
+ [ [ Maker.index(k, 'ngram', ngram), 1 ] ]
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,37 @@
1
+ require 'vector_embed/maker'
2
+
3
+ class VectorEmbed
4
+ class Maker
5
+ class Number < Maker
6
+ class << self
7
+ def want?(k, v, parent)
8
+ v.is_a?(::Numeric) or v =~ JUST_A_NUMBER
9
+ end
10
+
11
+ def numify(v)
12
+ num = if v.is_a?(String)
13
+ if v.include?('.') or v.include?('e')
14
+ v.to_f
15
+ else
16
+ v.to_i
17
+ end
18
+ else
19
+ v
20
+ end
21
+ num > 1e10 ? ('%.10e' % num) : num
22
+ end
23
+ end
24
+
25
+ def value(v)
26
+ case v
27
+ when Numeric, JUST_A_NUMBER
28
+ Number.numify v
29
+ when NilClass
30
+ 0
31
+ else
32
+ raise ArgumentError, "Can't embed #{v.inspect} in number feature #{k.inspect}"
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,18 @@
1
+ require 'vector_embed/maker'
2
+
3
+ class VectorEmbed
4
+ class Maker
5
+ class Phrase < Maker
6
+ class << self
7
+ def want?(k, v, parent)
8
+ true
9
+ end
10
+ end
11
+
12
+ def pairs(v)
13
+ v = parent.preprocess v.to_s
14
+ [ [ Maker.index(k, v), 1 ] ]
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,24 @@
1
+ require 'vector_embed'
2
+
3
+ class VectorEmbed
4
+ class StopWord
5
+ class << self
6
+ def remove(stop_words, str)
7
+ memo = str.dup
8
+ stop_words.each do |stop_word|
9
+ stop_word.apply! memo
10
+ end
11
+ memo.gsub! /\s+/, ' '
12
+ memo.strip!
13
+ memo
14
+ end
15
+ end
16
+
17
+ def initialize(raw_stop_word)
18
+ @pattern = /\s*\b#{raw_stop_word}\b\s*/i
19
+ end
20
+ def apply!(str)
21
+ str.gsub! @pattern, ' '
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,3 @@
1
+ class VectorEmbed
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,21 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # Require this file using `require "spec_helper"` to ensure that it is only
4
+ # loaded once.
5
+ #
6
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
+ RSpec.configure do |config|
8
+ config.treat_symbols_as_metadata_keys_with_true_values = true
9
+ config.run_all_when_everything_filtered = true
10
+ config.filter_run :focus
11
+
12
+ # Run specs in random order to surface order dependencies. If you find an
13
+ # order dependency and want to debug it, you can fix the order by providing
14
+ # the seed, which is printed after each run.
15
+ # --seed 1234
16
+ config.order = 'random'
17
+ end
18
+
19
+ require 'pry'
20
+
21
+ require 'vector_embed'
@@ -0,0 +1,219 @@
1
+ require 'spec_helper'
2
+
3
+ describe VectorEmbed do
4
+ describe 'in labels' do
5
+ it "stores true/false as 1/0" do
6
+ v = VectorEmbed.new
7
+ v.line(true).should == '1'
8
+ v.line(false).should == '0'
9
+ v.line('true').should == '1'
10
+ v.line('false').should == '0'
11
+ end
12
+
13
+ it "stores numbers as numbers" do
14
+ v = VectorEmbed.new
15
+ v.line(5.4).should == '5.4'
16
+ v.line(-3.9).should == '-3.9'
17
+ end
18
+
19
+ it "doesn't allow strings" do
20
+ v = VectorEmbed.new
21
+ lambda { v.line('foo') }.should raise_error(/string.*label/i)
22
+ end
23
+
24
+ it "doesn't allow mixing" do
25
+ v = VectorEmbed.new
26
+ v.line(5.4)
27
+ lambda { v.line(true) }.should raise_error(/Can't embed.*number/)
28
+ v = VectorEmbed.new
29
+ v.line(true)
30
+ lambda { v.line(5.4) }.should raise_error(/Can't embed.*boolean/)
31
+ end
32
+ end
33
+
34
+ # aka dimension indexes
35
+ describe 'in feature keys' do
36
+ it "stores values as their string equivalents" do
37
+ v = VectorEmbed.new
38
+ v.line(1, 1 => 9).should == "1 #{l_h('1')}:9"
39
+ v.line(1, 5.4 => 9).should == "1 #{l_h('5.4')}:9"
40
+ v.line(1, '5.4' => 9).should == "1 #{l_h('5.4')}:9"
41
+ v.line(1, '5.4 ' => 9).should == "1 #{l_h('5.4 ')}:9"
42
+ v.line(1, 'foo' => 9).should == "1 #{l_h('foo')}:9"
43
+ v.line(1, 'foo bar' => 9).should == "1 #{l_h('foo bar')}:9"
44
+ v.line(1, true => 9).should == "1 #{l_h('true')}:9"
45
+ v.line(1, 'true' => 9).should == "1 #{l_h('true')}:9"
46
+ v.line(1, false => 9).should == "1 #{l_h('false')}:9"
47
+ v.line(1, 'false' => 9).should == "1 #{l_h('false')}:9"
48
+ end
49
+
50
+ it "treats nil as a blank string" do
51
+ v = VectorEmbed.new
52
+ v.line(1, nil => 9).should == "1 #{l_h('')}:9"
53
+ end
54
+
55
+ it "leaves whitespace alone" do
56
+ v = VectorEmbed.new
57
+ v.line(1, '' => 9).should == "1 #{l_h('')}:9"
58
+ v.line(1, ' ' => 9).should == "1 #{l_h(' ')}:9"
59
+ v.line(1, ' ' => 9).should == "1 #{l_h(' ')}:9"
60
+ v.line(1, ' foo ' => 9).should == "1 #{l_h(' foo ')}:9"
61
+ v.line(1, '5.4 ' => 9).should == "1 #{l_h('5.4 ')}:9"
62
+ end
63
+
64
+ it "orders feature names" do
65
+ v = VectorEmbed.new
66
+ v.line(1, 1 => 3, 2 => 7).should == "1 #{l_h('2')}:7 #{l_h('1')}:3"
67
+ end
68
+
69
+ it "allows mixed string and number feature values" do
70
+ v = VectorEmbed.new
71
+ v.line(1, a: :b).should == "1 #{l_h("a\x00b")}:1"
72
+ v.line(1, a: 13).should == "1 #{l_h("a\x0013")}:1"
73
+ v.line(1, 1 => 9).should == "1 #{l_h('1')}:9" # 9 is not hashed, 1 is
74
+ end
75
+ end
76
+
77
+ describe 'feature values' do
78
+ it "stores true/false/nil as (1,0,0)/(0,1,0)/(0,0,1)" do
79
+ v = VectorEmbed.new
80
+ v.line(1, 1 => true).should == "1 #{l_h("1\x00true")}:1"
81
+ v.line(1, 1 => 'true').should == "1 #{l_h("1\x00true")}:1"
82
+ v.line(1, 1 => false).should == "1 #{l_h("1\x00false")}:1"
83
+ v.line(1, 1 => 'false').should == "1 #{l_h("1\x00false")}:1"
84
+ v.line(1, 1 => nil).should == "1 #{l_h("1\x00null")}:1"
85
+ v.line(1, 1 => 'null').should == "1 #{l_h("1\x00null")}:1"
86
+ end
87
+
88
+ it "stores numbers as numbers" do
89
+ v = VectorEmbed.new
90
+ v.line(1, 1 => 9).should == "1 #{l_h('1')}:9"
91
+ v.line(1, 1 => '9').should == "1 #{l_h('1')}:9"
92
+ v.line(1, 1 => 5.4).should == "1 #{l_h('1')}:5.4"
93
+ v.line(1, 1 => '5.4').should == "1 #{l_h('1')}:5.4"
94
+ v.line(1, 1 => 9e9).should == "1 #{l_h('1')}:9000000000.0"
95
+ v.line(1, 1 => '9e9').should == "1 #{l_h('1')}:9000000000.0"
96
+ end
97
+
98
+ it "stores strings as m-category attributes" do
99
+ v = VectorEmbed.new
100
+ v.line(1, 1 => 'sfh').should == "1 #{l_h("1\x00sfh")}:1"
101
+ v.line(1, 1 => 'mfh').should == "1 #{l_h("1\x00mfh")}:1"
102
+ v.line(1, 1 => 'foo bar').should == "1 #{l_h("1\x00foo bar")}:1"
103
+ v.line(1, 1 => 'foo bar ').should == "1 #{l_h("1\x00foo bar")}:1"
104
+ v.line(1, 1 => ' foo bar ').should == "1 #{l_h("1\x00foo bar")}:1"
105
+ end
106
+
107
+ it "in string mode, treats true/false/nil as strings" do
108
+ v = VectorEmbed.new
109
+ v.line(1, 1 => 'foo').should == "1 #{l_h("1\x00foo")}:1"
110
+ v.line(1, 1 => true).should == "1 #{l_h("1\x00true")}:1"
111
+ v.line(1, 1 => false).should == "1 #{l_h("1\x00false")}:1"
112
+ v.line(1, 1 => nil).should == "1 #{l_h("1\x00")}:1"
113
+ end
114
+
115
+ it "in string mode, treats numbers as strings" do
116
+ v = VectorEmbed.new
117
+ v.line(1, 1 => 'foo').should == "1 #{l_h("1\x00foo")}:1"
118
+ v.line(1, 1 => 1).should == "1 #{l_h("1\x001")}:1"
119
+ v.line(1, 1 => 5.4).should == "1 #{l_h("1\x005.4")}:1"
120
+ v.line(1, 1 => 9e9).should == "1 #{l_h("1\x00" + 9e9.to_s)}:1"
121
+ end
122
+
123
+ it "flattens and stores arrays" do
124
+ v = VectorEmbed.new
125
+ v.line(1, 'foo' => [7,13,19]).should == sortme("1 #{l_h("foo\x001")}:13 #{l_h("foo\x000")}:7 #{l_h("foo\x002")}:19")
126
+ v.line(1, 'bar' => ['a','b','c']).should == sortme("1 #{l_h("bar\x001\x00b")}:1 #{l_h("bar\x000\x00a")}:1 #{l_h("bar\x002\x00c")}:1")
127
+ end
128
+
129
+ it "in number mode, treats null as 0" do
130
+ v = VectorEmbed.new
131
+ v.line(1, 1 => 9).should == "1 #{l_h('1')}:9"
132
+ v.line(1, 1 => nil).should == "1 #{l_h('1')}:0"
133
+ end
134
+
135
+ it "doesn't allow embedding boolean in number mode or vice-versa" do
136
+ v = VectorEmbed.new
137
+ v.line(1, 1 => true)
138
+ v.line(1, 2 => 5.4) # that's fine, different dimension
139
+ lambda { v.line(1, 1 => 5.4) }.should raise_error(ArgumentError)
140
+ v = VectorEmbed.new
141
+ v.line(1, 1 => 5.4)
142
+ v.line(1, 2 => true) # that's fine, diff dim
143
+ lambda { v.line(1, 1 => true) }.should raise_error(ArgumentError)
144
+ end
145
+
146
+ it "doesn't allow embedding string in number mode" do
147
+ v = VectorEmbed.new
148
+ v.line(1, 1 => 9)
149
+ v.line(1, 2 => 'foo') # that's fine, different dimension
150
+ lambda { v.line(1, 1 => 'foo') }.should raise_error(ArgumentError)
151
+ end
152
+
153
+ it "uses scientific notation for large numbers" do
154
+ v = VectorEmbed.new
155
+ v.line(5, 1 => 8.12e13).should == "5 #{l_h('1')}:8.1200000000e+13"
156
+ end
157
+
158
+ it "detects numbers in strings" do
159
+ v = VectorEmbed.new
160
+ v.line(5, 1 => '8.12e13').should == "5 #{l_h('1')}:8.1200000000e+13"
161
+ end
162
+
163
+ it "allows 2 byte n-grams" do
164
+ v = VectorEmbed.new ngram_len: 2, ngram_delim: ''
165
+ v.line(1, 1 => 'foo').should == sortme("1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1")
166
+ v.line(1, 1 => 'bar').should == sortme("1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00ba")}:1")
167
+ v.line(1, 1 => 'baz').should == sortme("1 #{l_h("1\x00ngram\x00az")}:1 #{l_h("1\x00ngram\x00ba")}:1")
168
+ v.line(1, 1 => 'foobar').should == sortme("1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("1\x00ngram\x00ob")}:1 #{l_h("1\x00ngram\x00ba")}:1")
169
+ v.line(1, 1 => 'foo bar').should == sortme("1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("1\x00ngram\x00o ")}:1 #{l_h("1\x00ngram\x00 b")}:1 #{l_h("1\x00ngram\x00ba")}:1 #{l_h("1\x00ngram\x00ar")}:1")
170
+ end
171
+
172
+ it "allows word-grams" do
173
+ v = VectorEmbed.new ngram_len: 1, ngram_delim: /\s+/
174
+ v.line(1, 1 => 'foo').should == sortme("1 #{l_h("1\x00ngram\x00foo")}:1")
175
+ v.line(1, 1 => 'foobar').should == sortme("1 #{l_h("1\x00ngram\x00foobar")}:1")
176
+ v.line(1, 1 => 'foo bar').should == sortme("1 #{l_h("1\x00ngram\x00bar")}:1 #{l_h("1\x00ngram\x00foo")}:1")
177
+ end
178
+
179
+ it "allows 2 byte n-grams with stop words" do
180
+ v = VectorEmbed.new ngram_len: 2, ngram_delim: '', stop_words: %w{the and or}
181
+ v.line(1, 1 => 'foo or').should == sortme("1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1")
182
+ v.line(1, 1 => 'the bar').should == sortme("1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00ba")}:1")
183
+ v.line(1, 1 => 'and baz').should == sortme("1 #{l_h("1\x00ngram\x00az")}:1 #{l_h("1\x00ngram\x00ba")}:1")
184
+ v.line(1, 1 => 'foobar or the and').should == sortme("1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("1\x00ngram\x00ob")}:1 #{l_h("1\x00ngram\x00ba")}:1")
185
+ v.line(1, 1 => 'foo or and the bar').should == sortme("1 #{l_h("1\x00ngram\x00 b")}:1 #{l_h("1\x00ngram\x00ar")}:1 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("1\x00ngram\x00o ")}:1 #{l_h("1\x00ngram\x00ba")}:1")
186
+ end
187
+
188
+ it "allows word-grams with stop words" do
189
+ v = VectorEmbed.new ngram_len: 1, ngram_delim: /\s+/, stop_words: %w{the and or}
190
+ v.line(1, 1 => 'foo or').should == "1 #{l_h("1\x00ngram\x00foo")}:1"
191
+ v.line(1, 1 => 'foo the bar').should == "1 #{l_h("1\x00ngram\x00bar")}:1 #{l_h("1\x00ngram\x00foo")}:1"
192
+ v.line(1, 1 => 'foo bar and').should == "1 #{l_h("1\x00ngram\x00bar")}:1 #{l_h("1\x00ngram\x00foo")}:1"
193
+ end
194
+
195
+ it "doesn't do anything weird when you have multiple features" do
196
+ v = VectorEmbed.new
197
+ v.line(1, 1 => 'foo', 2 => 'bar', 'baz' => 'zoo').should == sortme("1 #{l_h("1\x00foo")}:1 #{l_h("2\x00bar")}:1 #{l_h("baz\x00zoo")}:1")
198
+ end
199
+
200
+ end
201
+
202
+ private
203
+
204
+ def h(v)
205
+ MurmurHash3::V32.str_hash v
206
+ end
207
+
208
+ # for labels
209
+ def l_h(v)
210
+ h(v).to_s[0..6].to_i
211
+ end
212
+
213
+ def sortme(line)
214
+ parts = line.split(' ')
215
+ label = parts.shift
216
+ features = parts.map { |p| p.split(':') }.sort_by { |k, v| k.to_i }.map { |k, v| [k, v].join(':') }
217
+ ([label] + features).join(' ')
218
+ end
219
+ end
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/vector_embed/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.name = "vector_embed"
6
+ gem.version = VectorEmbed::VERSION
7
+ gem.authors = ["Seamus Abshere"]
8
+ gem.email = ["seamus@abshere.net"]
9
+ gem.description = %q{Vector embedding of strings, booleans, numerics, and arrays into LIBSVM / LIBLINEAR format.}
10
+ gem.summary = %q{Vector embedding of strings, booleans, numerics, and arrays into LIBSVM / LIBLINEAR format.}
11
+ gem.homepage = "https://github.com/seamusabshere/vector_embed"
12
+
13
+ gem.files = `git ls-files`.split($/)
14
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
15
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
16
+ gem.require_paths = ["lib"]
17
+
18
+ gem.add_dependency 'murmurhash3'
19
+
20
+ gem.add_development_dependency 'rspec'
21
+ gem.add_development_dependency 'pry'
22
+ gem.add_development_dependency 'yard'
23
+ end
metadata ADDED
@@ -0,0 +1,135 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: vector_embed
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Seamus Abshere
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-02-20 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: murmurhash3
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rspec
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: pry
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: yard
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ description: Vector embedding of strings, booleans, numerics, and arrays into LIBSVM
79
+ / LIBLINEAR format.
80
+ email:
81
+ - seamus@abshere.net
82
+ executables:
83
+ - vector_embed
84
+ extensions: []
85
+ extra_rdoc_files: []
86
+ files:
87
+ - .gitignore
88
+ - .rspec
89
+ - CHANGELOG
90
+ - Gemfile
91
+ - LICENSE
92
+ - LICENSE.txt
93
+ - README.md
94
+ - Rakefile
95
+ - bin/vector_embed
96
+ - lib/vector_embed.rb
97
+ - lib/vector_embed/maker.rb
98
+ - lib/vector_embed/maker/boolean.rb
99
+ - lib/vector_embed/maker/ngram.rb
100
+ - lib/vector_embed/maker/number.rb
101
+ - lib/vector_embed/maker/phrase.rb
102
+ - lib/vector_embed/stop_word.rb
103
+ - lib/vector_embed/version.rb
104
+ - spec/spec_helper.rb
105
+ - spec/vector_embed_spec.rb
106
+ - vector_embed.gemspec
107
+ homepage: https://github.com/seamusabshere/vector_embed
108
+ licenses: []
109
+ post_install_message:
110
+ rdoc_options: []
111
+ require_paths:
112
+ - lib
113
+ required_ruby_version: !ruby/object:Gem::Requirement
114
+ none: false
115
+ requirements:
116
+ - - ! '>='
117
+ - !ruby/object:Gem::Version
118
+ version: '0'
119
+ required_rubygems_version: !ruby/object:Gem::Requirement
120
+ none: false
121
+ requirements:
122
+ - - ! '>='
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ requirements: []
126
+ rubyforge_project:
127
+ rubygems_version: 1.8.25
128
+ signing_key:
129
+ specification_version: 3
130
+ summary: Vector embedding of strings, booleans, numerics, and arrays into LIBSVM /
131
+ LIBLINEAR format.
132
+ test_files:
133
+ - spec/spec_helper.rb
134
+ - spec/vector_embed_spec.rb
135
+ has_rdoc: