vector_embed 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,14 @@
1
+ 0.1.0 / 2013-02-20
2
+
3
+ * Enhancements
4
+
5
+ * csv2libsvm binary
6
+ * "null" and "NULL" are treated as nil (0 in number mode)
7
+
8
+ * Breaking changes
9
+
10
+ * yes', 'on' / 'no', 'off' no longer treated as true/false
11
+
1
12
  0.0.1 / 2013-02-20
2
13
 
3
14
  * First release!
data/README.md CHANGED
@@ -4,6 +4,21 @@ Vector embedding of strings, booleans, numerics, and arrays into [LIBSVM](http:/
4
4
 
5
5
  Inspired by [Sally](http://www.mlsec.org/sally/), except `VectorEmbed` is meant to handle categorical and continuous data at the same time.
6
6
 
7
+ ## csv2libsvm
8
+
9
+ $ cat houses.csv
10
+ label,household_income,zip_code_id,year
11
+ +1,35893,53703,1904
12
+ -1,43708,53711,1977
13
+ +1,103214,53719,NULL
14
+ -1,49250,53704,1950
15
+
16
+ $ csv2libsvm houses.csv
17
+ 1 1243483:35893 6439848:53703 8227451:1904
18
+ -1 1243483:43708 6439848:53711 8227451:1977
19
+ 1 1243483:103214 6439848:53719 8227451:0
20
+ -1 1243483:49250 6439848:53704 8227451:1950
21
+
7
22
  ## Usage
8
23
 
9
24
  Create a `VectorEmbed` instance, which auto-detects and then remembers what kind of data goes into each feature:
data/bin/csv2libsvm ADDED
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ if File.exist?('Gemfile')
4
+ require 'bundler/setup'
5
+ end
6
+
7
+ require 'csv'
8
+ require 'vector_embed'
9
+
10
+ csv_path = ARGV[0]
11
+
12
+ v = VectorEmbed.new
13
+ CSV.foreach(csv_path, headers: :first_row) do |row|
14
+ features = row.to_hash
15
+ label = features.delete('label')
16
+ puts v.line(label, features)
17
+ end
@@ -6,7 +6,7 @@ class VectorEmbed
6
6
  class << self
7
7
  def want?(k, v, parent)
8
8
  case v
9
- when NilClass, TrueClass, FalseClass, 'true', 'false', 'null'
9
+ when NilClass, TrueClass, FalseClass, TRUE, FALSE, T, F, NULL, SLASH_N
10
10
  true
11
11
  else
12
12
  false
@@ -16,9 +16,9 @@ class VectorEmbed
16
16
 
17
17
  def value(v)
18
18
  case v
19
- when TrueClass, 'true', 't', 'yes', 'on'
19
+ when TrueClass, TRUE, T
20
20
  1
21
- when FalseClass, 'false', 'f', 'no', 'off'
21
+ when FalseClass, FALSE, F
22
22
  0
23
23
  else
24
24
  raise "Can't embed #{v.inspect} in boolean feature #{k.inspect}"
@@ -27,11 +27,11 @@ class VectorEmbed
27
27
 
28
28
  def pairs(v)
29
29
  case v
30
- when TrueClass, 'true', 't', 'yes', 'on'
30
+ when TrueClass, TRUE, T
31
31
  [ [ Maker.index(k, 'true'), 1 ] ]
32
- when FalseClass, 'false', 'f', 'no', 'off'
32
+ when FalseClass, FALSE, F
33
33
  [ [ Maker.index(k, 'false'), 1 ] ]
34
- when NilClass, 'null', BLANK
34
+ when NilClass, NULL, SLASH_N, BLANK
35
35
  [ [ Maker.index(k, 'null'), 1 ] ]
36
36
  else
37
37
  raise ArgumentError, "Can't embed #{v.inspect} in boolean feature #{k.inspect}"
@@ -26,7 +26,7 @@ class VectorEmbed
26
26
  case v
27
27
  when Numeric, JUST_A_NUMBER
28
28
  Number.numify v
29
- when NilClass
29
+ when NilClass, NULL, SLASH_N
30
30
  0
31
31
  else
32
32
  raise ArgumentError, "Can't embed #{v.inspect} in number feature #{k.inspect}"
@@ -1,3 +1,3 @@
1
1
  class VectorEmbed
2
- VERSION = "0.0.1"
2
+ VERSION = "0.1.0"
3
3
  end
data/lib/vector_embed.rb CHANGED
@@ -9,6 +9,12 @@ class VectorEmbed
9
9
  # http://stackoverflow.com/questions/638565/parsing-scientific-notation-sensibly
10
10
  JUST_A_NUMBER = /\A\s*[+\-]?(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?\s*\z/
11
11
  BLANK = /\A\s*\z/
12
+ NULL = /\Anull\z/i
13
+ SLASH_N = '\N'
14
+ TRUE = /\Atrue\z/i
15
+ T = /\At\z/i
16
+ FALSE = /\Afalse\z/i
17
+ F = /\Af\z/i
12
18
  NULL_BYTE = "\x00"
13
19
 
14
20
  attr_reader :options
@@ -79,10 +79,18 @@ describe VectorEmbed do
79
79
  v = VectorEmbed.new
80
80
  v.line(1, 1 => true).should == "1 #{l_h("1\x00true")}:1"
81
81
  v.line(1, 1 => 'true').should == "1 #{l_h("1\x00true")}:1"
82
+ v.line(1, 1 => 'TRUE').should == "1 #{l_h("1\x00true")}:1"
83
+ v.line(1, 1 => 't').should == "1 #{l_h("1\x00true")}:1"
84
+ v.line(1, 1 => 'T').should == "1 #{l_h("1\x00true")}:1"
82
85
  v.line(1, 1 => false).should == "1 #{l_h("1\x00false")}:1"
83
86
  v.line(1, 1 => 'false').should == "1 #{l_h("1\x00false")}:1"
87
+ v.line(1, 1 => 'FALSE').should == "1 #{l_h("1\x00false")}:1"
88
+ v.line(1, 1 => 'f').should == "1 #{l_h("1\x00false")}:1"
89
+ v.line(1, 1 => 'F').should == "1 #{l_h("1\x00false")}:1"
84
90
  v.line(1, 1 => nil).should == "1 #{l_h("1\x00null")}:1"
85
91
  v.line(1, 1 => 'null').should == "1 #{l_h("1\x00null")}:1"
92
+ v.line(1, 1 => 'NULL').should == "1 #{l_h("1\x00null")}:1"
93
+ v.line(1, 1 => '\N').should == "1 #{l_h("1\x00null")}:1"
86
94
  end
87
95
 
88
96
  it "stores numbers as numbers" do
@@ -130,6 +138,9 @@ describe VectorEmbed do
130
138
  v = VectorEmbed.new
131
139
  v.line(1, 1 => 9).should == "1 #{l_h('1')}:9"
132
140
  v.line(1, 1 => nil).should == "1 #{l_h('1')}:0"
141
+ v.line(1, 1 => 'null').should == "1 #{l_h('1')}:0"
142
+ v.line(1, 1 => 'NULL').should == "1 #{l_h('1')}:0"
143
+ v.line(1, 1 => '\N').should == "1 #{l_h('1')}:0"
133
144
  end
134
145
 
135
146
  it "doesn't allow embedding boolean in number mode or vice-versa" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vector_embed
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-02-20 00:00:00.000000000 Z
12
+ date: 2013-02-22 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: murmurhash3
@@ -80,7 +80,7 @@ description: Vector embedding of strings, booleans, numerics, and arrays into LI
80
80
  email:
81
81
  - seamus@abshere.net
82
82
  executables:
83
- - vector_embed
83
+ - csv2libsvm
84
84
  extensions: []
85
85
  extra_rdoc_files: []
86
86
  files:
@@ -92,7 +92,7 @@ files:
92
92
  - LICENSE.txt
93
93
  - README.md
94
94
  - Rakefile
95
- - bin/vector_embed
95
+ - bin/csv2libsvm
96
96
  - lib/vector_embed.rb
97
97
  - lib/vector_embed/maker.rb
98
98
  - lib/vector_embed/maker/boolean.rb
data/bin/vector_embed DELETED
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'vector_embed'