vector_embed 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,14 @@
1
+ 0.1.0 / 2013-02-20
2
+
3
+ * Enhancements
4
+
5
+ * csv2libsvm binary
6
+ * "null" and "NULL" are treated as nil (0 in number mode)
7
+
8
+ * Breaking changes
9
+
10
+ * yes', 'on' / 'no', 'off' no longer treated as true/false
11
+
1
12
  0.0.1 / 2013-02-20
2
13
 
3
14
  * First release!
data/README.md CHANGED
@@ -4,6 +4,21 @@ Vector embedding of strings, booleans, numerics, and arrays into [LIBSVM](http:/
4
4
 
5
5
  Inspired by [Sally](http://www.mlsec.org/sally/), except `VectorEmbed` is meant to handle categorical and continuous data at the same time.
6
6
 
7
+ ## csv2libsvm
8
+
9
+ $ cat houses.csv
10
+ label,household_income,zip_code_id,year
11
+ +1,35893,53703,1904
12
+ -1,43708,53711,1977
13
+ +1,103214,53719,NULL
14
+ -1,49250,53704,1950
15
+
16
+ $ csv2libsvm houses.csv
17
+ 1 1243483:35893 6439848:53703 8227451:1904
18
+ -1 1243483:43708 6439848:53711 8227451:1977
19
+ 1 1243483:103214 6439848:53719 8227451:0
20
+ -1 1243483:49250 6439848:53704 8227451:1950
21
+
7
22
  ## Usage
8
23
 
9
24
  Create a `VectorEmbed` instance, which auto-detects and then remembers what kind of data goes into each feature:
data/bin/csv2libsvm ADDED
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ if File.exist?('Gemfile')
4
+ require 'bundler/setup'
5
+ end
6
+
7
+ require 'csv'
8
+ require 'vector_embed'
9
+
10
+ csv_path = ARGV[0]
11
+
12
+ v = VectorEmbed.new
13
+ CSV.foreach(csv_path, headers: :first_row) do |row|
14
+ features = row.to_hash
15
+ label = features.delete('label')
16
+ puts v.line(label, features)
17
+ end
@@ -6,7 +6,7 @@ class VectorEmbed
6
6
  class << self
7
7
  def want?(k, v, parent)
8
8
  case v
9
- when NilClass, TrueClass, FalseClass, 'true', 'false', 'null'
9
+ when NilClass, TrueClass, FalseClass, TRUE, FALSE, T, F, NULL, SLASH_N
10
10
  true
11
11
  else
12
12
  false
@@ -16,9 +16,9 @@ class VectorEmbed
16
16
 
17
17
  def value(v)
18
18
  case v
19
- when TrueClass, 'true', 't', 'yes', 'on'
19
+ when TrueClass, TRUE, T
20
20
  1
21
- when FalseClass, 'false', 'f', 'no', 'off'
21
+ when FalseClass, FALSE, F
22
22
  0
23
23
  else
24
24
  raise "Can't embed #{v.inspect} in boolean feature #{k.inspect}"
@@ -27,11 +27,11 @@ class VectorEmbed
27
27
 
28
28
  def pairs(v)
29
29
  case v
30
- when TrueClass, 'true', 't', 'yes', 'on'
30
+ when TrueClass, TRUE, T
31
31
  [ [ Maker.index(k, 'true'), 1 ] ]
32
- when FalseClass, 'false', 'f', 'no', 'off'
32
+ when FalseClass, FALSE, F
33
33
  [ [ Maker.index(k, 'false'), 1 ] ]
34
- when NilClass, 'null', BLANK
34
+ when NilClass, NULL, SLASH_N, BLANK
35
35
  [ [ Maker.index(k, 'null'), 1 ] ]
36
36
  else
37
37
  raise ArgumentError, "Can't embed #{v.inspect} in boolean feature #{k.inspect}"
@@ -26,7 +26,7 @@ class VectorEmbed
26
26
  case v
27
27
  when Numeric, JUST_A_NUMBER
28
28
  Number.numify v
29
- when NilClass
29
+ when NilClass, NULL, SLASH_N
30
30
  0
31
31
  else
32
32
  raise ArgumentError, "Can't embed #{v.inspect} in number feature #{k.inspect}"
@@ -1,3 +1,3 @@
1
1
  class VectorEmbed
2
- VERSION = "0.0.1"
2
+ VERSION = "0.1.0"
3
3
  end
data/lib/vector_embed.rb CHANGED
@@ -9,6 +9,12 @@ class VectorEmbed
9
9
  # http://stackoverflow.com/questions/638565/parsing-scientific-notation-sensibly
10
10
  JUST_A_NUMBER = /\A\s*[+\-]?(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?\s*\z/
11
11
  BLANK = /\A\s*\z/
12
+ NULL = /\Anull\z/i
13
+ SLASH_N = '\N'
14
+ TRUE = /\Atrue\z/i
15
+ T = /\At\z/i
16
+ FALSE = /\Afalse\z/i
17
+ F = /\Af\z/i
12
18
  NULL_BYTE = "\x00"
13
19
 
14
20
  attr_reader :options
@@ -79,10 +79,18 @@ describe VectorEmbed do
79
79
  v = VectorEmbed.new
80
80
  v.line(1, 1 => true).should == "1 #{l_h("1\x00true")}:1"
81
81
  v.line(1, 1 => 'true').should == "1 #{l_h("1\x00true")}:1"
82
+ v.line(1, 1 => 'TRUE').should == "1 #{l_h("1\x00true")}:1"
83
+ v.line(1, 1 => 't').should == "1 #{l_h("1\x00true")}:1"
84
+ v.line(1, 1 => 'T').should == "1 #{l_h("1\x00true")}:1"
82
85
  v.line(1, 1 => false).should == "1 #{l_h("1\x00false")}:1"
83
86
  v.line(1, 1 => 'false').should == "1 #{l_h("1\x00false")}:1"
87
+ v.line(1, 1 => 'FALSE').should == "1 #{l_h("1\x00false")}:1"
88
+ v.line(1, 1 => 'f').should == "1 #{l_h("1\x00false")}:1"
89
+ v.line(1, 1 => 'F').should == "1 #{l_h("1\x00false")}:1"
84
90
  v.line(1, 1 => nil).should == "1 #{l_h("1\x00null")}:1"
85
91
  v.line(1, 1 => 'null').should == "1 #{l_h("1\x00null")}:1"
92
+ v.line(1, 1 => 'NULL').should == "1 #{l_h("1\x00null")}:1"
93
+ v.line(1, 1 => '\N').should == "1 #{l_h("1\x00null")}:1"
86
94
  end
87
95
 
88
96
  it "stores numbers as numbers" do
@@ -130,6 +138,9 @@ describe VectorEmbed do
130
138
  v = VectorEmbed.new
131
139
  v.line(1, 1 => 9).should == "1 #{l_h('1')}:9"
132
140
  v.line(1, 1 => nil).should == "1 #{l_h('1')}:0"
141
+ v.line(1, 1 => 'null').should == "1 #{l_h('1')}:0"
142
+ v.line(1, 1 => 'NULL').should == "1 #{l_h('1')}:0"
143
+ v.line(1, 1 => '\N').should == "1 #{l_h('1')}:0"
133
144
  end
134
145
 
135
146
  it "doesn't allow embedding boolean in number mode or vice-versa" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vector_embed
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-02-20 00:00:00.000000000 Z
12
+ date: 2013-02-22 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: murmurhash3
@@ -80,7 +80,7 @@ description: Vector embedding of strings, booleans, numerics, and arrays into LI
80
80
  email:
81
81
  - seamus@abshere.net
82
82
  executables:
83
- - vector_embed
83
+ - csv2libsvm
84
84
  extensions: []
85
85
  extra_rdoc_files: []
86
86
  files:
@@ -92,7 +92,7 @@ files:
92
92
  - LICENSE.txt
93
93
  - README.md
94
94
  - Rakefile
95
- - bin/vector_embed
95
+ - bin/csv2libsvm
96
96
  - lib/vector_embed.rb
97
97
  - lib/vector_embed/maker.rb
98
98
  - lib/vector_embed/maker/boolean.rb
data/bin/vector_embed DELETED
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'vector_embed'