vector_embed 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +11 -0
- data/README.md +15 -0
- data/bin/csv2libsvm +17 -0
- data/lib/vector_embed/maker/boolean.rb +6 -6
- data/lib/vector_embed/maker/number.rb +1 -1
- data/lib/vector_embed/version.rb +1 -1
- data/lib/vector_embed.rb +6 -0
- data/spec/vector_embed_spec.rb +11 -0
- metadata +4 -4
- data/bin/vector_embed +0 -3
data/CHANGELOG
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
0.1.0 / 2013-02-20
|
2
|
+
|
3
|
+
* Enhancements
|
4
|
+
|
5
|
+
* csv2libsvm binary
|
6
|
+
* "null" and "NULL" are treated as nil (0 in number mode)
|
7
|
+
|
8
|
+
* Breaking changes
|
9
|
+
|
10
|
+
* yes', 'on' / 'no', 'off' no longer treated as true/false
|
11
|
+
|
1
12
|
0.0.1 / 2013-02-20
|
2
13
|
|
3
14
|
* First release!
|
data/README.md
CHANGED
@@ -4,6 +4,21 @@ Vector embedding of strings, booleans, numerics, and arrays into [LIBSVM](http:/
|
|
4
4
|
|
5
5
|
Inspired by [Sally](http://www.mlsec.org/sally/), except `VectorEmbed` is meant to handle categorical and continuous data at the same time.
|
6
6
|
|
7
|
+
## csv2libsvm
|
8
|
+
|
9
|
+
$ cat houses.csv
|
10
|
+
label,household_income,zip_code_id,year
|
11
|
+
+1,35893,53703,1904
|
12
|
+
-1,43708,53711,1977
|
13
|
+
+1,103214,53719,NULL
|
14
|
+
-1,49250,53704,1950
|
15
|
+
|
16
|
+
$ csv2libsvm houses.csv
|
17
|
+
1 1243483:35893 6439848:53703 8227451:1904
|
18
|
+
-1 1243483:43708 6439848:53711 8227451:1977
|
19
|
+
1 1243483:103214 6439848:53719 8227451:0
|
20
|
+
-1 1243483:49250 6439848:53704 8227451:1950
|
21
|
+
|
7
22
|
## Usage
|
8
23
|
|
9
24
|
Create a `VectorEmbed` instance, which auto-detects and then remembers what kind of data goes into each feature:
|
data/bin/csv2libsvm
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
if File.exist?('Gemfile')
|
4
|
+
require 'bundler/setup'
|
5
|
+
end
|
6
|
+
|
7
|
+
require 'csv'
|
8
|
+
require 'vector_embed'
|
9
|
+
|
10
|
+
csv_path = ARGV[0]
|
11
|
+
|
12
|
+
v = VectorEmbed.new
|
13
|
+
CSV.foreach(csv_path, headers: :first_row) do |row|
|
14
|
+
features = row.to_hash
|
15
|
+
label = features.delete('label')
|
16
|
+
puts v.line(label, features)
|
17
|
+
end
|
@@ -6,7 +6,7 @@ class VectorEmbed
|
|
6
6
|
class << self
|
7
7
|
def want?(k, v, parent)
|
8
8
|
case v
|
9
|
-
when NilClass, TrueClass, FalseClass,
|
9
|
+
when NilClass, TrueClass, FalseClass, TRUE, FALSE, T, F, NULL, SLASH_N
|
10
10
|
true
|
11
11
|
else
|
12
12
|
false
|
@@ -16,9 +16,9 @@ class VectorEmbed
|
|
16
16
|
|
17
17
|
def value(v)
|
18
18
|
case v
|
19
|
-
when TrueClass,
|
19
|
+
when TrueClass, TRUE, T
|
20
20
|
1
|
21
|
-
when FalseClass,
|
21
|
+
when FalseClass, FALSE, F
|
22
22
|
0
|
23
23
|
else
|
24
24
|
raise "Can't embed #{v.inspect} in boolean feature #{k.inspect}"
|
@@ -27,11 +27,11 @@ class VectorEmbed
|
|
27
27
|
|
28
28
|
def pairs(v)
|
29
29
|
case v
|
30
|
-
when TrueClass,
|
30
|
+
when TrueClass, TRUE, T
|
31
31
|
[ [ Maker.index(k, 'true'), 1 ] ]
|
32
|
-
when FalseClass,
|
32
|
+
when FalseClass, FALSE, F
|
33
33
|
[ [ Maker.index(k, 'false'), 1 ] ]
|
34
|
-
when NilClass,
|
34
|
+
when NilClass, NULL, SLASH_N, BLANK
|
35
35
|
[ [ Maker.index(k, 'null'), 1 ] ]
|
36
36
|
else
|
37
37
|
raise ArgumentError, "Can't embed #{v.inspect} in boolean feature #{k.inspect}"
|
data/lib/vector_embed/version.rb
CHANGED
data/lib/vector_embed.rb
CHANGED
@@ -9,6 +9,12 @@ class VectorEmbed
|
|
9
9
|
# http://stackoverflow.com/questions/638565/parsing-scientific-notation-sensibly
|
10
10
|
JUST_A_NUMBER = /\A\s*[+\-]?(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?\s*\z/
|
11
11
|
BLANK = /\A\s*\z/
|
12
|
+
NULL = /\Anull\z/i
|
13
|
+
SLASH_N = '\N'
|
14
|
+
TRUE = /\Atrue\z/i
|
15
|
+
T = /\At\z/i
|
16
|
+
FALSE = /\Afalse\z/i
|
17
|
+
F = /\Af\z/i
|
12
18
|
NULL_BYTE = "\x00"
|
13
19
|
|
14
20
|
attr_reader :options
|
data/spec/vector_embed_spec.rb
CHANGED
@@ -79,10 +79,18 @@ describe VectorEmbed do
|
|
79
79
|
v = VectorEmbed.new
|
80
80
|
v.line(1, 1 => true).should == "1 #{l_h("1\x00true")}:1"
|
81
81
|
v.line(1, 1 => 'true').should == "1 #{l_h("1\x00true")}:1"
|
82
|
+
v.line(1, 1 => 'TRUE').should == "1 #{l_h("1\x00true")}:1"
|
83
|
+
v.line(1, 1 => 't').should == "1 #{l_h("1\x00true")}:1"
|
84
|
+
v.line(1, 1 => 'T').should == "1 #{l_h("1\x00true")}:1"
|
82
85
|
v.line(1, 1 => false).should == "1 #{l_h("1\x00false")}:1"
|
83
86
|
v.line(1, 1 => 'false').should == "1 #{l_h("1\x00false")}:1"
|
87
|
+
v.line(1, 1 => 'FALSE').should == "1 #{l_h("1\x00false")}:1"
|
88
|
+
v.line(1, 1 => 'f').should == "1 #{l_h("1\x00false")}:1"
|
89
|
+
v.line(1, 1 => 'F').should == "1 #{l_h("1\x00false")}:1"
|
84
90
|
v.line(1, 1 => nil).should == "1 #{l_h("1\x00null")}:1"
|
85
91
|
v.line(1, 1 => 'null').should == "1 #{l_h("1\x00null")}:1"
|
92
|
+
v.line(1, 1 => 'NULL').should == "1 #{l_h("1\x00null")}:1"
|
93
|
+
v.line(1, 1 => '\N').should == "1 #{l_h("1\x00null")}:1"
|
86
94
|
end
|
87
95
|
|
88
96
|
it "stores numbers as numbers" do
|
@@ -130,6 +138,9 @@ describe VectorEmbed do
|
|
130
138
|
v = VectorEmbed.new
|
131
139
|
v.line(1, 1 => 9).should == "1 #{l_h('1')}:9"
|
132
140
|
v.line(1, 1 => nil).should == "1 #{l_h('1')}:0"
|
141
|
+
v.line(1, 1 => 'null').should == "1 #{l_h('1')}:0"
|
142
|
+
v.line(1, 1 => 'NULL').should == "1 #{l_h('1')}:0"
|
143
|
+
v.line(1, 1 => '\N').should == "1 #{l_h('1')}:0"
|
133
144
|
end
|
134
145
|
|
135
146
|
it "doesn't allow embedding boolean in number mode or vice-versa" do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vector_embed
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-02-
|
12
|
+
date: 2013-02-22 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: murmurhash3
|
@@ -80,7 +80,7 @@ description: Vector embedding of strings, booleans, numerics, and arrays into LI
|
|
80
80
|
email:
|
81
81
|
- seamus@abshere.net
|
82
82
|
executables:
|
83
|
-
-
|
83
|
+
- csv2libsvm
|
84
84
|
extensions: []
|
85
85
|
extra_rdoc_files: []
|
86
86
|
files:
|
@@ -92,7 +92,7 @@ files:
|
|
92
92
|
- LICENSE.txt
|
93
93
|
- README.md
|
94
94
|
- Rakefile
|
95
|
-
- bin/
|
95
|
+
- bin/csv2libsvm
|
96
96
|
- lib/vector_embed.rb
|
97
97
|
- lib/vector_embed/maker.rb
|
98
98
|
- lib/vector_embed/maker/boolean.rb
|
data/bin/vector_embed
DELETED