vector_embed 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/CHANGELOG +7 -0
- data/bin/csv2libsvm +3 -1
- data/lib/vector_embed/maker/boolean.rb +4 -4
- data/lib/vector_embed/maker/ngram.rb +2 -2
- data/lib/vector_embed/maker/number.rb +1 -1
- data/lib/vector_embed/maker/phrase.rb +2 -2
- data/lib/vector_embed/maker.rb +3 -9
- data/lib/vector_embed/version.rb +1 -1
- data/lib/vector_embed.rb +23 -3
- data/spec/vector_embed_spec.rb +23 -0
- metadata +5 -15
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
YWRiOGI3YTdkMDIwNDljNTA0NDkzOGNiNGVkNThiZjczNjdlNDU3OQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
ZDIxY2Q1NDFjMjkxNDFkOGJkZTk0NTZiMDc4NTgwNjYwZGE5MDI1MQ==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
YzAwOGQ0NjlmNThiYzNmZmQxNDk0ZTY2ZTIxMTA2M2NjMjZiNGE4MGU0NzE3
|
10
|
+
YTZhNTkzM2ViMWU2M2FhZDE5NTk5YTdkOWU3NGYxYjVjNmJkYmZjNGVhMTU0
|
11
|
+
ODY1OTkzNmFhNDIwODY4MjUwYTJjODU0ZDgzNWYzNmE1ZTljYzg=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
ZjM3OGUyNDk1YzIxMjc5OTgyYmI3ZWY3NGEwYjRjYWY1MTk0MjJhYzhjODU0
|
14
|
+
NTg0OWYxNWU4ZTQ3YTQ0ZDFlOWVjYzMyYmZhOTE0MjNlMDRjMjMyMDkwNGMy
|
15
|
+
NDNhMjhkYjUzNGRhZmRhNjliMGFiNzY1NmNiNzk5MDhhOTUyOTU=
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
0.1.1 / 2013-04-04
|
2
|
+
|
3
|
+
* Enhancements
|
4
|
+
|
5
|
+
* If you pass a Hash as :dict, it will start feature labels at 1 rather than murmurhashing; you can then serialize VectorEmbed#dict
|
6
|
+
* Better error message if you run csv2libsvm on a CSV without a "label" column
|
7
|
+
|
1
8
|
0.1.0 / 2013-02-20
|
2
9
|
|
3
10
|
* Enhancements
|
data/bin/csv2libsvm
CHANGED
@@ -12,6 +12,8 @@ csv_path = ARGV[0]
|
|
12
12
|
v = VectorEmbed.new
|
13
13
|
CSV.foreach(csv_path, headers: :first_row) do |row|
|
14
14
|
features = row.to_hash
|
15
|
-
label = features.delete('label')
|
15
|
+
unless label = features.delete('label')
|
16
|
+
raise "No label found - do you have a column named 'label'?"
|
17
|
+
end
|
16
18
|
puts v.line(label, features)
|
17
19
|
end
|
@@ -4,7 +4,7 @@ class VectorEmbed
|
|
4
4
|
class Maker
|
5
5
|
class Boolean < Maker
|
6
6
|
class << self
|
7
|
-
def want?(
|
7
|
+
def want?(v, parent)
|
8
8
|
case v
|
9
9
|
when NilClass, TrueClass, FalseClass, TRUE, FALSE, T, F, NULL, SLASH_N
|
10
10
|
true
|
@@ -28,11 +28,11 @@ class VectorEmbed
|
|
28
28
|
def pairs(v)
|
29
29
|
case v
|
30
30
|
when TrueClass, TRUE, T
|
31
|
-
[ [
|
31
|
+
[ [ parent.index([k, 'true']), 1 ] ]
|
32
32
|
when FalseClass, FALSE, F
|
33
|
-
[ [
|
33
|
+
[ [ parent.index([k, 'false']), 1 ] ]
|
34
34
|
when NilClass, NULL, SLASH_N, BLANK
|
35
|
-
[ [
|
35
|
+
[ [ parent.index([k, 'null']), 1 ] ]
|
36
36
|
else
|
37
37
|
raise ArgumentError, "Can't embed #{v.inspect} in boolean feature #{k.inspect}"
|
38
38
|
end
|
@@ -4,7 +4,7 @@ class VectorEmbed
|
|
4
4
|
class Maker
|
5
5
|
class Ngram < Maker
|
6
6
|
class << self
|
7
|
-
def want?(
|
7
|
+
def want?(v, parent)
|
8
8
|
parent.options[:ngram_len]
|
9
9
|
end
|
10
10
|
end
|
@@ -31,7 +31,7 @@ class VectorEmbed
|
|
31
31
|
else
|
32
32
|
raise "Word n-gram not supported yet"
|
33
33
|
end.map do |ngram|
|
34
|
-
[ [
|
34
|
+
[ [ parent.index([k, 'ngram', ngram]), 1 ] ]
|
35
35
|
end
|
36
36
|
end
|
37
37
|
end
|
@@ -4,14 +4,14 @@ class VectorEmbed
|
|
4
4
|
class Maker
|
5
5
|
class Phrase < Maker
|
6
6
|
class << self
|
7
|
-
def want?(
|
7
|
+
def want?(v, parent)
|
8
8
|
true
|
9
9
|
end
|
10
10
|
end
|
11
11
|
|
12
12
|
def pairs(v)
|
13
13
|
v = parent.preprocess v.to_s
|
14
|
-
[ [
|
14
|
+
[ [ parent.index([k, v]), 1 ] ]
|
15
15
|
end
|
16
16
|
end
|
17
17
|
end
|
data/lib/vector_embed/maker.rb
CHANGED
@@ -3,23 +3,17 @@ require 'vector_embed/maker/ngram'
|
|
3
3
|
require 'vector_embed/maker/number'
|
4
4
|
require 'vector_embed/maker/boolean'
|
5
5
|
|
6
|
-
require 'murmurhash3'
|
7
|
-
|
8
6
|
class VectorEmbed
|
9
7
|
class Maker
|
10
8
|
class << self
|
11
9
|
def pick(choices, k, first_v, parent)
|
12
|
-
if klass = choices.detect { |klass| klass.want?(
|
10
|
+
if klass = choices.detect { |klass| klass.want?(first_v, parent) }
|
13
11
|
parent.logger.debug { "Interpreting #{k.inspect} as #{klass.name.split('::').last} given first value #{first_v.inspect}" }
|
14
12
|
klass.new k, parent
|
15
13
|
else
|
16
14
|
raise "Can't use #{first_v.class} for #{k.inspect} given #{first_v.inspect} and choices #{choices.inspect}"
|
17
15
|
end
|
18
16
|
end
|
19
|
-
|
20
|
-
def index(*parts)
|
21
|
-
MurmurHash3::V32.str_hash(parts.join(NULL_BYTE)).to_s[0..6].to_i
|
22
|
-
end
|
23
17
|
end
|
24
18
|
|
25
19
|
attr_reader :parent
|
@@ -35,11 +29,11 @@ class VectorEmbed
|
|
35
29
|
when Array
|
36
30
|
memo = []
|
37
31
|
v.each_with_index do |vv, i|
|
38
|
-
memo << [
|
32
|
+
memo << [ parent.index([k, i]), value(vv) ]
|
39
33
|
end
|
40
34
|
memo
|
41
35
|
else
|
42
|
-
[ [
|
36
|
+
[ [ parent.index([k]), value(v) ] ]
|
43
37
|
end
|
44
38
|
end
|
45
39
|
end
|
data/lib/vector_embed/version.rb
CHANGED
data/lib/vector_embed.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
require 'logger'
|
2
|
+
require 'digest/md5'
|
3
|
+
require 'murmurhash3'
|
2
4
|
|
3
5
|
require 'vector_embed/version'
|
4
6
|
require 'vector_embed/maker'
|
@@ -16,15 +18,21 @@ class VectorEmbed
|
|
16
18
|
FALSE = /\Afalse\z/i
|
17
19
|
F = /\Af\z/i
|
18
20
|
NULL_BYTE = "\x00"
|
21
|
+
LABEL_MAKERS = [Maker::Boolean, Maker::Number]
|
22
|
+
FEATURE_MAKERS = [Maker::Boolean, Maker::Number, Maker::Ngram, Maker::Phrase]
|
19
23
|
|
20
24
|
attr_reader :options
|
21
25
|
attr_accessor :logger
|
26
|
+
attr_reader :dict
|
22
27
|
|
23
28
|
def initialize(options = {})
|
29
|
+
@options = options.dup
|
24
30
|
@mutex = Mutex.new
|
25
31
|
@feature_makers = {}
|
26
32
|
@logger = options[:logger] || (l = Logger.new($stderr); l.level = Logger::INFO; l)
|
27
|
-
|
33
|
+
if dict = @options.delete(:dict)
|
34
|
+
@dict = dict.dup
|
35
|
+
end
|
28
36
|
end
|
29
37
|
|
30
38
|
def line(label, features = {})
|
@@ -50,6 +58,18 @@ class VectorEmbed
|
|
50
58
|
StopWord.remove stop_words, v
|
51
59
|
end
|
52
60
|
|
61
|
+
def index(parts)
|
62
|
+
k = parts.join NULL_BYTE
|
63
|
+
if dict
|
64
|
+
k = Digest::MD5.digest k
|
65
|
+
dict[k] || @mutex.synchronize do
|
66
|
+
dict[k] ||= dict.length + 1
|
67
|
+
end
|
68
|
+
else
|
69
|
+
MurmurHash3::V32.str_hash(k).to_s[0..6].to_i
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
53
73
|
private
|
54
74
|
|
55
75
|
def stop_words
|
@@ -60,13 +80,13 @@ class VectorEmbed
|
|
60
80
|
|
61
81
|
def label_maker(label)
|
62
82
|
@label_maker || @mutex.synchronize do
|
63
|
-
@label_maker ||= Maker.pick(
|
83
|
+
@label_maker ||= Maker.pick(LABEL_MAKERS, 'label', label, self)
|
64
84
|
end
|
65
85
|
end
|
66
86
|
|
67
87
|
def feature_maker(k, v)
|
68
88
|
@feature_makers[k] || @mutex.synchronize do
|
69
|
-
@feature_makers[k] ||= Maker.pick(
|
89
|
+
@feature_makers[k] ||= Maker.pick(FEATURE_MAKERS, k, v, self)
|
70
90
|
end
|
71
91
|
end
|
72
92
|
end
|
data/spec/vector_embed_spec.rb
CHANGED
@@ -31,6 +31,29 @@ describe VectorEmbed do
|
|
31
31
|
end
|
32
32
|
end
|
33
33
|
|
34
|
+
describe 'using a dictionary' do
|
35
|
+
it "starts at feature label 1" do
|
36
|
+
v = VectorEmbed.new dict: {}
|
37
|
+
v.line(1, 'foo' => 5).should == "1 1:5"
|
38
|
+
v.line(1, 'bar' => 3).should == "1 2:3"
|
39
|
+
v.line(1, 'foo' => 3, 'bar' => 5).should == "1 1:3 2:5"
|
40
|
+
end
|
41
|
+
|
42
|
+
it "does not modify the original dict" do
|
43
|
+
orig = {}
|
44
|
+
v = VectorEmbed.new dict: orig
|
45
|
+
v.line(1, 'foo' => 5)
|
46
|
+
orig.should == {}
|
47
|
+
end
|
48
|
+
|
49
|
+
it "provides the latest dict on demand" do
|
50
|
+
require 'digest/md5'
|
51
|
+
v = VectorEmbed.new dict: {}
|
52
|
+
v.line(1, 'foo' => 5)
|
53
|
+
v.dict.should == { Digest::MD5.digest('foo') => 1 }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
34
57
|
# aka dimension indexes
|
35
58
|
describe 'in feature keys' do
|
36
59
|
it "stores values as their string equivalents" do
|
metadata
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vector_embed
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
5
|
-
prerelease:
|
4
|
+
version: 0.1.1
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Seamus Abshere
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2013-
|
11
|
+
date: 2013-04-09 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: murmurhash3
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
17
|
- - ! '>='
|
20
18
|
- !ruby/object:Gem::Version
|
@@ -22,7 +20,6 @@ dependencies:
|
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
24
|
- - ! '>='
|
28
25
|
- !ruby/object:Gem::Version
|
@@ -30,7 +27,6 @@ dependencies:
|
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: rspec
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
31
|
- - ! '>='
|
36
32
|
- !ruby/object:Gem::Version
|
@@ -38,7 +34,6 @@ dependencies:
|
|
38
34
|
type: :development
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
38
|
- - ! '>='
|
44
39
|
- !ruby/object:Gem::Version
|
@@ -46,7 +41,6 @@ dependencies:
|
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: pry
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
45
|
- - ! '>='
|
52
46
|
- !ruby/object:Gem::Version
|
@@ -54,7 +48,6 @@ dependencies:
|
|
54
48
|
type: :development
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
51
|
requirements:
|
59
52
|
- - ! '>='
|
60
53
|
- !ruby/object:Gem::Version
|
@@ -62,7 +55,6 @@ dependencies:
|
|
62
55
|
- !ruby/object:Gem::Dependency
|
63
56
|
name: yard
|
64
57
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
58
|
requirements:
|
67
59
|
- - ! '>='
|
68
60
|
- !ruby/object:Gem::Version
|
@@ -70,7 +62,6 @@ dependencies:
|
|
70
62
|
type: :development
|
71
63
|
prerelease: false
|
72
64
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
65
|
requirements:
|
75
66
|
- - ! '>='
|
76
67
|
- !ruby/object:Gem::Version
|
@@ -106,27 +97,26 @@ files:
|
|
106
97
|
- vector_embed.gemspec
|
107
98
|
homepage: https://github.com/seamusabshere/vector_embed
|
108
99
|
licenses: []
|
100
|
+
metadata: {}
|
109
101
|
post_install_message:
|
110
102
|
rdoc_options: []
|
111
103
|
require_paths:
|
112
104
|
- lib
|
113
105
|
required_ruby_version: !ruby/object:Gem::Requirement
|
114
|
-
none: false
|
115
106
|
requirements:
|
116
107
|
- - ! '>='
|
117
108
|
- !ruby/object:Gem::Version
|
118
109
|
version: '0'
|
119
110
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
120
|
-
none: false
|
121
111
|
requirements:
|
122
112
|
- - ! '>='
|
123
113
|
- !ruby/object:Gem::Version
|
124
114
|
version: '0'
|
125
115
|
requirements: []
|
126
116
|
rubyforge_project:
|
127
|
-
rubygems_version:
|
117
|
+
rubygems_version: 2.0.3
|
128
118
|
signing_key:
|
129
|
-
specification_version:
|
119
|
+
specification_version: 4
|
130
120
|
summary: Vector embedding of strings, booleans, numerics, and arrays into LIBSVM /
|
131
121
|
LIBLINEAR format.
|
132
122
|
test_files:
|