vector_embed 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ YWRiOGI3YTdkMDIwNDljNTA0NDkzOGNiNGVkNThiZjczNjdlNDU3OQ==
5
+ data.tar.gz: !binary |-
6
+ ZDIxY2Q1NDFjMjkxNDFkOGJkZTk0NTZiMDc4NTgwNjYwZGE5MDI1MQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ YzAwOGQ0NjlmNThiYzNmZmQxNDk0ZTY2ZTIxMTA2M2NjMjZiNGE4MGU0NzE3
10
+ YTZhNTkzM2ViMWU2M2FhZDE5NTk5YTdkOWU3NGYxYjVjNmJkYmZjNGVhMTU0
11
+ ODY1OTkzNmFhNDIwODY4MjUwYTJjODU0ZDgzNWYzNmE1ZTljYzg=
12
+ data.tar.gz: !binary |-
13
+ ZjM3OGUyNDk1YzIxMjc5OTgyYmI3ZWY3NGEwYjRjYWY1MTk0MjJhYzhjODU0
14
+ NTg0OWYxNWU4ZTQ3YTQ0ZDFlOWVjYzMyYmZhOTE0MjNlMDRjMjMyMDkwNGMy
15
+ NDNhMjhkYjUzNGRhZmRhNjliMGFiNzY1NmNiNzk5MDhhOTUyOTU=
data/CHANGELOG CHANGED
@@ -1,3 +1,10 @@
1
+ 0.1.1 / 2013-04-04
2
+
3
+ * Enhancements
4
+
5
+ * If you pass a Hash as :dict, it will start feature labels at 1 rather than murmurhashing; you can then serialize VectorEmbed#dict
6
+ * Better error message if you run csv2libsvm on a CSV without a "label" column
7
+
1
8
  0.1.0 / 2013-02-20
2
9
 
3
10
  * Enhancements
data/bin/csv2libsvm CHANGED
@@ -12,6 +12,8 @@ csv_path = ARGV[0]
12
12
  v = VectorEmbed.new
13
13
  CSV.foreach(csv_path, headers: :first_row) do |row|
14
14
  features = row.to_hash
15
- label = features.delete('label')
15
+ unless label = features.delete('label')
16
+ raise "No label found - do you have a column named 'label'?"
17
+ end
16
18
  puts v.line(label, features)
17
19
  end
@@ -4,7 +4,7 @@ class VectorEmbed
4
4
  class Maker
5
5
  class Boolean < Maker
6
6
  class << self
7
- def want?(k, v, parent)
7
+ def want?(v, parent)
8
8
  case v
9
9
  when NilClass, TrueClass, FalseClass, TRUE, FALSE, T, F, NULL, SLASH_N
10
10
  true
@@ -28,11 +28,11 @@ class VectorEmbed
28
28
  def pairs(v)
29
29
  case v
30
30
  when TrueClass, TRUE, T
31
- [ [ Maker.index(k, 'true'), 1 ] ]
31
+ [ [ parent.index([k, 'true']), 1 ] ]
32
32
  when FalseClass, FALSE, F
33
- [ [ Maker.index(k, 'false'), 1 ] ]
33
+ [ [ parent.index([k, 'false']), 1 ] ]
34
34
  when NilClass, NULL, SLASH_N, BLANK
35
- [ [ Maker.index(k, 'null'), 1 ] ]
35
+ [ [ parent.index([k, 'null']), 1 ] ]
36
36
  else
37
37
  raise ArgumentError, "Can't embed #{v.inspect} in boolean feature #{k.inspect}"
38
38
  end
@@ -4,7 +4,7 @@ class VectorEmbed
4
4
  class Maker
5
5
  class Ngram < Maker
6
6
  class << self
7
- def want?(k, v, parent)
7
+ def want?(v, parent)
8
8
  parent.options[:ngram_len]
9
9
  end
10
10
  end
@@ -31,7 +31,7 @@ class VectorEmbed
31
31
  else
32
32
  raise "Word n-gram not supported yet"
33
33
  end.map do |ngram|
34
- [ [ Maker.index(k, 'ngram', ngram), 1 ] ]
34
+ [ [ parent.index([k, 'ngram', ngram]), 1 ] ]
35
35
  end
36
36
  end
37
37
  end
@@ -4,7 +4,7 @@ class VectorEmbed
4
4
  class Maker
5
5
  class Number < Maker
6
6
  class << self
7
- def want?(k, v, parent)
7
+ def want?(v, parent)
8
8
  v.is_a?(::Numeric) or v =~ JUST_A_NUMBER
9
9
  end
10
10
 
@@ -4,14 +4,14 @@ class VectorEmbed
4
4
  class Maker
5
5
  class Phrase < Maker
6
6
  class << self
7
- def want?(k, v, parent)
7
+ def want?(v, parent)
8
8
  true
9
9
  end
10
10
  end
11
11
 
12
12
  def pairs(v)
13
13
  v = parent.preprocess v.to_s
14
- [ [ Maker.index(k, v), 1 ] ]
14
+ [ [ parent.index([k, v]), 1 ] ]
15
15
  end
16
16
  end
17
17
  end
@@ -3,23 +3,17 @@ require 'vector_embed/maker/ngram'
3
3
  require 'vector_embed/maker/number'
4
4
  require 'vector_embed/maker/boolean'
5
5
 
6
- require 'murmurhash3'
7
-
8
6
  class VectorEmbed
9
7
  class Maker
10
8
  class << self
11
9
  def pick(choices, k, first_v, parent)
12
- if klass = choices.detect { |klass| klass.want?(k, first_v, parent) }
10
+ if klass = choices.detect { |klass| klass.want?(first_v, parent) }
13
11
  parent.logger.debug { "Interpreting #{k.inspect} as #{klass.name.split('::').last} given first value #{first_v.inspect}" }
14
12
  klass.new k, parent
15
13
  else
16
14
  raise "Can't use #{first_v.class} for #{k.inspect} given #{first_v.inspect} and choices #{choices.inspect}"
17
15
  end
18
16
  end
19
-
20
- def index(*parts)
21
- MurmurHash3::V32.str_hash(parts.join(NULL_BYTE)).to_s[0..6].to_i
22
- end
23
17
  end
24
18
 
25
19
  attr_reader :parent
@@ -35,11 +29,11 @@ class VectorEmbed
35
29
  when Array
36
30
  memo = []
37
31
  v.each_with_index do |vv, i|
38
- memo << [ Maker.index(k, i), value(vv) ]
32
+ memo << [ parent.index([k, i]), value(vv) ]
39
33
  end
40
34
  memo
41
35
  else
42
- [ [ Maker.index(k), value(v) ] ]
36
+ [ [ parent.index([k]), value(v) ] ]
43
37
  end
44
38
  end
45
39
  end
@@ -1,3 +1,3 @@
1
1
  class VectorEmbed
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
data/lib/vector_embed.rb CHANGED
@@ -1,4 +1,6 @@
1
1
  require 'logger'
2
+ require 'digest/md5'
3
+ require 'murmurhash3'
2
4
 
3
5
  require 'vector_embed/version'
4
6
  require 'vector_embed/maker'
@@ -16,15 +18,21 @@ class VectorEmbed
16
18
  FALSE = /\Afalse\z/i
17
19
  F = /\Af\z/i
18
20
  NULL_BYTE = "\x00"
21
+ LABEL_MAKERS = [Maker::Boolean, Maker::Number]
22
+ FEATURE_MAKERS = [Maker::Boolean, Maker::Number, Maker::Ngram, Maker::Phrase]
19
23
 
20
24
  attr_reader :options
21
25
  attr_accessor :logger
26
+ attr_reader :dict
22
27
 
23
28
  def initialize(options = {})
29
+ @options = options.dup
24
30
  @mutex = Mutex.new
25
31
  @feature_makers = {}
26
32
  @logger = options[:logger] || (l = Logger.new($stderr); l.level = Logger::INFO; l)
27
- @options = options.dup
33
+ if dict = @options.delete(:dict)
34
+ @dict = dict.dup
35
+ end
28
36
  end
29
37
 
30
38
  def line(label, features = {})
@@ -50,6 +58,18 @@ class VectorEmbed
50
58
  StopWord.remove stop_words, v
51
59
  end
52
60
 
61
+ def index(parts)
62
+ k = parts.join NULL_BYTE
63
+ if dict
64
+ k = Digest::MD5.digest k
65
+ dict[k] || @mutex.synchronize do
66
+ dict[k] ||= dict.length + 1
67
+ end
68
+ else
69
+ MurmurHash3::V32.str_hash(k).to_s[0..6].to_i
70
+ end
71
+ end
72
+
53
73
  private
54
74
 
55
75
  def stop_words
@@ -60,13 +80,13 @@ class VectorEmbed
60
80
 
61
81
  def label_maker(label)
62
82
  @label_maker || @mutex.synchronize do
63
- @label_maker ||= Maker.pick([Maker::Boolean, Maker::Number], 'label', label, self)
83
+ @label_maker ||= Maker.pick(LABEL_MAKERS, 'label', label, self)
64
84
  end
65
85
  end
66
86
 
67
87
  def feature_maker(k, v)
68
88
  @feature_makers[k] || @mutex.synchronize do
69
- @feature_makers[k] ||= Maker.pick([Maker::Boolean, Maker::Number, Maker::Ngram, Maker::Phrase], k, v, self)
89
+ @feature_makers[k] ||= Maker.pick(FEATURE_MAKERS, k, v, self)
70
90
  end
71
91
  end
72
92
  end
@@ -31,6 +31,29 @@ describe VectorEmbed do
31
31
  end
32
32
  end
33
33
 
34
+ describe 'using a dictionary' do
35
+ it "starts at feature label 1" do
36
+ v = VectorEmbed.new dict: {}
37
+ v.line(1, 'foo' => 5).should == "1 1:5"
38
+ v.line(1, 'bar' => 3).should == "1 2:3"
39
+ v.line(1, 'foo' => 3, 'bar' => 5).should == "1 1:3 2:5"
40
+ end
41
+
42
+ it "does not modify the original dict" do
43
+ orig = {}
44
+ v = VectorEmbed.new dict: orig
45
+ v.line(1, 'foo' => 5)
46
+ orig.should == {}
47
+ end
48
+
49
+ it "provides the latest dict on demand" do
50
+ require 'digest/md5'
51
+ v = VectorEmbed.new dict: {}
52
+ v.line(1, 'foo' => 5)
53
+ v.dict.should == { Digest::MD5.digest('foo') => 1 }
54
+ end
55
+ end
56
+
34
57
  # aka dimension indexes
35
58
  describe 'in feature keys' do
36
59
  it "stores values as their string equivalents" do
metadata CHANGED
@@ -1,20 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vector_embed
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
5
- prerelease:
4
+ version: 0.1.1
6
5
  platform: ruby
7
6
  authors:
8
7
  - Seamus Abshere
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-02-22 00:00:00.000000000 Z
11
+ date: 2013-04-09 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: murmurhash3
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
17
  - - ! '>='
20
18
  - !ruby/object:Gem::Version
@@ -22,7 +20,6 @@ dependencies:
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
24
  - - ! '>='
28
25
  - !ruby/object:Gem::Version
@@ -30,7 +27,6 @@ dependencies:
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: rspec
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
31
  - - ! '>='
36
32
  - !ruby/object:Gem::Version
@@ -38,7 +34,6 @@ dependencies:
38
34
  type: :development
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
38
  - - ! '>='
44
39
  - !ruby/object:Gem::Version
@@ -46,7 +41,6 @@ dependencies:
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: pry
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
45
  - - ! '>='
52
46
  - !ruby/object:Gem::Version
@@ -54,7 +48,6 @@ dependencies:
54
48
  type: :development
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
52
  - - ! '>='
60
53
  - !ruby/object:Gem::Version
@@ -62,7 +55,6 @@ dependencies:
62
55
  - !ruby/object:Gem::Dependency
63
56
  name: yard
64
57
  requirement: !ruby/object:Gem::Requirement
65
- none: false
66
58
  requirements:
67
59
  - - ! '>='
68
60
  - !ruby/object:Gem::Version
@@ -70,7 +62,6 @@ dependencies:
70
62
  type: :development
71
63
  prerelease: false
72
64
  version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
65
  requirements:
75
66
  - - ! '>='
76
67
  - !ruby/object:Gem::Version
@@ -106,27 +97,26 @@ files:
106
97
  - vector_embed.gemspec
107
98
  homepage: https://github.com/seamusabshere/vector_embed
108
99
  licenses: []
100
+ metadata: {}
109
101
  post_install_message:
110
102
  rdoc_options: []
111
103
  require_paths:
112
104
  - lib
113
105
  required_ruby_version: !ruby/object:Gem::Requirement
114
- none: false
115
106
  requirements:
116
107
  - - ! '>='
117
108
  - !ruby/object:Gem::Version
118
109
  version: '0'
119
110
  required_rubygems_version: !ruby/object:Gem::Requirement
120
- none: false
121
111
  requirements:
122
112
  - - ! '>='
123
113
  - !ruby/object:Gem::Version
124
114
  version: '0'
125
115
  requirements: []
126
116
  rubyforge_project:
127
- rubygems_version: 1.8.25
117
+ rubygems_version: 2.0.3
128
118
  signing_key:
129
- specification_version: 3
119
+ specification_version: 4
130
120
  summary: Vector embedding of strings, booleans, numerics, and arrays into LIBSVM /
131
121
  LIBLINEAR format.
132
122
  test_files: