vector_embed 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ YWRiOGI3YTdkMDIwNDljNTA0NDkzOGNiNGVkNThiZjczNjdlNDU3OQ==
5
+ data.tar.gz: !binary |-
6
+ ZDIxY2Q1NDFjMjkxNDFkOGJkZTk0NTZiMDc4NTgwNjYwZGE5MDI1MQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ YzAwOGQ0NjlmNThiYzNmZmQxNDk0ZTY2ZTIxMTA2M2NjMjZiNGE4MGU0NzE3
10
+ YTZhNTkzM2ViMWU2M2FhZDE5NTk5YTdkOWU3NGYxYjVjNmJkYmZjNGVhMTU0
11
+ ODY1OTkzNmFhNDIwODY4MjUwYTJjODU0ZDgzNWYzNmE1ZTljYzg=
12
+ data.tar.gz: !binary |-
13
+ ZjM3OGUyNDk1YzIxMjc5OTgyYmI3ZWY3NGEwYjRjYWY1MTk0MjJhYzhjODU0
14
+ NTg0OWYxNWU4ZTQ3YTQ0ZDFlOWVjYzMyYmZhOTE0MjNlMDRjMjMyMDkwNGMy
15
+ NDNhMjhkYjUzNGRhZmRhNjliMGFiNzY1NmNiNzk5MDhhOTUyOTU=
data/CHANGELOG CHANGED
@@ -1,3 +1,10 @@
1
+ 0.1.1 / 2013-04-04
2
+
3
+ * Enhancements
4
+
5
+ * If you pass a Hash as :dict, it will start feature labels at 1 rather than murmurhashing; you can then serialize VectorEmbed#dict
6
+ * Better error message if you run csv2libsvm on a CSV without a "label" column
7
+
1
8
  0.1.0 / 2013-02-20
2
9
 
3
10
  * Enhancements
data/bin/csv2libsvm CHANGED
@@ -12,6 +12,8 @@ csv_path = ARGV[0]
12
12
  v = VectorEmbed.new
13
13
  CSV.foreach(csv_path, headers: :first_row) do |row|
14
14
  features = row.to_hash
15
- label = features.delete('label')
15
+ unless label = features.delete('label')
16
+ raise "No label found - do you have a column named 'label'?"
17
+ end
16
18
  puts v.line(label, features)
17
19
  end
@@ -4,7 +4,7 @@ class VectorEmbed
4
4
  class Maker
5
5
  class Boolean < Maker
6
6
  class << self
7
- def want?(k, v, parent)
7
+ def want?(v, parent)
8
8
  case v
9
9
  when NilClass, TrueClass, FalseClass, TRUE, FALSE, T, F, NULL, SLASH_N
10
10
  true
@@ -28,11 +28,11 @@ class VectorEmbed
28
28
  def pairs(v)
29
29
  case v
30
30
  when TrueClass, TRUE, T
31
- [ [ Maker.index(k, 'true'), 1 ] ]
31
+ [ [ parent.index([k, 'true']), 1 ] ]
32
32
  when FalseClass, FALSE, F
33
- [ [ Maker.index(k, 'false'), 1 ] ]
33
+ [ [ parent.index([k, 'false']), 1 ] ]
34
34
  when NilClass, NULL, SLASH_N, BLANK
35
- [ [ Maker.index(k, 'null'), 1 ] ]
35
+ [ [ parent.index([k, 'null']), 1 ] ]
36
36
  else
37
37
  raise ArgumentError, "Can't embed #{v.inspect} in boolean feature #{k.inspect}"
38
38
  end
@@ -4,7 +4,7 @@ class VectorEmbed
4
4
  class Maker
5
5
  class Ngram < Maker
6
6
  class << self
7
- def want?(k, v, parent)
7
+ def want?(v, parent)
8
8
  parent.options[:ngram_len]
9
9
  end
10
10
  end
@@ -31,7 +31,7 @@ class VectorEmbed
31
31
  else
32
32
  raise "Word n-gram not supported yet"
33
33
  end.map do |ngram|
34
- [ [ Maker.index(k, 'ngram', ngram), 1 ] ]
34
+ [ [ parent.index([k, 'ngram', ngram]), 1 ] ]
35
35
  end
36
36
  end
37
37
  end
@@ -4,7 +4,7 @@ class VectorEmbed
4
4
  class Maker
5
5
  class Number < Maker
6
6
  class << self
7
- def want?(k, v, parent)
7
+ def want?(v, parent)
8
8
  v.is_a?(::Numeric) or v =~ JUST_A_NUMBER
9
9
  end
10
10
 
@@ -4,14 +4,14 @@ class VectorEmbed
4
4
  class Maker
5
5
  class Phrase < Maker
6
6
  class << self
7
- def want?(k, v, parent)
7
+ def want?(v, parent)
8
8
  true
9
9
  end
10
10
  end
11
11
 
12
12
  def pairs(v)
13
13
  v = parent.preprocess v.to_s
14
- [ [ Maker.index(k, v), 1 ] ]
14
+ [ [ parent.index([k, v]), 1 ] ]
15
15
  end
16
16
  end
17
17
  end
@@ -3,23 +3,17 @@ require 'vector_embed/maker/ngram'
3
3
  require 'vector_embed/maker/number'
4
4
  require 'vector_embed/maker/boolean'
5
5
 
6
- require 'murmurhash3'
7
-
8
6
  class VectorEmbed
9
7
  class Maker
10
8
  class << self
11
9
  def pick(choices, k, first_v, parent)
12
- if klass = choices.detect { |klass| klass.want?(k, first_v, parent) }
10
+ if klass = choices.detect { |klass| klass.want?(first_v, parent) }
13
11
  parent.logger.debug { "Interpreting #{k.inspect} as #{klass.name.split('::').last} given first value #{first_v.inspect}" }
14
12
  klass.new k, parent
15
13
  else
16
14
  raise "Can't use #{first_v.class} for #{k.inspect} given #{first_v.inspect} and choices #{choices.inspect}"
17
15
  end
18
16
  end
19
-
20
- def index(*parts)
21
- MurmurHash3::V32.str_hash(parts.join(NULL_BYTE)).to_s[0..6].to_i
22
- end
23
17
  end
24
18
 
25
19
  attr_reader :parent
@@ -35,11 +29,11 @@ class VectorEmbed
35
29
  when Array
36
30
  memo = []
37
31
  v.each_with_index do |vv, i|
38
- memo << [ Maker.index(k, i), value(vv) ]
32
+ memo << [ parent.index([k, i]), value(vv) ]
39
33
  end
40
34
  memo
41
35
  else
42
- [ [ Maker.index(k), value(v) ] ]
36
+ [ [ parent.index([k]), value(v) ] ]
43
37
  end
44
38
  end
45
39
  end
@@ -1,3 +1,3 @@
1
1
  class VectorEmbed
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
data/lib/vector_embed.rb CHANGED
@@ -1,4 +1,6 @@
1
1
  require 'logger'
2
+ require 'digest/md5'
3
+ require 'murmurhash3'
2
4
 
3
5
  require 'vector_embed/version'
4
6
  require 'vector_embed/maker'
@@ -16,15 +18,21 @@ class VectorEmbed
16
18
  FALSE = /\Afalse\z/i
17
19
  F = /\Af\z/i
18
20
  NULL_BYTE = "\x00"
21
+ LABEL_MAKERS = [Maker::Boolean, Maker::Number]
22
+ FEATURE_MAKERS = [Maker::Boolean, Maker::Number, Maker::Ngram, Maker::Phrase]
19
23
 
20
24
  attr_reader :options
21
25
  attr_accessor :logger
26
+ attr_reader :dict
22
27
 
23
28
  def initialize(options = {})
29
+ @options = options.dup
24
30
  @mutex = Mutex.new
25
31
  @feature_makers = {}
26
32
  @logger = options[:logger] || (l = Logger.new($stderr); l.level = Logger::INFO; l)
27
- @options = options.dup
33
+ if dict = @options.delete(:dict)
34
+ @dict = dict.dup
35
+ end
28
36
  end
29
37
 
30
38
  def line(label, features = {})
@@ -50,6 +58,18 @@ class VectorEmbed
50
58
  StopWord.remove stop_words, v
51
59
  end
52
60
 
61
+ def index(parts)
62
+ k = parts.join NULL_BYTE
63
+ if dict
64
+ k = Digest::MD5.digest k
65
+ dict[k] || @mutex.synchronize do
66
+ dict[k] ||= dict.length + 1
67
+ end
68
+ else
69
+ MurmurHash3::V32.str_hash(k).to_s[0..6].to_i
70
+ end
71
+ end
72
+
53
73
  private
54
74
 
55
75
  def stop_words
@@ -60,13 +80,13 @@ class VectorEmbed
60
80
 
61
81
  def label_maker(label)
62
82
  @label_maker || @mutex.synchronize do
63
- @label_maker ||= Maker.pick([Maker::Boolean, Maker::Number], 'label', label, self)
83
+ @label_maker ||= Maker.pick(LABEL_MAKERS, 'label', label, self)
64
84
  end
65
85
  end
66
86
 
67
87
  def feature_maker(k, v)
68
88
  @feature_makers[k] || @mutex.synchronize do
69
- @feature_makers[k] ||= Maker.pick([Maker::Boolean, Maker::Number, Maker::Ngram, Maker::Phrase], k, v, self)
89
+ @feature_makers[k] ||= Maker.pick(FEATURE_MAKERS, k, v, self)
70
90
  end
71
91
  end
72
92
  end
@@ -31,6 +31,29 @@ describe VectorEmbed do
31
31
  end
32
32
  end
33
33
 
34
+ describe 'using a dictionary' do
35
+ it "starts at feature label 1" do
36
+ v = VectorEmbed.new dict: {}
37
+ v.line(1, 'foo' => 5).should == "1 1:5"
38
+ v.line(1, 'bar' => 3).should == "1 2:3"
39
+ v.line(1, 'foo' => 3, 'bar' => 5).should == "1 1:3 2:5"
40
+ end
41
+
42
+ it "does not modify the original dict" do
43
+ orig = {}
44
+ v = VectorEmbed.new dict: orig
45
+ v.line(1, 'foo' => 5)
46
+ orig.should == {}
47
+ end
48
+
49
+ it "provides the latest dict on demand" do
50
+ require 'digest/md5'
51
+ v = VectorEmbed.new dict: {}
52
+ v.line(1, 'foo' => 5)
53
+ v.dict.should == { Digest::MD5.digest('foo') => 1 }
54
+ end
55
+ end
56
+
34
57
  # aka dimension indexes
35
58
  describe 'in feature keys' do
36
59
  it "stores values as their string equivalents" do
metadata CHANGED
@@ -1,20 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vector_embed
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
5
- prerelease:
4
+ version: 0.1.1
6
5
  platform: ruby
7
6
  authors:
8
7
  - Seamus Abshere
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-02-22 00:00:00.000000000 Z
11
+ date: 2013-04-09 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: murmurhash3
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
17
  - - ! '>='
20
18
  - !ruby/object:Gem::Version
@@ -22,7 +20,6 @@ dependencies:
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
24
  - - ! '>='
28
25
  - !ruby/object:Gem::Version
@@ -30,7 +27,6 @@ dependencies:
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: rspec
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
31
  - - ! '>='
36
32
  - !ruby/object:Gem::Version
@@ -38,7 +34,6 @@ dependencies:
38
34
  type: :development
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
38
  - - ! '>='
44
39
  - !ruby/object:Gem::Version
@@ -46,7 +41,6 @@ dependencies:
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: pry
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
45
  - - ! '>='
52
46
  - !ruby/object:Gem::Version
@@ -54,7 +48,6 @@ dependencies:
54
48
  type: :development
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
52
  - - ! '>='
60
53
  - !ruby/object:Gem::Version
@@ -62,7 +55,6 @@ dependencies:
62
55
  - !ruby/object:Gem::Dependency
63
56
  name: yard
64
57
  requirement: !ruby/object:Gem::Requirement
65
- none: false
66
58
  requirements:
67
59
  - - ! '>='
68
60
  - !ruby/object:Gem::Version
@@ -70,7 +62,6 @@ dependencies:
70
62
  type: :development
71
63
  prerelease: false
72
64
  version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
65
  requirements:
75
66
  - - ! '>='
76
67
  - !ruby/object:Gem::Version
@@ -106,27 +97,26 @@ files:
106
97
  - vector_embed.gemspec
107
98
  homepage: https://github.com/seamusabshere/vector_embed
108
99
  licenses: []
100
+ metadata: {}
109
101
  post_install_message:
110
102
  rdoc_options: []
111
103
  require_paths:
112
104
  - lib
113
105
  required_ruby_version: !ruby/object:Gem::Requirement
114
- none: false
115
106
  requirements:
116
107
  - - ! '>='
117
108
  - !ruby/object:Gem::Version
118
109
  version: '0'
119
110
  required_rubygems_version: !ruby/object:Gem::Requirement
120
- none: false
121
111
  requirements:
122
112
  - - ! '>='
123
113
  - !ruby/object:Gem::Version
124
114
  version: '0'
125
115
  requirements: []
126
116
  rubyforge_project:
127
- rubygems_version: 1.8.25
117
+ rubygems_version: 2.0.3
128
118
  signing_key:
129
- specification_version: 3
119
+ specification_version: 4
130
120
  summary: Vector embedding of strings, booleans, numerics, and arrays into LIBSVM /
131
121
  LIBLINEAR format.
132
122
  test_files: