vector_embed 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,13 @@
1
+ 0.4.0 / 2013-07-01
2
+
3
+ * Breaking changes
4
+
5
+ * Treat 0/1 and '0'/'1' as true/false (per @ihough)
6
+
7
+ * Enhancements
8
+
9
+ * Add VectorEmbed#stats_report, which lists types and cardinalities of features (thanks @ihough)
10
+
1
11
  0.3.3 / 2013-06-14
2
12
 
3
13
  * Enhancement
data/lib/vector_embed.rb CHANGED
@@ -21,15 +21,15 @@ class VectorEmbed
21
21
  LABEL_MAKERS = [Maker::Boolean, Maker::Number]
22
22
  FEATURE_MAKERS = [Maker::Boolean, Maker::Date, Maker::Number, Maker::Ngram, Maker::Phrase]
23
23
 
24
- attr_reader :options
25
24
  attr_accessor :logger
26
25
  attr_reader :dict
26
+ attr_reader :options
27
27
 
28
28
  def initialize(options = {})
29
29
  @options = options.dup
30
30
  @mutex = Mutex.new
31
31
  @feature_makers = {}
32
- @logger = options[:logger] || (l = Logger.new($stderr); l.level = Logger::INFO; l)
32
+ @logger = options[:logger] || (l = Logger.new($stderr); l.level = (ENV['VERBOSE'] == 'true') ? Logger::DEBUG : Logger::INFO; l)
33
33
  if dict = @options.delete(:dict)
34
34
  @dict = dict.dup
35
35
  end
@@ -59,15 +59,43 @@ class VectorEmbed
59
59
  end
60
60
 
61
61
  def index(parts)
62
- k = parts.join NULL_BYTE
62
+ sig = parts.join NULL_BYTE
63
63
  if dict
64
- k = Digest::MD5.digest k
65
- dict[k] || @mutex.synchronize do
66
- dict[k] ||= dict.length + 1
64
+ sig = Digest::MD5.digest sig
65
+ dict[sig] || @mutex.synchronize do
66
+ dict[sig] ||= begin
67
+ k = parts[0]
68
+ @feature_makers[k].cardinality += 1
69
+ dict[sig] = dict.length + 1
70
+ end
67
71
  end
68
72
  else
69
- MurmurHash3::V32.str_hash(k).to_s[0..6].to_i
73
+ MurmurHash3::V32.str_hash(sig).to_s[0..6].to_i
74
+ end
75
+ end
76
+
77
+ def stats_report
78
+ report = @feature_makers.map do |feature, maker|
79
+ [feature, maker.class, maker.cardinality]
80
+ end
81
+ total_cardinality = report.inject(0) { |sum, row| sum += row[2]; sum }
82
+
83
+ report.unshift %w{ Feature Class Cardinality }
84
+ feature_width = report.map { |row| row[0].to_s.length }.max
85
+ class_width = report.map { |row| row[1].to_s.length }.max
86
+ cardinality_width = report.map { |row| row[2].to_s.length }.max
87
+
88
+ report = report.map do |row|
89
+ [
90
+ row[0].to_s.ljust(feature_width),
91
+ row[1].to_s.ljust(class_width),
92
+ row[2].to_s.rjust(cardinality_width),
93
+ ].join(' | ')
70
94
  end
95
+ total_width = report.first.length
96
+ report.insert(1, ''.ljust(total_width, '-'))
97
+ report.push(total_cardinality.to_s.rjust(total_width))
98
+ report.push('').join("\n")
71
99
  end
72
100
 
73
101
  private
@@ -20,12 +20,14 @@ class VectorEmbed
20
20
  end
21
21
  end
22
22
 
23
+ attr_accessor :cardinality
23
24
  attr_reader :parent
24
25
  attr_reader :k
25
26
 
26
27
  def initialize(k, parent)
27
28
  @k = k
28
29
  @parent = parent
30
+ @cardinality = 0
29
31
  end
30
32
 
31
33
  def pairs(v)
@@ -42,7 +44,7 @@ class VectorEmbed
42
44
  if (vv = value(v)).nil?
43
45
  []
44
46
  else
45
- [ [ parent.index([k]), value(v) ] ]
47
+ [ [ parent.index([k]), vv ] ]
46
48
  end
47
49
  end
48
50
  end
@@ -27,9 +27,9 @@ class VectorEmbed
27
27
 
28
28
  def pairs(v)
29
29
  case v
30
- when TrueClass, TRUE, T
30
+ when TrueClass, TRUE, T, 1, '1'
31
31
  [ [ parent.index([k, 'true']), 1 ] ]
32
- when FalseClass, FALSE, F
32
+ when FalseClass, FALSE, F, 0, '0'
33
33
  [ [ parent.index([k, 'false']), 1 ] ]
34
34
  when NilClass, NULL, SLASH_N, BLANK
35
35
  [ [ parent.index([k, 'null']), 1 ] ]
@@ -1,3 +1,3 @@
1
1
  class VectorEmbed
2
- VERSION = '0.3.3'
2
+ VERSION = '0.4.0'
3
3
  end
@@ -101,20 +101,28 @@ describe VectorEmbed do
101
101
  describe 'in boolean attributes' do
102
102
  it "stores true/false/nil as (1,0,0)/(0,1,0)/(0,0,1)" do
103
103
  v = VectorEmbed.new
104
- v.line(1, 1 => true).should == "1 #{l_h("1\x00true")}:1"
104
+ v.line(1, 1 => true).should == "1 #{l_h("1\x00true")}:1"
105
105
  v.line(1, 1 => 'true').should == "1 #{l_h("1\x00true")}:1"
106
106
  v.line(1, 1 => 'TRUE').should == "1 #{l_h("1\x00true")}:1"
107
- v.line(1, 1 => 't').should == "1 #{l_h("1\x00true")}:1"
108
- v.line(1, 1 => 'T').should == "1 #{l_h("1\x00true")}:1"
109
- v.line(1, 1 => false).should == "1 #{l_h("1\x00false")}:1"
107
+ v.line(1, 1 => 't').should == "1 #{l_h("1\x00true")}:1"
108
+ v.line(1, 1 => 'T').should == "1 #{l_h("1\x00true")}:1"
109
+ v.line(1, 1 => 1).should == "1 #{l_h("1\x00true")}:1"
110
+ v.line(1, 1 => '1').should == "1 #{l_h("1\x00true")}:1"
111
+
112
+ v.line(1, 1 => false).should == "1 #{l_h("1\x00false")}:1"
110
113
  v.line(1, 1 => 'false').should == "1 #{l_h("1\x00false")}:1"
111
114
  v.line(1, 1 => 'FALSE').should == "1 #{l_h("1\x00false")}:1"
112
- v.line(1, 1 => 'f').should == "1 #{l_h("1\x00false")}:1"
113
- v.line(1, 1 => 'F').should == "1 #{l_h("1\x00false")}:1"
114
- v.line(1, 1 => nil).should == "1 #{l_h("1\x00null")}:1"
115
+ v.line(1, 1 => 'f').should == "1 #{l_h("1\x00false")}:1"
116
+ v.line(1, 1 => 'F').should == "1 #{l_h("1\x00false")}:1"
117
+ v.line(1, 1 => 0).should == "1 #{l_h("1\x00false")}:1"
118
+ v.line(1, 1 => '0').should == "1 #{l_h("1\x00false")}:1"
119
+
120
+ v.line(1, 1 => nil).should == "1 #{l_h("1\x00null")}:1"
115
121
  v.line(1, 1 => 'null').should == "1 #{l_h("1\x00null")}:1"
116
122
  v.line(1, 1 => 'NULL').should == "1 #{l_h("1\x00null")}:1"
117
- v.line(1, 1 => '\N').should == "1 #{l_h("1\x00null")}:1"
123
+ v.line(1, 1 => '\N').should == "1 #{l_h("1\x00null")}:1"
124
+ v.line(1, 1 => '').should == "1 #{l_h("1\x00null")}:1"
125
+ v.line(1, 1 => " \t ").should == "1 #{l_h("1\x00null")}:1"
118
126
  end
119
127
  end
120
128
 
@@ -171,7 +179,7 @@ describe VectorEmbed do
171
179
  v.line(3, 1 => '\N').should == v.line(3, 1 => 0)
172
180
  end
173
181
 
174
- it "stores strings as m-category attributes" do
182
+ it "stores strings as m-category attributes, ignoring whitespace" do
175
183
  v = VectorEmbed.new
176
184
  v.line(1, 1 => 'sfh').should == "1 #{l_h("1\x00sfh")}:1"
177
185
  v.line(1, 1 => 'mfh').should == "1 #{l_h("1\x00mfh")}:1"
@@ -307,8 +315,35 @@ describe VectorEmbed do
307
315
  end
308
316
  end
309
317
 
310
- describe 'debug mode' do
311
-
318
+ describe '#stats_report' do
319
+ it "reports statistics on the embedded features" do
320
+ v = VectorEmbed.new dict: {}
321
+ v.line(1, 1 => 1)
322
+ v.line(1, 1 => 2)
323
+ v.line(1, 2 => 1)
324
+ v.line(1, 2 => nil)
325
+ v.line(1, 3 => '2010-01-01')
326
+ v.line(1, 3 => '2011-01-01')
327
+ v.line(1, 4 => true)
328
+ v.line(1, 5 => true)
329
+ v.line(1, 5 => false)
330
+ v.line(1, 5 => nil)
331
+ v.line(1, 'foo' => 'bar')
332
+ v.line(1, 'foo' => 'biz')
333
+ v.line(1, 'foo' => 'baz')
334
+ v.stats_report.should == [
335
+ 'Feature | Class | Cardinality',
336
+ '---------------------------------------------------',
337
+ '1 | VectorEmbed::Maker::Number | 1',
338
+ '2 | VectorEmbed::Maker::Number | 1',
339
+ '3 | VectorEmbed::Maker::Date | 1',
340
+ '4 | VectorEmbed::Maker::Boolean | 1',
341
+ '5 | VectorEmbed::Maker::Boolean | 3',
342
+ 'foo | VectorEmbed::Maker::Phrase | 3',
343
+ ' 10',
344
+ '',
345
+ ].join("\n")
346
+ end
312
347
  end
313
348
 
314
349
  private
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vector_embed
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.4.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-06-14 00:00:00.000000000 Z
12
+ date: 2013-07-01 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: murmurhash3