vector_embed 0.3.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,13 @@
1
+ 0.4.0 / 2013-07-01
2
+
3
+ * Breaking changes
4
+
5
+ * Treat 0/1 and '0'/'1' as true/false (per @ihough)
6
+
7
+ * Enhancements
8
+
9
+ * Add VectorEmbed#stats_report, which lists types and cardinalities of features (thanks @ihough)
10
+
1
11
  0.3.3 / 2013-06-14
2
12
 
3
13
  * Enhancement
data/lib/vector_embed.rb CHANGED
@@ -21,15 +21,15 @@ class VectorEmbed
21
21
  LABEL_MAKERS = [Maker::Boolean, Maker::Number]
22
22
  FEATURE_MAKERS = [Maker::Boolean, Maker::Date, Maker::Number, Maker::Ngram, Maker::Phrase]
23
23
 
24
- attr_reader :options
25
24
  attr_accessor :logger
26
25
  attr_reader :dict
26
+ attr_reader :options
27
27
 
28
28
  def initialize(options = {})
29
29
  @options = options.dup
30
30
  @mutex = Mutex.new
31
31
  @feature_makers = {}
32
- @logger = options[:logger] || (l = Logger.new($stderr); l.level = Logger::INFO; l)
32
+ @logger = options[:logger] || (l = Logger.new($stderr); l.level = (ENV['VERBOSE'] == 'true') ? Logger::DEBUG : Logger::INFO; l)
33
33
  if dict = @options.delete(:dict)
34
34
  @dict = dict.dup
35
35
  end
@@ -59,15 +59,43 @@ class VectorEmbed
59
59
  end
60
60
 
61
61
  def index(parts)
62
- k = parts.join NULL_BYTE
62
+ sig = parts.join NULL_BYTE
63
63
  if dict
64
- k = Digest::MD5.digest k
65
- dict[k] || @mutex.synchronize do
66
- dict[k] ||= dict.length + 1
64
+ sig = Digest::MD5.digest sig
65
+ dict[sig] || @mutex.synchronize do
66
+ dict[sig] ||= begin
67
+ k = parts[0]
68
+ @feature_makers[k].cardinality += 1
69
+ dict[sig] = dict.length + 1
70
+ end
67
71
  end
68
72
  else
69
- MurmurHash3::V32.str_hash(k).to_s[0..6].to_i
73
+ MurmurHash3::V32.str_hash(sig).to_s[0..6].to_i
74
+ end
75
+ end
76
+
77
+ def stats_report
78
+ report = @feature_makers.map do |feature, maker|
79
+ [feature, maker.class, maker.cardinality]
80
+ end
81
+ total_cardinality = report.inject(0) { |sum, row| sum += row[2]; sum }
82
+
83
+ report.unshift %w{ Feature Class Cardinality }
84
+ feature_width = report.map { |row| row[0].to_s.length }.max
85
+ class_width = report.map { |row| row[1].to_s.length }.max
86
+ cardinality_width = report.map { |row| row[2].to_s.length }.max
87
+
88
+ report = report.map do |row|
89
+ [
90
+ row[0].to_s.ljust(feature_width),
91
+ row[1].to_s.ljust(class_width),
92
+ row[2].to_s.rjust(cardinality_width),
93
+ ].join(' | ')
70
94
  end
95
+ total_width = report.first.length
96
+ report.insert(1, ''.ljust(total_width, '-'))
97
+ report.push(total_cardinality.to_s.rjust(total_width))
98
+ report.push('').join("\n")
71
99
  end
72
100
 
73
101
  private
@@ -20,12 +20,14 @@ class VectorEmbed
20
20
  end
21
21
  end
22
22
 
23
+ attr_accessor :cardinality
23
24
  attr_reader :parent
24
25
  attr_reader :k
25
26
 
26
27
  def initialize(k, parent)
27
28
  @k = k
28
29
  @parent = parent
30
+ @cardinality = 0
29
31
  end
30
32
 
31
33
  def pairs(v)
@@ -42,7 +44,7 @@ class VectorEmbed
42
44
  if (vv = value(v)).nil?
43
45
  []
44
46
  else
45
- [ [ parent.index([k]), value(v) ] ]
47
+ [ [ parent.index([k]), vv ] ]
46
48
  end
47
49
  end
48
50
  end
@@ -27,9 +27,9 @@ class VectorEmbed
27
27
 
28
28
  def pairs(v)
29
29
  case v
30
- when TrueClass, TRUE, T
30
+ when TrueClass, TRUE, T, 1, '1'
31
31
  [ [ parent.index([k, 'true']), 1 ] ]
32
- when FalseClass, FALSE, F
32
+ when FalseClass, FALSE, F, 0, '0'
33
33
  [ [ parent.index([k, 'false']), 1 ] ]
34
34
  when NilClass, NULL, SLASH_N, BLANK
35
35
  [ [ parent.index([k, 'null']), 1 ] ]
@@ -1,3 +1,3 @@
1
1
  class VectorEmbed
2
- VERSION = '0.3.3'
2
+ VERSION = '0.4.0'
3
3
  end
@@ -101,20 +101,28 @@ describe VectorEmbed do
101
101
  describe 'in boolean attributes' do
102
102
  it "stores true/false/nil as (1,0,0)/(0,1,0)/(0,0,1)" do
103
103
  v = VectorEmbed.new
104
- v.line(1, 1 => true).should == "1 #{l_h("1\x00true")}:1"
104
+ v.line(1, 1 => true).should == "1 #{l_h("1\x00true")}:1"
105
105
  v.line(1, 1 => 'true').should == "1 #{l_h("1\x00true")}:1"
106
106
  v.line(1, 1 => 'TRUE').should == "1 #{l_h("1\x00true")}:1"
107
- v.line(1, 1 => 't').should == "1 #{l_h("1\x00true")}:1"
108
- v.line(1, 1 => 'T').should == "1 #{l_h("1\x00true")}:1"
109
- v.line(1, 1 => false).should == "1 #{l_h("1\x00false")}:1"
107
+ v.line(1, 1 => 't').should == "1 #{l_h("1\x00true")}:1"
108
+ v.line(1, 1 => 'T').should == "1 #{l_h("1\x00true")}:1"
109
+ v.line(1, 1 => 1).should == "1 #{l_h("1\x00true")}:1"
110
+ v.line(1, 1 => '1').should == "1 #{l_h("1\x00true")}:1"
111
+
112
+ v.line(1, 1 => false).should == "1 #{l_h("1\x00false")}:1"
110
113
  v.line(1, 1 => 'false').should == "1 #{l_h("1\x00false")}:1"
111
114
  v.line(1, 1 => 'FALSE').should == "1 #{l_h("1\x00false")}:1"
112
- v.line(1, 1 => 'f').should == "1 #{l_h("1\x00false")}:1"
113
- v.line(1, 1 => 'F').should == "1 #{l_h("1\x00false")}:1"
114
- v.line(1, 1 => nil).should == "1 #{l_h("1\x00null")}:1"
115
+ v.line(1, 1 => 'f').should == "1 #{l_h("1\x00false")}:1"
116
+ v.line(1, 1 => 'F').should == "1 #{l_h("1\x00false")}:1"
117
+ v.line(1, 1 => 0).should == "1 #{l_h("1\x00false")}:1"
118
+ v.line(1, 1 => '0').should == "1 #{l_h("1\x00false")}:1"
119
+
120
+ v.line(1, 1 => nil).should == "1 #{l_h("1\x00null")}:1"
115
121
  v.line(1, 1 => 'null').should == "1 #{l_h("1\x00null")}:1"
116
122
  v.line(1, 1 => 'NULL').should == "1 #{l_h("1\x00null")}:1"
117
- v.line(1, 1 => '\N').should == "1 #{l_h("1\x00null")}:1"
123
+ v.line(1, 1 => '\N').should == "1 #{l_h("1\x00null")}:1"
124
+ v.line(1, 1 => '').should == "1 #{l_h("1\x00null")}:1"
125
+ v.line(1, 1 => " \t ").should == "1 #{l_h("1\x00null")}:1"
118
126
  end
119
127
  end
120
128
 
@@ -171,7 +179,7 @@ describe VectorEmbed do
171
179
  v.line(3, 1 => '\N').should == v.line(3, 1 => 0)
172
180
  end
173
181
 
174
- it "stores strings as m-category attributes" do
182
+ it "stores strings as m-category attributes, ignoring whitespace" do
175
183
  v = VectorEmbed.new
176
184
  v.line(1, 1 => 'sfh').should == "1 #{l_h("1\x00sfh")}:1"
177
185
  v.line(1, 1 => 'mfh').should == "1 #{l_h("1\x00mfh")}:1"
@@ -307,8 +315,35 @@ describe VectorEmbed do
307
315
  end
308
316
  end
309
317
 
310
- describe 'debug mode' do
311
-
318
+ describe '#stats_report' do
319
+ it "reports statistics on the embedded features" do
320
+ v = VectorEmbed.new dict: {}
321
+ v.line(1, 1 => 1)
322
+ v.line(1, 1 => 2)
323
+ v.line(1, 2 => 1)
324
+ v.line(1, 2 => nil)
325
+ v.line(1, 3 => '2010-01-01')
326
+ v.line(1, 3 => '2011-01-01')
327
+ v.line(1, 4 => true)
328
+ v.line(1, 5 => true)
329
+ v.line(1, 5 => false)
330
+ v.line(1, 5 => nil)
331
+ v.line(1, 'foo' => 'bar')
332
+ v.line(1, 'foo' => 'biz')
333
+ v.line(1, 'foo' => 'baz')
334
+ v.stats_report.should == [
335
+ 'Feature | Class | Cardinality',
336
+ '---------------------------------------------------',
337
+ '1 | VectorEmbed::Maker::Number | 1',
338
+ '2 | VectorEmbed::Maker::Number | 1',
339
+ '3 | VectorEmbed::Maker::Date | 1',
340
+ '4 | VectorEmbed::Maker::Boolean | 1',
341
+ '5 | VectorEmbed::Maker::Boolean | 3',
342
+ 'foo | VectorEmbed::Maker::Phrase | 3',
343
+ ' 10',
344
+ '',
345
+ ].join("\n")
346
+ end
312
347
  end
313
348
 
314
349
  private
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vector_embed
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.4.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-06-14 00:00:00.000000000 Z
12
+ date: 2013-07-01 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: murmurhash3