vector_embed 0.3.3 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +10 -0
- data/lib/vector_embed.rb +35 -7
- data/lib/vector_embed/maker.rb +3 -1
- data/lib/vector_embed/maker/boolean.rb +2 -2
- data/lib/vector_embed/version.rb +1 -1
- data/spec/vector_embed_spec.rb +46 -11
- metadata +2 -2
data/CHANGELOG
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
0.4.0 / 2013-07-01
|
2
|
+
|
3
|
+
* Breaking changes
|
4
|
+
|
5
|
+
* Treat 0/1 and '0'/'1' as true/false (per @ihough)
|
6
|
+
|
7
|
+
* Enhancements
|
8
|
+
|
9
|
+
* Add VectorEmbed#stats_report, which lists types and cardinalities of features (thanks @ihough)
|
10
|
+
|
1
11
|
0.3.3 / 2013-06-14
|
2
12
|
|
3
13
|
* Enhancement
|
data/lib/vector_embed.rb
CHANGED
@@ -21,15 +21,15 @@ class VectorEmbed
|
|
21
21
|
LABEL_MAKERS = [Maker::Boolean, Maker::Number]
|
22
22
|
FEATURE_MAKERS = [Maker::Boolean, Maker::Date, Maker::Number, Maker::Ngram, Maker::Phrase]
|
23
23
|
|
24
|
-
attr_reader :options
|
25
24
|
attr_accessor :logger
|
26
25
|
attr_reader :dict
|
26
|
+
attr_reader :options
|
27
27
|
|
28
28
|
def initialize(options = {})
|
29
29
|
@options = options.dup
|
30
30
|
@mutex = Mutex.new
|
31
31
|
@feature_makers = {}
|
32
|
-
@logger = options[:logger] || (l = Logger.new($stderr); l.level = Logger::INFO; l)
|
32
|
+
@logger = options[:logger] || (l = Logger.new($stderr); l.level = (ENV['VERBOSE'] == 'true') ? Logger::DEBUG : Logger::INFO; l)
|
33
33
|
if dict = @options.delete(:dict)
|
34
34
|
@dict = dict.dup
|
35
35
|
end
|
@@ -59,15 +59,43 @@ class VectorEmbed
|
|
59
59
|
end
|
60
60
|
|
61
61
|
def index(parts)
|
62
|
-
|
62
|
+
sig = parts.join NULL_BYTE
|
63
63
|
if dict
|
64
|
-
|
65
|
-
dict[
|
66
|
-
dict[
|
64
|
+
sig = Digest::MD5.digest sig
|
65
|
+
dict[sig] || @mutex.synchronize do
|
66
|
+
dict[sig] ||= begin
|
67
|
+
k = parts[0]
|
68
|
+
@feature_makers[k].cardinality += 1
|
69
|
+
dict[sig] = dict.length + 1
|
70
|
+
end
|
67
71
|
end
|
68
72
|
else
|
69
|
-
MurmurHash3::V32.str_hash(
|
73
|
+
MurmurHash3::V32.str_hash(sig).to_s[0..6].to_i
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def stats_report
|
78
|
+
report = @feature_makers.map do |feature, maker|
|
79
|
+
[feature, maker.class, maker.cardinality]
|
80
|
+
end
|
81
|
+
total_cardinality = report.inject(0) { |sum, row| sum += row[2]; sum }
|
82
|
+
|
83
|
+
report.unshift %w{ Feature Class Cardinality }
|
84
|
+
feature_width = report.map { |row| row[0].to_s.length }.max
|
85
|
+
class_width = report.map { |row| row[1].to_s.length }.max
|
86
|
+
cardinality_width = report.map { |row| row[2].to_s.length }.max
|
87
|
+
|
88
|
+
report = report.map do |row|
|
89
|
+
[
|
90
|
+
row[0].to_s.ljust(feature_width),
|
91
|
+
row[1].to_s.ljust(class_width),
|
92
|
+
row[2].to_s.rjust(cardinality_width),
|
93
|
+
].join(' | ')
|
70
94
|
end
|
95
|
+
total_width = report.first.length
|
96
|
+
report.insert(1, ''.ljust(total_width, '-'))
|
97
|
+
report.push(total_cardinality.to_s.rjust(total_width))
|
98
|
+
report.push('').join("\n")
|
71
99
|
end
|
72
100
|
|
73
101
|
private
|
data/lib/vector_embed/maker.rb
CHANGED
@@ -20,12 +20,14 @@ class VectorEmbed
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
+
attr_accessor :cardinality
|
23
24
|
attr_reader :parent
|
24
25
|
attr_reader :k
|
25
26
|
|
26
27
|
def initialize(k, parent)
|
27
28
|
@k = k
|
28
29
|
@parent = parent
|
30
|
+
@cardinality = 0
|
29
31
|
end
|
30
32
|
|
31
33
|
def pairs(v)
|
@@ -42,7 +44,7 @@ class VectorEmbed
|
|
42
44
|
if (vv = value(v)).nil?
|
43
45
|
[]
|
44
46
|
else
|
45
|
-
[ [ parent.index([k]),
|
47
|
+
[ [ parent.index([k]), vv ] ]
|
46
48
|
end
|
47
49
|
end
|
48
50
|
end
|
@@ -27,9 +27,9 @@ class VectorEmbed
|
|
27
27
|
|
28
28
|
def pairs(v)
|
29
29
|
case v
|
30
|
-
when TrueClass, TRUE, T
|
30
|
+
when TrueClass, TRUE, T, 1, '1'
|
31
31
|
[ [ parent.index([k, 'true']), 1 ] ]
|
32
|
-
when FalseClass, FALSE, F
|
32
|
+
when FalseClass, FALSE, F, 0, '0'
|
33
33
|
[ [ parent.index([k, 'false']), 1 ] ]
|
34
34
|
when NilClass, NULL, SLASH_N, BLANK
|
35
35
|
[ [ parent.index([k, 'null']), 1 ] ]
|
data/lib/vector_embed/version.rb
CHANGED
data/spec/vector_embed_spec.rb
CHANGED
@@ -101,20 +101,28 @@ describe VectorEmbed do
|
|
101
101
|
describe 'in boolean attributes' do
|
102
102
|
it "stores true/false/nil as (1,0,0)/(0,1,0)/(0,0,1)" do
|
103
103
|
v = VectorEmbed.new
|
104
|
-
v.line(1, 1 => true).should
|
104
|
+
v.line(1, 1 => true).should == "1 #{l_h("1\x00true")}:1"
|
105
105
|
v.line(1, 1 => 'true').should == "1 #{l_h("1\x00true")}:1"
|
106
106
|
v.line(1, 1 => 'TRUE').should == "1 #{l_h("1\x00true")}:1"
|
107
|
-
v.line(1, 1 => 't').should
|
108
|
-
v.line(1, 1 => 'T').should
|
109
|
-
v.line(1, 1 =>
|
107
|
+
v.line(1, 1 => 't').should == "1 #{l_h("1\x00true")}:1"
|
108
|
+
v.line(1, 1 => 'T').should == "1 #{l_h("1\x00true")}:1"
|
109
|
+
v.line(1, 1 => 1).should == "1 #{l_h("1\x00true")}:1"
|
110
|
+
v.line(1, 1 => '1').should == "1 #{l_h("1\x00true")}:1"
|
111
|
+
|
112
|
+
v.line(1, 1 => false).should == "1 #{l_h("1\x00false")}:1"
|
110
113
|
v.line(1, 1 => 'false').should == "1 #{l_h("1\x00false")}:1"
|
111
114
|
v.line(1, 1 => 'FALSE').should == "1 #{l_h("1\x00false")}:1"
|
112
|
-
v.line(1, 1 => 'f').should
|
113
|
-
v.line(1, 1 => 'F').should
|
114
|
-
v.line(1, 1 =>
|
115
|
+
v.line(1, 1 => 'f').should == "1 #{l_h("1\x00false")}:1"
|
116
|
+
v.line(1, 1 => 'F').should == "1 #{l_h("1\x00false")}:1"
|
117
|
+
v.line(1, 1 => 0).should == "1 #{l_h("1\x00false")}:1"
|
118
|
+
v.line(1, 1 => '0').should == "1 #{l_h("1\x00false")}:1"
|
119
|
+
|
120
|
+
v.line(1, 1 => nil).should == "1 #{l_h("1\x00null")}:1"
|
115
121
|
v.line(1, 1 => 'null').should == "1 #{l_h("1\x00null")}:1"
|
116
122
|
v.line(1, 1 => 'NULL').should == "1 #{l_h("1\x00null")}:1"
|
117
|
-
v.line(1, 1 => '\N').should
|
123
|
+
v.line(1, 1 => '\N').should == "1 #{l_h("1\x00null")}:1"
|
124
|
+
v.line(1, 1 => '').should == "1 #{l_h("1\x00null")}:1"
|
125
|
+
v.line(1, 1 => " \t ").should == "1 #{l_h("1\x00null")}:1"
|
118
126
|
end
|
119
127
|
end
|
120
128
|
|
@@ -171,7 +179,7 @@ describe VectorEmbed do
|
|
171
179
|
v.line(3, 1 => '\N').should == v.line(3, 1 => 0)
|
172
180
|
end
|
173
181
|
|
174
|
-
it "stores strings as m-category attributes" do
|
182
|
+
it "stores strings as m-category attributes, ignoring whitespace" do
|
175
183
|
v = VectorEmbed.new
|
176
184
|
v.line(1, 1 => 'sfh').should == "1 #{l_h("1\x00sfh")}:1"
|
177
185
|
v.line(1, 1 => 'mfh').should == "1 #{l_h("1\x00mfh")}:1"
|
@@ -307,8 +315,35 @@ describe VectorEmbed do
|
|
307
315
|
end
|
308
316
|
end
|
309
317
|
|
310
|
-
describe '
|
311
|
-
|
318
|
+
describe '#stats_report' do
|
319
|
+
it "reports statistics on the embedded features" do
|
320
|
+
v = VectorEmbed.new dict: {}
|
321
|
+
v.line(1, 1 => 1)
|
322
|
+
v.line(1, 1 => 2)
|
323
|
+
v.line(1, 2 => 1)
|
324
|
+
v.line(1, 2 => nil)
|
325
|
+
v.line(1, 3 => '2010-01-01')
|
326
|
+
v.line(1, 3 => '2011-01-01')
|
327
|
+
v.line(1, 4 => true)
|
328
|
+
v.line(1, 5 => true)
|
329
|
+
v.line(1, 5 => false)
|
330
|
+
v.line(1, 5 => nil)
|
331
|
+
v.line(1, 'foo' => 'bar')
|
332
|
+
v.line(1, 'foo' => 'biz')
|
333
|
+
v.line(1, 'foo' => 'baz')
|
334
|
+
v.stats_report.should == [
|
335
|
+
'Feature | Class | Cardinality',
|
336
|
+
'---------------------------------------------------',
|
337
|
+
'1 | VectorEmbed::Maker::Number | 1',
|
338
|
+
'2 | VectorEmbed::Maker::Number | 1',
|
339
|
+
'3 | VectorEmbed::Maker::Date | 1',
|
340
|
+
'4 | VectorEmbed::Maker::Boolean | 1',
|
341
|
+
'5 | VectorEmbed::Maker::Boolean | 3',
|
342
|
+
'foo | VectorEmbed::Maker::Phrase | 3',
|
343
|
+
' 10',
|
344
|
+
'',
|
345
|
+
].join("\n")
|
346
|
+
end
|
312
347
|
end
|
313
348
|
|
314
349
|
private
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vector_embed
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-07-01 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: murmurhash3
|