vector_embed 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +10 -0
- data/lib/vector_embed.rb +35 -7
- data/lib/vector_embed/maker.rb +3 -1
- data/lib/vector_embed/maker/boolean.rb +2 -2
- data/lib/vector_embed/version.rb +1 -1
- data/spec/vector_embed_spec.rb +46 -11
- metadata +2 -2
data/CHANGELOG
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
0.4.0 / 2013-07-01
|
2
|
+
|
3
|
+
* Breaking changes
|
4
|
+
|
5
|
+
* Treat 0/1 and '0'/'1' as true/false (per @ihough)
|
6
|
+
|
7
|
+
* Enhancements
|
8
|
+
|
9
|
+
* Add VectorEmbed#stats_report, which lists types and cardinalities of features (thanks @ihough)
|
10
|
+
|
1
11
|
0.3.3 / 2013-06-14
|
2
12
|
|
3
13
|
* Enhancement
|
data/lib/vector_embed.rb
CHANGED
@@ -21,15 +21,15 @@ class VectorEmbed
|
|
21
21
|
LABEL_MAKERS = [Maker::Boolean, Maker::Number]
|
22
22
|
FEATURE_MAKERS = [Maker::Boolean, Maker::Date, Maker::Number, Maker::Ngram, Maker::Phrase]
|
23
23
|
|
24
|
-
attr_reader :options
|
25
24
|
attr_accessor :logger
|
26
25
|
attr_reader :dict
|
26
|
+
attr_reader :options
|
27
27
|
|
28
28
|
def initialize(options = {})
|
29
29
|
@options = options.dup
|
30
30
|
@mutex = Mutex.new
|
31
31
|
@feature_makers = {}
|
32
|
-
@logger = options[:logger] || (l = Logger.new($stderr); l.level = Logger::INFO; l)
|
32
|
+
@logger = options[:logger] || (l = Logger.new($stderr); l.level = (ENV['VERBOSE'] == 'true') ? Logger::DEBUG : Logger::INFO; l)
|
33
33
|
if dict = @options.delete(:dict)
|
34
34
|
@dict = dict.dup
|
35
35
|
end
|
@@ -59,15 +59,43 @@ class VectorEmbed
|
|
59
59
|
end
|
60
60
|
|
61
61
|
def index(parts)
|
62
|
-
|
62
|
+
sig = parts.join NULL_BYTE
|
63
63
|
if dict
|
64
|
-
|
65
|
-
dict[
|
66
|
-
dict[
|
64
|
+
sig = Digest::MD5.digest sig
|
65
|
+
dict[sig] || @mutex.synchronize do
|
66
|
+
dict[sig] ||= begin
|
67
|
+
k = parts[0]
|
68
|
+
@feature_makers[k].cardinality += 1
|
69
|
+
dict[sig] = dict.length + 1
|
70
|
+
end
|
67
71
|
end
|
68
72
|
else
|
69
|
-
MurmurHash3::V32.str_hash(
|
73
|
+
MurmurHash3::V32.str_hash(sig).to_s[0..6].to_i
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def stats_report
|
78
|
+
report = @feature_makers.map do |feature, maker|
|
79
|
+
[feature, maker.class, maker.cardinality]
|
80
|
+
end
|
81
|
+
total_cardinality = report.inject(0) { |sum, row| sum += row[2]; sum }
|
82
|
+
|
83
|
+
report.unshift %w{ Feature Class Cardinality }
|
84
|
+
feature_width = report.map { |row| row[0].to_s.length }.max
|
85
|
+
class_width = report.map { |row| row[1].to_s.length }.max
|
86
|
+
cardinality_width = report.map { |row| row[2].to_s.length }.max
|
87
|
+
|
88
|
+
report = report.map do |row|
|
89
|
+
[
|
90
|
+
row[0].to_s.ljust(feature_width),
|
91
|
+
row[1].to_s.ljust(class_width),
|
92
|
+
row[2].to_s.rjust(cardinality_width),
|
93
|
+
].join(' | ')
|
70
94
|
end
|
95
|
+
total_width = report.first.length
|
96
|
+
report.insert(1, ''.ljust(total_width, '-'))
|
97
|
+
report.push(total_cardinality.to_s.rjust(total_width))
|
98
|
+
report.push('').join("\n")
|
71
99
|
end
|
72
100
|
|
73
101
|
private
|
data/lib/vector_embed/maker.rb
CHANGED
@@ -20,12 +20,14 @@ class VectorEmbed
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
+
attr_accessor :cardinality
|
23
24
|
attr_reader :parent
|
24
25
|
attr_reader :k
|
25
26
|
|
26
27
|
def initialize(k, parent)
|
27
28
|
@k = k
|
28
29
|
@parent = parent
|
30
|
+
@cardinality = 0
|
29
31
|
end
|
30
32
|
|
31
33
|
def pairs(v)
|
@@ -42,7 +44,7 @@ class VectorEmbed
|
|
42
44
|
if (vv = value(v)).nil?
|
43
45
|
[]
|
44
46
|
else
|
45
|
-
[ [ parent.index([k]),
|
47
|
+
[ [ parent.index([k]), vv ] ]
|
46
48
|
end
|
47
49
|
end
|
48
50
|
end
|
@@ -27,9 +27,9 @@ class VectorEmbed
|
|
27
27
|
|
28
28
|
def pairs(v)
|
29
29
|
case v
|
30
|
-
when TrueClass, TRUE, T
|
30
|
+
when TrueClass, TRUE, T, 1, '1'
|
31
31
|
[ [ parent.index([k, 'true']), 1 ] ]
|
32
|
-
when FalseClass, FALSE, F
|
32
|
+
when FalseClass, FALSE, F, 0, '0'
|
33
33
|
[ [ parent.index([k, 'false']), 1 ] ]
|
34
34
|
when NilClass, NULL, SLASH_N, BLANK
|
35
35
|
[ [ parent.index([k, 'null']), 1 ] ]
|
data/lib/vector_embed/version.rb
CHANGED
data/spec/vector_embed_spec.rb
CHANGED
@@ -101,20 +101,28 @@ describe VectorEmbed do
|
|
101
101
|
describe 'in boolean attributes' do
|
102
102
|
it "stores true/false/nil as (1,0,0)/(0,1,0)/(0,0,1)" do
|
103
103
|
v = VectorEmbed.new
|
104
|
-
v.line(1, 1 => true).should
|
104
|
+
v.line(1, 1 => true).should == "1 #{l_h("1\x00true")}:1"
|
105
105
|
v.line(1, 1 => 'true').should == "1 #{l_h("1\x00true")}:1"
|
106
106
|
v.line(1, 1 => 'TRUE').should == "1 #{l_h("1\x00true")}:1"
|
107
|
-
v.line(1, 1 => 't').should
|
108
|
-
v.line(1, 1 => 'T').should
|
109
|
-
v.line(1, 1 =>
|
107
|
+
v.line(1, 1 => 't').should == "1 #{l_h("1\x00true")}:1"
|
108
|
+
v.line(1, 1 => 'T').should == "1 #{l_h("1\x00true")}:1"
|
109
|
+
v.line(1, 1 => 1).should == "1 #{l_h("1\x00true")}:1"
|
110
|
+
v.line(1, 1 => '1').should == "1 #{l_h("1\x00true")}:1"
|
111
|
+
|
112
|
+
v.line(1, 1 => false).should == "1 #{l_h("1\x00false")}:1"
|
110
113
|
v.line(1, 1 => 'false').should == "1 #{l_h("1\x00false")}:1"
|
111
114
|
v.line(1, 1 => 'FALSE').should == "1 #{l_h("1\x00false")}:1"
|
112
|
-
v.line(1, 1 => 'f').should
|
113
|
-
v.line(1, 1 => 'F').should
|
114
|
-
v.line(1, 1 =>
|
115
|
+
v.line(1, 1 => 'f').should == "1 #{l_h("1\x00false")}:1"
|
116
|
+
v.line(1, 1 => 'F').should == "1 #{l_h("1\x00false")}:1"
|
117
|
+
v.line(1, 1 => 0).should == "1 #{l_h("1\x00false")}:1"
|
118
|
+
v.line(1, 1 => '0').should == "1 #{l_h("1\x00false")}:1"
|
119
|
+
|
120
|
+
v.line(1, 1 => nil).should == "1 #{l_h("1\x00null")}:1"
|
115
121
|
v.line(1, 1 => 'null').should == "1 #{l_h("1\x00null")}:1"
|
116
122
|
v.line(1, 1 => 'NULL').should == "1 #{l_h("1\x00null")}:1"
|
117
|
-
v.line(1, 1 => '\N').should
|
123
|
+
v.line(1, 1 => '\N').should == "1 #{l_h("1\x00null")}:1"
|
124
|
+
v.line(1, 1 => '').should == "1 #{l_h("1\x00null")}:1"
|
125
|
+
v.line(1, 1 => " \t ").should == "1 #{l_h("1\x00null")}:1"
|
118
126
|
end
|
119
127
|
end
|
120
128
|
|
@@ -171,7 +179,7 @@ describe VectorEmbed do
|
|
171
179
|
v.line(3, 1 => '\N').should == v.line(3, 1 => 0)
|
172
180
|
end
|
173
181
|
|
174
|
-
it "stores strings as m-category attributes" do
|
182
|
+
it "stores strings as m-category attributes, ignoring whitespace" do
|
175
183
|
v = VectorEmbed.new
|
176
184
|
v.line(1, 1 => 'sfh').should == "1 #{l_h("1\x00sfh")}:1"
|
177
185
|
v.line(1, 1 => 'mfh').should == "1 #{l_h("1\x00mfh")}:1"
|
@@ -307,8 +315,35 @@ describe VectorEmbed do
|
|
307
315
|
end
|
308
316
|
end
|
309
317
|
|
310
|
-
describe '
|
311
|
-
|
318
|
+
describe '#stats_report' do
|
319
|
+
it "reports statistics on the embedded features" do
|
320
|
+
v = VectorEmbed.new dict: {}
|
321
|
+
v.line(1, 1 => 1)
|
322
|
+
v.line(1, 1 => 2)
|
323
|
+
v.line(1, 2 => 1)
|
324
|
+
v.line(1, 2 => nil)
|
325
|
+
v.line(1, 3 => '2010-01-01')
|
326
|
+
v.line(1, 3 => '2011-01-01')
|
327
|
+
v.line(1, 4 => true)
|
328
|
+
v.line(1, 5 => true)
|
329
|
+
v.line(1, 5 => false)
|
330
|
+
v.line(1, 5 => nil)
|
331
|
+
v.line(1, 'foo' => 'bar')
|
332
|
+
v.line(1, 'foo' => 'biz')
|
333
|
+
v.line(1, 'foo' => 'baz')
|
334
|
+
v.stats_report.should == [
|
335
|
+
'Feature | Class | Cardinality',
|
336
|
+
'---------------------------------------------------',
|
337
|
+
'1 | VectorEmbed::Maker::Number | 1',
|
338
|
+
'2 | VectorEmbed::Maker::Number | 1',
|
339
|
+
'3 | VectorEmbed::Maker::Date | 1',
|
340
|
+
'4 | VectorEmbed::Maker::Boolean | 1',
|
341
|
+
'5 | VectorEmbed::Maker::Boolean | 3',
|
342
|
+
'foo | VectorEmbed::Maker::Phrase | 3',
|
343
|
+
' 10',
|
344
|
+
'',
|
345
|
+
].join("\n")
|
346
|
+
end
|
312
347
|
end
|
313
348
|
|
314
349
|
private
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vector_embed
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-07-01 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: murmurhash3
|