vector_embed 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +10 -0
- data/lib/vector_embed.rb +1 -0
- data/lib/vector_embed/maker/ngram.rb +5 -2
- data/lib/vector_embed/maker/number.rb +3 -3
- data/lib/vector_embed/version.rb +1 -1
- data/spec/vector_embed_spec.rb +11 -0
- metadata +2 -2
data/CHANGELOG
CHANGED
data/lib/vector_embed.rb
CHANGED
@@ -10,6 +10,7 @@ require 'vector_embed/stop_word'
|
|
10
10
|
class VectorEmbed
|
11
11
|
# http://stackoverflow.com/questions/638565/parsing-scientific-notation-sensibly
|
12
12
|
JUST_A_NUMBER = /\A\s*[+\-]?(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?\s*\z/
|
13
|
+
UGLY_FLOAT = /\A\.\d+\z/
|
13
14
|
BLANK = /\A\s*\z/
|
14
15
|
NULL = /\Anull\z/i
|
15
16
|
SLASH_N = '\N'
|
@@ -9,6 +9,9 @@ class VectorEmbed
|
|
9
9
|
end
|
10
10
|
end
|
11
11
|
|
12
|
+
# TODO make sure you can't collide with these
|
13
|
+
IM_AN_NGRAM = 'ngram'
|
14
|
+
|
12
15
|
attr_reader :len
|
13
16
|
attr_reader :delim
|
14
17
|
|
@@ -20,7 +23,7 @@ class VectorEmbed
|
|
20
23
|
end
|
21
24
|
|
22
25
|
def pairs(v)
|
23
|
-
raise "Ngram can't handle #{v.inspect}, only a single string for now" unless v.is_a?(String)
|
26
|
+
raise "Ngram can't handle #{v.inspect}, only a single string for now" unless v.is_a?(::String)
|
24
27
|
v = parent.preprocess v.to_s
|
25
28
|
if len == 1
|
26
29
|
# word mode
|
@@ -31,7 +34,7 @@ class VectorEmbed
|
|
31
34
|
else
|
32
35
|
raise "Word n-gram not supported yet"
|
33
36
|
end.map do |ngram|
|
34
|
-
[
|
37
|
+
[ parent.index([k, IM_AN_NGRAM, ngram]), 1 ]
|
35
38
|
end
|
36
39
|
end
|
37
40
|
end
|
@@ -8,8 +8,8 @@ class VectorEmbed
|
|
8
8
|
case v
|
9
9
|
when Numeric, NilClass, NULL, SLASH_N
|
10
10
|
true
|
11
|
-
|
12
|
-
v =~ JUST_A_NUMBER
|
11
|
+
when String
|
12
|
+
v =~ JUST_A_NUMBER or v =~ UGLY_FLOAT
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
@@ -31,7 +31,7 @@ class VectorEmbed
|
|
31
31
|
|
32
32
|
def value(v)
|
33
33
|
case v
|
34
|
-
when Numeric, JUST_A_NUMBER
|
34
|
+
when Numeric, JUST_A_NUMBER, UGLY_FLOAT
|
35
35
|
Number.numify v
|
36
36
|
when NilClass, NULL, SLASH_N
|
37
37
|
nil
|
data/lib/vector_embed/version.rb
CHANGED
data/spec/vector_embed_spec.rb
CHANGED
@@ -136,6 +136,11 @@ describe VectorEmbed do
|
|
136
136
|
v.line(1, 1 => '9e9').should == "1 #{l_h('1')}:9000000000"
|
137
137
|
end
|
138
138
|
|
139
|
+
it "is ok with floats that don't start with 0" do
|
140
|
+
v = VectorEmbed.new
|
141
|
+
v.line(3, 5 => '.9').should == "3 #{l_h('5')}:0.9"
|
142
|
+
end
|
143
|
+
|
139
144
|
it "stores dates as days since 2000-01-01" do
|
140
145
|
v = VectorEmbed.new
|
141
146
|
v.line(1, 3 => Date.new(1999,12,31)).should == "1 #{l_h('3')}:-1"
|
@@ -295,6 +300,12 @@ describe VectorEmbed do
|
|
295
300
|
v.line(1, 1 => 'foo', 2 => 'bar', 'baz' => 'zoo').should == sortme("1 #{l_h("1\x00foo")}:1 #{l_h("2\x00bar")}:1 #{l_h("baz\x00zoo")}:1")
|
296
301
|
end
|
297
302
|
|
303
|
+
it "simultaneously produces phrases and n-grams" do
|
304
|
+
v = VectorEmbed.new ngram_len: 2, ngram_delim: '', features: { 2 => :Phrase }
|
305
|
+
v.line(3, 1 => 'foo', 2 => 'bar').should == sortme("3 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("2\x00bar")}:1")
|
306
|
+
|
307
|
+
end
|
308
|
+
|
298
309
|
end
|
299
310
|
|
300
311
|
describe 'hinting' do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vector_embed
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-07-
|
12
|
+
date: 2013-07-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: murmurhash3
|