vector_embed 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +10 -0
- data/lib/vector_embed.rb +1 -0
- data/lib/vector_embed/maker/ngram.rb +5 -2
- data/lib/vector_embed/maker/number.rb +3 -3
- data/lib/vector_embed/version.rb +1 -1
- data/spec/vector_embed_spec.rb +11 -0
- metadata +2 -2
data/CHANGELOG
CHANGED
data/lib/vector_embed.rb
CHANGED
@@ -10,6 +10,7 @@ require 'vector_embed/stop_word'
|
|
10
10
|
class VectorEmbed
|
11
11
|
# http://stackoverflow.com/questions/638565/parsing-scientific-notation-sensibly
|
12
12
|
JUST_A_NUMBER = /\A\s*[+\-]?(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?\s*\z/
|
13
|
+
UGLY_FLOAT = /\A\.\d+\z/
|
13
14
|
BLANK = /\A\s*\z/
|
14
15
|
NULL = /\Anull\z/i
|
15
16
|
SLASH_N = '\N'
|
@@ -9,6 +9,9 @@ class VectorEmbed
|
|
9
9
|
end
|
10
10
|
end
|
11
11
|
|
12
|
+
# TODO make sure you can't collide with these
|
13
|
+
IM_AN_NGRAM = 'ngram'
|
14
|
+
|
12
15
|
attr_reader :len
|
13
16
|
attr_reader :delim
|
14
17
|
|
@@ -20,7 +23,7 @@ class VectorEmbed
|
|
20
23
|
end
|
21
24
|
|
22
25
|
def pairs(v)
|
23
|
-
raise "Ngram can't handle #{v.inspect}, only a single string for now" unless v.is_a?(String)
|
26
|
+
raise "Ngram can't handle #{v.inspect}, only a single string for now" unless v.is_a?(::String)
|
24
27
|
v = parent.preprocess v.to_s
|
25
28
|
if len == 1
|
26
29
|
# word mode
|
@@ -31,7 +34,7 @@ class VectorEmbed
|
|
31
34
|
else
|
32
35
|
raise "Word n-gram not supported yet"
|
33
36
|
end.map do |ngram|
|
34
|
-
[
|
37
|
+
[ parent.index([k, IM_AN_NGRAM, ngram]), 1 ]
|
35
38
|
end
|
36
39
|
end
|
37
40
|
end
|
@@ -8,8 +8,8 @@ class VectorEmbed
|
|
8
8
|
case v
|
9
9
|
when Numeric, NilClass, NULL, SLASH_N
|
10
10
|
true
|
11
|
-
|
12
|
-
v =~ JUST_A_NUMBER
|
11
|
+
when String
|
12
|
+
v =~ JUST_A_NUMBER or v =~ UGLY_FLOAT
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
@@ -31,7 +31,7 @@ class VectorEmbed
|
|
31
31
|
|
32
32
|
def value(v)
|
33
33
|
case v
|
34
|
-
when Numeric, JUST_A_NUMBER
|
34
|
+
when Numeric, JUST_A_NUMBER, UGLY_FLOAT
|
35
35
|
Number.numify v
|
36
36
|
when NilClass, NULL, SLASH_N
|
37
37
|
nil
|
data/lib/vector_embed/version.rb
CHANGED
data/spec/vector_embed_spec.rb
CHANGED
@@ -136,6 +136,11 @@ describe VectorEmbed do
|
|
136
136
|
v.line(1, 1 => '9e9').should == "1 #{l_h('1')}:9000000000"
|
137
137
|
end
|
138
138
|
|
139
|
+
it "is ok with floats that don't start with 0" do
|
140
|
+
v = VectorEmbed.new
|
141
|
+
v.line(3, 5 => '.9').should == "3 #{l_h('5')}:0.9"
|
142
|
+
end
|
143
|
+
|
139
144
|
it "stores dates as days since 2000-01-01" do
|
140
145
|
v = VectorEmbed.new
|
141
146
|
v.line(1, 3 => Date.new(1999,12,31)).should == "1 #{l_h('3')}:-1"
|
@@ -295,6 +300,12 @@ describe VectorEmbed do
|
|
295
300
|
v.line(1, 1 => 'foo', 2 => 'bar', 'baz' => 'zoo').should == sortme("1 #{l_h("1\x00foo")}:1 #{l_h("2\x00bar")}:1 #{l_h("baz\x00zoo")}:1")
|
296
301
|
end
|
297
302
|
|
303
|
+
it "simultaneously produces phrases and n-grams" do
|
304
|
+
v = VectorEmbed.new ngram_len: 2, ngram_delim: '', features: { 2 => :Phrase }
|
305
|
+
v.line(3, 1 => 'foo', 2 => 'bar').should == sortme("3 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("2\x00bar")}:1")
|
306
|
+
|
307
|
+
end
|
308
|
+
|
298
309
|
end
|
299
310
|
|
300
311
|
describe 'hinting' do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: vector_embed
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-07-
|
12
|
+
date: 2013-07-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: murmurhash3
|