vector_embed 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,13 @@
1
+ 0.5.0 / 2013-07-08
2
+
3
+ * Breaking changes
4
+
5
+ * Treat .1 as 0.1
6
+
7
+ * Bug fixes
8
+
9
+ * Fix error when mixing ngrams and phrases
10
+
1
11
  0.4.0 / 2013-07-01
2
12
 
3
13
  * Breaking changes
@@ -10,6 +10,7 @@ require 'vector_embed/stop_word'
10
10
  class VectorEmbed
11
11
  # http://stackoverflow.com/questions/638565/parsing-scientific-notation-sensibly
12
12
  JUST_A_NUMBER = /\A\s*[+\-]?(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?\s*\z/
13
+ UGLY_FLOAT = /\A\.\d+\z/
13
14
  BLANK = /\A\s*\z/
14
15
  NULL = /\Anull\z/i
15
16
  SLASH_N = '\N'
@@ -9,6 +9,9 @@ class VectorEmbed
9
9
  end
10
10
  end
11
11
 
12
+ # TODO make sure you can't collide with these
13
+ IM_AN_NGRAM = 'ngram'
14
+
12
15
  attr_reader :len
13
16
  attr_reader :delim
14
17
 
@@ -20,7 +23,7 @@ class VectorEmbed
20
23
  end
21
24
 
22
25
  def pairs(v)
23
- raise "Ngram can't handle #{v.inspect}, only a single string for now" unless v.is_a?(String)
26
+ raise "Ngram can't handle #{v.inspect}, only a single string for now" unless v.is_a?(::String)
24
27
  v = parent.preprocess v.to_s
25
28
  if len == 1
26
29
  # word mode
@@ -31,7 +34,7 @@ class VectorEmbed
31
34
  else
32
35
  raise "Word n-gram not supported yet"
33
36
  end.map do |ngram|
34
- [ [ parent.index([k, 'ngram', ngram]), 1 ] ]
37
+ [ parent.index([k, IM_AN_NGRAM, ngram]), 1 ]
35
38
  end
36
39
  end
37
40
  end
@@ -8,8 +8,8 @@ class VectorEmbed
8
8
  case v
9
9
  when Numeric, NilClass, NULL, SLASH_N
10
10
  true
11
- else
12
- v =~ JUST_A_NUMBER
11
+ when String
12
+ v =~ JUST_A_NUMBER or v =~ UGLY_FLOAT
13
13
  end
14
14
  end
15
15
 
@@ -31,7 +31,7 @@ class VectorEmbed
31
31
 
32
32
  def value(v)
33
33
  case v
34
- when Numeric, JUST_A_NUMBER
34
+ when Numeric, JUST_A_NUMBER, UGLY_FLOAT
35
35
  Number.numify v
36
36
  when NilClass, NULL, SLASH_N
37
37
  nil
@@ -1,3 +1,3 @@
1
1
  class VectorEmbed
2
- VERSION = '0.4.0'
2
+ VERSION = '0.5.0'
3
3
  end
@@ -136,6 +136,11 @@ describe VectorEmbed do
136
136
  v.line(1, 1 => '9e9').should == "1 #{l_h('1')}:9000000000"
137
137
  end
138
138
 
139
+ it "is ok with floats that don't start with 0" do
140
+ v = VectorEmbed.new
141
+ v.line(3, 5 => '.9').should == "3 #{l_h('5')}:0.9"
142
+ end
143
+
139
144
  it "stores dates as days since 2000-01-01" do
140
145
  v = VectorEmbed.new
141
146
  v.line(1, 3 => Date.new(1999,12,31)).should == "1 #{l_h('3')}:-1"
@@ -295,6 +300,12 @@ describe VectorEmbed do
295
300
  v.line(1, 1 => 'foo', 2 => 'bar', 'baz' => 'zoo').should == sortme("1 #{l_h("1\x00foo")}:1 #{l_h("2\x00bar")}:1 #{l_h("baz\x00zoo")}:1")
296
301
  end
297
302
 
303
+ it "simultaneously produces phrases and n-grams" do
304
+ v = VectorEmbed.new ngram_len: 2, ngram_delim: '', features: { 2 => :Phrase }
305
+ v.line(3, 1 => 'foo', 2 => 'bar').should == sortme("3 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("2\x00bar")}:1")
306
+
307
+ end
308
+
298
309
  end
299
310
 
300
311
  describe 'hinting' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vector_embed
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-07-01 00:00:00.000000000 Z
12
+ date: 2013-07-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: murmurhash3