vector_embed 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,13 @@
1
+ 0.5.0 / 2013-07-08
2
+
3
+ * Breaking changes
4
+
5
+ * Treat .1 as 0.1
6
+
7
+ * Bug fixes
8
+
9
+ * Fix error when mixing ngrams and phrases
10
+
1
11
  0.4.0 / 2013-07-01
2
12
 
3
13
  * Breaking changes
@@ -10,6 +10,7 @@ require 'vector_embed/stop_word'
10
10
  class VectorEmbed
11
11
  # http://stackoverflow.com/questions/638565/parsing-scientific-notation-sensibly
12
12
  JUST_A_NUMBER = /\A\s*[+\-]?(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?\s*\z/
13
+ UGLY_FLOAT = /\A\.\d+\z/
13
14
  BLANK = /\A\s*\z/
14
15
  NULL = /\Anull\z/i
15
16
  SLASH_N = '\N'
@@ -9,6 +9,9 @@ class VectorEmbed
9
9
  end
10
10
  end
11
11
 
12
+ # TODO make sure you can't collide with these
13
+ IM_AN_NGRAM = 'ngram'
14
+
12
15
  attr_reader :len
13
16
  attr_reader :delim
14
17
 
@@ -20,7 +23,7 @@ class VectorEmbed
20
23
  end
21
24
 
22
25
  def pairs(v)
23
- raise "Ngram can't handle #{v.inspect}, only a single string for now" unless v.is_a?(String)
26
+ raise "Ngram can't handle #{v.inspect}, only a single string for now" unless v.is_a?(::String)
24
27
  v = parent.preprocess v.to_s
25
28
  if len == 1
26
29
  # word mode
@@ -31,7 +34,7 @@ class VectorEmbed
31
34
  else
32
35
  raise "Word n-gram not supported yet"
33
36
  end.map do |ngram|
34
- [ [ parent.index([k, 'ngram', ngram]), 1 ] ]
37
+ [ parent.index([k, IM_AN_NGRAM, ngram]), 1 ]
35
38
  end
36
39
  end
37
40
  end
@@ -8,8 +8,8 @@ class VectorEmbed
8
8
  case v
9
9
  when Numeric, NilClass, NULL, SLASH_N
10
10
  true
11
- else
12
- v =~ JUST_A_NUMBER
11
+ when String
12
+ v =~ JUST_A_NUMBER or v =~ UGLY_FLOAT
13
13
  end
14
14
  end
15
15
 
@@ -31,7 +31,7 @@ class VectorEmbed
31
31
 
32
32
  def value(v)
33
33
  case v
34
- when Numeric, JUST_A_NUMBER
34
+ when Numeric, JUST_A_NUMBER, UGLY_FLOAT
35
35
  Number.numify v
36
36
  when NilClass, NULL, SLASH_N
37
37
  nil
@@ -1,3 +1,3 @@
1
1
  class VectorEmbed
2
- VERSION = '0.4.0'
2
+ VERSION = '0.5.0'
3
3
  end
@@ -136,6 +136,11 @@ describe VectorEmbed do
136
136
  v.line(1, 1 => '9e9').should == "1 #{l_h('1')}:9000000000"
137
137
  end
138
138
 
139
+ it "is ok with floats that don't start with 0" do
140
+ v = VectorEmbed.new
141
+ v.line(3, 5 => '.9').should == "3 #{l_h('5')}:0.9"
142
+ end
143
+
139
144
  it "stores dates as days since 2000-01-01" do
140
145
  v = VectorEmbed.new
141
146
  v.line(1, 3 => Date.new(1999,12,31)).should == "1 #{l_h('3')}:-1"
@@ -295,6 +300,12 @@ describe VectorEmbed do
295
300
  v.line(1, 1 => 'foo', 2 => 'bar', 'baz' => 'zoo').should == sortme("1 #{l_h("1\x00foo")}:1 #{l_h("2\x00bar")}:1 #{l_h("baz\x00zoo")}:1")
296
301
  end
297
302
 
303
+ it "simultaneously produces phrases and n-grams" do
304
+ v = VectorEmbed.new ngram_len: 2, ngram_delim: '', features: { 2 => :Phrase }
305
+ v.line(3, 1 => 'foo', 2 => 'bar').should == sortme("3 #{l_h("1\x00ngram\x00fo")}:1 #{l_h("1\x00ngram\x00oo")}:1 #{l_h("2\x00bar")}:1")
306
+
307
+ end
308
+
298
309
  end
299
310
 
300
311
  describe 'hinting' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vector_embed
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-07-01 00:00:00.000000000 Z
12
+ date: 2013-07-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: murmurhash3