vss 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -1,6 +1,6 @@
1
1
  require "echoe"
2
2
 
3
- Echoe.new("vss", "0.1.1") do |p|
3
+ Echoe.new("vss", "0.1.2") do |p|
4
4
  p.description = "Simple vector space search engine"
5
5
  p.url = "http://github.com/mkdynamic/vss"
6
6
  p.author = "Mark Dodwell"
@@ -24,20 +24,20 @@ module VSS
24
24
  record.instance_eval %{def rank; #{ranks[i]}; end}
25
25
  end
26
26
 
27
- # sort by rank and return
28
- @records.sort { |a,b| b.rank <=> a.rank } # highest to lowest
27
+ # exclude 0 rank (no match) and sort by rank
28
+ @records.reject { |r| r.rank == 0 }.sort { |a,b| b.rank <=> a.rank }
29
29
  end
30
30
 
31
31
  private
32
32
 
33
33
  # ranks from 0 to 100
34
34
  def cosine_rank(vector1, vector2)
35
- (cosine(vector1, vector2) + 1) / 2 * 100
35
+ cosine(vector1, vector2) / 1 * 100
36
36
  end
37
37
 
38
38
  # see http://www.ltcconline.net/greenl/courses/107/vectors/DOTCROS.HTM
39
39
  # and http://ruby-doc.org/stdlib/libdoc/matrix/rdoc/index.html
40
- # will be in range -1 to 1
40
+ # will be in range 0 to 1, as vectors always positive
41
41
  def cosine(vector1, vector2)
42
42
  dot_product = vector1.inner_product(vector2)
43
43
  dot_product / (vector1.r * vector2.r) # Vector#r is same as ||v||
@@ -47,35 +47,45 @@ module VSS
47
47
  make_vector(query, true)
48
48
  end
49
49
 
50
- # NOTE: will choke if string contains words not in vocab
50
+ # NOTE: will choke if string contains tokens not in vocab
51
51
  # this is why, when we make the query vector, we do an
52
52
  # intersection of tokens with the vocab
53
- def make_vector(string, ensure_words_in_vocab = false)
53
+ def make_vector(string, ensure_tokens_in_vocab = false)
54
54
  @vector_cache = {}
55
55
  @vector_cache[string] ||= begin
56
- arr = Array.new(vector_keyword_index.size, 0)
57
-
58
- # uses tf*idf (http://en.wikipedia.org/wiki/Tf-idf)
59
- words = tokenize(string)
60
- words &= @vocab if ensure_words_in_vocab
61
- words.uniq.each do |word|
62
- tf = count_in_array(words, word)
63
- idf = @documents.size / count_in_array(@documents, proc { |doc| tokenize(doc).include?(word) })
64
-
65
- index = vector_keyword_index[word]
66
- arr[index] = tf * idf
56
+ arr = Array.new(vector_token_index.size, 0)
57
+
58
+ tokens = tokenize(string)
59
+ tokens &= @vocab if ensure_tokens_in_vocab
60
+ tokens.uniq.each do |token|
61
+ index = vector_token_index[token]
62
+ arr[index] = tf_idf(token, tokens, @documents)
67
63
  end
68
64
 
69
65
  Vector.elements(arr, false)
70
66
  end
71
67
  end
68
+
69
+ def tf(token, tokens)
70
+ count_in_array(tokens, token)
71
+ end
72
+
73
+ def idf(token, docs)
74
+ docs_with_token_count = count_in_array(docs, proc { |doc| tokenize(doc).include?(token) })
75
+ docs.size / docs_with_token_count
76
+ end
77
+
78
+ # http://en.wikipedia.org/wiki/Tf-idf
79
+ def tf_idf(token, tokens, docs)
80
+ tf(token, tokens) * idf(token, @documents)
81
+ end
72
82
 
73
- def vector_keyword_index
74
- @vector_keyword_index ||= begin
83
+ def vector_token_index
84
+ @vector_token_index ||= begin
75
85
  index, offset = {}, 0
76
86
 
77
- @vocab.each do |keyword|
78
- index[keyword] = offset
87
+ @vocab.each do |token|
88
+ index[token] = offset
79
89
  offset += 1
80
90
  end
81
91
 
@@ -88,7 +98,7 @@ module VSS
88
98
  @tokenize_cache[string] ||= Tokenizer.tokenize(string)
89
99
  end
90
100
 
91
- # could use Array#count, but 1.8.6 on Heroku don't have it only 1.8.7 >
101
+ # could use Array#count, but only for Ruby 1.8.7 >=
92
102
  def count_in_array(array, item)
93
103
  count = 0
94
104
  if item.is_a? Proc
@@ -7,7 +7,8 @@ class SearchTest < Test::Unit::TestCase
7
7
  @doc2 = "The Wire is the best thing ever. Fact."
8
8
  @doc3 = "Some would argue that Lost got a bit too wierd after season 2."
9
9
  @doc4 = "Lost is surely not in the same league as The Wire."
10
- @docs = [@doc1, @doc2, @doc3, @doc4]
10
+ @doc5 = "You cannot compare the The Wire and Lost."
11
+ @docs = [@doc1, @doc2, @doc3, @doc4, @doc5]
11
12
  @engine = VSS::Engine.new(@docs)
12
13
  end
13
14
 
@@ -18,17 +19,17 @@ class SearchTest < Test::Unit::TestCase
18
19
 
19
20
  def test_ordering
20
21
  results = @engine.search("How can you compare The Wire with Lost?")
21
- assert_equal @doc4, results[0]
22
- assert_equal @doc2, results[1]
23
- assert_equal @doc3, results[2]
24
- assert_equal @doc1, results[3]
22
+ assert_equal @doc5, results[0]
23
+ assert_equal @doc4, results[1]
24
+ assert_equal @doc2, results[2]
25
+ assert_equal @doc3, results[3]
25
26
  end
26
27
 
27
28
  def test_ranking
28
29
  results = @engine.search("How can you compare The Wire with Lost?")
29
- assert_equal 68.2574185835055, results[0].rank
30
- assert_equal 58.5749292571254, results[1].rank
31
- assert_equal 55.5215763037423, results[2].rank
32
- assert_equal 50.0, results[3].rank
30
+ assert_equal 82.1781403613318, results[0].rank
31
+ assert_equal 3.08166775680683, results[1].rank
32
+ assert_equal 1.3798683116522, results[2].rank
33
+ assert_equal 0.875309148136544, results[3].rank
33
34
  end
34
35
  end
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{vss}
5
- s.version = "0.1.1"
5
+ s.version = "0.1.2"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Mark Dodwell"]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Dodwell