vss 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -1,6 +1,6 @@
1
1
  require "echoe"
2
2
 
3
- Echoe.new("vss", "0.1.1") do |p|
3
+ Echoe.new("vss", "0.1.2") do |p|
4
4
  p.description = "Simple vector space search engine"
5
5
  p.url = "http://github.com/mkdynamic/vss"
6
6
  p.author = "Mark Dodwell"
@@ -24,20 +24,20 @@ module VSS
24
24
  record.instance_eval %{def rank; #{ranks[i]}; end}
25
25
  end
26
26
 
27
- # sort by rank and return
28
- @records.sort { |a,b| b.rank <=> a.rank } # highest to lowest
27
+ # exclude 0 rank (no match) and sort by rank
28
+ @records.reject { |r| r.rank == 0 }.sort { |a,b| b.rank <=> a.rank }
29
29
  end
30
30
 
31
31
  private
32
32
 
33
33
  # ranks from 0 to 100
34
34
  def cosine_rank(vector1, vector2)
35
- (cosine(vector1, vector2) + 1) / 2 * 100
35
+ cosine(vector1, vector2) / 1 * 100
36
36
  end
37
37
 
38
38
  # see http://www.ltcconline.net/greenl/courses/107/vectors/DOTCROS.HTM
39
39
  # and http://ruby-doc.org/stdlib/libdoc/matrix/rdoc/index.html
40
- # will be in range -1 to 1
40
+ # will be in range 0 to 1, as vectors always positive
41
41
  def cosine(vector1, vector2)
42
42
  dot_product = vector1.inner_product(vector2)
43
43
  dot_product / (vector1.r * vector2.r) # Vector#r is same as ||v||
@@ -47,35 +47,45 @@ module VSS
47
47
  make_vector(query, true)
48
48
  end
49
49
 
50
- # NOTE: will choke if string contains words not in vocab
50
+ # NOTE: will choke if string contains tokens not in vocab
51
51
  # this is why, when we make the query vector, we do an
52
52
  # intersection of tokens with the vocab
53
- def make_vector(string, ensure_words_in_vocab = false)
53
+ def make_vector(string, ensure_tokens_in_vocab = false)
54
54
  @vector_cache = {}
55
55
  @vector_cache[string] ||= begin
56
- arr = Array.new(vector_keyword_index.size, 0)
57
-
58
- # uses tf*idf (http://en.wikipedia.org/wiki/Tf-idf)
59
- words = tokenize(string)
60
- words &= @vocab if ensure_words_in_vocab
61
- words.uniq.each do |word|
62
- tf = count_in_array(words, word)
63
- idf = @documents.size / count_in_array(@documents, proc { |doc| tokenize(doc).include?(word) })
64
-
65
- index = vector_keyword_index[word]
66
- arr[index] = tf * idf
56
+ arr = Array.new(vector_token_index.size, 0)
57
+
58
+ tokens = tokenize(string)
59
+ tokens &= @vocab if ensure_tokens_in_vocab
60
+ tokens.uniq.each do |token|
61
+ index = vector_token_index[token]
62
+ arr[index] = tf_idf(token, tokens, @documents)
67
63
  end
68
64
 
69
65
  Vector.elements(arr, false)
70
66
  end
71
67
  end
68
+
69
+ def tf(token, tokens)
70
+ count_in_array(tokens, token)
71
+ end
72
+
73
+ def idf(token, docs)
74
+ docs_with_token_count = count_in_array(docs, proc { |doc| tokenize(doc).include?(token) })
75
+ docs.size / docs_with_token_count
76
+ end
77
+
78
+ # http://en.wikipedia.org/wiki/Tf-idf
79
+ def tf_idf(token, tokens, docs)
80
+ tf(token, tokens) * idf(token, @documents)
81
+ end
72
82
 
73
- def vector_keyword_index
74
- @vector_keyword_index ||= begin
83
+ def vector_token_index
84
+ @vector_token_index ||= begin
75
85
  index, offset = {}, 0
76
86
 
77
- @vocab.each do |keyword|
78
- index[keyword] = offset
87
+ @vocab.each do |token|
88
+ index[token] = offset
79
89
  offset += 1
80
90
  end
81
91
 
@@ -88,7 +98,7 @@ module VSS
88
98
  @tokenize_cache[string] ||= Tokenizer.tokenize(string)
89
99
  end
90
100
 
91
- # could use Array#count, but 1.8.6 on Heroku don't have it only 1.8.7 >
101
+ # could use Array#count, but only for Ruby 1.8.7 >=
92
102
  def count_in_array(array, item)
93
103
  count = 0
94
104
  if item.is_a? Proc
@@ -7,7 +7,8 @@ class SearchTest < Test::Unit::TestCase
7
7
  @doc2 = "The Wire is the best thing ever. Fact."
8
8
  @doc3 = "Some would argue that Lost got a bit too wierd after season 2."
9
9
  @doc4 = "Lost is surely not in the same league as The Wire."
10
- @docs = [@doc1, @doc2, @doc3, @doc4]
10
+ @doc5 = "You cannot compare the The Wire and Lost."
11
+ @docs = [@doc1, @doc2, @doc3, @doc4, @doc5]
11
12
  @engine = VSS::Engine.new(@docs)
12
13
  end
13
14
 
@@ -18,17 +19,17 @@ class SearchTest < Test::Unit::TestCase
18
19
 
19
20
  def test_ordering
20
21
  results = @engine.search("How can you compare The Wire with Lost?")
21
- assert_equal @doc4, results[0]
22
- assert_equal @doc2, results[1]
23
- assert_equal @doc3, results[2]
24
- assert_equal @doc1, results[3]
22
+ assert_equal @doc5, results[0]
23
+ assert_equal @doc4, results[1]
24
+ assert_equal @doc2, results[2]
25
+ assert_equal @doc3, results[3]
25
26
  end
26
27
 
27
28
  def test_ranking
28
29
  results = @engine.search("How can you compare The Wire with Lost?")
29
- assert_equal 68.2574185835055, results[0].rank
30
- assert_equal 58.5749292571254, results[1].rank
31
- assert_equal 55.5215763037423, results[2].rank
32
- assert_equal 50.0, results[3].rank
30
+ assert_equal 82.1781403613318, results[0].rank
31
+ assert_equal 3.08166775680683, results[1].rank
32
+ assert_equal 1.3798683116522, results[2].rank
33
+ assert_equal 0.875309148136544, results[3].rank
33
34
  end
34
35
  end
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{vss}
5
- s.version = "0.1.1"
5
+ s.version = "0.1.2"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Mark Dodwell"]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mark Dodwell