vss 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +1 -1
- data/lib/vss/engine.rb +32 -22
- data/test/search_test.rb +10 -9
- data/vss.gemspec +1 -1
- metadata +1 -1
data/Rakefile
CHANGED
data/lib/vss/engine.rb
CHANGED
@@ -24,20 +24,20 @@ module VSS
|
|
24
24
|
record.instance_eval %{def rank; #{ranks[i]}; end}
|
25
25
|
end
|
26
26
|
|
27
|
-
#
|
28
|
-
@records.sort { |a,b| b.rank <=> a.rank }
|
27
|
+
# exclude 0 rank (no match) and sort by rank
|
28
|
+
@records.reject { |r| r.rank == 0 }.sort { |a,b| b.rank <=> a.rank }
|
29
29
|
end
|
30
30
|
|
31
31
|
private
|
32
32
|
|
33
33
|
# ranks from 0 to 100
|
34
34
|
def cosine_rank(vector1, vector2)
|
35
|
-
|
35
|
+
cosine(vector1, vector2) / 1 * 100
|
36
36
|
end
|
37
37
|
|
38
38
|
# see http://www.ltcconline.net/greenl/courses/107/vectors/DOTCROS.HTM
|
39
39
|
# and http://ruby-doc.org/stdlib/libdoc/matrix/rdoc/index.html
|
40
|
-
# will be in range
|
40
|
+
# will be in range 0 to 1, as vectors always positive
|
41
41
|
def cosine(vector1, vector2)
|
42
42
|
dot_product = vector1.inner_product(vector2)
|
43
43
|
dot_product / (vector1.r * vector2.r) # Vector#r is same as ||v||
|
@@ -47,35 +47,45 @@ module VSS
|
|
47
47
|
make_vector(query, true)
|
48
48
|
end
|
49
49
|
|
50
|
-
# NOTE: will choke if string contains
|
50
|
+
# NOTE: will choke if string contains tokens not in vocab
|
51
51
|
# this is why, when we make the query vector, we do an
|
52
52
|
# intersection of tokens with the vocab
|
53
|
-
def make_vector(string,
|
53
|
+
def make_vector(string, ensure_tokens_in_vocab = false)
|
54
54
|
@vector_cache = {}
|
55
55
|
@vector_cache[string] ||= begin
|
56
|
-
arr = Array.new(
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
idf = @documents.size / count_in_array(@documents, proc { |doc| tokenize(doc).include?(word) })
|
64
|
-
|
65
|
-
index = vector_keyword_index[word]
|
66
|
-
arr[index] = tf * idf
|
56
|
+
arr = Array.new(vector_token_index.size, 0)
|
57
|
+
|
58
|
+
tokens = tokenize(string)
|
59
|
+
tokens &= @vocab if ensure_tokens_in_vocab
|
60
|
+
tokens.uniq.each do |token|
|
61
|
+
index = vector_token_index[token]
|
62
|
+
arr[index] = tf_idf(token, tokens, @documents)
|
67
63
|
end
|
68
64
|
|
69
65
|
Vector.elements(arr, false)
|
70
66
|
end
|
71
67
|
end
|
68
|
+
|
69
|
+
def tf(token, tokens)
|
70
|
+
count_in_array(tokens, token)
|
71
|
+
end
|
72
|
+
|
73
|
+
def idf(token, docs)
|
74
|
+
docs_with_token_count = count_in_array(docs, proc { |doc| tokenize(doc).include?(token) })
|
75
|
+
docs.size / docs_with_token_count
|
76
|
+
end
|
77
|
+
|
78
|
+
# http://en.wikipedia.org/wiki/Tf-idf
|
79
|
+
def tf_idf(token, tokens, docs)
|
80
|
+
tf(token, tokens) * idf(token, @documents)
|
81
|
+
end
|
72
82
|
|
73
|
-
def
|
74
|
-
@
|
83
|
+
def vector_token_index
|
84
|
+
@vector_token_index ||= begin
|
75
85
|
index, offset = {}, 0
|
76
86
|
|
77
|
-
@vocab.each do |
|
78
|
-
index[
|
87
|
+
@vocab.each do |token|
|
88
|
+
index[token] = offset
|
79
89
|
offset += 1
|
80
90
|
end
|
81
91
|
|
@@ -88,7 +98,7 @@ module VSS
|
|
88
98
|
@tokenize_cache[string] ||= Tokenizer.tokenize(string)
|
89
99
|
end
|
90
100
|
|
91
|
-
# could use Array#count, but
|
101
|
+
# could use Array#count, but only for Ruby 1.8.7 >=
|
92
102
|
def count_in_array(array, item)
|
93
103
|
count = 0
|
94
104
|
if item.is_a? Proc
|
data/test/search_test.rb
CHANGED
@@ -7,7 +7,8 @@ class SearchTest < Test::Unit::TestCase
|
|
7
7
|
@doc2 = "The Wire is the best thing ever. Fact."
|
8
8
|
@doc3 = "Some would argue that Lost got a bit too wierd after season 2."
|
9
9
|
@doc4 = "Lost is surely not in the same league as The Wire."
|
10
|
-
@
|
10
|
+
@doc5 = "You cannot compare the The Wire and Lost."
|
11
|
+
@docs = [@doc1, @doc2, @doc3, @doc4, @doc5]
|
11
12
|
@engine = VSS::Engine.new(@docs)
|
12
13
|
end
|
13
14
|
|
@@ -18,17 +19,17 @@ class SearchTest < Test::Unit::TestCase
|
|
18
19
|
|
19
20
|
def test_ordering
|
20
21
|
results = @engine.search("How can you compare The Wire with Lost?")
|
21
|
-
assert_equal @
|
22
|
-
assert_equal @
|
23
|
-
assert_equal @
|
24
|
-
assert_equal @
|
22
|
+
assert_equal @doc5, results[0]
|
23
|
+
assert_equal @doc4, results[1]
|
24
|
+
assert_equal @doc2, results[2]
|
25
|
+
assert_equal @doc3, results[3]
|
25
26
|
end
|
26
27
|
|
27
28
|
def test_ranking
|
28
29
|
results = @engine.search("How can you compare The Wire with Lost?")
|
29
|
-
assert_equal
|
30
|
-
assert_equal
|
31
|
-
assert_equal
|
32
|
-
assert_equal
|
30
|
+
assert_equal 82.1781403613318, results[0].rank
|
31
|
+
assert_equal 3.08166775680683, results[1].rank
|
32
|
+
assert_equal 1.3798683116522, results[2].rank
|
33
|
+
assert_equal 0.875309148136544, results[3].rank
|
33
34
|
end
|
34
35
|
end
|
data/vss.gemspec
CHANGED