vss 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -1
- data/lib/vss/engine.rb +32 -22
- data/test/search_test.rb +10 -9
- data/vss.gemspec +1 -1
- metadata +1 -1
data/Rakefile
CHANGED
data/lib/vss/engine.rb
CHANGED
@@ -24,20 +24,20 @@ module VSS
|
|
24
24
|
record.instance_eval %{def rank; #{ranks[i]}; end}
|
25
25
|
end
|
26
26
|
|
27
|
-
#
|
28
|
-
@records.sort { |a,b| b.rank <=> a.rank }
|
27
|
+
# exclude 0 rank (no match) and sort by rank
|
28
|
+
@records.reject { |r| r.rank == 0 }.sort { |a,b| b.rank <=> a.rank }
|
29
29
|
end
|
30
30
|
|
31
31
|
private
|
32
32
|
|
33
33
|
# ranks from 0 to 100
|
34
34
|
def cosine_rank(vector1, vector2)
|
35
|
-
|
35
|
+
cosine(vector1, vector2) / 1 * 100
|
36
36
|
end
|
37
37
|
|
38
38
|
# see http://www.ltcconline.net/greenl/courses/107/vectors/DOTCROS.HTM
|
39
39
|
# and http://ruby-doc.org/stdlib/libdoc/matrix/rdoc/index.html
|
40
|
-
# will be in range
|
40
|
+
# will be in range 0 to 1, as vectors always positive
|
41
41
|
def cosine(vector1, vector2)
|
42
42
|
dot_product = vector1.inner_product(vector2)
|
43
43
|
dot_product / (vector1.r * vector2.r) # Vector#r is same as ||v||
|
@@ -47,35 +47,45 @@ module VSS
|
|
47
47
|
make_vector(query, true)
|
48
48
|
end
|
49
49
|
|
50
|
-
# NOTE: will choke if string contains
|
50
|
+
# NOTE: will choke if string contains tokens not in vocab
|
51
51
|
# this is why, when we make the query vector, we do an
|
52
52
|
# intersection of tokens with the vocab
|
53
|
-
def make_vector(string,
|
53
|
+
def make_vector(string, ensure_tokens_in_vocab = false)
|
54
54
|
@vector_cache = {}
|
55
55
|
@vector_cache[string] ||= begin
|
56
|
-
arr = Array.new(
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
idf = @documents.size / count_in_array(@documents, proc { |doc| tokenize(doc).include?(word) })
|
64
|
-
|
65
|
-
index = vector_keyword_index[word]
|
66
|
-
arr[index] = tf * idf
|
56
|
+
arr = Array.new(vector_token_index.size, 0)
|
57
|
+
|
58
|
+
tokens = tokenize(string)
|
59
|
+
tokens &= @vocab if ensure_tokens_in_vocab
|
60
|
+
tokens.uniq.each do |token|
|
61
|
+
index = vector_token_index[token]
|
62
|
+
arr[index] = tf_idf(token, tokens, @documents)
|
67
63
|
end
|
68
64
|
|
69
65
|
Vector.elements(arr, false)
|
70
66
|
end
|
71
67
|
end
|
68
|
+
|
69
|
+
def tf(token, tokens)
|
70
|
+
count_in_array(tokens, token)
|
71
|
+
end
|
72
|
+
|
73
|
+
def idf(token, docs)
|
74
|
+
docs_with_token_count = count_in_array(docs, proc { |doc| tokenize(doc).include?(token) })
|
75
|
+
docs.size / docs_with_token_count
|
76
|
+
end
|
77
|
+
|
78
|
+
# http://en.wikipedia.org/wiki/Tf-idf
|
79
|
+
def tf_idf(token, tokens, docs)
|
80
|
+
tf(token, tokens) * idf(token, @documents)
|
81
|
+
end
|
72
82
|
|
73
|
-
def
|
74
|
-
@
|
83
|
+
def vector_token_index
|
84
|
+
@vector_token_index ||= begin
|
75
85
|
index, offset = {}, 0
|
76
86
|
|
77
|
-
@vocab.each do |
|
78
|
-
index[
|
87
|
+
@vocab.each do |token|
|
88
|
+
index[token] = offset
|
79
89
|
offset += 1
|
80
90
|
end
|
81
91
|
|
@@ -88,7 +98,7 @@ module VSS
|
|
88
98
|
@tokenize_cache[string] ||= Tokenizer.tokenize(string)
|
89
99
|
end
|
90
100
|
|
91
|
-
# could use Array#count, but
|
101
|
+
# could use Array#count, but only for Ruby 1.8.7 >=
|
92
102
|
def count_in_array(array, item)
|
93
103
|
count = 0
|
94
104
|
if item.is_a? Proc
|
data/test/search_test.rb
CHANGED
@@ -7,7 +7,8 @@ class SearchTest < Test::Unit::TestCase
|
|
7
7
|
@doc2 = "The Wire is the best thing ever. Fact."
|
8
8
|
@doc3 = "Some would argue that Lost got a bit too wierd after season 2."
|
9
9
|
@doc4 = "Lost is surely not in the same league as The Wire."
|
10
|
-
@
|
10
|
+
@doc5 = "You cannot compare the The Wire and Lost."
|
11
|
+
@docs = [@doc1, @doc2, @doc3, @doc4, @doc5]
|
11
12
|
@engine = VSS::Engine.new(@docs)
|
12
13
|
end
|
13
14
|
|
@@ -18,17 +19,17 @@ class SearchTest < Test::Unit::TestCase
|
|
18
19
|
|
19
20
|
def test_ordering
|
20
21
|
results = @engine.search("How can you compare The Wire with Lost?")
|
21
|
-
assert_equal @
|
22
|
-
assert_equal @
|
23
|
-
assert_equal @
|
24
|
-
assert_equal @
|
22
|
+
assert_equal @doc5, results[0]
|
23
|
+
assert_equal @doc4, results[1]
|
24
|
+
assert_equal @doc2, results[2]
|
25
|
+
assert_equal @doc3, results[3]
|
25
26
|
end
|
26
27
|
|
27
28
|
def test_ranking
|
28
29
|
results = @engine.search("How can you compare The Wire with Lost?")
|
29
|
-
assert_equal
|
30
|
-
assert_equal
|
31
|
-
assert_equal
|
32
|
-
assert_equal
|
30
|
+
assert_equal 82.1781403613318, results[0].rank
|
31
|
+
assert_equal 3.08166775680683, results[1].rank
|
32
|
+
assert_equal 1.3798683116522, results[2].rank
|
33
|
+
assert_equal 0.875309148136544, results[3].rank
|
33
34
|
end
|
34
35
|
end
|
data/vss.gemspec
CHANGED