retrieval_lite 1.0.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/lib/retrieval_lite.rb +57 -1
- data/lib/retrieval_lite/boolean_retrieval.rb +55 -10
- data/lib/retrieval_lite/corpus.rb +5 -0
- data/lib/retrieval_lite/document.rb +9 -8
- data/lib/retrieval_lite/tfidf_retrieval.rb +42 -20
- data/lib/retrieval_lite/tokenizer.rb +3 -1
- data/lib/retrieval_lite/vector.rb +1 -0
- data/lib/version.rb +2 -1
- data/spec/boolean_retrieval_spec.rb +84 -17
- data/spec/document_spec.rb +6 -0
- data/spec/retrieval_lite_spec.rb +65 -0
- data/spec/spec_helper.rb +3 -0
- data/spec/tfidf_retrieval_spec.rb +25 -9
- data/spec/vector_spec.rb +12 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4e4e7dd3aca0e8ccc8ff59095e5e223b6f8a1f04
|
4
|
+
data.tar.gz: 84517e16668414f490e3ce284fccf53f55ebdd20
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 67e4537932eea13f79d4009ff9c48dc2a9f7e05ca3ca4c6b4e31564c057d9c87e228de0cca0713649a6d5607d712b19f9febfc31c4fffdc087bbe04a14da348a
|
7
|
+
data.tar.gz: 65676b034f4cbbf7757c5b58b99ce07a453154d0230e063b6ce2a76f5fa11b200ec927a97c864872a9202cbcf48fac8b71543a9d62b0c26ea1372a893c7afc87
|
data/Gemfile
CHANGED
data/lib/retrieval_lite.rb
CHANGED
@@ -1,7 +1,63 @@
|
|
1
1
|
require "version"
|
2
|
+
require "set"
|
2
3
|
|
4
|
+
# Offers simple document retrieval from a corpus with a query
|
3
5
|
module RetrievalLite
|
4
|
-
|
6
|
+
# Queries a corpus first by filtering it using a boolean evaluator and then
|
7
|
+
# using the tf-idf ranking algorithm and cosine similarity.
|
8
|
+
# Returns documents ordered by tf-idf score.
|
9
|
+
#
|
10
|
+
# @param corpus [Corpus] the collection of documents
|
11
|
+
# @param query [String] the boolean query to be evaluated
|
12
|
+
# @option opts [Boolean] :no_bool prevent the boolean filter
|
13
|
+
# @return [Array<Document>] ordered array of documents that satisfy the query
|
14
|
+
def evaluate_query(corpus, query, opts = {})
|
15
|
+
evaluate_query_with_scores(corpus, query, opts).keys
|
16
|
+
end
|
17
|
+
|
18
|
+
# Queries a corpus first by filtering it using a boolean evaluator and then
|
19
|
+
# using the tf-idf ranking algorithm and cosine similarity.
|
20
|
+
# Returns Hash of documents to their respective TF-IDF scores
|
21
|
+
# @see evaluate_query
|
22
|
+
#
|
23
|
+
# @param corpus [Corpus] the collection of documents
|
24
|
+
# @param query [String] the boolean query to be evaluated
|
25
|
+
# @option opts [Boolean] :no_bool prevent the boolean filter
|
26
|
+
# @return [Hash<Document, Integer>] ordered array of documents that satisfy the query
|
27
|
+
def evaluate_query_with_scores(corpus, query, opts = {})
|
28
|
+
evaluator_options = {}
|
29
|
+
|
30
|
+
# evaluate like normal if it is not a boolean expression
|
31
|
+
if opts[:no_bool] || !RetrievalLite::BooleanRetrieval.has_boolean_operators?(query)
|
32
|
+
RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, query)
|
33
|
+
else
|
34
|
+
documents = RetrievalLite::BooleanRetrieval.evaluate(corpus, query)
|
35
|
+
RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, query, { document_set: documents })
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Creates a new Retrieval Lite document. Upon initialization, the content
|
40
|
+
# is parsed into individual tokens, and its term frequencies are recorded.
|
41
|
+
#
|
42
|
+
# @param content [String] the text of the document
|
43
|
+
# @param opts [Hash] optional arguments to initializer
|
44
|
+
# @option opts [String] :id the id of the document. Defaults to object_id assigned by ruby
|
45
|
+
# @return a new document containing the input text
|
46
|
+
def new_document(content, opts = {})
|
47
|
+
RetrievalLite::Document.new(content, opts)
|
48
|
+
end
|
49
|
+
|
50
|
+
# Creates a new Retrieval Lite corpus, a collection of documents. Corpuses
|
51
|
+
# do not modify nor own the documents in them, meaning documents must
|
52
|
+
# be created first before adding them to the corpus.
|
53
|
+
#
|
54
|
+
# @param documents [Array<Document>] the documents of the corpus
|
55
|
+
# @param opts [Hash] optional arguments to initializer
|
56
|
+
# @option opts [Array<String>] :stop_words the words to ignore when creating tokens
|
57
|
+
# @return [Corpus] either a new empty corpus or one with those documents
|
58
|
+
def new_corpus(documents = [], opts = {})
|
59
|
+
RetrievalLite::Corpus.new(documents, opts)
|
60
|
+
end
|
5
61
|
end
|
6
62
|
|
7
63
|
require 'retrieval_lite/document'
|
@@ -1,21 +1,66 @@
|
|
1
|
+
# Gathers documents that satisfy boolean expression
|
1
2
|
module RetrievalLite::BooleanRetrieval
|
2
|
-
#
|
3
|
-
# AND, OR, NOT.
|
4
|
-
#
|
3
|
+
# Gathers up all documents of a corpus that satisfy a boolean expression
|
4
|
+
# with the standard operators: AND, OR, NOT. Does not order the documents in
|
5
|
+
# particular any way. Assumes that all boolean operators are separated by
|
6
|
+
# white space on either side.
|
5
7
|
#
|
6
8
|
# @param corpus [Corpus] the collection of documents
|
7
9
|
# @param query [String] the boolean query to be evaluated
|
8
10
|
# @return [Array<Document>] unordered array of documents that satisfy the query
|
9
11
|
def self.evaluate(corpus, query)
|
10
|
-
if !
|
11
|
-
raise "
|
12
|
+
if !is_valid_expression?(query)
|
13
|
+
raise "Each boolean operator (AND, OR, NOT) must operate on two terms."
|
12
14
|
end
|
13
15
|
|
14
|
-
|
15
|
-
|
16
|
+
# must strip all non alphanumeric characters
|
17
|
+
query = strip_query(query)
|
18
|
+
|
19
|
+
# must have spaces in front and back for next line
|
20
|
+
query = " " + query + " "
|
21
|
+
|
22
|
+
# replace all operators with corresponding operators
|
23
|
+
query = query.gsub("AND", "\&\&").gsub("OR", "\|\|").gsub("NOT", "!")
|
24
|
+
|
25
|
+
# replace all terms with corresponding functions
|
26
|
+
query.gsub!(/[a-zA-Z0-9]+(-[a-zA-Z0-9]+)?/) do |q|
|
27
|
+
" document.contains?(\"" + q.downcase + "\") "
|
28
|
+
end
|
16
29
|
|
17
|
-
|
18
|
-
|
19
|
-
|
30
|
+
output_documents = []
|
31
|
+
corpus.documents.each do |document|
|
32
|
+
begin
|
33
|
+
if eval(query)
|
34
|
+
output_documents << document
|
35
|
+
end
|
36
|
+
rescue
|
37
|
+
raise "The boolean expression is not valid. Please check all parethensis and operators."
|
38
|
+
end
|
20
39
|
end
|
40
|
+
|
41
|
+
return output_documents
|
42
|
+
end
|
43
|
+
|
44
|
+
# @param query [String] the boolean query to be evaluated
|
45
|
+
# @return [Boolean] whether query contains any boolean operators
|
46
|
+
def self.has_boolean_operators?(query)
|
47
|
+
/AND|OR|NOT/ === query
|
48
|
+
end
|
49
|
+
|
50
|
+
# @note all other invalid expressions should be caught later on
|
51
|
+
# @param query [String] the boolean query to be evaluated
|
52
|
+
# @return [Boolean] whether query ends parenthesis correctly
|
53
|
+
def self.is_valid_expression?(query)
|
54
|
+
!(/(AND|OR|NOT)\s*\)/ === query)
|
55
|
+
end
|
56
|
+
|
57
|
+
# @param query [String] the boolean query to be evaluated
|
58
|
+
# @return [String] a query removed of any non-alphanumeric characters besides parenthesis and whitespace
|
59
|
+
def self.strip_query(query)
|
60
|
+
# remove non-alphanumeric
|
61
|
+
query = query.gsub(/[^a-zA-Z0-9\s\(\)\-]/, " ")
|
62
|
+
|
63
|
+
# getting rid of stray hyphens
|
64
|
+
query = query.gsub(/\-\-+/, " ").gsub(/\s+\-\s+/, " ")
|
65
|
+
end
|
21
66
|
end
|
@@ -1,9 +1,14 @@
|
|
1
|
+
# A collection of documents
|
1
2
|
class RetrievalLite::Corpus
|
2
3
|
# the documents within the corpus
|
3
4
|
attr_reader :documents
|
4
5
|
# hash of a term to the array of documents that contain the particular term
|
5
6
|
attr_reader :term_occurrences
|
6
7
|
|
8
|
+
# Creates a new Retrieval Lite corpus, a collection of documents. Corpuses
|
9
|
+
# do not modify nor own the documents in them, meaning documents must
|
10
|
+
# be created first before adding them to the corpus.
|
11
|
+
#
|
7
12
|
# @param documents [Array<Document>] the documents of the corpus
|
8
13
|
# @param opts [Hash] optional arguments to initializer
|
9
14
|
# @option opts [Array<String>] :stop_words the words to ignore when creating tokens
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# Representation of document using content as a string and term frequencies as a hash
|
1
2
|
class RetrievalLite::Document
|
2
3
|
# the text of the document
|
3
4
|
attr_reader :content
|
@@ -6,7 +7,8 @@ class RetrievalLite::Document
|
|
6
7
|
# the id of the document
|
7
8
|
attr_reader :id
|
8
9
|
|
9
|
-
#
|
10
|
+
# Creates a new Retrieval Lite document. Upon initialization, the content
|
11
|
+
# is parsed into individual tokens, and its term frequencies are recorded.
|
10
12
|
#
|
11
13
|
# @param content [String] the text of the document
|
12
14
|
# @param opts [Hash] optional arguments to initializer
|
@@ -17,13 +19,6 @@ class RetrievalLite::Document
|
|
17
19
|
@term_frequencies = RetrievalLite::Tokenizer.parse_content(content)
|
18
20
|
end
|
19
21
|
|
20
|
-
# for debugging
|
21
|
-
def print_tokens
|
22
|
-
@term_frequencies.each do |key, value|
|
23
|
-
puts "#{key}: #{value}"
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
22
|
# @return [Integer] the total number of unique terms in the document
|
28
23
|
def term_count
|
29
24
|
@term_frequencies.size
|
@@ -44,6 +39,12 @@ class RetrievalLite::Document
|
|
44
39
|
end
|
45
40
|
end
|
46
41
|
|
42
|
+
# @param term [String]
|
43
|
+
# @return [Boolean] whether a term appears in the document
|
44
|
+
def contains?(term)
|
45
|
+
@term_frequencies.has_key?(term)
|
46
|
+
end
|
47
|
+
|
47
48
|
# @return [Integer] the total number of terms (not unique) in the document
|
48
49
|
def total_terms
|
49
50
|
count = 0
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# Scores queries using TF-IDF
|
1
2
|
# @see http://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf
|
2
3
|
module RetrievalLite::TfIdfRetrieval
|
3
4
|
# Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
|
@@ -6,8 +7,8 @@ module RetrievalLite::TfIdfRetrieval
|
|
6
7
|
# @param corpus [Corpus] the collection of documents
|
7
8
|
# @param query [String] the boolean query to be evaluated
|
8
9
|
# @return [Array<Document>] ordered array of documents that satisfy the query
|
9
|
-
def self.evaluate(corpus, query)
|
10
|
-
evaluate_with_scores(corpus, query).keys
|
10
|
+
def self.evaluate(corpus, query, opts = {})
|
11
|
+
evaluate_with_scores(corpus, query, opts).keys
|
11
12
|
end
|
12
13
|
|
13
14
|
# Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
|
@@ -16,20 +17,25 @@ module RetrievalLite::TfIdfRetrieval
|
|
16
17
|
#
|
17
18
|
# @param corpus [Corpus] the collection of documents
|
18
19
|
# @param query [String] the boolean query to be evaluated
|
20
|
+
# @option opts [Array<Document>] :document_set limiting the documents to search in the corpus to only these documents
|
19
21
|
# @return [Hash<Document, Integer>] ordered array of documents that satisfy the query
|
20
|
-
def self.evaluate_with_scores(corpus, query)
|
22
|
+
def self.evaluate_with_scores(corpus, query, opts = {})
|
21
23
|
query_document = RetrievalLite::Document.new(query)
|
22
24
|
terms = query_document.term_frequencies.keys
|
23
25
|
query_vector = query_document.term_frequencies.values # should be in same order as keys
|
24
26
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
27
|
+
if opts[:document_set]
|
28
|
+
documents = opts[:document_set]
|
29
|
+
else
|
30
|
+
documents = Set.new # ordering of documents doesn't matter right now
|
31
|
+
# gathering only the documents that contain at least one of those terms
|
32
|
+
terms.each do |t|
|
33
|
+
docs_with_term = corpus.documents_with(t)
|
34
|
+
if docs_with_term
|
35
|
+
docs_with_term.each do |d|
|
36
|
+
if !documents.include?(d)
|
37
|
+
documents << d
|
38
|
+
end
|
33
39
|
end
|
34
40
|
end
|
35
41
|
end
|
@@ -37,11 +43,16 @@ module RetrievalLite::TfIdfRetrieval
|
|
37
43
|
|
38
44
|
scores = {}
|
39
45
|
documents.each do |document|
|
40
|
-
|
41
|
-
|
42
|
-
|
46
|
+
vector_length = tfidf_weight_length(corpus, document)
|
47
|
+
if vector_length == 0
|
48
|
+
scores[document] = 0
|
49
|
+
else
|
50
|
+
document_vector = Array.new(terms.size)
|
51
|
+
terms.each_with_index do |term, index|
|
52
|
+
document_vector[index] = tfidf_weight(corpus, document, term)
|
53
|
+
end
|
54
|
+
scores[document] = RetrievalLite::Vector.dot_product(query_vector, document_vector) / vector_length
|
43
55
|
end
|
44
|
-
scores[document] = RetrievalLite::Vector.cosine_similarity(query_vector, document_vector)
|
45
56
|
end
|
46
57
|
|
47
58
|
# order it by score in descending order
|
@@ -72,13 +83,24 @@ module RetrievalLite::TfIdfRetrieval
|
|
72
83
|
# @param term [String]
|
73
84
|
# @return [Float] the normalized tfidf weight of the term in the document
|
74
85
|
def self.normalized_tfidf_weight(corpus, document, term)
|
75
|
-
|
86
|
+
tfidf_weight(corpus, document, term) / tfidf_weight_length(corpus, document)
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
# Computes the length of a document vector of tf-idf weights. This is
|
91
|
+
# used for normalization
|
92
|
+
#
|
93
|
+
# @param corpus [Corpus]
|
94
|
+
# @param document [Document]
|
95
|
+
# @return [Float] the length of the document vector of tf-idf weights
|
96
|
+
def self.tfidf_weight_length(corpus, document)
|
97
|
+
normalize = 0
|
76
98
|
|
77
|
-
|
78
|
-
weight = tfidf_weight(corpus,
|
79
|
-
|
99
|
+
document.terms.each do |t|
|
100
|
+
weight = tfidf_weight(corpus, document, t)
|
101
|
+
normalize += weight * weight
|
80
102
|
end
|
81
103
|
|
82
|
-
|
104
|
+
return Math.sqrt(normalize)
|
83
105
|
end
|
84
106
|
end
|
@@ -1,4 +1,6 @@
|
|
1
|
+
# Separates text into tokens used for IR
|
1
2
|
module RetrievalLite::Tokenizer
|
3
|
+
# Punctuation that is to be ignored when parsing. Does not contain the hyphen
|
2
4
|
SPECIAL_SEPARATERS = ['[', ']', '\\', ';', '\'', ',', '.', '/', '!', '@', '#', '%', '&', '*', '(', ')', '_', '{', '}', ':', '"', '?', '=', '`', '~', '$', '^', '+', '|', '<', '>']
|
3
5
|
|
4
6
|
# @param content [String] the text of the document
|
@@ -24,7 +26,7 @@ module RetrievalLite::Tokenizer
|
|
24
26
|
end
|
25
27
|
end
|
26
28
|
|
27
|
-
tokens
|
29
|
+
return tokens
|
28
30
|
end
|
29
31
|
|
30
32
|
private
|
data/lib/version.rb
CHANGED
@@ -7,38 +7,105 @@ describe RetrievalLite::BooleanRetrieval do
|
|
7
7
|
let (:document_replicated) do
|
8
8
|
RetrievalLite::Document.new("lorem ipsum dolor sit amet")
|
9
9
|
end
|
10
|
+
let (:document_one_term) do
|
11
|
+
RetrievalLite::Document.new("lorem")
|
12
|
+
end
|
10
13
|
let (:document_with_duplicates) do
|
11
14
|
RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
|
12
15
|
end
|
13
|
-
let (:
|
14
|
-
RetrievalLite::Document.new("
|
16
|
+
let (:document_strange) do
|
17
|
+
RetrievalLite::Document.new("foo bar")
|
15
18
|
end
|
16
|
-
let (:
|
17
|
-
RetrievalLite::Document.new("
|
19
|
+
let (:document_no_match) do
|
20
|
+
RetrievalLite::Document.new("no-match")
|
18
21
|
end
|
19
|
-
let (:
|
20
|
-
|
21
|
-
Sed dictum metus condimentum libero ullamcorper, eget scelerisque risus congue.
|
22
|
-
Morbi tempus rhoncus ante, at varius sem adipiscing eu. Sed ut purus pretium,
|
23
|
-
consequat velit et, ultricies magna. Etiam sit amet elit mi. Sed et nibh non nibh
|
24
|
-
vestibulum hendrerit vitae dapibus lectus. Aenean eget odio vitae tortor elementum
|
25
|
-
euismod non nec eros. Nunc id convallis magna. Aliquam ultrices dignissim ipsum,
|
26
|
-
a accumsan enim faucibus non. Pellentesque a felis quis diam blandit tempor.
|
27
|
-
In aliquet laoreet tortor, at adipiscing diam scelerisque ut."
|
28
|
-
)
|
22
|
+
let (:all_normal_documents) do
|
23
|
+
[document, document_replicated, document_with_duplicates, document_one_term, document_strange]
|
29
24
|
end
|
30
25
|
let (:all_documents) do
|
31
|
-
[document, document_replicated, document_with_duplicates,
|
26
|
+
[document, document_replicated, document_with_duplicates, document_one_term, document_strange, document_no_match]
|
32
27
|
end
|
33
28
|
let (:corpus) do
|
34
29
|
RetrievalLite::Corpus.new(all_documents)
|
35
30
|
end
|
31
|
+
|
32
|
+
describe "#has_boolean_operators?" do
|
33
|
+
it "should accept any uses of AND OR NOT" do
|
34
|
+
RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo AND bar").should == true
|
35
|
+
RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo OR bar").should == true
|
36
|
+
RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo NOT bar").should == true
|
37
|
+
end
|
38
|
+
it "should reject any regular non-boolean queries" do
|
39
|
+
RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo bar").should == false
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
describe "#is_valid_expression?" do
|
44
|
+
it "should accept parenthesis and spaces, as well as all alphanumeric characters" do
|
45
|
+
RetrievalLite::BooleanRetrieval.is_valid_expression?("(foo AND bar) OR baz").should == true
|
46
|
+
end
|
47
|
+
it "should reject when there is a close parethensis but no term after AND/OR/NOT" do
|
48
|
+
RetrievalLite::BooleanRetrieval.is_valid_expression?("(foo AND bar AND)").should == false
|
49
|
+
RetrievalLite::BooleanRetrieval.is_valid_expression?("(foo AND bar AND )").should == false
|
50
|
+
end
|
51
|
+
it "should accept AND/OR/NOT with any begin parenthesis after it, regardless if there's a whitespace" do
|
52
|
+
RetrievalLite::BooleanRetrieval.is_valid_expression?("NOT(foo AND bar)").should == true
|
53
|
+
RetrievalLite::BooleanRetrieval.is_valid_expression?("NOT (foo AND bar)").should == true
|
54
|
+
end
|
55
|
+
it "should accept sentences" do
|
56
|
+
RetrievalLite::BooleanRetrieval.is_valid_expression?("foo bar.").should == true
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
describe "#strip_query" do
|
61
|
+
it "should strip any commas, periods, etc nonalphanumeric characters" do
|
62
|
+
RetrievalLite::BooleanRetrieval.strip_query("(This is a cat.) AND (Although, something else!)").should == "(This is a cat ) AND (Although something else )"
|
63
|
+
end
|
64
|
+
it "should strip any double, triple, etc hyphenated words" do
|
65
|
+
RetrievalLite::BooleanRetrieval.strip_query("This is it--hooray!").should == "This is it hooray "
|
66
|
+
end
|
67
|
+
it "should leave hyphenated words alone" do
|
68
|
+
RetrievalLite::BooleanRetrieval.strip_query("This is foo-bar").should == "This is foo-bar"
|
69
|
+
end
|
70
|
+
it "should remove lone hyphens" do
|
71
|
+
RetrievalLite::BooleanRetrieval.strip_query("This - is foo-bar").should == "This is foo-bar"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
describe "invalid boolean" do
|
76
|
+
it "should error on unclosed parenthesis" do
|
77
|
+
expect { RetrievalLite::BooleanRetrieval.evaluate(corpus, "(lorem AND ipsum") }.to raise_error
|
78
|
+
end
|
79
|
+
it "should error on when not enough arguments are provided" do
|
80
|
+
expect { RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem AND ipsum OR") }.to raise_error
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
36
84
|
describe "one-term retrieval" do
|
37
85
|
it "should return array of all documents with that term" do
|
38
|
-
RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem") == [document, document_replicated, document_with_duplicates]
|
86
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem").should == [document, document_replicated, document_with_duplicates, document_one_term]
|
39
87
|
end
|
40
88
|
it "should ignore case" do
|
41
|
-
RetrievalLite::BooleanRetrieval.evaluate(corpus, "
|
89
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "LOreM").should == [document, document_replicated, document_with_duplicates, document_one_term]
|
90
|
+
end
|
91
|
+
it "should work for hyphenated words" do
|
92
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "no-match").should == [document_no_match]
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
describe "valid boolean retrieval" do
|
97
|
+
it "should work for simple two term AND" do
|
98
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem AND ipsum").should == [document, document_replicated, document_with_duplicates]
|
99
|
+
end
|
100
|
+
it "should work for simple two term OR" do
|
101
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem OR foo").should == all_normal_documents
|
102
|
+
end
|
103
|
+
it "should work for simple one term NOT" do
|
104
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "NOT lorem").should == [document_strange, document_no_match]
|
105
|
+
end
|
106
|
+
it "should work for more complex retrievals with parenthesis" do
|
107
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "foo OR (dolor AND sit)").should == [document, document_replicated, document_with_duplicates, document_strange]
|
108
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "(lorem OR foo) AND NOT ipsum").should == [document_one_term, document_strange]
|
42
109
|
end
|
43
110
|
end
|
44
111
|
end
|
data/spec/document_spec.rb
CHANGED
@@ -80,6 +80,12 @@ describe RetrievalLite::Document do
|
|
80
80
|
end
|
81
81
|
end
|
82
82
|
|
83
|
+
describe "total terms" do
|
84
|
+
it "should have the right number of terms" do
|
85
|
+
document.total_terms.should == 5
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
83
89
|
describe "for blank document" do
|
84
90
|
it "should not raise error on initialization" do
|
85
91
|
expect { RetrievalLite::Document.new("") }.to_not raise_error
|
data/spec/retrieval_lite_spec.rb
CHANGED
@@ -1,3 +1,68 @@
|
|
1
1
|
require 'spec_helper'
|
2
|
+
|
2
3
|
describe RetrievalLite do
|
4
|
+
include RetrievalLite
|
5
|
+
|
6
|
+
let (:document_one_term) do
|
7
|
+
new_document("lorem")
|
8
|
+
end
|
9
|
+
let (:document) do
|
10
|
+
new_document("lorem ipsum dolor sit amet")
|
11
|
+
end
|
12
|
+
let (:document_with_duplicates) do
|
13
|
+
new_document("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
|
14
|
+
end
|
15
|
+
let (:document_doubled) do
|
16
|
+
new_document("lorem ipsum dolor sit amet lorem ipsum dolor sit amet")
|
17
|
+
end
|
18
|
+
let (:document_both_terms) do
|
19
|
+
new_document("lorem ipsum")
|
20
|
+
end
|
21
|
+
let (:document_with_unique) do
|
22
|
+
new_document("lorem unique")
|
23
|
+
end
|
24
|
+
let (:document_no_match) do
|
25
|
+
new_document("no-match")
|
26
|
+
end
|
27
|
+
let (:all_documents) do
|
28
|
+
[document, document_with_duplicates, document_doubled, document_one_term, document_both_terms, document_with_unique, document_no_match]
|
29
|
+
end
|
30
|
+
let (:corpus) do
|
31
|
+
new_corpus(all_documents)
|
32
|
+
end
|
33
|
+
let (:corpus_different) do
|
34
|
+
new_corpus([document_one_term, document, document_with_duplicates])
|
35
|
+
end
|
36
|
+
let(:corpus_small) do
|
37
|
+
new_corpus([document_one_term, document, document_no_match])
|
38
|
+
end
|
39
|
+
|
40
|
+
describe "when no options are passed" do
|
41
|
+
it "should default to basic tf-idf" do
|
42
|
+
scores = evaluate_query_with_scores(corpus_different, "lorem dolor sit")
|
43
|
+
scores[document].should be_within(0.001).of(1.0)
|
44
|
+
scores[document_with_duplicates].should be_within(0.001).of(0.953)
|
45
|
+
scores[document_one_term].should be_within(0.001).of(0.0)
|
46
|
+
|
47
|
+
evaluate_query(corpus_different, "lorem dolor sit").should == [document, document_with_duplicates, document_one_term]
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
describe "when boolean operators are present" do
|
52
|
+
it "should first filter through boolean" do
|
53
|
+
evaluate_query(corpus, "lorem AND NOT dolor").should == [document_one_term, document_both_terms, document_with_unique]
|
54
|
+
evaluate_query(corpus, "(lorem AND unique) OR no-match").should == [document_with_unique, document_no_match]
|
55
|
+
evaluate_query(corpus, "lorem AND ipsum AND dolor AND sit AND amet").should == [document_doubled, document, document_with_duplicates]
|
56
|
+
evaluate_query(corpus, "lorem AND no-match").should == []
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
describe "with punctuation" do
|
61
|
+
it "should retrieve it as normal" do
|
62
|
+
evaluate_query(corpus, "lorem. AND NOT dolor!").should == [document_one_term, document_both_terms, document_with_unique]
|
63
|
+
evaluate_query(corpus, "(lorem-- AND !unique) OR no-match").should == [document_with_unique, document_no_match]
|
64
|
+
evaluate_query(corpus, "@lorem AND @ipsum AND @dolor AND @sit AND @amet").should == [document_doubled, document, document_with_duplicates]
|
65
|
+
evaluate_query(corpus, "||lorem AND no-match").should == []
|
66
|
+
end
|
67
|
+
end
|
3
68
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe RetrievalLite::TfIdfRetrieval do
|
4
|
+
let (:document_no_match) do
|
5
|
+
RetrievalLite::Document.new("no-match")
|
6
|
+
end
|
4
7
|
let (:document_one_term) do
|
5
8
|
RetrievalLite::Document.new("lorem")
|
6
9
|
end
|
@@ -29,6 +32,9 @@ describe RetrievalLite::TfIdfRetrieval do
|
|
29
32
|
let (:corpus_different) do
|
30
33
|
RetrievalLite::Corpus.new([document_one_term, document, document_with_duplicates])
|
31
34
|
end
|
35
|
+
let(:corpus_small) do
|
36
|
+
RetrievalLite::Corpus.new([document_one_term, document, document_no_match])
|
37
|
+
end
|
32
38
|
|
33
39
|
describe "calculating tf-idf scores" do
|
34
40
|
describe "term that all documents have" do
|
@@ -55,10 +61,10 @@ describe RetrievalLite::TfIdfRetrieval do
|
|
55
61
|
describe "calculating normalized tf-idf scores" do
|
56
62
|
describe "term that a few documents have" do
|
57
63
|
it "should have correct tf-idf" do
|
58
|
-
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.
|
59
|
-
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.
|
60
|
-
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.
|
61
|
-
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(0
|
64
|
+
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.320)
|
65
|
+
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.163)
|
66
|
+
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.320)
|
67
|
+
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(1.0)
|
62
68
|
end
|
63
69
|
end
|
64
70
|
end
|
@@ -84,9 +90,9 @@ describe RetrievalLite::TfIdfRetrieval do
|
|
84
90
|
it "should return the correct score" do
|
85
91
|
scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "ipsum")
|
86
92
|
scores.size.should == 4
|
87
|
-
scores[document].should be_within(0.001).of(
|
88
|
-
scores[document_with_duplicates].should be_within(0.001).of(
|
89
|
-
scores[document_doubled].should be_within(0.001).of(
|
93
|
+
scores[document].should be_within(0.001).of(0.320)
|
94
|
+
scores[document_with_duplicates].should be_within(0.001).of(0.163)
|
95
|
+
scores[document_doubled].should be_within(0.001).of(0.320)
|
90
96
|
scores[document_both_terms].should be_within(0.001).of(1.0)
|
91
97
|
end
|
92
98
|
end
|
@@ -122,9 +128,19 @@ describe RetrievalLite::TfIdfRetrieval do
|
|
122
128
|
end
|
123
129
|
it "should have the correct scores" do
|
124
130
|
scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus_different, "lorem dolor sit")
|
125
|
-
scores[document].should be_within(0.001).of(0
|
126
|
-
scores[document_with_duplicates].should be_within(0.001).of(0.
|
131
|
+
scores[document].should be_within(0.001).of(1.0)
|
132
|
+
scores[document_with_duplicates].should be_within(0.001).of(0.953)
|
127
133
|
scores[document_one_term].should be_within(0.001).of(0.0)
|
128
134
|
end
|
129
135
|
end
|
136
|
+
|
137
|
+
describe "documents with same frequency but longer lengths" do
|
138
|
+
it "order should favor shorter documents" do
|
139
|
+
RetrievalLite::TfIdfRetrieval.evaluate(corpus_small, "lorem").should == [document_one_term, document]
|
140
|
+
end
|
141
|
+
it "shorter documents should rank higher" do
|
142
|
+
scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus_small, "lorem")
|
143
|
+
scores[document_one_term].should > scores[document]
|
144
|
+
end
|
145
|
+
end
|
130
146
|
end
|
data/spec/vector_spec.rb
CHANGED
@@ -24,4 +24,16 @@ describe RetrievalLite::Vector do
|
|
24
24
|
RetrievalLite::Vector.euclidean_length([3, 4]).should == 5
|
25
25
|
end
|
26
26
|
end
|
27
|
+
|
28
|
+
describe "cosine similarity" do
|
29
|
+
it "should compute correctly for vectors length 1" do
|
30
|
+
RetrievalLite::Vector.cosine_similarity([3], [5]).should == 1
|
31
|
+
end
|
32
|
+
it "should compute correctly for longer vectors" do
|
33
|
+
RetrievalLite::Vector.cosine_similarity([2, 3], [4, 5]).should be_within(0.001).of(0.996)
|
34
|
+
end
|
35
|
+
it "should raise error for unequal sized arrays" do
|
36
|
+
expect { RetrievalLite::Vector.cosine_similarity([2, 3], [4]) }.to raise_error
|
37
|
+
end
|
38
|
+
end
|
27
39
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: retrieval_lite
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Irvin Zhan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-05-
|
11
|
+
date: 2014-05-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|