retrieval_lite 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/lib/retrieval_lite.rb +57 -1
- data/lib/retrieval_lite/boolean_retrieval.rb +55 -10
- data/lib/retrieval_lite/corpus.rb +5 -0
- data/lib/retrieval_lite/document.rb +9 -8
- data/lib/retrieval_lite/tfidf_retrieval.rb +42 -20
- data/lib/retrieval_lite/tokenizer.rb +3 -1
- data/lib/retrieval_lite/vector.rb +1 -0
- data/lib/version.rb +2 -1
- data/spec/boolean_retrieval_spec.rb +84 -17
- data/spec/document_spec.rb +6 -0
- data/spec/retrieval_lite_spec.rb +65 -0
- data/spec/spec_helper.rb +3 -0
- data/spec/tfidf_retrieval_spec.rb +25 -9
- data/spec/vector_spec.rb +12 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4e4e7dd3aca0e8ccc8ff59095e5e223b6f8a1f04
|
4
|
+
data.tar.gz: 84517e16668414f490e3ce284fccf53f55ebdd20
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 67e4537932eea13f79d4009ff9c48dc2a9f7e05ca3ca4c6b4e31564c057d9c87e228de0cca0713649a6d5607d712b19f9febfc31c4fffdc087bbe04a14da348a
|
7
|
+
data.tar.gz: 65676b034f4cbbf7757c5b58b99ce07a453154d0230e063b6ce2a76f5fa11b200ec927a97c864872a9202cbcf48fac8b71543a9d62b0c26ea1372a893c7afc87
|
data/Gemfile
CHANGED
data/lib/retrieval_lite.rb
CHANGED
@@ -1,7 +1,63 @@
|
|
1
1
|
require "version"
|
2
|
+
require "set"
|
2
3
|
|
4
|
+
# Offers simple document retrieval from a corpus with a query
|
3
5
|
module RetrievalLite
|
4
|
-
|
6
|
+
# Queries a corpus first by filtering it using a boolean evaluator and then
|
7
|
+
# using the tf-idf ranking algorithm and cosine similarity.
|
8
|
+
# Returns documents ordered by tf-idf score.
|
9
|
+
#
|
10
|
+
# @param corpus [Corpus] the collection of documents
|
11
|
+
# @param query [String] the boolean query to be evaluated
|
12
|
+
# @option opts [Boolean] :no_bool prevent the boolean filter
|
13
|
+
# @return [Array<Document>] ordered array of documents that satisfy the query
|
14
|
+
def evaluate_query(corpus, query, opts = {})
|
15
|
+
evaluate_query_with_scores(corpus, query, opts).keys
|
16
|
+
end
|
17
|
+
|
18
|
+
# Queries a corpus first by filtering it using a boolean evaluator and then
|
19
|
+
# using the tf-idf ranking algorithm and cosine similarity.
|
20
|
+
# Returns Hash of documents to their respective TF-IDF scores
|
21
|
+
# @see evaluate_query
|
22
|
+
#
|
23
|
+
# @param corpus [Corpus] the collection of documents
|
24
|
+
# @param query [String] the boolean query to be evaluated
|
25
|
+
# @option opts [Boolean] :no_bool prevent the boolean filter
|
26
|
+
# @return [Hash<Document, Integer>] ordered array of documents that satisfy the query
|
27
|
+
def evaluate_query_with_scores(corpus, query, opts = {})
|
28
|
+
evaluator_options = {}
|
29
|
+
|
30
|
+
# evaluate like normal if it is not a boolean expression
|
31
|
+
if opts[:no_bool] || !RetrievalLite::BooleanRetrieval.has_boolean_operators?(query)
|
32
|
+
RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, query)
|
33
|
+
else
|
34
|
+
documents = RetrievalLite::BooleanRetrieval.evaluate(corpus, query)
|
35
|
+
RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, query, { document_set: documents })
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Creates a new Retrieval Lite document. Upon initialization, the content
|
40
|
+
# is parsed into individual tokens, and its term frequencies are recorded.
|
41
|
+
#
|
42
|
+
# @param content [String] the text of the document
|
43
|
+
# @param opts [Hash] optional arguments to initializer
|
44
|
+
# @option opts [String] :id the id of the document. Defaults to object_id assigned by ruby
|
45
|
+
# @return a new document containing the input text
|
46
|
+
def new_document(content, opts = {})
|
47
|
+
RetrievalLite::Document.new(content, opts)
|
48
|
+
end
|
49
|
+
|
50
|
+
# Creates a new Retrieval Lite corpus, a collection of documents. Corpuses
|
51
|
+
# do not modify nor own the documents in them, meaning documents must
|
52
|
+
# be created first before adding them to the corpus.
|
53
|
+
#
|
54
|
+
# @param documents [Array<Document>] the documents of the corpus
|
55
|
+
# @param opts [Hash] optional arguments to initializer
|
56
|
+
# @option opts [Array<String>] :stop_words the words to ignore when creating tokens
|
57
|
+
# @return [Corpus] either a new empty corpus or one with those documents
|
58
|
+
def new_corpus(documents = [], opts = {})
|
59
|
+
RetrievalLite::Corpus.new(documents, opts)
|
60
|
+
end
|
5
61
|
end
|
6
62
|
|
7
63
|
require 'retrieval_lite/document'
|
@@ -1,21 +1,66 @@
|
|
1
|
+
# Gathers documents that satisfy boolean expression
|
1
2
|
module RetrievalLite::BooleanRetrieval
|
2
|
-
#
|
3
|
-
# AND, OR, NOT.
|
4
|
-
#
|
3
|
+
# Gathers up all documents of a corpus that satisfy a boolean expression
|
4
|
+
# with the standard operators: AND, OR, NOT. Does not order the documents in
|
5
|
+
# particular any way. Assumes that all boolean operators are separated by
|
6
|
+
# white space on either side.
|
5
7
|
#
|
6
8
|
# @param corpus [Corpus] the collection of documents
|
7
9
|
# @param query [String] the boolean query to be evaluated
|
8
10
|
# @return [Array<Document>] unordered array of documents that satisfy the query
|
9
11
|
def self.evaluate(corpus, query)
|
10
|
-
if !
|
11
|
-
raise "
|
12
|
+
if !is_valid_expression?(query)
|
13
|
+
raise "Each boolean operator (AND, OR, NOT) must operate on two terms."
|
12
14
|
end
|
13
15
|
|
14
|
-
|
15
|
-
|
16
|
+
# must strip all non alphanumeric characters
|
17
|
+
query = strip_query(query)
|
18
|
+
|
19
|
+
# must have spaces in front and back for next line
|
20
|
+
query = " " + query + " "
|
21
|
+
|
22
|
+
# replace all operators with corresponding operators
|
23
|
+
query = query.gsub("AND", "\&\&").gsub("OR", "\|\|").gsub("NOT", "!")
|
24
|
+
|
25
|
+
# replace all terms with corresponding functions
|
26
|
+
query.gsub!(/[a-zA-Z0-9]+(-[a-zA-Z0-9]+)?/) do |q|
|
27
|
+
" document.contains?(\"" + q.downcase + "\") "
|
28
|
+
end
|
16
29
|
|
17
|
-
|
18
|
-
|
19
|
-
|
30
|
+
output_documents = []
|
31
|
+
corpus.documents.each do |document|
|
32
|
+
begin
|
33
|
+
if eval(query)
|
34
|
+
output_documents << document
|
35
|
+
end
|
36
|
+
rescue
|
37
|
+
raise "The boolean expression is not valid. Please check all parethensis and operators."
|
38
|
+
end
|
20
39
|
end
|
40
|
+
|
41
|
+
return output_documents
|
42
|
+
end
|
43
|
+
|
44
|
+
# @param query [String] the boolean query to be evaluated
|
45
|
+
# @return [Boolean] whether query contains any boolean operators
|
46
|
+
def self.has_boolean_operators?(query)
|
47
|
+
/AND|OR|NOT/ === query
|
48
|
+
end
|
49
|
+
|
50
|
+
# @note all other invalid expressions should be caught later on
|
51
|
+
# @param query [String] the boolean query to be evaluated
|
52
|
+
# @return [Boolean] whether query ends parenthesis correctly
|
53
|
+
def self.is_valid_expression?(query)
|
54
|
+
!(/(AND|OR|NOT)\s*\)/ === query)
|
55
|
+
end
|
56
|
+
|
57
|
+
# @param query [String] the boolean query to be evaluated
|
58
|
+
# @return [String] a query removed of any non-alphanumeric characters besides parenthesis and whitespace
|
59
|
+
def self.strip_query(query)
|
60
|
+
# remove non-alphanumeric
|
61
|
+
query = query.gsub(/[^a-zA-Z0-9\s\(\)\-]/, " ")
|
62
|
+
|
63
|
+
# getting rid of stray hyphens
|
64
|
+
query = query.gsub(/\-\-+/, " ").gsub(/\s+\-\s+/, " ")
|
65
|
+
end
|
21
66
|
end
|
@@ -1,9 +1,14 @@
|
|
1
|
+
# A collection of documents
|
1
2
|
class RetrievalLite::Corpus
|
2
3
|
# the documents within the corpus
|
3
4
|
attr_reader :documents
|
4
5
|
# hash of a term to the array of documents that contain the particular term
|
5
6
|
attr_reader :term_occurrences
|
6
7
|
|
8
|
+
# Creates a new Retrieval Lite corpus, a collection of documents. Corpuses
|
9
|
+
# do not modify nor own the documents in them, meaning documents must
|
10
|
+
# be created first before adding them to the corpus.
|
11
|
+
#
|
7
12
|
# @param documents [Array<Document>] the documents of the corpus
|
8
13
|
# @param opts [Hash] optional arguments to initializer
|
9
14
|
# @option opts [Array<String>] :stop_words the words to ignore when creating tokens
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# Representation of document using content as a string and term frequencies as a hash
|
1
2
|
class RetrievalLite::Document
|
2
3
|
# the text of the document
|
3
4
|
attr_reader :content
|
@@ -6,7 +7,8 @@ class RetrievalLite::Document
|
|
6
7
|
# the id of the document
|
7
8
|
attr_reader :id
|
8
9
|
|
9
|
-
#
|
10
|
+
# Creates a new Retrieval Lite document. Upon initialization, the content
|
11
|
+
# is parsed into individual tokens, and its term frequencies are recorded.
|
10
12
|
#
|
11
13
|
# @param content [String] the text of the document
|
12
14
|
# @param opts [Hash] optional arguments to initializer
|
@@ -17,13 +19,6 @@ class RetrievalLite::Document
|
|
17
19
|
@term_frequencies = RetrievalLite::Tokenizer.parse_content(content)
|
18
20
|
end
|
19
21
|
|
20
|
-
# for debugging
|
21
|
-
def print_tokens
|
22
|
-
@term_frequencies.each do |key, value|
|
23
|
-
puts "#{key}: #{value}"
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
22
|
# @return [Integer] the total number of unique terms in the document
|
28
23
|
def term_count
|
29
24
|
@term_frequencies.size
|
@@ -44,6 +39,12 @@ class RetrievalLite::Document
|
|
44
39
|
end
|
45
40
|
end
|
46
41
|
|
42
|
+
# @param term [String]
|
43
|
+
# @return [Boolean] whether a term appears in the document
|
44
|
+
def contains?(term)
|
45
|
+
@term_frequencies.has_key?(term)
|
46
|
+
end
|
47
|
+
|
47
48
|
# @return [Integer] the total number of terms (not unique) in the document
|
48
49
|
def total_terms
|
49
50
|
count = 0
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# Scores queries using TF-IDF
|
1
2
|
# @see http://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf
|
2
3
|
module RetrievalLite::TfIdfRetrieval
|
3
4
|
# Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
|
@@ -6,8 +7,8 @@ module RetrievalLite::TfIdfRetrieval
|
|
6
7
|
# @param corpus [Corpus] the collection of documents
|
7
8
|
# @param query [String] the boolean query to be evaluated
|
8
9
|
# @return [Array<Document>] ordered array of documents that satisfy the query
|
9
|
-
def self.evaluate(corpus, query)
|
10
|
-
evaluate_with_scores(corpus, query).keys
|
10
|
+
def self.evaluate(corpus, query, opts = {})
|
11
|
+
evaluate_with_scores(corpus, query, opts).keys
|
11
12
|
end
|
12
13
|
|
13
14
|
# Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
|
@@ -16,20 +17,25 @@ module RetrievalLite::TfIdfRetrieval
|
|
16
17
|
#
|
17
18
|
# @param corpus [Corpus] the collection of documents
|
18
19
|
# @param query [String] the boolean query to be evaluated
|
20
|
+
# @option opts [Array<Document>] :document_set limiting the documents to search in the corpus to only these documents
|
19
21
|
# @return [Hash<Document, Integer>] ordered array of documents that satisfy the query
|
20
|
-
def self.evaluate_with_scores(corpus, query)
|
22
|
+
def self.evaluate_with_scores(corpus, query, opts = {})
|
21
23
|
query_document = RetrievalLite::Document.new(query)
|
22
24
|
terms = query_document.term_frequencies.keys
|
23
25
|
query_vector = query_document.term_frequencies.values # should be in same order as keys
|
24
26
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
27
|
+
if opts[:document_set]
|
28
|
+
documents = opts[:document_set]
|
29
|
+
else
|
30
|
+
documents = Set.new # ordering of documents doesn't matter right now
|
31
|
+
# gathering only the documents that contain at least one of those terms
|
32
|
+
terms.each do |t|
|
33
|
+
docs_with_term = corpus.documents_with(t)
|
34
|
+
if docs_with_term
|
35
|
+
docs_with_term.each do |d|
|
36
|
+
if !documents.include?(d)
|
37
|
+
documents << d
|
38
|
+
end
|
33
39
|
end
|
34
40
|
end
|
35
41
|
end
|
@@ -37,11 +43,16 @@ module RetrievalLite::TfIdfRetrieval
|
|
37
43
|
|
38
44
|
scores = {}
|
39
45
|
documents.each do |document|
|
40
|
-
|
41
|
-
|
42
|
-
|
46
|
+
vector_length = tfidf_weight_length(corpus, document)
|
47
|
+
if vector_length == 0
|
48
|
+
scores[document] = 0
|
49
|
+
else
|
50
|
+
document_vector = Array.new(terms.size)
|
51
|
+
terms.each_with_index do |term, index|
|
52
|
+
document_vector[index] = tfidf_weight(corpus, document, term)
|
53
|
+
end
|
54
|
+
scores[document] = RetrievalLite::Vector.dot_product(query_vector, document_vector) / vector_length
|
43
55
|
end
|
44
|
-
scores[document] = RetrievalLite::Vector.cosine_similarity(query_vector, document_vector)
|
45
56
|
end
|
46
57
|
|
47
58
|
# order it by score in descending order
|
@@ -72,13 +83,24 @@ module RetrievalLite::TfIdfRetrieval
|
|
72
83
|
# @param term [String]
|
73
84
|
# @return [Float] the normalized tfidf weight of the term in the document
|
74
85
|
def self.normalized_tfidf_weight(corpus, document, term)
|
75
|
-
|
86
|
+
tfidf_weight(corpus, document, term) / tfidf_weight_length(corpus, document)
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
# Computes the length of a document vector of tf-idf weights. This is
|
91
|
+
# used for normalization
|
92
|
+
#
|
93
|
+
# @param corpus [Corpus]
|
94
|
+
# @param document [Document]
|
95
|
+
# @return [Float] the length of the document vector of tf-idf weights
|
96
|
+
def self.tfidf_weight_length(corpus, document)
|
97
|
+
normalize = 0
|
76
98
|
|
77
|
-
|
78
|
-
weight = tfidf_weight(corpus,
|
79
|
-
|
99
|
+
document.terms.each do |t|
|
100
|
+
weight = tfidf_weight(corpus, document, t)
|
101
|
+
normalize += weight * weight
|
80
102
|
end
|
81
103
|
|
82
|
-
|
104
|
+
return Math.sqrt(normalize)
|
83
105
|
end
|
84
106
|
end
|
@@ -1,4 +1,6 @@
|
|
1
|
+
# Separates text into tokens used for IR
|
1
2
|
module RetrievalLite::Tokenizer
|
3
|
+
# Punctuation that is to be ignored when parsing. Does not contain the hyphen
|
2
4
|
SPECIAL_SEPARATERS = ['[', ']', '\\', ';', '\'', ',', '.', '/', '!', '@', '#', '%', '&', '*', '(', ')', '_', '{', '}', ':', '"', '?', '=', '`', '~', '$', '^', '+', '|', '<', '>']
|
3
5
|
|
4
6
|
# @param content [String] the text of the document
|
@@ -24,7 +26,7 @@ module RetrievalLite::Tokenizer
|
|
24
26
|
end
|
25
27
|
end
|
26
28
|
|
27
|
-
tokens
|
29
|
+
return tokens
|
28
30
|
end
|
29
31
|
|
30
32
|
private
|
data/lib/version.rb
CHANGED
@@ -7,38 +7,105 @@ describe RetrievalLite::BooleanRetrieval do
|
|
7
7
|
let (:document_replicated) do
|
8
8
|
RetrievalLite::Document.new("lorem ipsum dolor sit amet")
|
9
9
|
end
|
10
|
+
let (:document_one_term) do
|
11
|
+
RetrievalLite::Document.new("lorem")
|
12
|
+
end
|
10
13
|
let (:document_with_duplicates) do
|
11
14
|
RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
|
12
15
|
end
|
13
|
-
let (:
|
14
|
-
RetrievalLite::Document.new("
|
16
|
+
let (:document_strange) do
|
17
|
+
RetrievalLite::Document.new("foo bar")
|
15
18
|
end
|
16
|
-
let (:
|
17
|
-
RetrievalLite::Document.new("
|
19
|
+
let (:document_no_match) do
|
20
|
+
RetrievalLite::Document.new("no-match")
|
18
21
|
end
|
19
|
-
let (:
|
20
|
-
|
21
|
-
Sed dictum metus condimentum libero ullamcorper, eget scelerisque risus congue.
|
22
|
-
Morbi tempus rhoncus ante, at varius sem adipiscing eu. Sed ut purus pretium,
|
23
|
-
consequat velit et, ultricies magna. Etiam sit amet elit mi. Sed et nibh non nibh
|
24
|
-
vestibulum hendrerit vitae dapibus lectus. Aenean eget odio vitae tortor elementum
|
25
|
-
euismod non nec eros. Nunc id convallis magna. Aliquam ultrices dignissim ipsum,
|
26
|
-
a accumsan enim faucibus non. Pellentesque a felis quis diam blandit tempor.
|
27
|
-
In aliquet laoreet tortor, at adipiscing diam scelerisque ut."
|
28
|
-
)
|
22
|
+
let (:all_normal_documents) do
|
23
|
+
[document, document_replicated, document_with_duplicates, document_one_term, document_strange]
|
29
24
|
end
|
30
25
|
let (:all_documents) do
|
31
|
-
[document, document_replicated, document_with_duplicates,
|
26
|
+
[document, document_replicated, document_with_duplicates, document_one_term, document_strange, document_no_match]
|
32
27
|
end
|
33
28
|
let (:corpus) do
|
34
29
|
RetrievalLite::Corpus.new(all_documents)
|
35
30
|
end
|
31
|
+
|
32
|
+
describe "#has_boolean_operators?" do
|
33
|
+
it "should accept any uses of AND OR NOT" do
|
34
|
+
RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo AND bar").should == true
|
35
|
+
RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo OR bar").should == true
|
36
|
+
RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo NOT bar").should == true
|
37
|
+
end
|
38
|
+
it "should reject any regular non-boolean queries" do
|
39
|
+
RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo bar").should == false
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
describe "#is_valid_expression?" do
|
44
|
+
it "should accept parenthesis and spaces, as well as all alphanumeric characters" do
|
45
|
+
RetrievalLite::BooleanRetrieval.is_valid_expression?("(foo AND bar) OR baz").should == true
|
46
|
+
end
|
47
|
+
it "should reject when there is a close parethensis but no term after AND/OR/NOT" do
|
48
|
+
RetrievalLite::BooleanRetrieval.is_valid_expression?("(foo AND bar AND)").should == false
|
49
|
+
RetrievalLite::BooleanRetrieval.is_valid_expression?("(foo AND bar AND )").should == false
|
50
|
+
end
|
51
|
+
it "should accept AND/OR/NOT with any begin parenthesis after it, regardless if there's a whitespace" do
|
52
|
+
RetrievalLite::BooleanRetrieval.is_valid_expression?("NOT(foo AND bar)").should == true
|
53
|
+
RetrievalLite::BooleanRetrieval.is_valid_expression?("NOT (foo AND bar)").should == true
|
54
|
+
end
|
55
|
+
it "should accept sentences" do
|
56
|
+
RetrievalLite::BooleanRetrieval.is_valid_expression?("foo bar.").should == true
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
describe "#strip_query" do
|
61
|
+
it "should strip any commas, periods, etc nonalphanumeric characters" do
|
62
|
+
RetrievalLite::BooleanRetrieval.strip_query("(This is a cat.) AND (Although, something else!)").should == "(This is a cat ) AND (Although something else )"
|
63
|
+
end
|
64
|
+
it "should strip any double, triple, etc hyphenated words" do
|
65
|
+
RetrievalLite::BooleanRetrieval.strip_query("This is it--hooray!").should == "This is it hooray "
|
66
|
+
end
|
67
|
+
it "should leave hyphenated words alone" do
|
68
|
+
RetrievalLite::BooleanRetrieval.strip_query("This is foo-bar").should == "This is foo-bar"
|
69
|
+
end
|
70
|
+
it "should remove lone hyphens" do
|
71
|
+
RetrievalLite::BooleanRetrieval.strip_query("This - is foo-bar").should == "This is foo-bar"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
describe "invalid boolean" do
|
76
|
+
it "should error on unclosed parenthesis" do
|
77
|
+
expect { RetrievalLite::BooleanRetrieval.evaluate(corpus, "(lorem AND ipsum") }.to raise_error
|
78
|
+
end
|
79
|
+
it "should error on when not enough arguments are provided" do
|
80
|
+
expect { RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem AND ipsum OR") }.to raise_error
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
36
84
|
describe "one-term retrieval" do
|
37
85
|
it "should return array of all documents with that term" do
|
38
|
-
RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem") == [document, document_replicated, document_with_duplicates]
|
86
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem").should == [document, document_replicated, document_with_duplicates, document_one_term]
|
39
87
|
end
|
40
88
|
it "should ignore case" do
|
41
|
-
RetrievalLite::BooleanRetrieval.evaluate(corpus, "
|
89
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "LOreM").should == [document, document_replicated, document_with_duplicates, document_one_term]
|
90
|
+
end
|
91
|
+
it "should work for hyphenated words" do
|
92
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "no-match").should == [document_no_match]
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
describe "valid boolean retrieval" do
|
97
|
+
it "should work for simple two term AND" do
|
98
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem AND ipsum").should == [document, document_replicated, document_with_duplicates]
|
99
|
+
end
|
100
|
+
it "should work for simple two term OR" do
|
101
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem OR foo").should == all_normal_documents
|
102
|
+
end
|
103
|
+
it "should work for simple one term NOT" do
|
104
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "NOT lorem").should == [document_strange, document_no_match]
|
105
|
+
end
|
106
|
+
it "should work for more complex retrievals with parenthesis" do
|
107
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "foo OR (dolor AND sit)").should == [document, document_replicated, document_with_duplicates, document_strange]
|
108
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "(lorem OR foo) AND NOT ipsum").should == [document_one_term, document_strange]
|
42
109
|
end
|
43
110
|
end
|
44
111
|
end
|
data/spec/document_spec.rb
CHANGED
@@ -80,6 +80,12 @@ describe RetrievalLite::Document do
|
|
80
80
|
end
|
81
81
|
end
|
82
82
|
|
83
|
+
describe "total terms" do
|
84
|
+
it "should have the right number of terms" do
|
85
|
+
document.total_terms.should == 5
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
83
89
|
describe "for blank document" do
|
84
90
|
it "should not raise error on initialization" do
|
85
91
|
expect { RetrievalLite::Document.new("") }.to_not raise_error
|
data/spec/retrieval_lite_spec.rb
CHANGED
@@ -1,3 +1,68 @@
|
|
1
1
|
require 'spec_helper'
|
2
|
+
|
2
3
|
describe RetrievalLite do
|
4
|
+
include RetrievalLite
|
5
|
+
|
6
|
+
let (:document_one_term) do
|
7
|
+
new_document("lorem")
|
8
|
+
end
|
9
|
+
let (:document) do
|
10
|
+
new_document("lorem ipsum dolor sit amet")
|
11
|
+
end
|
12
|
+
let (:document_with_duplicates) do
|
13
|
+
new_document("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
|
14
|
+
end
|
15
|
+
let (:document_doubled) do
|
16
|
+
new_document("lorem ipsum dolor sit amet lorem ipsum dolor sit amet")
|
17
|
+
end
|
18
|
+
let (:document_both_terms) do
|
19
|
+
new_document("lorem ipsum")
|
20
|
+
end
|
21
|
+
let (:document_with_unique) do
|
22
|
+
new_document("lorem unique")
|
23
|
+
end
|
24
|
+
let (:document_no_match) do
|
25
|
+
new_document("no-match")
|
26
|
+
end
|
27
|
+
let (:all_documents) do
|
28
|
+
[document, document_with_duplicates, document_doubled, document_one_term, document_both_terms, document_with_unique, document_no_match]
|
29
|
+
end
|
30
|
+
let (:corpus) do
|
31
|
+
new_corpus(all_documents)
|
32
|
+
end
|
33
|
+
let (:corpus_different) do
|
34
|
+
new_corpus([document_one_term, document, document_with_duplicates])
|
35
|
+
end
|
36
|
+
let(:corpus_small) do
|
37
|
+
new_corpus([document_one_term, document, document_no_match])
|
38
|
+
end
|
39
|
+
|
40
|
+
describe "when no options are passed" do
|
41
|
+
it "should default to basic tf-idf" do
|
42
|
+
scores = evaluate_query_with_scores(corpus_different, "lorem dolor sit")
|
43
|
+
scores[document].should be_within(0.001).of(1.0)
|
44
|
+
scores[document_with_duplicates].should be_within(0.001).of(0.953)
|
45
|
+
scores[document_one_term].should be_within(0.001).of(0.0)
|
46
|
+
|
47
|
+
evaluate_query(corpus_different, "lorem dolor sit").should == [document, document_with_duplicates, document_one_term]
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
describe "when boolean operators are present" do
|
52
|
+
it "should first filter through boolean" do
|
53
|
+
evaluate_query(corpus, "lorem AND NOT dolor").should == [document_one_term, document_both_terms, document_with_unique]
|
54
|
+
evaluate_query(corpus, "(lorem AND unique) OR no-match").should == [document_with_unique, document_no_match]
|
55
|
+
evaluate_query(corpus, "lorem AND ipsum AND dolor AND sit AND amet").should == [document_doubled, document, document_with_duplicates]
|
56
|
+
evaluate_query(corpus, "lorem AND no-match").should == []
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
describe "with punctuation" do
|
61
|
+
it "should retrieve it as normal" do
|
62
|
+
evaluate_query(corpus, "lorem. AND NOT dolor!").should == [document_one_term, document_both_terms, document_with_unique]
|
63
|
+
evaluate_query(corpus, "(lorem-- AND !unique) OR no-match").should == [document_with_unique, document_no_match]
|
64
|
+
evaluate_query(corpus, "@lorem AND @ipsum AND @dolor AND @sit AND @amet").should == [document_doubled, document, document_with_duplicates]
|
65
|
+
evaluate_query(corpus, "||lorem AND no-match").should == []
|
66
|
+
end
|
67
|
+
end
|
3
68
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe RetrievalLite::TfIdfRetrieval do
|
4
|
+
let (:document_no_match) do
|
5
|
+
RetrievalLite::Document.new("no-match")
|
6
|
+
end
|
4
7
|
let (:document_one_term) do
|
5
8
|
RetrievalLite::Document.new("lorem")
|
6
9
|
end
|
@@ -29,6 +32,9 @@ describe RetrievalLite::TfIdfRetrieval do
|
|
29
32
|
let (:corpus_different) do
|
30
33
|
RetrievalLite::Corpus.new([document_one_term, document, document_with_duplicates])
|
31
34
|
end
|
35
|
+
let(:corpus_small) do
|
36
|
+
RetrievalLite::Corpus.new([document_one_term, document, document_no_match])
|
37
|
+
end
|
32
38
|
|
33
39
|
describe "calculating tf-idf scores" do
|
34
40
|
describe "term that all documents have" do
|
@@ -55,10 +61,10 @@ describe RetrievalLite::TfIdfRetrieval do
|
|
55
61
|
describe "calculating normalized tf-idf scores" do
|
56
62
|
describe "term that a few documents have" do
|
57
63
|
it "should have correct tf-idf" do
|
58
|
-
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.
|
59
|
-
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.
|
60
|
-
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.
|
61
|
-
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(0
|
64
|
+
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.320)
|
65
|
+
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.163)
|
66
|
+
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.320)
|
67
|
+
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(1.0)
|
62
68
|
end
|
63
69
|
end
|
64
70
|
end
|
@@ -84,9 +90,9 @@ describe RetrievalLite::TfIdfRetrieval do
|
|
84
90
|
it "should return the correct score" do
|
85
91
|
scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "ipsum")
|
86
92
|
scores.size.should == 4
|
87
|
-
scores[document].should be_within(0.001).of(
|
88
|
-
scores[document_with_duplicates].should be_within(0.001).of(
|
89
|
-
scores[document_doubled].should be_within(0.001).of(
|
93
|
+
scores[document].should be_within(0.001).of(0.320)
|
94
|
+
scores[document_with_duplicates].should be_within(0.001).of(0.163)
|
95
|
+
scores[document_doubled].should be_within(0.001).of(0.320)
|
90
96
|
scores[document_both_terms].should be_within(0.001).of(1.0)
|
91
97
|
end
|
92
98
|
end
|
@@ -122,9 +128,19 @@ describe RetrievalLite::TfIdfRetrieval do
|
|
122
128
|
end
|
123
129
|
it "should have the correct scores" do
|
124
130
|
scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus_different, "lorem dolor sit")
|
125
|
-
scores[document].should be_within(0.001).of(0
|
126
|
-
scores[document_with_duplicates].should be_within(0.001).of(0.
|
131
|
+
scores[document].should be_within(0.001).of(1.0)
|
132
|
+
scores[document_with_duplicates].should be_within(0.001).of(0.953)
|
127
133
|
scores[document_one_term].should be_within(0.001).of(0.0)
|
128
134
|
end
|
129
135
|
end
|
136
|
+
|
137
|
+
describe "documents with same frequency but longer lengths" do
|
138
|
+
it "order should favor shorter documents" do
|
139
|
+
RetrievalLite::TfIdfRetrieval.evaluate(corpus_small, "lorem").should == [document_one_term, document]
|
140
|
+
end
|
141
|
+
it "shorter documents should rank higher" do
|
142
|
+
scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus_small, "lorem")
|
143
|
+
scores[document_one_term].should > scores[document]
|
144
|
+
end
|
145
|
+
end
|
130
146
|
end
|
data/spec/vector_spec.rb
CHANGED
@@ -24,4 +24,16 @@ describe RetrievalLite::Vector do
|
|
24
24
|
RetrievalLite::Vector.euclidean_length([3, 4]).should == 5
|
25
25
|
end
|
26
26
|
end
|
27
|
+
|
28
|
+
describe "cosine similarity" do
|
29
|
+
it "should compute correctly for vectors length 1" do
|
30
|
+
RetrievalLite::Vector.cosine_similarity([3], [5]).should == 1
|
31
|
+
end
|
32
|
+
it "should compute correctly for longer vectors" do
|
33
|
+
RetrievalLite::Vector.cosine_similarity([2, 3], [4, 5]).should be_within(0.001).of(0.996)
|
34
|
+
end
|
35
|
+
it "should raise error for unequal sized arrays" do
|
36
|
+
expect { RetrievalLite::Vector.cosine_similarity([2, 3], [4]) }.to raise_error
|
37
|
+
end
|
38
|
+
end
|
27
39
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: retrieval_lite
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Irvin Zhan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-05-
|
11
|
+
date: 2014-05-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|