retrieval_lite 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8e080b5b0c600b330f5d07827b5c2e576132306b
4
- data.tar.gz: f0fa5049a6f5e9df0a1fbc761107d28dc1efe857
3
+ metadata.gz: 4e4e7dd3aca0e8ccc8ff59095e5e223b6f8a1f04
4
+ data.tar.gz: 84517e16668414f490e3ce284fccf53f55ebdd20
5
5
  SHA512:
6
- metadata.gz: b1827e3f90edbc3ee841fbe25526c748ac82ad1aae6f42455605e3199dffe7c5cd309c2684741d41f695c88285152c1bf83558beb87668ec9eebb4973f7ff0a3
7
- data.tar.gz: 5d2009f175da5ce2cecb277061e9592093a14435d5012094c461afb1ac626b1d72705edf560cece8f53d4a02623f7ebe2fd3adc0f0555b0934cd5b878f6c12b2
6
+ metadata.gz: 67e4537932eea13f79d4009ff9c48dc2a9f7e05ca3ca4c6b4e31564c057d9c87e228de0cca0713649a6d5607d712b19f9febfc31c4fffdc087bbe04a14da348a
7
+ data.tar.gz: 65676b034f4cbbf7757c5b58b99ce07a453154d0230e063b6ce2a76f5fa11b200ec927a97c864872a9202cbcf48fac8b71543a9d62b0c26ea1372a893c7afc87
data/Gemfile CHANGED
@@ -2,3 +2,5 @@ source 'https://rubygems.org'
2
2
 
3
3
  # gem's dependencies specified in retrieval_lite.gemspec
4
4
  gemspec
5
+
6
+ gem 'simplecov', :require => false, :group => :test
@@ -1,7 +1,63 @@
1
1
  require "version"
2
+ require "set"
2
3
 
4
+ # Offers simple document retrieval from a corpus with a query
3
5
  module RetrievalLite
4
-
6
+ # Queries a corpus first by filtering it using a boolean evaluator and then
7
+ # using the tf-idf ranking algorithm and cosine similarity.
8
+ # Returns documents ordered by tf-idf score.
9
+ #
10
+ # @param corpus [Corpus] the collection of documents
11
+ # @param query [String] the boolean query to be evaluated
12
+ # @option opts [Boolean] :no_bool prevent the boolean filter
13
+ # @return [Array<Document>] ordered array of documents that satisfy the query
14
+ def evaluate_query(corpus, query, opts = {})
15
+ evaluate_query_with_scores(corpus, query, opts).keys
16
+ end
17
+
18
+ # Queries a corpus first by filtering it using a boolean evaluator and then
19
+ # using the tf-idf ranking algorithm and cosine similarity.
20
+ # Returns Hash of documents to their respective TF-IDF scores
21
+ # @see evaluate_query
22
+ #
23
+ # @param corpus [Corpus] the collection of documents
24
+ # @param query [String] the boolean query to be evaluated
25
+ # @option opts [Boolean] :no_bool prevent the boolean filter
26
+ # @return [Hash<Document, Integer>] ordered array of documents that satisfy the query
27
+ def evaluate_query_with_scores(corpus, query, opts = {})
28
+ evaluator_options = {}
29
+
30
+ # evaluate like normal if it is not a boolean expression
31
+ if opts[:no_bool] || !RetrievalLite::BooleanRetrieval.has_boolean_operators?(query)
32
+ RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, query)
33
+ else
34
+ documents = RetrievalLite::BooleanRetrieval.evaluate(corpus, query)
35
+ RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, query, { document_set: documents })
36
+ end
37
+ end
38
+
39
+ # Creates a new Retrieval Lite document. Upon initialization, the content
40
+ # is parsed into individual tokens, and its term frequencies are recorded.
41
+ #
42
+ # @param content [String] the text of the document
43
+ # @param opts [Hash] optional arguments to initializer
44
+ # @option opts [String] :id the id of the document. Defaults to object_id assigned by ruby
45
+ # @return a new document containing the input text
46
+ def new_document(content, opts = {})
47
+ RetrievalLite::Document.new(content, opts)
48
+ end
49
+
50
+ # Creates a new Retrieval Lite corpus, a collection of documents. Corpuses
51
+ # do not modify nor own the documents in them, meaning documents must
52
+ # be created first before adding them to the corpus.
53
+ #
54
+ # @param documents [Array<Document>] the documents of the corpus
55
+ # @param opts [Hash] optional arguments to initializer
56
+ # @option opts [Array<String>] :stop_words the words to ignore when creating tokens
57
+ # @return [Corpus] either a new empty corpus or one with those documents
58
+ def new_corpus(documents = [], opts = {})
59
+ RetrievalLite::Corpus.new(documents, opts)
60
+ end
5
61
  end
6
62
 
7
63
  require 'retrieval_lite/document'
@@ -1,21 +1,66 @@
1
+ # Gathers documents that satisfy boolean expression
1
2
  module RetrievalLite::BooleanRetrieval
2
- # Queries a corpus using a boolean expression with the standard operators,
3
- # AND, OR, NOT. Only returns documents that satisfy the query, and does
4
- # not rank the documents in any way.
3
+ # Gathers up all documents of a corpus that satisfy a boolean expression
4
+ # with the standard operators: AND, OR, NOT. Does not order the documents in
5
+ # particular any way. Assumes that all boolean operators are separated by
6
+ # white space on either side.
5
7
  #
6
8
  # @param corpus [Corpus] the collection of documents
7
9
  # @param query [String] the boolean query to be evaluated
8
10
  # @return [Array<Document>] unordered array of documents that satisfy the query
9
11
  def self.evaluate(corpus, query)
10
- if !is_valid?(query)
11
- raise "Boolean expression is not valid." # TODO better validation message?
12
+ if !is_valid_expression?(query)
13
+ raise "Each boolean operator (AND, OR, NOT) must operate on two terms."
12
14
  end
13
15
 
14
- corpus.documents_with(query)
15
- end
16
+ # must strip all non alphanumeric characters
17
+ query = strip_query(query)
18
+
19
+ # must have spaces in front and back for next line
20
+ query = " " + query + " "
21
+
22
+ # replace all operators with corresponding operators
23
+ query = query.gsub("AND", "\&\&").gsub("OR", "\|\|").gsub("NOT", "!")
24
+
25
+ # replace all terms with corresponding functions
26
+ query.gsub!(/[a-zA-Z0-9]+(-[a-zA-Z0-9]+)?/) do |q|
27
+ " document.contains?(\"" + q.downcase + "\") "
28
+ end
16
29
 
17
- private
18
- def self.is_valid?(query)
19
- true
30
+ output_documents = []
31
+ corpus.documents.each do |document|
32
+ begin
33
+ if eval(query)
34
+ output_documents << document
35
+ end
36
+ rescue
37
+ raise "The boolean expression is not valid. Please check all parethensis and operators."
38
+ end
20
39
  end
40
+
41
+ return output_documents
42
+ end
43
+
44
+ # @param query [String] the boolean query to be evaluated
45
+ # @return [Boolean] whether query contains any boolean operators
46
+ def self.has_boolean_operators?(query)
47
+ /AND|OR|NOT/ === query
48
+ end
49
+
50
+ # @note all other invalid expressions should be caught later on
51
+ # @param query [String] the boolean query to be evaluated
52
+ # @return [Boolean] whether query ends parenthesis correctly
53
+ def self.is_valid_expression?(query)
54
+ !(/(AND|OR|NOT)\s*\)/ === query)
55
+ end
56
+
57
+ # @param query [String] the boolean query to be evaluated
58
+ # @return [String] a query removed of any non-alphanumeric characters besides parenthesis and whitespace
59
+ def self.strip_query(query)
60
+ # remove non-alphanumeric
61
+ query = query.gsub(/[^a-zA-Z0-9\s\(\)\-]/, " ")
62
+
63
+ # getting rid of stray hyphens
64
+ query = query.gsub(/\-\-+/, " ").gsub(/\s+\-\s+/, " ")
65
+ end
21
66
  end
@@ -1,9 +1,14 @@
1
+ # A collection of documents
1
2
  class RetrievalLite::Corpus
2
3
  # the documents within the corpus
3
4
  attr_reader :documents
4
5
  # hash of a term to the array of documents that contain the particular term
5
6
  attr_reader :term_occurrences
6
7
 
8
+ # Creates a new Retrieval Lite corpus, a collection of documents. Corpuses
9
+ # do not modify nor own the documents in them, meaning documents must
10
+ # be created first before adding them to the corpus.
11
+ #
7
12
  # @param documents [Array<Document>] the documents of the corpus
8
13
  # @param opts [Hash] optional arguments to initializer
9
14
  # @option opts [Array<String>] :stop_words the words to ignore when creating tokens
@@ -1,3 +1,4 @@
1
+ # Representation of document using content as a string and term frequencies as a hash
1
2
  class RetrievalLite::Document
2
3
  # the text of the document
3
4
  attr_reader :content
@@ -6,7 +7,8 @@ class RetrievalLite::Document
6
7
  # the id of the document
7
8
  attr_reader :id
8
9
 
9
- # splits the text of the document into an array of tokens
10
+ # Creates a new Retrieval Lite document. Upon initialization, the content
11
+ # is parsed into individual tokens, and its term frequencies are recorded.
10
12
  #
11
13
  # @param content [String] the text of the document
12
14
  # @param opts [Hash] optional arguments to initializer
@@ -17,13 +19,6 @@ class RetrievalLite::Document
17
19
  @term_frequencies = RetrievalLite::Tokenizer.parse_content(content)
18
20
  end
19
21
 
20
- # for debugging
21
- def print_tokens
22
- @term_frequencies.each do |key, value|
23
- puts "#{key}: #{value}"
24
- end
25
- end
26
-
27
22
  # @return [Integer] the total number of unique terms in the document
28
23
  def term_count
29
24
  @term_frequencies.size
@@ -44,6 +39,12 @@ class RetrievalLite::Document
44
39
  end
45
40
  end
46
41
 
42
+ # @param term [String]
43
+ # @return [Boolean] whether a term appears in the document
44
+ def contains?(term)
45
+ @term_frequencies.has_key?(term)
46
+ end
47
+
47
48
  # @return [Integer] the total number of terms (not unique) in the document
48
49
  def total_terms
49
50
  count = 0
@@ -1,3 +1,4 @@
1
+ # Scores queries using TF-IDF
1
2
  # @see http://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf
2
3
  module RetrievalLite::TfIdfRetrieval
3
4
  # Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
@@ -6,8 +7,8 @@ module RetrievalLite::TfIdfRetrieval
6
7
  # @param corpus [Corpus] the collection of documents
7
8
  # @param query [String] the boolean query to be evaluated
8
9
  # @return [Array<Document>] ordered array of documents that satisfy the query
9
- def self.evaluate(corpus, query)
10
- evaluate_with_scores(corpus, query).keys
10
+ def self.evaluate(corpus, query, opts = {})
11
+ evaluate_with_scores(corpus, query, opts).keys
11
12
  end
12
13
 
13
14
  # Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
@@ -16,20 +17,25 @@ module RetrievalLite::TfIdfRetrieval
16
17
  #
17
18
  # @param corpus [Corpus] the collection of documents
18
19
  # @param query [String] the boolean query to be evaluated
20
+ # @option opts [Array<Document>] :document_set limiting the documents to search in the corpus to only these documents
19
21
  # @return [Hash<Document, Integer>] ordered array of documents that satisfy the query
20
- def self.evaluate_with_scores(corpus, query)
22
+ def self.evaluate_with_scores(corpus, query, opts = {})
21
23
  query_document = RetrievalLite::Document.new(query)
22
24
  terms = query_document.term_frequencies.keys
23
25
  query_vector = query_document.term_frequencies.values # should be in same order as keys
24
26
 
25
- documents = Set.new # ordering of documents doesn't matter right now
26
- # gathering only the documents that contain at least one of those terms
27
- terms.each do |t|
28
- docs_with_term = corpus.documents_with(t)
29
- if docs_with_term
30
- docs_with_term.each do |d|
31
- if !documents.include?(d)
32
- documents << d
27
+ if opts[:document_set]
28
+ documents = opts[:document_set]
29
+ else
30
+ documents = Set.new # ordering of documents doesn't matter right now
31
+ # gathering only the documents that contain at least one of those terms
32
+ terms.each do |t|
33
+ docs_with_term = corpus.documents_with(t)
34
+ if docs_with_term
35
+ docs_with_term.each do |d|
36
+ if !documents.include?(d)
37
+ documents << d
38
+ end
33
39
  end
34
40
  end
35
41
  end
@@ -37,11 +43,16 @@ module RetrievalLite::TfIdfRetrieval
37
43
 
38
44
  scores = {}
39
45
  documents.each do |document|
40
- document_vector = Array.new(terms.size)
41
- terms.each_with_index do |term, index|
42
- document_vector[index] = tfidf_weight(corpus, document, term)
46
+ vector_length = tfidf_weight_length(corpus, document)
47
+ if vector_length == 0
48
+ scores[document] = 0
49
+ else
50
+ document_vector = Array.new(terms.size)
51
+ terms.each_with_index do |term, index|
52
+ document_vector[index] = tfidf_weight(corpus, document, term)
53
+ end
54
+ scores[document] = RetrievalLite::Vector.dot_product(query_vector, document_vector) / vector_length
43
55
  end
44
- scores[document] = RetrievalLite::Vector.cosine_similarity(query_vector, document_vector)
45
56
  end
46
57
 
47
58
  # order it by score in descending order
@@ -72,13 +83,24 @@ module RetrievalLite::TfIdfRetrieval
72
83
  # @param term [String]
73
84
  # @return [Float] the normalized tfidf weight of the term in the document
74
85
  def self.normalized_tfidf_weight(corpus, document, term)
75
- length_of_vector = 0
86
+ tfidf_weight(corpus, document, term) / tfidf_weight_length(corpus, document)
87
+ end
88
+
89
+
90
+ # Computes the length of a document vector of tf-idf weights. This is
91
+ # used for normalization
92
+ #
93
+ # @param corpus [Corpus]
94
+ # @param document [Document]
95
+ # @return [Float] the length of the document vector of tf-idf weights
96
+ def self.tfidf_weight_length(corpus, document)
97
+ normalize = 0
76
98
 
77
- corpus.documents_with(term).each do |d|
78
- weight = tfidf_weight(corpus, d, term)
79
- length_of_vector += weight * weight
99
+ document.terms.each do |t|
100
+ weight = tfidf_weight(corpus, document, t)
101
+ normalize += weight * weight
80
102
  end
81
103
 
82
- tfidf_weight(corpus, document, term) / Math.sqrt(length_of_vector)
104
+ return Math.sqrt(normalize)
83
105
  end
84
106
  end
@@ -1,4 +1,6 @@
1
+ # Separates text into tokens used for IR
1
2
  module RetrievalLite::Tokenizer
3
+ # Punctuation that is to be ignored when parsing. Does not contain the hyphen
2
4
  SPECIAL_SEPARATERS = ['[', ']', '\\', ';', '\'', ',', '.', '/', '!', '@', '#', '%', '&', '*', '(', ')', '_', '{', '}', ':', '"', '?', '=', '`', '~', '$', '^', '+', '|', '<', '>']
3
5
 
4
6
  # @param content [String] the text of the document
@@ -24,7 +26,7 @@ module RetrievalLite::Tokenizer
24
26
  end
25
27
  end
26
28
 
27
- tokens
29
+ return tokens
28
30
  end
29
31
 
30
32
  private
@@ -1,3 +1,4 @@
1
+ # Offers mathematical operations for vectors
1
2
  module RetrievalLite::Vector
2
3
  # @param scores1 [Array<Integer>] each term and its corresponding score in the first document
3
4
  # @param scores2 [Array<Integer>] each term and its corresponding score in the second document
data/lib/version.rb CHANGED
@@ -1,3 +1,4 @@
1
1
  module RetrievalLite
2
- VERSION = "1.0.0"
2
+ # current version of the Ruby gem
3
+ VERSION = "1.1.0"
3
4
  end
@@ -7,38 +7,105 @@ describe RetrievalLite::BooleanRetrieval do
7
7
  let (:document_replicated) do
8
8
  RetrievalLite::Document.new("lorem ipsum dolor sit amet")
9
9
  end
10
+ let (:document_one_term) do
11
+ RetrievalLite::Document.new("lorem")
12
+ end
10
13
  let (:document_with_duplicates) do
11
14
  RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
12
15
  end
13
- let (:document_two) do
14
- RetrievalLite::Document.new("Mauris ullamcorper, tortor et consequat sagittis.")
16
+ let (:document_strange) do
17
+ RetrievalLite::Document.new("foo bar")
15
18
  end
16
- let (:document_three) do
17
- RetrievalLite::Document.new("Pellentesque felis lectus, lacinia nec mauris non.")
19
+ let (:document_no_match) do
20
+ RetrievalLite::Document.new("no-match")
18
21
  end
19
- let (:document_paragraph) do
20
- RetrievalLite::Document.new("In semper enim non ullamcorper venenatis.
21
- Sed dictum metus condimentum libero ullamcorper, eget scelerisque risus congue.
22
- Morbi tempus rhoncus ante, at varius sem adipiscing eu. Sed ut purus pretium,
23
- consequat velit et, ultricies magna. Etiam sit amet elit mi. Sed et nibh non nibh
24
- vestibulum hendrerit vitae dapibus lectus. Aenean eget odio vitae tortor elementum
25
- euismod non nec eros. Nunc id convallis magna. Aliquam ultrices dignissim ipsum,
26
- a accumsan enim faucibus non. Pellentesque a felis quis diam blandit tempor.
27
- In aliquet laoreet tortor, at adipiscing diam scelerisque ut."
28
- )
22
+ let (:all_normal_documents) do
23
+ [document, document_replicated, document_with_duplicates, document_one_term, document_strange]
29
24
  end
30
25
  let (:all_documents) do
31
- [document, document_replicated, document_with_duplicates, document_two, document_three, document_paragraph]
26
+ [document, document_replicated, document_with_duplicates, document_one_term, document_strange, document_no_match]
32
27
  end
33
28
  let (:corpus) do
34
29
  RetrievalLite::Corpus.new(all_documents)
35
30
  end
31
+
32
+ describe "#has_boolean_operators?" do
33
+ it "should accept any uses of AND OR NOT" do
34
+ RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo AND bar").should == true
35
+ RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo OR bar").should == true
36
+ RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo NOT bar").should == true
37
+ end
38
+ it "should reject any regular non-boolean queries" do
39
+ RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo bar").should == false
40
+ end
41
+ end
42
+
43
+ describe "#is_valid_expression?" do
44
+ it "should accept parenthesis and spaces, as well as all alphanumeric characters" do
45
+ RetrievalLite::BooleanRetrieval.is_valid_expression?("(foo AND bar) OR baz").should == true
46
+ end
47
+ it "should reject when there is a close parethensis but no term after AND/OR/NOT" do
48
+ RetrievalLite::BooleanRetrieval.is_valid_expression?("(foo AND bar AND)").should == false
49
+ RetrievalLite::BooleanRetrieval.is_valid_expression?("(foo AND bar AND )").should == false
50
+ end
51
+ it "should accept AND/OR/NOT with any begin parenthesis after it, regardless if there's a whitespace" do
52
+ RetrievalLite::BooleanRetrieval.is_valid_expression?("NOT(foo AND bar)").should == true
53
+ RetrievalLite::BooleanRetrieval.is_valid_expression?("NOT (foo AND bar)").should == true
54
+ end
55
+ it "should accept sentences" do
56
+ RetrievalLite::BooleanRetrieval.is_valid_expression?("foo bar.").should == true
57
+ end
58
+ end
59
+
60
+ describe "#strip_query" do
61
+ it "should strip any commas, periods, etc nonalphanumeric characters" do
62
+ RetrievalLite::BooleanRetrieval.strip_query("(This is a cat.) AND (Although, something else!)").should == "(This is a cat ) AND (Although something else )"
63
+ end
64
+ it "should strip any double, triple, etc hyphenated words" do
65
+ RetrievalLite::BooleanRetrieval.strip_query("This is it--hooray!").should == "This is it hooray "
66
+ end
67
+ it "should leave hyphenated words alone" do
68
+ RetrievalLite::BooleanRetrieval.strip_query("This is foo-bar").should == "This is foo-bar"
69
+ end
70
+ it "should remove lone hyphens" do
71
+ RetrievalLite::BooleanRetrieval.strip_query("This - is foo-bar").should == "This is foo-bar"
72
+ end
73
+ end
74
+
75
+ describe "invalid boolean" do
76
+ it "should error on unclosed parenthesis" do
77
+ expect { RetrievalLite::BooleanRetrieval.evaluate(corpus, "(lorem AND ipsum") }.to raise_error
78
+ end
79
+ it "should error on when not enough arguments are provided" do
80
+ expect { RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem AND ipsum OR") }.to raise_error
81
+ end
82
+ end
83
+
36
84
  describe "one-term retrieval" do
37
85
  it "should return array of all documents with that term" do
38
- RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem") == [document, document_replicated, document_with_duplicates]
86
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem").should == [document, document_replicated, document_with_duplicates, document_one_term]
39
87
  end
40
88
  it "should ignore case" do
41
- RetrievalLite::BooleanRetrieval.evaluate(corpus, "LOREM") == [document, document_replicated, document_with_duplicates]
89
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "LOreM").should == [document, document_replicated, document_with_duplicates, document_one_term]
90
+ end
91
+ it "should work for hyphenated words" do
92
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "no-match").should == [document_no_match]
93
+ end
94
+ end
95
+
96
+ describe "valid boolean retrieval" do
97
+ it "should work for simple two term AND" do
98
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem AND ipsum").should == [document, document_replicated, document_with_duplicates]
99
+ end
100
+ it "should work for simple two term OR" do
101
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem OR foo").should == all_normal_documents
102
+ end
103
+ it "should work for simple one term NOT" do
104
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "NOT lorem").should == [document_strange, document_no_match]
105
+ end
106
+ it "should work for more complex retrievals with parenthesis" do
107
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "foo OR (dolor AND sit)").should == [document, document_replicated, document_with_duplicates, document_strange]
108
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "(lorem OR foo) AND NOT ipsum").should == [document_one_term, document_strange]
42
109
  end
43
110
  end
44
111
  end
@@ -80,6 +80,12 @@ describe RetrievalLite::Document do
80
80
  end
81
81
  end
82
82
 
83
+ describe "total terms" do
84
+ it "should have the right number of terms" do
85
+ document.total_terms.should == 5
86
+ end
87
+ end
88
+
83
89
  describe "for blank document" do
84
90
  it "should not raise error on initialization" do
85
91
  expect { RetrievalLite::Document.new("") }.to_not raise_error
@@ -1,3 +1,68 @@
1
1
  require 'spec_helper'
2
+
2
3
  describe RetrievalLite do
4
+ include RetrievalLite
5
+
6
+ let (:document_one_term) do
7
+ new_document("lorem")
8
+ end
9
+ let (:document) do
10
+ new_document("lorem ipsum dolor sit amet")
11
+ end
12
+ let (:document_with_duplicates) do
13
+ new_document("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
14
+ end
15
+ let (:document_doubled) do
16
+ new_document("lorem ipsum dolor sit amet lorem ipsum dolor sit amet")
17
+ end
18
+ let (:document_both_terms) do
19
+ new_document("lorem ipsum")
20
+ end
21
+ let (:document_with_unique) do
22
+ new_document("lorem unique")
23
+ end
24
+ let (:document_no_match) do
25
+ new_document("no-match")
26
+ end
27
+ let (:all_documents) do
28
+ [document, document_with_duplicates, document_doubled, document_one_term, document_both_terms, document_with_unique, document_no_match]
29
+ end
30
+ let (:corpus) do
31
+ new_corpus(all_documents)
32
+ end
33
+ let (:corpus_different) do
34
+ new_corpus([document_one_term, document, document_with_duplicates])
35
+ end
36
+ let(:corpus_small) do
37
+ new_corpus([document_one_term, document, document_no_match])
38
+ end
39
+
40
+ describe "when no options are passed" do
41
+ it "should default to basic tf-idf" do
42
+ scores = evaluate_query_with_scores(corpus_different, "lorem dolor sit")
43
+ scores[document].should be_within(0.001).of(1.0)
44
+ scores[document_with_duplicates].should be_within(0.001).of(0.953)
45
+ scores[document_one_term].should be_within(0.001).of(0.0)
46
+
47
+ evaluate_query(corpus_different, "lorem dolor sit").should == [document, document_with_duplicates, document_one_term]
48
+ end
49
+ end
50
+
51
+ describe "when boolean operators are present" do
52
+ it "should first filter through boolean" do
53
+ evaluate_query(corpus, "lorem AND NOT dolor").should == [document_one_term, document_both_terms, document_with_unique]
54
+ evaluate_query(corpus, "(lorem AND unique) OR no-match").should == [document_with_unique, document_no_match]
55
+ evaluate_query(corpus, "lorem AND ipsum AND dolor AND sit AND amet").should == [document_doubled, document, document_with_duplicates]
56
+ evaluate_query(corpus, "lorem AND no-match").should == []
57
+ end
58
+ end
59
+
60
+ describe "with punctuation" do
61
+ it "should retrieve it as normal" do
62
+ evaluate_query(corpus, "lorem. AND NOT dolor!").should == [document_one_term, document_both_terms, document_with_unique]
63
+ evaluate_query(corpus, "(lorem-- AND !unique) OR no-match").should == [document_with_unique, document_no_match]
64
+ evaluate_query(corpus, "@lorem AND @ipsum AND @dolor AND @sit AND @amet").should == [document_doubled, document, document_with_duplicates]
65
+ evaluate_query(corpus, "||lorem AND no-match").should == []
66
+ end
67
+ end
3
68
  end
data/spec/spec_helper.rb CHANGED
@@ -1,3 +1,6 @@
1
+ require 'simplecov'
2
+ SimpleCov.start
3
+
1
4
  require "retrieval_lite"
2
5
  require "spec_helpers/file_helpers"
3
6
 
@@ -1,6 +1,9 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe RetrievalLite::TfIdfRetrieval do
4
+ let (:document_no_match) do
5
+ RetrievalLite::Document.new("no-match")
6
+ end
4
7
  let (:document_one_term) do
5
8
  RetrievalLite::Document.new("lorem")
6
9
  end
@@ -29,6 +32,9 @@ describe RetrievalLite::TfIdfRetrieval do
29
32
  let (:corpus_different) do
30
33
  RetrievalLite::Corpus.new([document_one_term, document, document_with_duplicates])
31
34
  end
35
+ let(:corpus_small) do
36
+ RetrievalLite::Corpus.new([document_one_term, document, document_no_match])
37
+ end
32
38
 
33
39
  describe "calculating tf-idf scores" do
34
40
  describe "term that all documents have" do
@@ -55,10 +61,10 @@ describe RetrievalLite::TfIdfRetrieval do
55
61
  describe "calculating normalized tf-idf scores" do
56
62
  describe "term that a few documents have" do
57
63
  it "should have correct tf-idf" do
58
- RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.316)
59
- RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.632)
60
- RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.632)
61
- RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(0.316)
64
+ RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.320)
65
+ RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.163)
66
+ RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.320)
67
+ RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(1.0)
62
68
  end
63
69
  end
64
70
  end
@@ -84,9 +90,9 @@ describe RetrievalLite::TfIdfRetrieval do
84
90
  it "should return the correct score" do
85
91
  scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "ipsum")
86
92
  scores.size.should == 4
87
- scores[document].should be_within(0.001).of(1.0)
88
- scores[document_with_duplicates].should be_within(0.001).of(1.0)
89
- scores[document_doubled].should be_within(0.001).of(1.0)
93
+ scores[document].should be_within(0.001).of(0.320)
94
+ scores[document_with_duplicates].should be_within(0.001).of(0.163)
95
+ scores[document_doubled].should be_within(0.001).of(0.320)
90
96
  scores[document_both_terms].should be_within(0.001).of(1.0)
91
97
  end
92
98
  end
@@ -122,9 +128,19 @@ describe RetrievalLite::TfIdfRetrieval do
122
128
  end
123
129
  it "should have the correct scores" do
124
130
  scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus_different, "lorem dolor sit")
125
- scores[document].should be_within(0.001).of(0.816)
126
- scores[document_with_duplicates].should be_within(0.001).of(0.808)
131
+ scores[document].should be_within(0.001).of(1.0)
132
+ scores[document_with_duplicates].should be_within(0.001).of(0.953)
127
133
  scores[document_one_term].should be_within(0.001).of(0.0)
128
134
  end
129
135
  end
136
+
137
+ describe "documents with same frequency but longer lengths" do
138
+ it "order should favor shorter documents" do
139
+ RetrievalLite::TfIdfRetrieval.evaluate(corpus_small, "lorem").should == [document_one_term, document]
140
+ end
141
+ it "shorter documents should rank higher" do
142
+ scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus_small, "lorem")
143
+ scores[document_one_term].should > scores[document]
144
+ end
145
+ end
130
146
  end
data/spec/vector_spec.rb CHANGED
@@ -24,4 +24,16 @@ describe RetrievalLite::Vector do
24
24
  RetrievalLite::Vector.euclidean_length([3, 4]).should == 5
25
25
  end
26
26
  end
27
+
28
+ describe "cosine similarity" do
29
+ it "should compute correctly for vectors length 1" do
30
+ RetrievalLite::Vector.cosine_similarity([3], [5]).should == 1
31
+ end
32
+ it "should compute correctly for longer vectors" do
33
+ RetrievalLite::Vector.cosine_similarity([2, 3], [4, 5]).should be_within(0.001).of(0.996)
34
+ end
35
+ it "should raise error for unequal sized arrays" do
36
+ expect { RetrievalLite::Vector.cosine_similarity([2, 3], [4]) }.to raise_error
37
+ end
38
+ end
27
39
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: retrieval_lite
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Irvin Zhan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-12 00:00:00.000000000 Z
11
+ date: 2014-05-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec