retrieval_lite 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8e080b5b0c600b330f5d07827b5c2e576132306b
4
- data.tar.gz: f0fa5049a6f5e9df0a1fbc761107d28dc1efe857
3
+ metadata.gz: 4e4e7dd3aca0e8ccc8ff59095e5e223b6f8a1f04
4
+ data.tar.gz: 84517e16668414f490e3ce284fccf53f55ebdd20
5
5
  SHA512:
6
- metadata.gz: b1827e3f90edbc3ee841fbe25526c748ac82ad1aae6f42455605e3199dffe7c5cd309c2684741d41f695c88285152c1bf83558beb87668ec9eebb4973f7ff0a3
7
- data.tar.gz: 5d2009f175da5ce2cecb277061e9592093a14435d5012094c461afb1ac626b1d72705edf560cece8f53d4a02623f7ebe2fd3adc0f0555b0934cd5b878f6c12b2
6
+ metadata.gz: 67e4537932eea13f79d4009ff9c48dc2a9f7e05ca3ca4c6b4e31564c057d9c87e228de0cca0713649a6d5607d712b19f9febfc31c4fffdc087bbe04a14da348a
7
+ data.tar.gz: 65676b034f4cbbf7757c5b58b99ce07a453154d0230e063b6ce2a76f5fa11b200ec927a97c864872a9202cbcf48fac8b71543a9d62b0c26ea1372a893c7afc87
data/Gemfile CHANGED
@@ -2,3 +2,5 @@ source 'https://rubygems.org'
2
2
 
3
3
  # gem's dependencies specified in retrieval_lite.gemspec
4
4
  gemspec
5
+
6
+ gem 'simplecov', :require => false, :group => :test
@@ -1,7 +1,63 @@
1
1
  require "version"
2
+ require "set"
2
3
 
4
+ # Offers simple document retrieval from a corpus with a query
3
5
  module RetrievalLite
4
-
6
+ # Queries a corpus first by filtering it using a boolean evaluator and then
7
+ # using the tf-idf ranking algorithm and cosine similarity.
8
+ # Returns documents ordered by tf-idf score.
9
+ #
10
+ # @param corpus [Corpus] the collection of documents
11
+ # @param query [String] the boolean query to be evaluated
12
+ # @option opts [Boolean] :no_bool prevent the boolean filter
13
+ # @return [Array<Document>] ordered array of documents that satisfy the query
14
+ def evaluate_query(corpus, query, opts = {})
15
+ evaluate_query_with_scores(corpus, query, opts).keys
16
+ end
17
+
18
+ # Queries a corpus first by filtering it using a boolean evaluator and then
19
+ # using the tf-idf ranking algorithm and cosine similarity.
20
+ # Returns Hash of documents to their respective TF-IDF scores
21
+ # @see evaluate_query
22
+ #
23
+ # @param corpus [Corpus] the collection of documents
24
+ # @param query [String] the boolean query to be evaluated
25
+ # @option opts [Boolean] :no_bool prevent the boolean filter
26
+ # @return [Hash<Document, Integer>] ordered array of documents that satisfy the query
27
+ def evaluate_query_with_scores(corpus, query, opts = {})
28
+ evaluator_options = {}
29
+
30
+ # evaluate like normal if it is not a boolean expression
31
+ if opts[:no_bool] || !RetrievalLite::BooleanRetrieval.has_boolean_operators?(query)
32
+ RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, query)
33
+ else
34
+ documents = RetrievalLite::BooleanRetrieval.evaluate(corpus, query)
35
+ RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, query, { document_set: documents })
36
+ end
37
+ end
38
+
39
+ # Creates a new Retrieval Lite document. Upon initialization, the content
40
+ # is parsed into individual tokens, and its term frequencies are recorded.
41
+ #
42
+ # @param content [String] the text of the document
43
+ # @param opts [Hash] optional arguments to initializer
44
+ # @option opts [String] :id the id of the document. Defaults to object_id assigned by ruby
45
+ # @return a new document containing the input text
46
+ def new_document(content, opts = {})
47
+ RetrievalLite::Document.new(content, opts)
48
+ end
49
+
50
+ # Creates a new Retrieval Lite corpus, a collection of documents. Corpuses
51
+ # do not modify nor own the documents in them, meaning documents must
52
+ # be created first before adding them to the corpus.
53
+ #
54
+ # @param documents [Array<Document>] the documents of the corpus
55
+ # @param opts [Hash] optional arguments to initializer
56
+ # @option opts [Array<String>] :stop_words the words to ignore when creating tokens
57
+ # @return [Corpus] either a new empty corpus or one with those documents
58
+ def new_corpus(documents = [], opts = {})
59
+ RetrievalLite::Corpus.new(documents, opts)
60
+ end
5
61
  end
6
62
 
7
63
  require 'retrieval_lite/document'
@@ -1,21 +1,66 @@
1
+ # Gathers documents that satisfy boolean expression
1
2
  module RetrievalLite::BooleanRetrieval
2
- # Queries a corpus using a boolean expression with the standard operators,
3
- # AND, OR, NOT. Only returns documents that satisfy the query, and does
4
- # not rank the documents in any way.
3
+ # Gathers up all documents of a corpus that satisfy a boolean expression
4
+ # with the standard operators: AND, OR, NOT. Does not order the documents in
5
+ # particular any way. Assumes that all boolean operators are separated by
6
+ # white space on either side.
5
7
  #
6
8
  # @param corpus [Corpus] the collection of documents
7
9
  # @param query [String] the boolean query to be evaluated
8
10
  # @return [Array<Document>] unordered array of documents that satisfy the query
9
11
  def self.evaluate(corpus, query)
10
- if !is_valid?(query)
11
- raise "Boolean expression is not valid." # TODO better validation message?
12
+ if !is_valid_expression?(query)
13
+ raise "Each boolean operator (AND, OR, NOT) must operate on two terms."
12
14
  end
13
15
 
14
- corpus.documents_with(query)
15
- end
16
+ # must strip all non alphanumeric characters
17
+ query = strip_query(query)
18
+
19
+ # must have spaces in front and back for next line
20
+ query = " " + query + " "
21
+
22
+ # replace all operators with corresponding operators
23
+ query = query.gsub("AND", "\&\&").gsub("OR", "\|\|").gsub("NOT", "!")
24
+
25
+ # replace all terms with corresponding functions
26
+ query.gsub!(/[a-zA-Z0-9]+(-[a-zA-Z0-9]+)?/) do |q|
27
+ " document.contains?(\"" + q.downcase + "\") "
28
+ end
16
29
 
17
- private
18
- def self.is_valid?(query)
19
- true
30
+ output_documents = []
31
+ corpus.documents.each do |document|
32
+ begin
33
+ if eval(query)
34
+ output_documents << document
35
+ end
36
+ rescue
37
+ raise "The boolean expression is not valid. Please check all parethensis and operators."
38
+ end
20
39
  end
40
+
41
+ return output_documents
42
+ end
43
+
44
+ # @param query [String] the boolean query to be evaluated
45
+ # @return [Boolean] whether query contains any boolean operators
46
+ def self.has_boolean_operators?(query)
47
+ /AND|OR|NOT/ === query
48
+ end
49
+
50
+ # @note all other invalid expressions should be caught later on
51
+ # @param query [String] the boolean query to be evaluated
52
+ # @return [Boolean] whether query ends parenthesis correctly
53
+ def self.is_valid_expression?(query)
54
+ !(/(AND|OR|NOT)\s*\)/ === query)
55
+ end
56
+
57
+ # @param query [String] the boolean query to be evaluated
58
+ # @return [String] a query removed of any non-alphanumeric characters besides parenthesis and whitespace
59
+ def self.strip_query(query)
60
+ # remove non-alphanumeric
61
+ query = query.gsub(/[^a-zA-Z0-9\s\(\)\-]/, " ")
62
+
63
+ # getting rid of stray hyphens
64
+ query = query.gsub(/\-\-+/, " ").gsub(/\s+\-\s+/, " ")
65
+ end
21
66
  end
@@ -1,9 +1,14 @@
1
+ # A collection of documents
1
2
  class RetrievalLite::Corpus
2
3
  # the documents within the corpus
3
4
  attr_reader :documents
4
5
  # hash of a term to the array of documents that contain the particular term
5
6
  attr_reader :term_occurrences
6
7
 
8
+ # Creates a new Retrieval Lite corpus, a collection of documents. Corpuses
9
+ # do not modify nor own the documents in them, meaning documents must
10
+ # be created first before adding them to the corpus.
11
+ #
7
12
  # @param documents [Array<Document>] the documents of the corpus
8
13
  # @param opts [Hash] optional arguments to initializer
9
14
  # @option opts [Array<String>] :stop_words the words to ignore when creating tokens
@@ -1,3 +1,4 @@
1
+ # Representation of document using content as a string and term frequencies as a hash
1
2
  class RetrievalLite::Document
2
3
  # the text of the document
3
4
  attr_reader :content
@@ -6,7 +7,8 @@ class RetrievalLite::Document
6
7
  # the id of the document
7
8
  attr_reader :id
8
9
 
9
- # splits the text of the document into an array of tokens
10
+ # Creates a new Retrieval Lite document. Upon initialization, the content
11
+ # is parsed into individual tokens, and its term frequencies are recorded.
10
12
  #
11
13
  # @param content [String] the text of the document
12
14
  # @param opts [Hash] optional arguments to initializer
@@ -17,13 +19,6 @@ class RetrievalLite::Document
17
19
  @term_frequencies = RetrievalLite::Tokenizer.parse_content(content)
18
20
  end
19
21
 
20
- # for debugging
21
- def print_tokens
22
- @term_frequencies.each do |key, value|
23
- puts "#{key}: #{value}"
24
- end
25
- end
26
-
27
22
  # @return [Integer] the total number of unique terms in the document
28
23
  def term_count
29
24
  @term_frequencies.size
@@ -44,6 +39,12 @@ class RetrievalLite::Document
44
39
  end
45
40
  end
46
41
 
42
+ # @param term [String]
43
+ # @return [Boolean] whether a term appears in the document
44
+ def contains?(term)
45
+ @term_frequencies.has_key?(term)
46
+ end
47
+
47
48
  # @return [Integer] the total number of terms (not unique) in the document
48
49
  def total_terms
49
50
  count = 0
@@ -1,3 +1,4 @@
1
+ # Scores queries using TF-IDF
1
2
  # @see http://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf
2
3
  module RetrievalLite::TfIdfRetrieval
3
4
  # Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
@@ -6,8 +7,8 @@ module RetrievalLite::TfIdfRetrieval
6
7
  # @param corpus [Corpus] the collection of documents
7
8
  # @param query [String] the boolean query to be evaluated
8
9
  # @return [Array<Document>] ordered array of documents that satisfy the query
9
- def self.evaluate(corpus, query)
10
- evaluate_with_scores(corpus, query).keys
10
+ def self.evaluate(corpus, query, opts = {})
11
+ evaluate_with_scores(corpus, query, opts).keys
11
12
  end
12
13
 
13
14
  # Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
@@ -16,20 +17,25 @@ module RetrievalLite::TfIdfRetrieval
16
17
  #
17
18
  # @param corpus [Corpus] the collection of documents
18
19
  # @param query [String] the boolean query to be evaluated
20
+ # @option opts [Array<Document>] :document_set limiting the documents to search in the corpus to only these documents
19
21
  # @return [Hash<Document, Integer>] ordered array of documents that satisfy the query
20
- def self.evaluate_with_scores(corpus, query)
22
+ def self.evaluate_with_scores(corpus, query, opts = {})
21
23
  query_document = RetrievalLite::Document.new(query)
22
24
  terms = query_document.term_frequencies.keys
23
25
  query_vector = query_document.term_frequencies.values # should be in same order as keys
24
26
 
25
- documents = Set.new # ordering of documents doesn't matter right now
26
- # gathering only the documents that contain at least one of those terms
27
- terms.each do |t|
28
- docs_with_term = corpus.documents_with(t)
29
- if docs_with_term
30
- docs_with_term.each do |d|
31
- if !documents.include?(d)
32
- documents << d
27
+ if opts[:document_set]
28
+ documents = opts[:document_set]
29
+ else
30
+ documents = Set.new # ordering of documents doesn't matter right now
31
+ # gathering only the documents that contain at least one of those terms
32
+ terms.each do |t|
33
+ docs_with_term = corpus.documents_with(t)
34
+ if docs_with_term
35
+ docs_with_term.each do |d|
36
+ if !documents.include?(d)
37
+ documents << d
38
+ end
33
39
  end
34
40
  end
35
41
  end
@@ -37,11 +43,16 @@ module RetrievalLite::TfIdfRetrieval
37
43
 
38
44
  scores = {}
39
45
  documents.each do |document|
40
- document_vector = Array.new(terms.size)
41
- terms.each_with_index do |term, index|
42
- document_vector[index] = tfidf_weight(corpus, document, term)
46
+ vector_length = tfidf_weight_length(corpus, document)
47
+ if vector_length == 0
48
+ scores[document] = 0
49
+ else
50
+ document_vector = Array.new(terms.size)
51
+ terms.each_with_index do |term, index|
52
+ document_vector[index] = tfidf_weight(corpus, document, term)
53
+ end
54
+ scores[document] = RetrievalLite::Vector.dot_product(query_vector, document_vector) / vector_length
43
55
  end
44
- scores[document] = RetrievalLite::Vector.cosine_similarity(query_vector, document_vector)
45
56
  end
46
57
 
47
58
  # order it by score in descending order
@@ -72,13 +83,24 @@ module RetrievalLite::TfIdfRetrieval
72
83
  # @param term [String]
73
84
  # @return [Float] the normalized tfidf weight of the term in the document
74
85
  def self.normalized_tfidf_weight(corpus, document, term)
75
- length_of_vector = 0
86
+ tfidf_weight(corpus, document, term) / tfidf_weight_length(corpus, document)
87
+ end
88
+
89
+
90
+ # Computes the length of a document vector of tf-idf weights. This is
91
+ # used for normalization
92
+ #
93
+ # @param corpus [Corpus]
94
+ # @param document [Document]
95
+ # @return [Float] the length of the document vector of tf-idf weights
96
+ def self.tfidf_weight_length(corpus, document)
97
+ normalize = 0
76
98
 
77
- corpus.documents_with(term).each do |d|
78
- weight = tfidf_weight(corpus, d, term)
79
- length_of_vector += weight * weight
99
+ document.terms.each do |t|
100
+ weight = tfidf_weight(corpus, document, t)
101
+ normalize += weight * weight
80
102
  end
81
103
 
82
- tfidf_weight(corpus, document, term) / Math.sqrt(length_of_vector)
104
+ return Math.sqrt(normalize)
83
105
  end
84
106
  end
@@ -1,4 +1,6 @@
1
+ # Separates text into tokens used for IR
1
2
  module RetrievalLite::Tokenizer
3
+ # Punctuation that is to be ignored when parsing. Does not contain the hyphen
2
4
  SPECIAL_SEPARATERS = ['[', ']', '\\', ';', '\'', ',', '.', '/', '!', '@', '#', '%', '&', '*', '(', ')', '_', '{', '}', ':', '"', '?', '=', '`', '~', '$', '^', '+', '|', '<', '>']
3
5
 
4
6
  # @param content [String] the text of the document
@@ -24,7 +26,7 @@ module RetrievalLite::Tokenizer
24
26
  end
25
27
  end
26
28
 
27
- tokens
29
+ return tokens
28
30
  end
29
31
 
30
32
  private
@@ -1,3 +1,4 @@
1
+ # Offers mathematical operations for vectors
1
2
  module RetrievalLite::Vector
2
3
  # @param scores1 [Array<Integer>] each term and its corresponding score in the first document
3
4
  # @param scores2 [Array<Integer>] each term and its corresponding score in the second document
data/lib/version.rb CHANGED
@@ -1,3 +1,4 @@
1
1
  module RetrievalLite
2
- VERSION = "1.0.0"
2
+ # current version of the Ruby gem
3
+ VERSION = "1.1.0"
3
4
  end
@@ -7,38 +7,105 @@ describe RetrievalLite::BooleanRetrieval do
7
7
  let (:document_replicated) do
8
8
  RetrievalLite::Document.new("lorem ipsum dolor sit amet")
9
9
  end
10
+ let (:document_one_term) do
11
+ RetrievalLite::Document.new("lorem")
12
+ end
10
13
  let (:document_with_duplicates) do
11
14
  RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
12
15
  end
13
- let (:document_two) do
14
- RetrievalLite::Document.new("Mauris ullamcorper, tortor et consequat sagittis.")
16
+ let (:document_strange) do
17
+ RetrievalLite::Document.new("foo bar")
15
18
  end
16
- let (:document_three) do
17
- RetrievalLite::Document.new("Pellentesque felis lectus, lacinia nec mauris non.")
19
+ let (:document_no_match) do
20
+ RetrievalLite::Document.new("no-match")
18
21
  end
19
- let (:document_paragraph) do
20
- RetrievalLite::Document.new("In semper enim non ullamcorper venenatis.
21
- Sed dictum metus condimentum libero ullamcorper, eget scelerisque risus congue.
22
- Morbi tempus rhoncus ante, at varius sem adipiscing eu. Sed ut purus pretium,
23
- consequat velit et, ultricies magna. Etiam sit amet elit mi. Sed et nibh non nibh
24
- vestibulum hendrerit vitae dapibus lectus. Aenean eget odio vitae tortor elementum
25
- euismod non nec eros. Nunc id convallis magna. Aliquam ultrices dignissim ipsum,
26
- a accumsan enim faucibus non. Pellentesque a felis quis diam blandit tempor.
27
- In aliquet laoreet tortor, at adipiscing diam scelerisque ut."
28
- )
22
+ let (:all_normal_documents) do
23
+ [document, document_replicated, document_with_duplicates, document_one_term, document_strange]
29
24
  end
30
25
  let (:all_documents) do
31
- [document, document_replicated, document_with_duplicates, document_two, document_three, document_paragraph]
26
+ [document, document_replicated, document_with_duplicates, document_one_term, document_strange, document_no_match]
32
27
  end
33
28
  let (:corpus) do
34
29
  RetrievalLite::Corpus.new(all_documents)
35
30
  end
31
+
32
+ describe "#has_boolean_operators?" do
33
+ it "should accept any uses of AND OR NOT" do
34
+ RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo AND bar").should == true
35
+ RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo OR bar").should == true
36
+ RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo NOT bar").should == true
37
+ end
38
+ it "should reject any regular non-boolean queries" do
39
+ RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo bar").should == false
40
+ end
41
+ end
42
+
43
+ describe "#is_valid_expression?" do
44
+ it "should accept parenthesis and spaces, as well as all alphanumeric characters" do
45
+ RetrievalLite::BooleanRetrieval.is_valid_expression?("(foo AND bar) OR baz").should == true
46
+ end
47
+ it "should reject when there is a close parethensis but no term after AND/OR/NOT" do
48
+ RetrievalLite::BooleanRetrieval.is_valid_expression?("(foo AND bar AND)").should == false
49
+ RetrievalLite::BooleanRetrieval.is_valid_expression?("(foo AND bar AND )").should == false
50
+ end
51
+ it "should accept AND/OR/NOT with any begin parenthesis after it, regardless if there's a whitespace" do
52
+ RetrievalLite::BooleanRetrieval.is_valid_expression?("NOT(foo AND bar)").should == true
53
+ RetrievalLite::BooleanRetrieval.is_valid_expression?("NOT (foo AND bar)").should == true
54
+ end
55
+ it "should accept sentences" do
56
+ RetrievalLite::BooleanRetrieval.is_valid_expression?("foo bar.").should == true
57
+ end
58
+ end
59
+
60
+ describe "#strip_query" do
61
+ it "should strip any commas, periods, etc nonalphanumeric characters" do
62
+ RetrievalLite::BooleanRetrieval.strip_query("(This is a cat.) AND (Although, something else!)").should == "(This is a cat ) AND (Although something else )"
63
+ end
64
+ it "should strip any double, triple, etc hyphenated words" do
65
+ RetrievalLite::BooleanRetrieval.strip_query("This is it--hooray!").should == "This is it hooray "
66
+ end
67
+ it "should leave hyphenated words alone" do
68
+ RetrievalLite::BooleanRetrieval.strip_query("This is foo-bar").should == "This is foo-bar"
69
+ end
70
+ it "should remove lone hyphens" do
71
+ RetrievalLite::BooleanRetrieval.strip_query("This - is foo-bar").should == "This is foo-bar"
72
+ end
73
+ end
74
+
75
+ describe "invalid boolean" do
76
+ it "should error on unclosed parenthesis" do
77
+ expect { RetrievalLite::BooleanRetrieval.evaluate(corpus, "(lorem AND ipsum") }.to raise_error
78
+ end
79
+ it "should error on when not enough arguments are provided" do
80
+ expect { RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem AND ipsum OR") }.to raise_error
81
+ end
82
+ end
83
+
36
84
  describe "one-term retrieval" do
37
85
  it "should return array of all documents with that term" do
38
- RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem") == [document, document_replicated, document_with_duplicates]
86
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem").should == [document, document_replicated, document_with_duplicates, document_one_term]
39
87
  end
40
88
  it "should ignore case" do
41
- RetrievalLite::BooleanRetrieval.evaluate(corpus, "LOREM") == [document, document_replicated, document_with_duplicates]
89
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "LOreM").should == [document, document_replicated, document_with_duplicates, document_one_term]
90
+ end
91
+ it "should work for hyphenated words" do
92
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "no-match").should == [document_no_match]
93
+ end
94
+ end
95
+
96
+ describe "valid boolean retrieval" do
97
+ it "should work for simple two term AND" do
98
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem AND ipsum").should == [document, document_replicated, document_with_duplicates]
99
+ end
100
+ it "should work for simple two term OR" do
101
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem OR foo").should == all_normal_documents
102
+ end
103
+ it "should work for simple one term NOT" do
104
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "NOT lorem").should == [document_strange, document_no_match]
105
+ end
106
+ it "should work for more complex retrievals with parenthesis" do
107
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "foo OR (dolor AND sit)").should == [document, document_replicated, document_with_duplicates, document_strange]
108
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "(lorem OR foo) AND NOT ipsum").should == [document_one_term, document_strange]
42
109
  end
43
110
  end
44
111
  end
@@ -80,6 +80,12 @@ describe RetrievalLite::Document do
80
80
  end
81
81
  end
82
82
 
83
+ describe "total terms" do
84
+ it "should have the right number of terms" do
85
+ document.total_terms.should == 5
86
+ end
87
+ end
88
+
83
89
  describe "for blank document" do
84
90
  it "should not raise error on initialization" do
85
91
  expect { RetrievalLite::Document.new("") }.to_not raise_error
@@ -1,3 +1,68 @@
1
1
  require 'spec_helper'
2
+
2
3
  describe RetrievalLite do
4
+ include RetrievalLite
5
+
6
+ let (:document_one_term) do
7
+ new_document("lorem")
8
+ end
9
+ let (:document) do
10
+ new_document("lorem ipsum dolor sit amet")
11
+ end
12
+ let (:document_with_duplicates) do
13
+ new_document("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
14
+ end
15
+ let (:document_doubled) do
16
+ new_document("lorem ipsum dolor sit amet lorem ipsum dolor sit amet")
17
+ end
18
+ let (:document_both_terms) do
19
+ new_document("lorem ipsum")
20
+ end
21
+ let (:document_with_unique) do
22
+ new_document("lorem unique")
23
+ end
24
+ let (:document_no_match) do
25
+ new_document("no-match")
26
+ end
27
+ let (:all_documents) do
28
+ [document, document_with_duplicates, document_doubled, document_one_term, document_both_terms, document_with_unique, document_no_match]
29
+ end
30
+ let (:corpus) do
31
+ new_corpus(all_documents)
32
+ end
33
+ let (:corpus_different) do
34
+ new_corpus([document_one_term, document, document_with_duplicates])
35
+ end
36
+ let(:corpus_small) do
37
+ new_corpus([document_one_term, document, document_no_match])
38
+ end
39
+
40
+ describe "when no options are passed" do
41
+ it "should default to basic tf-idf" do
42
+ scores = evaluate_query_with_scores(corpus_different, "lorem dolor sit")
43
+ scores[document].should be_within(0.001).of(1.0)
44
+ scores[document_with_duplicates].should be_within(0.001).of(0.953)
45
+ scores[document_one_term].should be_within(0.001).of(0.0)
46
+
47
+ evaluate_query(corpus_different, "lorem dolor sit").should == [document, document_with_duplicates, document_one_term]
48
+ end
49
+ end
50
+
51
+ describe "when boolean operators are present" do
52
+ it "should first filter through boolean" do
53
+ evaluate_query(corpus, "lorem AND NOT dolor").should == [document_one_term, document_both_terms, document_with_unique]
54
+ evaluate_query(corpus, "(lorem AND unique) OR no-match").should == [document_with_unique, document_no_match]
55
+ evaluate_query(corpus, "lorem AND ipsum AND dolor AND sit AND amet").should == [document_doubled, document, document_with_duplicates]
56
+ evaluate_query(corpus, "lorem AND no-match").should == []
57
+ end
58
+ end
59
+
60
+ describe "with punctuation" do
61
+ it "should retrieve it as normal" do
62
+ evaluate_query(corpus, "lorem. AND NOT dolor!").should == [document_one_term, document_both_terms, document_with_unique]
63
+ evaluate_query(corpus, "(lorem-- AND !unique) OR no-match").should == [document_with_unique, document_no_match]
64
+ evaluate_query(corpus, "@lorem AND @ipsum AND @dolor AND @sit AND @amet").should == [document_doubled, document, document_with_duplicates]
65
+ evaluate_query(corpus, "||lorem AND no-match").should == []
66
+ end
67
+ end
3
68
  end
data/spec/spec_helper.rb CHANGED
@@ -1,3 +1,6 @@
1
+ require 'simplecov'
2
+ SimpleCov.start
3
+
1
4
  require "retrieval_lite"
2
5
  require "spec_helpers/file_helpers"
3
6
 
@@ -1,6 +1,9 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe RetrievalLite::TfIdfRetrieval do
4
+ let (:document_no_match) do
5
+ RetrievalLite::Document.new("no-match")
6
+ end
4
7
  let (:document_one_term) do
5
8
  RetrievalLite::Document.new("lorem")
6
9
  end
@@ -29,6 +32,9 @@ describe RetrievalLite::TfIdfRetrieval do
29
32
  let (:corpus_different) do
30
33
  RetrievalLite::Corpus.new([document_one_term, document, document_with_duplicates])
31
34
  end
35
+ let(:corpus_small) do
36
+ RetrievalLite::Corpus.new([document_one_term, document, document_no_match])
37
+ end
32
38
 
33
39
  describe "calculating tf-idf scores" do
34
40
  describe "term that all documents have" do
@@ -55,10 +61,10 @@ describe RetrievalLite::TfIdfRetrieval do
55
61
  describe "calculating normalized tf-idf scores" do
56
62
  describe "term that a few documents have" do
57
63
  it "should have correct tf-idf" do
58
- RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.316)
59
- RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.632)
60
- RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.632)
61
- RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(0.316)
64
+ RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.320)
65
+ RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.163)
66
+ RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.320)
67
+ RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(1.0)
62
68
  end
63
69
  end
64
70
  end
@@ -84,9 +90,9 @@ describe RetrievalLite::TfIdfRetrieval do
84
90
  it "should return the correct score" do
85
91
  scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "ipsum")
86
92
  scores.size.should == 4
87
- scores[document].should be_within(0.001).of(1.0)
88
- scores[document_with_duplicates].should be_within(0.001).of(1.0)
89
- scores[document_doubled].should be_within(0.001).of(1.0)
93
+ scores[document].should be_within(0.001).of(0.320)
94
+ scores[document_with_duplicates].should be_within(0.001).of(0.163)
95
+ scores[document_doubled].should be_within(0.001).of(0.320)
90
96
  scores[document_both_terms].should be_within(0.001).of(1.0)
91
97
  end
92
98
  end
@@ -122,9 +128,19 @@ describe RetrievalLite::TfIdfRetrieval do
122
128
  end
123
129
  it "should have the correct scores" do
124
130
  scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus_different, "lorem dolor sit")
125
- scores[document].should be_within(0.001).of(0.816)
126
- scores[document_with_duplicates].should be_within(0.001).of(0.808)
131
+ scores[document].should be_within(0.001).of(1.0)
132
+ scores[document_with_duplicates].should be_within(0.001).of(0.953)
127
133
  scores[document_one_term].should be_within(0.001).of(0.0)
128
134
  end
129
135
  end
136
+
137
+ describe "documents with same frequency but longer lengths" do
138
+ it "order should favor shorter documents" do
139
+ RetrievalLite::TfIdfRetrieval.evaluate(corpus_small, "lorem").should == [document_one_term, document]
140
+ end
141
+ it "shorter documents should rank higher" do
142
+ scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus_small, "lorem")
143
+ scores[document_one_term].should > scores[document]
144
+ end
145
+ end
130
146
  end
data/spec/vector_spec.rb CHANGED
@@ -24,4 +24,16 @@ describe RetrievalLite::Vector do
24
24
  RetrievalLite::Vector.euclidean_length([3, 4]).should == 5
25
25
  end
26
26
  end
27
+
28
+ describe "cosine similarity" do
29
+ it "should compute correctly for vectors length 1" do
30
+ RetrievalLite::Vector.cosine_similarity([3], [5]).should == 1
31
+ end
32
+ it "should compute correctly for longer vectors" do
33
+ RetrievalLite::Vector.cosine_similarity([2, 3], [4, 5]).should be_within(0.001).of(0.996)
34
+ end
35
+ it "should raise error for unequal sized arrays" do
36
+ expect { RetrievalLite::Vector.cosine_similarity([2, 3], [4]) }.to raise_error
37
+ end
38
+ end
27
39
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: retrieval_lite
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Irvin Zhan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-12 00:00:00.000000000 Z
11
+ date: 2014-05-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec