RubyGems - retrieval_lite - Versions diffs - 1.0.0 → 1.1.0 - Mend

retrieval_lite 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/Gemfile +2 -0
data/lib/retrieval_lite.rb +57 -1
data/lib/retrieval_lite/boolean_retrieval.rb +55 -10
data/lib/retrieval_lite/corpus.rb +5 -0
data/lib/retrieval_lite/document.rb +9 -8
data/lib/retrieval_lite/tfidf_retrieval.rb +42 -20
data/lib/retrieval_lite/tokenizer.rb +3 -1
data/lib/retrieval_lite/vector.rb +1 -0
data/lib/version.rb +2 -1
data/spec/boolean_retrieval_spec.rb +84 -17
data/spec/document_spec.rb +6 -0
data/spec/retrieval_lite_spec.rb +65 -0
data/spec/spec_helper.rb +3 -0
data/spec/tfidf_retrieval_spec.rb +25 -9
data/spec/vector_spec.rb +12 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 8e080b5b0c600b330f5d07827b5c2e576132306b
-  data.tar.gz: f0fa5049a6f5e9df0a1fbc761107d28dc1efe857
+  metadata.gz: 4e4e7dd3aca0e8ccc8ff59095e5e223b6f8a1f04
+  data.tar.gz: 84517e16668414f490e3ce284fccf53f55ebdd20
 SHA512:
-  metadata.gz: b1827e3f90edbc3ee841fbe25526c748ac82ad1aae6f42455605e3199dffe7c5cd309c2684741d41f695c88285152c1bf83558beb87668ec9eebb4973f7ff0a3
-  data.tar.gz: 5d2009f175da5ce2cecb277061e9592093a14435d5012094c461afb1ac626b1d72705edf560cece8f53d4a02623f7ebe2fd3adc0f0555b0934cd5b878f6c12b2
+  metadata.gz: 67e4537932eea13f79d4009ff9c48dc2a9f7e05ca3ca4c6b4e31564c057d9c87e228de0cca0713649a6d5607d712b19f9febfc31c4fffdc087bbe04a14da348a
+  data.tar.gz: 65676b034f4cbbf7757c5b58b99ce07a453154d0230e063b6ce2a76f5fa11b200ec927a97c864872a9202cbcf48fac8b71543a9d62b0c26ea1372a893c7afc87

data/Gemfile CHANGED Viewed

@@ -2,3 +2,5 @@ source 'https://rubygems.org'
 # gem's dependencies specified in retrieval_lite.gemspec
 gemspec
+gem 'simplecov', :require => false, :group => :test

data/lib/retrieval_lite.rb CHANGED Viewed

@@ -1,7 +1,63 @@
 require "version"
+require "set"
+# Offers simple document retrieval from a corpus with a query
 module RetrievalLite
+  # Queries a corpus first by filtering it using a boolean evaluator and then
+  # using the tf-idf ranking algorithm and cosine similarity.
+  # Returns documents ordered by tf-idf score.
+  #
+  # @param corpus [Corpus] the collection of documents
+  # @param query [String] the boolean query to be evaluated
+  # @option opts [Boolean] :no_bool prevent the boolean filter
+  # @return [Array<Document>] ordered array of documents that satisfy the query
+  def evaluate_query(corpus, query, opts = {})
+    evaluate_query_with_scores(corpus, query, opts).keys
+  end
+  # Queries a corpus first by filtering it using a boolean evaluator and then
+  # using the tf-idf ranking algorithm and cosine similarity.
+  # Returns Hash of documents to their respective TF-IDF scores
+  # @see evaluate_query
+  #
+  # @param corpus [Corpus] the collection of documents
+  # @param query [String] the boolean query to be evaluated
+  # @option opts [Boolean] :no_bool prevent the boolean filter
+  # @return [Hash<Document, Integer>] ordered array of documents that satisfy the query
+  def evaluate_query_with_scores(corpus, query, opts = {})
+    evaluator_options = {}
+    # evaluate like normal if it is not a boolean expression
+    if opts[:no_bool] || !RetrievalLite::BooleanRetrieval.has_boolean_operators?(query)
+      RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, query)
+    else
+      documents = RetrievalLite::BooleanRetrieval.evaluate(corpus, query)
+      RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, query, { document_set: documents })
+    end
+  end
+  # Creates a new Retrieval Lite document.  Upon initialization, the content
+  # is parsed into individual tokens, and its term frequencies are recorded.
+  #
+  # @param content [String] the text of the document
+  # @param opts [Hash] optional arguments to initializer
+  # @option opts [String] :id the id of the document.  Defaults to object_id assigned by ruby
+  # @return a new document containing the input text
+  def new_document(content, opts = {})
+    RetrievalLite::Document.new(content, opts)
+  end
+  # Creates a new Retrieval Lite corpus, a collection of documents.  Corpuses
+  # do not modify nor own the documents in them, meaning documents must
+  # be created first before adding them to the corpus.
+  #
+  # @param documents [Array<Document>] the documents of the corpus
+  # @param opts [Hash] optional arguments to initializer
+  # @option opts [Array<String>] :stop_words the words to ignore when creating tokens
+  # @return [Corpus] either a new empty corpus or one with those documents
+  def new_corpus(documents = [], opts = {})
+    RetrievalLite::Corpus.new(documents, opts)
+  end
 end
 require 'retrieval_lite/document'

data/lib/retrieval_lite/boolean_retrieval.rb CHANGED Viewed

@@ -1,21 +1,66 @@
+# Gathers documents that satisfy boolean expression
 module RetrievalLite::BooleanRetrieval
-  # Queries a corpus using a boolean expression with the standard operators,
-  # AND, OR, NOT.  Only returns documents that satisfy the query, and does
-  # not rank the documents in any way.
+  # Gathers up all documents of a corpus that satisfy a boolean expression
+  # with the standard operators: AND, OR, NOT.  Does not order the documents in
+  # particular any way.  Assumes that all boolean operators are separated by
+  # white space on either side.
   #
   # @param corpus [Corpus] the collection of documents
   # @param query [String] the boolean query to be evaluated
   # @return [Array<Document>] unordered array of documents that satisfy the query
   def self.evaluate(corpus, query)
-    if !is_valid?(query)
-      raise "Boolean expression is not valid." # TODO better validation message?
+    if !is_valid_expression?(query)
+      raise "Each boolean operator (AND, OR, NOT) must operate on two terms."
     end
-    corpus.documents_with(query)
-  end
+    # must strip all non alphanumeric characters
+    query = strip_query(query)
+    # must have spaces in front and back for next line
+    query = " " + query + " "
+    # replace all operators with corresponding operators
+    query = query.gsub("AND", "\&\&").gsub("OR", "\|\|").gsub("NOT", "!")
+    # replace all terms with corresponding functions
+    query.gsub!(/[a-zA-Z0-9]+(-[a-zA-Z0-9]+)?/) do |q|
+       " document.contains?(\"" + q.downcase + "\") "
+    end
-  private
-    def self.is_valid?(query)
-      true
+    output_documents = []
+    corpus.documents.each do |document|
+      begin
+        if eval(query)
+          output_documents << document
+        end
+      rescue
+        raise "The boolean expression is not valid.  Please check all parethensis and operators."
+      end
     end
+    return output_documents
+  end
+  # @param query [String] the boolean query to be evaluated
+  # @return [Boolean] whether query contains any boolean operators
+  def self.has_boolean_operators?(query)
+    /AND|OR|NOT/ === query
+  end
+  # @note all other invalid expressions should be caught later on
+  # @param query [String] the boolean query to be evaluated
+  # @return [Boolean] whether query ends parenthesis correctly
+  def self.is_valid_expression?(query)
+    !(/(AND|OR|NOT)\s*\)/ === query)
+  end
+  # @param query [String] the boolean query to be evaluated
+  # @return [String] a query removed of any non-alphanumeric characters besides parenthesis and whitespace
+  def self.strip_query(query)
+    # remove non-alphanumeric
+    query = query.gsub(/[^a-zA-Z0-9\s\(\)\-]/, " ")
+    # getting rid of stray hyphens
+    query = query.gsub(/\-\-+/, " ").gsub(/\s+\-\s+/, " ")
+  end
 end

data/lib/retrieval_lite/corpus.rb CHANGED Viewed

@@ -1,9 +1,14 @@
+# A collection of documents
 class RetrievalLite::Corpus
   # the documents within the corpus
   attr_reader :documents
   # hash of a term to the array of documents that contain the particular term
   attr_reader :term_occurrences
+  # Creates a new Retrieval Lite corpus, a collection of documents.  Corpuses
+  # do not modify nor own the documents in them, meaning documents must
+  # be created first before adding them to the corpus.
+  #
   # @param documents [Array<Document>] the documents of the corpus
   # @param opts [Hash] optional arguments to initializer
   # @option opts [Array<String>] :stop_words the words to ignore when creating tokens

data/lib/retrieval_lite/document.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# Representation of document using content as a string and term frequencies as a hash
 class RetrievalLite::Document
   # the text of the document
   attr_reader :content
@@ -6,7 +7,8 @@ class RetrievalLite::Document
   # the id of the document
   attr_reader :id
-  # splits the text of the document into an array of tokens
+  # Creates a new Retrieval Lite document.  Upon initialization, the content
+  # is parsed into individual tokens, and its term frequencies are recorded.
   #
   # @param content [String] the text of the document
   # @param opts [Hash] optional arguments to initializer
@@ -17,13 +19,6 @@ class RetrievalLite::Document
     @term_frequencies = RetrievalLite::Tokenizer.parse_content(content)
   end
-  # for debugging
-  def print_tokens
-    @term_frequencies.each do |key, value|
-      puts "#{key}: #{value}"
-    end
-  end
   # @return [Integer] the total number of unique terms in the document
   def term_count
     @term_frequencies.size
@@ -44,6 +39,12 @@ class RetrievalLite::Document
     end
   end
+  # @param term [String]
+  # @return [Boolean] whether a term appears in the document
+  def contains?(term)
+    @term_frequencies.has_key?(term)
+  end
   # @return [Integer] the total number of terms (not unique) in the document
   def total_terms
     count = 0

data/lib/retrieval_lite/tfidf_retrieval.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# Scores queries using TF-IDF
 # @see http://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf
 module RetrievalLite::TfIdfRetrieval
   # Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
@@ -6,8 +7,8 @@ module RetrievalLite::TfIdfRetrieval
   # @param corpus [Corpus] the collection of documents
   # @param query [String] the boolean query to be evaluated
   # @return [Array<Document>] ordered array of documents that satisfy the query
-  def self.evaluate(corpus, query)
-    evaluate_with_scores(corpus, query).keys
+  def self.evaluate(corpus, query, opts = {})
+    evaluate_with_scores(corpus, query, opts).keys
   end
   # Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
@@ -16,20 +17,25 @@ module RetrievalLite::TfIdfRetrieval
   #
   # @param corpus [Corpus] the collection of documents
   # @param query [String] the boolean query to be evaluated
+  # @option opts [Array<Document>] :document_set limiting the documents to search in the corpus to only these documents
   # @return [Hash<Document, Integer>] ordered array of documents that satisfy the query
-  def self.evaluate_with_scores(corpus, query)
+  def self.evaluate_with_scores(corpus, query, opts = {})
     query_document = RetrievalLite::Document.new(query)
     terms = query_document.term_frequencies.keys
     query_vector = query_document.term_frequencies.values # should be in same order as keys
-    documents = Set.new # ordering of documents doesn't matter right now
-    # gathering only the documents that contain at least one of those terms
-    terms.each do |t|
-      docs_with_term = corpus.documents_with(t)
-      if docs_with_term
-        docs_with_term.each do |d|
-          if !documents.include?(d)
-            documents << d
+    if opts[:document_set]
+      documents = opts[:document_set]
+    else
+      documents = Set.new # ordering of documents doesn't matter right now
+      # gathering only the documents that contain at least one of those terms
+      terms.each do |t|
+        docs_with_term = corpus.documents_with(t)
+        if docs_with_term
+          docs_with_term.each do |d|
+            if !documents.include?(d)
+              documents << d
+            end
           end
         end
       end
@@ -37,11 +43,16 @@ module RetrievalLite::TfIdfRetrieval
     scores = {}
     documents.each do |document|
-      document_vector = Array.new(terms.size)
-      terms.each_with_index do |term, index|
-        document_vector[index] = tfidf_weight(corpus, document, term)
+      vector_length = tfidf_weight_length(corpus, document)
+      if vector_length == 0
+        scores[document] = 0
+      else
+        document_vector = Array.new(terms.size)
+        terms.each_with_index do |term, index|
+          document_vector[index] = tfidf_weight(corpus, document, term)
+        end
+        scores[document] = RetrievalLite::Vector.dot_product(query_vector, document_vector) / vector_length
       end
-      scores[document] = RetrievalLite::Vector.cosine_similarity(query_vector, document_vector)
     end
     # order it by score in descending order
@@ -72,13 +83,24 @@ module RetrievalLite::TfIdfRetrieval
   # @param term [String]
   # @return [Float] the normalized tfidf weight of the term in the document
   def self.normalized_tfidf_weight(corpus, document, term)
-    length_of_vector = 0
+    tfidf_weight(corpus, document, term) / tfidf_weight_length(corpus, document)
+  end
+  # Computes the length of a document vector of tf-idf weights.  This is
+  # used for normalization
+  #
+  # @param corpus [Corpus]
+  # @param document [Document]
+  # @return [Float] the length of the document vector of tf-idf weights
+  def self.tfidf_weight_length(corpus, document)
+    normalize = 0
-    corpus.documents_with(term).each do |d|
-      weight = tfidf_weight(corpus, d, term)
-      length_of_vector += weight * weight
+    document.terms.each do |t|
+      weight = tfidf_weight(corpus, document, t)
+      normalize += weight * weight
     end
-    tfidf_weight(corpus, document, term) / Math.sqrt(length_of_vector)
+    return Math.sqrt(normalize)
   end
 end

data/lib/retrieval_lite/tokenizer.rb CHANGED Viewed

@@ -1,4 +1,6 @@
+# Separates text into tokens used for IR
 module RetrievalLite::Tokenizer
+  # Punctuation that is to be ignored when parsing.  Does not contain the hyphen
   SPECIAL_SEPARATERS = ['[', ']', '\\', ';', '\'', ',', '.', '/', '!', '@', '#', '%', '&', '*', '(', ')', '_', '{', '}', ':', '"', '?', '=', '`', '~', '$', '^', '+', '|', '<', '>']
   # @param content [String] the text of the document
@@ -24,7 +26,7 @@ module RetrievalLite::Tokenizer
       end
     end
-    tokens
+    return tokens
   end
   private

data/lib/retrieval_lite/vector.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# Offers mathematical operations for vectors
 module RetrievalLite::Vector
   # @param scores1 [Array<Integer>] each term and its corresponding score in the first document
   # @param scores2 [Array<Integer>] each term and its corresponding score in the second document

data/lib/version.rb CHANGED Viewed

@@ -1,3 +1,4 @@
 module RetrievalLite
-  VERSION = "1.0.0"
+  # current version of the Ruby gem
+  VERSION = "1.1.0"
 end

data/spec/boolean_retrieval_spec.rb CHANGED Viewed

@@ -7,38 +7,105 @@ describe RetrievalLite::BooleanRetrieval do
   let (:document_replicated) do
     RetrievalLite::Document.new("lorem ipsum dolor sit amet")
   end
+  let (:document_one_term) do
+    RetrievalLite::Document.new("lorem")
+  end
   let (:document_with_duplicates) do
     RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
   end
-  let (:document_two) do
-    RetrievalLite::Document.new("Mauris ullamcorper, tortor et consequat sagittis.")
+  let (:document_strange) do
+    RetrievalLite::Document.new("foo bar")
   end
-  let (:document_three) do
-    RetrievalLite::Document.new("Pellentesque felis lectus, lacinia nec mauris non.")
+  let (:document_no_match) do
+    RetrievalLite::Document.new("no-match")
   end
-  let (:document_paragraph) do
-    RetrievalLite::Document.new("In semper enim non ullamcorper venenatis.
-      Sed dictum metus condimentum libero ullamcorper, eget scelerisque risus congue.
-      Morbi tempus rhoncus ante, at varius sem adipiscing eu. Sed ut purus pretium,
-      consequat velit et, ultricies magna. Etiam sit amet elit mi. Sed et nibh non nibh
-      vestibulum hendrerit vitae dapibus lectus. Aenean eget odio vitae tortor elementum
-      euismod non nec eros. Nunc id convallis magna. Aliquam ultrices dignissim ipsum,
-      a accumsan enim faucibus non. Pellentesque a felis quis diam blandit tempor.
-      In aliquet laoreet tortor, at adipiscing diam scelerisque ut."
-      )
+  let (:all_normal_documents) do
+    [document, document_replicated, document_with_duplicates, document_one_term, document_strange]
   end
   let (:all_documents) do
-    [document, document_replicated, document_with_duplicates, document_two, document_three, document_paragraph]
+    [document, document_replicated, document_with_duplicates, document_one_term, document_strange, document_no_match]
   end
   let (:corpus) do
     RetrievalLite::Corpus.new(all_documents)
   end
+  describe "#has_boolean_operators?" do
+    it "should accept any uses of AND OR NOT" do
+      RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo AND bar").should == true
+      RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo OR bar").should == true
+      RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo NOT bar").should == true
+    end
+    it "should reject any regular non-boolean queries" do
+      RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo bar").should == false
+    end
+  end
+  describe "#is_valid_expression?" do
+    it "should accept parenthesis and spaces, as well as all alphanumeric characters" do
+      RetrievalLite::BooleanRetrieval.is_valid_expression?("(foo AND bar) OR baz").should == true
+    end
+    it "should reject when there is a close parethensis but no term after AND/OR/NOT" do
+      RetrievalLite::BooleanRetrieval.is_valid_expression?("(foo AND bar AND)").should == false
+      RetrievalLite::BooleanRetrieval.is_valid_expression?("(foo AND bar AND )").should == false
+    end
+    it "should accept AND/OR/NOT with any begin parenthesis after it, regardless if there's a whitespace" do
+      RetrievalLite::BooleanRetrieval.is_valid_expression?("NOT(foo AND bar)").should == true
+      RetrievalLite::BooleanRetrieval.is_valid_expression?("NOT (foo AND bar)").should == true
+    end
+    it "should accept sentences" do
+      RetrievalLite::BooleanRetrieval.is_valid_expression?("foo bar.").should == true
+    end
+  end
+  describe "#strip_query" do
+    it "should strip any commas, periods, etc nonalphanumeric characters" do
+      RetrievalLite::BooleanRetrieval.strip_query("(This is a cat.) AND (Although, something else!)").should == "(This is a cat ) AND (Although  something else )"
+    end
+    it "should strip any double, triple, etc hyphenated words" do
+      RetrievalLite::BooleanRetrieval.strip_query("This is it--hooray!").should == "This is it hooray "
+    end
+    it "should leave hyphenated words alone" do
+      RetrievalLite::BooleanRetrieval.strip_query("This is foo-bar").should == "This is foo-bar"
+    end
+    it "should remove lone hyphens" do
+      RetrievalLite::BooleanRetrieval.strip_query("This - is foo-bar").should == "This is foo-bar"
+    end
+  end
+  describe "invalid boolean" do
+    it "should error on unclosed parenthesis" do
+      expect { RetrievalLite::BooleanRetrieval.evaluate(corpus, "(lorem AND ipsum") }.to raise_error
+    end
+    it "should error on when not enough arguments are provided" do
+      expect { RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem AND ipsum OR") }.to raise_error
+    end
+  end
   describe "one-term retrieval" do
     it "should return array of all documents with that term" do
-      RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem") == [document, document_replicated, document_with_duplicates]
+      RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem").should == [document, document_replicated, document_with_duplicates, document_one_term]
     end
     it "should ignore case" do
-      RetrievalLite::BooleanRetrieval.evaluate(corpus, "LOREM") == [document, document_replicated, document_with_duplicates]
+      RetrievalLite::BooleanRetrieval.evaluate(corpus, "LOreM").should == [document, document_replicated, document_with_duplicates, document_one_term]
+    end
+    it "should work for hyphenated words" do
+      RetrievalLite::BooleanRetrieval.evaluate(corpus, "no-match").should == [document_no_match]
+    end
+  end
+  describe "valid boolean retrieval" do
+    it "should work for simple two term AND" do
+      RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem AND ipsum").should == [document, document_replicated, document_with_duplicates]
+    end
+    it "should work for simple two term OR" do
+      RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem OR foo").should == all_normal_documents
+    end
+    it "should work for simple one term NOT" do
+      RetrievalLite::BooleanRetrieval.evaluate(corpus, "NOT lorem").should == [document_strange, document_no_match]
+    end
+    it "should work for more complex retrievals with parenthesis" do
+      RetrievalLite::BooleanRetrieval.evaluate(corpus, "foo OR (dolor AND sit)").should == [document, document_replicated, document_with_duplicates, document_strange]
+      RetrievalLite::BooleanRetrieval.evaluate(corpus, "(lorem OR foo) AND NOT ipsum").should == [document_one_term, document_strange]
     end
   end
 end

data/spec/document_spec.rb CHANGED Viewed

@@ -80,6 +80,12 @@ describe RetrievalLite::Document do
       end
     end
+    describe "total terms" do
+      it "should have the right number of terms" do
+        document.total_terms.should == 5
+      end
+    end
     describe "for blank document" do
       it "should not raise error on initialization" do
         expect { RetrievalLite::Document.new("") }.to_not raise_error

data/spec/retrieval_lite_spec.rb CHANGED Viewed

@@ -1,3 +1,68 @@
 require 'spec_helper'
 describe RetrievalLite do
+  include RetrievalLite
+  let (:document_one_term) do
+    new_document("lorem")
+  end
+  let (:document) do
+    new_document("lorem ipsum dolor sit amet")
+  end
+  let (:document_with_duplicates) do
+    new_document("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
+  end
+  let (:document_doubled) do
+    new_document("lorem ipsum dolor sit amet lorem ipsum dolor sit amet")
+  end
+  let (:document_both_terms) do
+    new_document("lorem ipsum")
+  end
+  let (:document_with_unique) do
+    new_document("lorem unique")
+  end
+  let (:document_no_match) do
+    new_document("no-match")
+  end
+  let (:all_documents) do
+    [document, document_with_duplicates, document_doubled, document_one_term, document_both_terms, document_with_unique, document_no_match]
+  end
+  let (:corpus) do
+    new_corpus(all_documents)
+  end
+  let (:corpus_different) do
+    new_corpus([document_one_term, document, document_with_duplicates])
+  end
+  let(:corpus_small) do
+    new_corpus([document_one_term, document, document_no_match])
+  end
+  describe "when no options are passed" do
+    it "should default to basic tf-idf" do
+      scores = evaluate_query_with_scores(corpus_different, "lorem dolor sit")
+      scores[document].should be_within(0.001).of(1.0)
+      scores[document_with_duplicates].should be_within(0.001).of(0.953)
+      scores[document_one_term].should be_within(0.001).of(0.0)
+      evaluate_query(corpus_different, "lorem dolor sit").should == [document, document_with_duplicates, document_one_term]
+    end
+  end
+  describe "when boolean operators are present" do
+    it "should first filter through boolean" do
+      evaluate_query(corpus, "lorem AND NOT dolor").should == [document_one_term, document_both_terms, document_with_unique]
+      evaluate_query(corpus, "(lorem AND unique) OR no-match").should == [document_with_unique, document_no_match]
+      evaluate_query(corpus, "lorem AND ipsum AND dolor AND sit AND amet").should == [document_doubled, document, document_with_duplicates]
+      evaluate_query(corpus, "lorem AND no-match").should == []
+    end
+  end
+  describe "with punctuation" do
+    it "should retrieve it as normal" do
+      evaluate_query(corpus, "lorem. AND NOT dolor!").should == [document_one_term, document_both_terms, document_with_unique]
+      evaluate_query(corpus, "(lorem-- AND !unique) OR no-match").should == [document_with_unique, document_no_match]
+      evaluate_query(corpus, "@lorem AND @ipsum AND @dolor AND @sit AND @amet").should == [document_doubled, document, document_with_duplicates]
+      evaluate_query(corpus, "||lorem AND no-match").should == []
+    end
+  end
 end

data/spec/spec_helper.rb CHANGED Viewed

@@ -1,3 +1,6 @@
+require 'simplecov'
+SimpleCov.start
 require "retrieval_lite"
 require "spec_helpers/file_helpers"

data/spec/tfidf_retrieval_spec.rb CHANGED Viewed

@@ -1,6 +1,9 @@
 require 'spec_helper'
 describe RetrievalLite::TfIdfRetrieval do
+  let (:document_no_match) do
+    RetrievalLite::Document.new("no-match")
+  end
   let (:document_one_term) do
     RetrievalLite::Document.new("lorem")
   end
@@ -29,6 +32,9 @@ describe RetrievalLite::TfIdfRetrieval do
   let (:corpus_different) do
     RetrievalLite::Corpus.new([document_one_term, document, document_with_duplicates])
   end
+  let(:corpus_small) do
+    RetrievalLite::Corpus.new([document_one_term, document, document_no_match])
+  end
   describe "calculating tf-idf scores" do
     describe "term that all documents have" do
@@ -55,10 +61,10 @@ describe RetrievalLite::TfIdfRetrieval do
   describe "calculating normalized tf-idf scores" do
     describe "term that a few documents have" do
       it "should have correct tf-idf" do
-        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.316)
-        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.632)
-        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.632)
-        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(0.316)
+        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.320)
+        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.163)
+        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.320)
+        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(1.0)
       end
     end
   end
@@ -84,9 +90,9 @@ describe RetrievalLite::TfIdfRetrieval do
       it "should return the correct score" do
         scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "ipsum")
         scores.size.should == 4
-        scores[document].should be_within(0.001).of(1.0)
-        scores[document_with_duplicates].should be_within(0.001).of(1.0)
-        scores[document_doubled].should be_within(0.001).of(1.0)
+        scores[document].should be_within(0.001).of(0.320)
+        scores[document_with_duplicates].should be_within(0.001).of(0.163)
+        scores[document_doubled].should be_within(0.001).of(0.320)
         scores[document_both_terms].should be_within(0.001).of(1.0)
       end
     end
@@ -122,9 +128,19 @@ describe RetrievalLite::TfIdfRetrieval do
     end
     it "should have the correct scores" do
       scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus_different, "lorem dolor sit")
-      scores[document].should be_within(0.001).of(0.816)
-      scores[document_with_duplicates].should be_within(0.001).of(0.808)
+      scores[document].should be_within(0.001).of(1.0)
+      scores[document_with_duplicates].should be_within(0.001).of(0.953)
       scores[document_one_term].should be_within(0.001).of(0.0)
     end
   end
+  describe "documents with same frequency but longer lengths" do
+    it "order should favor shorter documents" do
+      RetrievalLite::TfIdfRetrieval.evaluate(corpus_small, "lorem").should == [document_one_term, document]
+    end
+    it "shorter documents should rank higher" do
+      scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus_small, "lorem")
+      scores[document_one_term].should > scores[document]
+    end
+  end
 end

data/spec/vector_spec.rb CHANGED Viewed

@@ -24,4 +24,16 @@ describe RetrievalLite::Vector do
       RetrievalLite::Vector.euclidean_length([3, 4]).should == 5
     end
   end
+  describe "cosine similarity" do
+    it "should compute correctly for vectors length 1" do
+      RetrievalLite::Vector.cosine_similarity([3], [5]).should == 1
+    end
+    it "should compute correctly for longer vectors" do
+      RetrievalLite::Vector.cosine_similarity([2, 3], [4, 5]).should be_within(0.001).of(0.996)
+    end
+    it "should raise error for unequal sized arrays" do
+      expect { RetrievalLite::Vector.cosine_similarity([2, 3], [4]) }.to raise_error
+    end
+  end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: retrieval_lite
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.1.0
 platform: ruby
 authors:
 - Irvin Zhan
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-05-12 00:00:00.000000000 Z
+date: 2014-05-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec