RubyGems - retrieval_lite - Versions diffs - 1.0.0 → 1.1.0 - Mend

retrieval_lite 1.0.0 → 1.1.0

Files changed (17) hide show

checksums.yaml +4 -4
data/Gemfile +2 -0
data/lib/retrieval_lite.rb +57 -1
data/lib/retrieval_lite/boolean_retrieval.rb +55 -10
data/lib/retrieval_lite/corpus.rb +5 -0
data/lib/retrieval_lite/document.rb +9 -8
data/lib/retrieval_lite/tfidf_retrieval.rb +42 -20
data/lib/retrieval_lite/tokenizer.rb +3 -1
data/lib/retrieval_lite/vector.rb +1 -0
data/lib/version.rb +2 -1
data/spec/boolean_retrieval_spec.rb +84 -17
data/spec/document_spec.rb +6 -0
data/spec/retrieval_lite_spec.rb +65 -0
data/spec/spec_helper.rb +3 -0
data/spec/tfidf_retrieval_spec.rb +25 -9
data/spec/vector_spec.rb +12 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 8e080b5b0c600b330f5d07827b5c2e576132306b
-  data.tar.gz: f0fa5049a6f5e9df0a1fbc761107d28dc1efe857
+  metadata.gz: 4e4e7dd3aca0e8ccc8ff59095e5e223b6f8a1f04
+  data.tar.gz: 84517e16668414f490e3ce284fccf53f55ebdd20
 SHA512:
-  metadata.gz: b1827e3f90edbc3ee841fbe25526c748ac82ad1aae6f42455605e3199dffe7c5cd309c2684741d41f695c88285152c1bf83558beb87668ec9eebb4973f7ff0a3
-  data.tar.gz: 5d2009f175da5ce2cecb277061e9592093a14435d5012094c461afb1ac626b1d72705edf560cece8f53d4a02623f7ebe2fd3adc0f0555b0934cd5b878f6c12b2
+  metadata.gz: 67e4537932eea13f79d4009ff9c48dc2a9f7e05ca3ca4c6b4e31564c057d9c87e228de0cca0713649a6d5607d712b19f9febfc31c4fffdc087bbe04a14da348a
+  data.tar.gz: 65676b034f4cbbf7757c5b58b99ce07a453154d0230e063b6ce2a76f5fa11b200ec927a97c864872a9202cbcf48fac8b71543a9d62b0c26ea1372a893c7afc87

data/Gemfile CHANGED Viewed

@@ -2,3 +2,5 @@ source 'https://rubygems.org'
 # gem's dependencies specified in retrieval_lite.gemspec
 gemspec
+gem 'simplecov', :require => false, :group => :test

data/lib/retrieval_lite.rb CHANGED Viewed

@@ -1,7 +1,63 @@
 require "version"
+require "set"
+# Offers simple document retrieval from a corpus with a query
 module RetrievalLite
+  # Queries a corpus first by filtering it using a boolean evaluator and then
+  # using the tf-idf ranking algorithm and cosine similarity.
+  # Returns documents ordered by tf-idf score.
+  #
+  # @param corpus [Corpus] the collection of documents
+  # @param query [String] the boolean query to be evaluated
+  # @option opts [Boolean] :no_bool prevent the boolean filter
+  # @return [Array<Document>] ordered array of documents that satisfy the query
+  def evaluate_query(corpus, query, opts = {})
+    evaluate_query_with_scores(corpus, query, opts).keys
+  end
+  # Queries a corpus first by filtering it using a boolean evaluator and then
+  # using the tf-idf ranking algorithm and cosine similarity.
+  # Returns Hash of documents to their respective TF-IDF scores
+  # @see evaluate_query
+  #
+  # @param corpus [Corpus] the collection of documents
+  # @param query [String] the boolean query to be evaluated
+  # @option opts [Boolean] :no_bool prevent the boolean filter
+  # @return [Hash<Document, Integer>] ordered array of documents that satisfy the query
+  def evaluate_query_with_scores(corpus, query, opts = {})
+    evaluator_options = {}
+    # evaluate like normal if it is not a boolean expression
+    if opts[:no_bool] || !RetrievalLite::BooleanRetrieval.has_boolean_operators?(query)
+      RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, query)
+    else
+      documents = RetrievalLite::BooleanRetrieval.evaluate(corpus, query)
+      RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, query, { document_set: documents })
+    end
+  end
+  # Creates a new Retrieval Lite document.  Upon initialization, the content
+  # is parsed into individual tokens, and its term frequencies are recorded.
+  #
+  # @param content [String] the text of the document
+  # @param opts [Hash] optional arguments to initializer
+  # @option opts [String] :id the id of the document.  Defaults to object_id assigned by ruby
+  # @return a new document containing the input text
+  def new_document(content, opts = {})
+    RetrievalLite::Document.new(content, opts)
+  end
+  # Creates a new Retrieval Lite corpus, a collection of documents.  Corpuses
+  # do not modify nor own the documents in them, meaning documents must
+  # be created first before adding them to the corpus.
+  #
+  # @param documents [Array<Document>] the documents of the corpus
+  # @param opts [Hash] optional arguments to initializer
+  # @option opts [Array<String>] :stop_words the words to ignore when creating tokens
+  # @return [Corpus] either a new empty corpus or one with those documents
+  def new_corpus(documents = [], opts = {})
+    RetrievalLite::Corpus.new(documents, opts)
+  end
 end
 require 'retrieval_lite/document'

data/lib/retrieval_lite/boolean_retrieval.rb CHANGED Viewed

@@ -1,21 +1,66 @@
+# Gathers documents that satisfy boolean expression
 module RetrievalLite::BooleanRetrieval
-  # Queries a corpus using a boolean expression with the standard operators,
-  # AND, OR, NOT.  Only returns documents that satisfy the query, and does
-  # not rank the documents in any way.
+  # Gathers up all documents of a corpus that satisfy a boolean expression
+  # with the standard operators: AND, OR, NOT.  Does not order the documents in
+  # particular any way.  Assumes that all boolean operators are separated by
+  # white space on either side.
   #
   # @param corpus [Corpus] the collection of documents
   # @param query [String] the boolean query to be evaluated
   # @return [Array<Document>] unordered array of documents that satisfy the query
   def self.evaluate(corpus, query)
-    if !is_valid?(query)
-      raise "Boolean expression is not valid." # TODO better validation message?
+    if !is_valid_expression?(query)
+      raise "Each boolean operator (AND, OR, NOT) must operate on two terms."
     end
-    corpus.documents_with(query)
-  end
+    # must strip all non alphanumeric characters
+    query = strip_query(query)
+    # must have spaces in front and back for next line
+    query = " " + query + " "
+    # replace all operators with corresponding operators
+    query = query.gsub("AND", "\&\&").gsub("OR", "\|\|").gsub("NOT", "!")
+    # replace all terms with corresponding functions
+    query.gsub!(/[a-zA-Z0-9]+(-[a-zA-Z0-9]+)?/) do |q|
+       " document.contains?(\"" + q.downcase + "\") "
+    end
-  private
-    def self.is_valid?(query)
-      true
+    output_documents = []
+    corpus.documents.each do |document|
+      begin
+        if eval(query)
+          output_documents << document
+        end
+      rescue
+        raise "The boolean expression is not valid.  Please check all parethensis and operators."
+      end
     end
+    return output_documents
+  end
+  # @param query [String] the boolean query to be evaluated
+  # @return [Boolean] whether query contains any boolean operators
+  def self.has_boolean_operators?(query)
+    /AND|OR|NOT/ === query
+  end
+  # @note all other invalid expressions should be caught later on
+  # @param query [String] the boolean query to be evaluated
+  # @return [Boolean] whether query ends parenthesis correctly
+  def self.is_valid_expression?(query)
+    !(/(AND|OR|NOT)\s*\)/ === query)
+  end
+  # @param query [String] the boolean query to be evaluated
+  # @return [String] a query removed of any non-alphanumeric characters besides parenthesis and whitespace
+  def self.strip_query(query)
+    # remove non-alphanumeric
+    query = query.gsub(/[^a-zA-Z0-9\s\(\)\-]/, " ")
+    # getting rid of stray hyphens
+    query = query.gsub(/\-\-+/, " ").gsub(/\s+\-\s+/, " ")
+  end
 end

data/lib/retrieval_lite/corpus.rb CHANGED Viewed

@@ -1,9 +1,14 @@
+# A collection of documents
 class RetrievalLite::Corpus
   # the documents within the corpus
   attr_reader :documents
   # hash of a term to the array of documents that contain the particular term
   attr_reader :term_occurrences
+  # Creates a new Retrieval Lite corpus, a collection of documents.  Corpuses
+  # do not modify nor own the documents in them, meaning documents must
+  # be created first before adding them to the corpus.
+  #
   # @param documents [Array<Document>] the documents of the corpus
   # @param opts [Hash] optional arguments to initializer
   # @option opts [Array<String>] :stop_words the words to ignore when creating tokens

data/lib/retrieval_lite/document.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# Representation of document using content as a string and term frequencies as a hash
 class RetrievalLite::Document
   # the text of the document
   attr_reader :content
@@ -6,7 +7,8 @@ class RetrievalLite::Document
   # the id of the document
   attr_reader :id
-  # splits the text of the document into an array of tokens
+  # Creates a new Retrieval Lite document.  Upon initialization, the content
+  # is parsed into individual tokens, and its term frequencies are recorded.
   #
   # @param content [String] the text of the document
   # @param opts [Hash] optional arguments to initializer
@@ -17,13 +19,6 @@ class RetrievalLite::Document
     @term_frequencies = RetrievalLite::Tokenizer.parse_content(content)
   end
-  # for debugging
-  def print_tokens
-    @term_frequencies.each do |key, value|
-      puts "#{key}: #{value}"
-    end
-  end
   # @return [Integer] the total number of unique terms in the document
   def term_count
     @term_frequencies.size
@@ -44,6 +39,12 @@ class RetrievalLite::Document
     end
   end
+  # @param term [String]
+  # @return [Boolean] whether a term appears in the document
+  def contains?(term)
+    @term_frequencies.has_key?(term)
+  end
   # @return [Integer] the total number of terms (not unique) in the document
   def total_terms
     count = 0

data/lib/retrieval_lite/tfidf_retrieval.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# Scores queries using TF-IDF
 # @see http://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf
 module RetrievalLite::TfIdfRetrieval
   # Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
@@ -6,8 +7,8 @@ module RetrievalLite::TfIdfRetrieval
   # @param corpus [Corpus] the collection of documents
   # @param query [String] the boolean query to be evaluated
   # @return [Array<Document>] ordered array of documents that satisfy the query
-  def self.evaluate(corpus, query)
-    evaluate_with_scores(corpus, query).keys
+  def self.evaluate(corpus, query, opts = {})
+    evaluate_with_scores(corpus, query, opts).keys
   end
   # Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
@@ -16,20 +17,25 @@ module RetrievalLite::TfIdfRetrieval
   #
   # @param corpus [Corpus] the collection of documents
   # @param query [String] the boolean query to be evaluated
+  # @option opts [Array<Document>] :document_set limiting the documents to search in the corpus to only these documents
   # @return [Hash<Document, Integer>] ordered array of documents that satisfy the query
-  def self.evaluate_with_scores(corpus, query)
+  def self.evaluate_with_scores(corpus, query, opts = {})
     query_document = RetrievalLite::Document.new(query)
     terms = query_document.term_frequencies.keys
     query_vector = query_document.term_frequencies.values # should be in same order as keys
-    documents = Set.new # ordering of documents doesn't matter right now
-    # gathering only the documents that contain at least one of those terms
-    terms.each do |t|
-      docs_with_term = corpus.documents_with(t)
-      if docs_with_term
-        docs_with_term.each do |d|
-          if !documents.include?(d)
-            documents << d
+    if opts[:document_set]
+      documents = opts[:document_set]
+    else
+      documents = Set.new # ordering of documents doesn't matter right now
+      # gathering only the documents that contain at least one of those terms
+      terms.each do |t|
+        docs_with_term = corpus.documents_with(t)
+        if docs_with_term
+          docs_with_term.each do |d|
+            if !documents.include?(d)
+              documents << d
+            end
           end
         end
       end
@@ -37,11 +43,16 @@ module RetrievalLite::TfIdfRetrieval
     scores = {}
     documents.each do |document|
-      document_vector = Array.new(terms.size)
-      terms.each_with_index do |term, index|
-        document_vector[index] = tfidf_weight(corpus, document, term)
+      vector_length = tfidf_weight_length(corpus, document)
+      if vector_length == 0
+        scores[document] = 0
+      else
+        document_vector = Array.new(terms.size)
+        terms.each_with_index do |term, index|
+          document_vector[index] = tfidf_weight(corpus, document, term)
+        end
+        scores[document] = RetrievalLite::Vector.dot_product(query_vector, document_vector) / vector_length
       end
-      scores[document] = RetrievalLite::Vector.cosine_similarity(query_vector, document_vector)
     end
     # order it by score in descending order
@@ -72,13 +83,24 @@ module RetrievalLite::TfIdfRetrieval
   # @param term [String]
   # @return [Float] the normalized tfidf weight of the term in the document
   def self.normalized_tfidf_weight(corpus, document, term)
-    length_of_vector = 0
+    tfidf_weight(corpus, document, term) / tfidf_weight_length(corpus, document)
+  end
+  # Computes the length of a document vector of tf-idf weights.  This is
+  # used for normalization
+  #
+  # @param corpus [Corpus]
+  # @param document [Document]
+  # @return [Float] the length of the document vector of tf-idf weights
+  def self.tfidf_weight_length(corpus, document)
+    normalize = 0
-    corpus.documents_with(term).each do |d|
-      weight = tfidf_weight(corpus, d, term)
-      length_of_vector += weight * weight
+    document.terms.each do |t|
+      weight = tfidf_weight(corpus, document, t)
+      normalize += weight * weight
     end
-    tfidf_weight(corpus, document, term) / Math.sqrt(length_of_vector)
+    return Math.sqrt(normalize)
   end
 end

data/lib/retrieval_lite/tokenizer.rb CHANGED Viewed

@@ -1,4 +1,6 @@
+# Separates text into tokens used for IR
 module RetrievalLite::Tokenizer
+  # Punctuation that is to be ignored when parsing.  Does not contain the hyphen
   SPECIAL_SEPARATERS = ['[', ']', '\\', ';', '\'', ',', '.', '/', '!', '@', '#', '%', '&', '*', '(', ')', '_', '{', '}', ':', '"', '?', '=', '`', '~', '$', '^', '+', '|', '<', '>']
   # @param content [String] the text of the document
@@ -24,7 +26,7 @@ module RetrievalLite::Tokenizer
       end
     end
-    tokens
+    return tokens
   end
   private

data/lib/retrieval_lite/vector.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# Offers mathematical operations for vectors
 module RetrievalLite::Vector
   # @param scores1 [Array<Integer>] each term and its corresponding score in the first document
   # @param scores2 [Array<Integer>] each term and its corresponding score in the second document

data/lib/version.rb CHANGED Viewed

@@ -1,3 +1,4 @@
 module RetrievalLite
-  VERSION = "1.0.0"
+  # current version of the Ruby gem
+  VERSION = "1.1.0"
 end

data/spec/boolean_retrieval_spec.rb CHANGED Viewed

@@ -7,38 +7,105 @@ describe RetrievalLite::BooleanRetrieval do
   let (:document_replicated) do
     RetrievalLite::Document.new("lorem ipsum dolor sit amet")
   end
+  let (:document_one_term) do
+    RetrievalLite::Document.new("lorem")
+  end
   let (:document_with_duplicates) do
     RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
   end
-  let (:document_two) do
-    RetrievalLite::Document.new("Mauris ullamcorper, tortor et consequat sagittis.")
+  let (:document_strange) do
+    RetrievalLite::Document.new("foo bar")
   end
-  let (:document_three) do
-    RetrievalLite::Document.new("Pellentesque felis lectus, lacinia nec mauris non.")
+  let (:document_no_match) do
+    RetrievalLite::Document.new("no-match")
   end
-  let (:document_paragraph) do
-    RetrievalLite::Document.new("In semper enim non ullamcorper venenatis.
-      Sed dictum metus condimentum libero ullamcorper, eget scelerisque risus congue.
-      Morbi tempus rhoncus ante, at varius sem adipiscing eu. Sed ut purus pretium,
-      consequat velit et, ultricies magna. Etiam sit amet elit mi. Sed et nibh non nibh
-      vestibulum hendrerit vitae dapibus lectus. Aenean eget odio vitae tortor elementum
-      euismod non nec eros. Nunc id convallis magna. Aliquam ultrices dignissim ipsum,
-      a accumsan enim faucibus non. Pellentesque a felis quis diam blandit tempor.
-      In aliquet laoreet tortor, at adipiscing diam scelerisque ut."
-      )
+  let (:all_normal_documents) do
+    [document, document_replicated, document_with_duplicates, document_one_term, document_strange]
   end
   let (:all_documents) do
-    [document, document_replicated, document_with_duplicates, document_two, document_three, document_paragraph]
+    [document, document_replicated, document_with_duplicates, document_one_term, document_strange, document_no_match]
   end
   let (:corpus) do
     RetrievalLite::Corpus.new(all_documents)
   end
+  describe "#has_boolean_operators?" do
+    it "should accept any uses of AND OR NOT" do
+      RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo AND bar").should == true
+      RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo OR bar").should == true
+      RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo NOT bar").should == true
+    end
+    it "should reject any regular non-boolean queries" do
+      RetrievalLite::BooleanRetrieval.has_boolean_operators?("foo bar").should == false
+    end
+  end
+  describe "#is_valid_expression?" do
+    it "should accept parenthesis and spaces, as well as all alphanumeric characters" do
+      RetrievalLite::BooleanRetrieval.is_valid_expression?("(foo AND bar) OR baz").should == true
+    end
+    it "should reject when there is a close parethensis but no term after AND/OR/NOT" do
+      RetrievalLite::BooleanRetrieval.is_valid_expression?("(foo AND bar AND)").should == false
+      RetrievalLite::BooleanRetrieval.is_valid_expression?("(foo AND bar AND )").should == false
+    end
+    it "should accept AND/OR/NOT with any begin parenthesis after it, regardless if there's a whitespace" do
+      RetrievalLite::BooleanRetrieval.is_valid_expression?("NOT(foo AND bar)").should == true
+      RetrievalLite::BooleanRetrieval.is_valid_expression?("NOT (foo AND bar)").should == true
+    end
+    it "should accept sentences" do
+      RetrievalLite::BooleanRetrieval.is_valid_expression?("foo bar.").should == true
+    end
+  end
+  describe "#strip_query" do
+    it "should strip any commas, periods, etc nonalphanumeric characters" do
+      RetrievalLite::BooleanRetrieval.strip_query("(This is a cat.) AND (Although, something else!)").should == "(This is a cat ) AND (Although  something else )"
+    end
+    it "should strip any double, triple, etc hyphenated words" do
+      RetrievalLite::BooleanRetrieval.strip_query("This is it--hooray!").should == "This is it hooray "
+    end
+    it "should leave hyphenated words alone" do
+      RetrievalLite::BooleanRetrieval.strip_query("This is foo-bar").should == "This is foo-bar"
+    end
+    it "should remove lone hyphens" do
+      RetrievalLite::BooleanRetrieval.strip_query("This - is foo-bar").should == "This is foo-bar"
+    end
+  end
+  describe "invalid boolean" do
+    it "should error on unclosed parenthesis" do
+      expect { RetrievalLite::BooleanRetrieval.evaluate(corpus, "(lorem AND ipsum") }.to raise_error
+    end
+    it "should error on when not enough arguments are provided" do
+      expect { RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem AND ipsum OR") }.to raise_error
+    end
+  end
   describe "one-term retrieval" do
     it "should return array of all documents with that term" do
-      RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem") == [document, document_replicated, document_with_duplicates]
+      RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem").should == [document, document_replicated, document_with_duplicates, document_one_term]
     end
     it "should ignore case" do
-      RetrievalLite::BooleanRetrieval.evaluate(corpus, "LOREM") == [document, document_replicated, document_with_duplicates]
+      RetrievalLite::BooleanRetrieval.evaluate(corpus, "LOreM").should == [document, document_replicated, document_with_duplicates, document_one_term]
+    end
+    it "should work for hyphenated words" do
+      RetrievalLite::BooleanRetrieval.evaluate(corpus, "no-match").should == [document_no_match]
+    end
+  end
+  describe "valid boolean retrieval" do
+    it "should work for simple two term AND" do
+      RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem AND ipsum").should == [document, document_replicated, document_with_duplicates]
+    end
+    it "should work for simple two term OR" do
+      RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem OR foo").should == all_normal_documents
+    end
+    it "should work for simple one term NOT" do
+      RetrievalLite::BooleanRetrieval.evaluate(corpus, "NOT lorem").should == [document_strange, document_no_match]
+    end
+    it "should work for more complex retrievals with parenthesis" do
+      RetrievalLite::BooleanRetrieval.evaluate(corpus, "foo OR (dolor AND sit)").should == [document, document_replicated, document_with_duplicates, document_strange]
+      RetrievalLite::BooleanRetrieval.evaluate(corpus, "(lorem OR foo) AND NOT ipsum").should == [document_one_term, document_strange]
     end
   end
 end

data/spec/document_spec.rb CHANGED Viewed

@@ -80,6 +80,12 @@ describe RetrievalLite::Document do
       end
     end
+    describe "total terms" do
+      it "should have the right number of terms" do
+        document.total_terms.should == 5
+      end
+    end
     describe "for blank document" do
       it "should not raise error on initialization" do
         expect { RetrievalLite::Document.new("") }.to_not raise_error

data/spec/retrieval_lite_spec.rb CHANGED Viewed

@@ -1,3 +1,68 @@
 require 'spec_helper'
 describe RetrievalLite do
+  include RetrievalLite
+  let (:document_one_term) do
+    new_document("lorem")
+  end
+  let (:document) do
+    new_document("lorem ipsum dolor sit amet")
+  end
+  let (:document_with_duplicates) do
+    new_document("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
+  end
+  let (:document_doubled) do
+    new_document("lorem ipsum dolor sit amet lorem ipsum dolor sit amet")
+  end
+  let (:document_both_terms) do
+    new_document("lorem ipsum")
+  end
+  let (:document_with_unique) do
+    new_document("lorem unique")
+  end
+  let (:document_no_match) do
+    new_document("no-match")
+  end
+  let (:all_documents) do
+    [document, document_with_duplicates, document_doubled, document_one_term, document_both_terms, document_with_unique, document_no_match]
+  end
+  let (:corpus) do
+    new_corpus(all_documents)
+  end
+  let (:corpus_different) do
+    new_corpus([document_one_term, document, document_with_duplicates])
+  end
+  let(:corpus_small) do
+    new_corpus([document_one_term, document, document_no_match])
+  end
+  describe "when no options are passed" do
+    it "should default to basic tf-idf" do
+      scores = evaluate_query_with_scores(corpus_different, "lorem dolor sit")
+      scores[document].should be_within(0.001).of(1.0)
+      scores[document_with_duplicates].should be_within(0.001).of(0.953)
+      scores[document_one_term].should be_within(0.001).of(0.0)
+      evaluate_query(corpus_different, "lorem dolor sit").should == [document, document_with_duplicates, document_one_term]
+    end
+  end
+  describe "when boolean operators are present" do
+    it "should first filter through boolean" do
+      evaluate_query(corpus, "lorem AND NOT dolor").should == [document_one_term, document_both_terms, document_with_unique]
+      evaluate_query(corpus, "(lorem AND unique) OR no-match").should == [document_with_unique, document_no_match]
+      evaluate_query(corpus, "lorem AND ipsum AND dolor AND sit AND amet").should == [document_doubled, document, document_with_duplicates]
+      evaluate_query(corpus, "lorem AND no-match").should == []
+    end
+  end
+  describe "with punctuation" do
+    it "should retrieve it as normal" do
+      evaluate_query(corpus, "lorem. AND NOT dolor!").should == [document_one_term, document_both_terms, document_with_unique]
+      evaluate_query(corpus, "(lorem-- AND !unique) OR no-match").should == [document_with_unique, document_no_match]
+      evaluate_query(corpus, "@lorem AND @ipsum AND @dolor AND @sit AND @amet").should == [document_doubled, document, document_with_duplicates]
+      evaluate_query(corpus, "||lorem AND no-match").should == []
+    end
+  end
 end

data/spec/spec_helper.rb CHANGED Viewed

@@ -1,3 +1,6 @@
+require 'simplecov'
+SimpleCov.start
 require "retrieval_lite"
 require "spec_helpers/file_helpers"

data/spec/tfidf_retrieval_spec.rb CHANGED Viewed

@@ -1,6 +1,9 @@
 require 'spec_helper'
 describe RetrievalLite::TfIdfRetrieval do
+  let (:document_no_match) do
+    RetrievalLite::Document.new("no-match")
+  end
   let (:document_one_term) do
     RetrievalLite::Document.new("lorem")
   end
@@ -29,6 +32,9 @@ describe RetrievalLite::TfIdfRetrieval do
   let (:corpus_different) do
     RetrievalLite::Corpus.new([document_one_term, document, document_with_duplicates])
   end
+  let(:corpus_small) do
+    RetrievalLite::Corpus.new([document_one_term, document, document_no_match])
+  end
   describe "calculating tf-idf scores" do
     describe "term that all documents have" do
@@ -55,10 +61,10 @@ describe RetrievalLite::TfIdfRetrieval do
   describe "calculating normalized tf-idf scores" do
     describe "term that a few documents have" do
       it "should have correct tf-idf" do
-        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.316)
-        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.632)
-        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.632)
-        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(0.316)
+        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.320)
+        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.163)
+        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.320)
+        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(1.0)
       end
     end
   end
@@ -84,9 +90,9 @@ describe RetrievalLite::TfIdfRetrieval do
       it "should return the correct score" do
         scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "ipsum")
         scores.size.should == 4
-        scores[document].should be_within(0.001).of(1.0)
-        scores[document_with_duplicates].should be_within(0.001).of(1.0)
-        scores[document_doubled].should be_within(0.001).of(1.0)
+        scores[document].should be_within(0.001).of(0.320)
+        scores[document_with_duplicates].should be_within(0.001).of(0.163)
+        scores[document_doubled].should be_within(0.001).of(0.320)
         scores[document_both_terms].should be_within(0.001).of(1.0)
       end
     end
@@ -122,9 +128,19 @@ describe RetrievalLite::TfIdfRetrieval do
     end
     it "should have the correct scores" do
       scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus_different, "lorem dolor sit")
-      scores[document].should be_within(0.001).of(0.816)
-      scores[document_with_duplicates].should be_within(0.001).of(0.808)
+      scores[document].should be_within(0.001).of(1.0)
+      scores[document_with_duplicates].should be_within(0.001).of(0.953)
       scores[document_one_term].should be_within(0.001).of(0.0)
     end
   end
+  describe "documents with same frequency but longer lengths" do
+    it "order should favor shorter documents" do
+      RetrievalLite::TfIdfRetrieval.evaluate(corpus_small, "lorem").should == [document_one_term, document]
+    end
+    it "shorter documents should rank higher" do
+      scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus_small, "lorem")
+      scores[document_one_term].should > scores[document]
+    end
+  end
 end

data/spec/vector_spec.rb CHANGED Viewed

@@ -24,4 +24,16 @@ describe RetrievalLite::Vector do
       RetrievalLite::Vector.euclidean_length([3, 4]).should == 5
     end
   end
+  describe "cosine similarity" do
+    it "should compute correctly for vectors length 1" do
+      RetrievalLite::Vector.cosine_similarity([3], [5]).should == 1
+    end
+    it "should compute correctly for longer vectors" do
+      RetrievalLite::Vector.cosine_similarity([2, 3], [4, 5]).should be_within(0.001).of(0.996)
+    end
+    it "should raise error for unequal sized arrays" do
+      expect { RetrievalLite::Vector.cosine_similarity([2, 3], [4]) }.to raise_error
+    end
+  end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: retrieval_lite
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.1.0
 platform: ruby
 authors:
 - Irvin Zhan
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-05-12 00:00:00.000000000 Z
+date: 2014-05-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec