RubyGems - retrieval_lite - Versions diffs - 1.0.0 - Mend

retrieval_lite 1.0.0

Files changed (27) hide show

checksums.yaml +7 -0
data/.gitignore +18 -0
data/.rspec +2 -0
data/Gemfile +4 -0
data/LICENSE +21 -0
data/LICENSE.txt +22 -0
data/README.md +29 -0
data/Rakefile +8 -0
data/lib/retrieval_lite/boolean_retrieval.rb +21 -0
data/lib/retrieval_lite/corpus.rb +64 -0
data/lib/retrieval_lite/document.rb +55 -0
data/lib/retrieval_lite/tfidf_retrieval.rb +84 -0
data/lib/retrieval_lite/tokenizer.rb +44 -0
data/lib/retrieval_lite/vector.rb +39 -0
data/lib/retrieval_lite.rb +12 -0
data/lib/version.rb +3 -0
data/retrieval_lite.gemspec +24 -0
data/spec/boolean_retrieval_spec.rb +44 -0
data/spec/corpus_spec.rb +132 -0
data/spec/document_spec.rb +96 -0
data/spec/retrieval_lite_spec.rb +3 -0
data/spec/spec_helper.rb +16 -0
data/spec/spec_helpers/file_helpers.rb +9 -0
data/spec/tfidf_retrieval_spec.rb +130 -0
data/spec/tokenizer_spec.rb +114 -0
data/spec/vector_spec.rb +27 -0
metadata +122 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 8e080b5b0c600b330f5d07827b5c2e576132306b
+  data.tar.gz: f0fa5049a6f5e9df0a1fbc761107d28dc1efe857
+SHA512:
+  metadata.gz: b1827e3f90edbc3ee841fbe25526c748ac82ad1aae6f42455605e3199dffe7c5cd309c2684741d41f695c88285152c1bf83558beb87668ec9eebb4973f7ff0a3
+  data.tar.gz: 5d2009f175da5ce2cecb277061e9592093a14435d5012094c461afb1ac626b1d72705edf560cece8f53d4a02623f7ebe2fd3adc0f0555b0934cd5b878f6c12b2

data/.gitignore ADDED Viewed

@@ -0,0 +1,18 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+.DS_STORE

data/.rspec ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --color
2	+ --format progress

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# gem's dependencies specified in retrieval_lite.gemspec
+gemspec

data/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright (c) 2014 Irvin Zhan
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2014 Irvin Zhan
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,29 @@
+# Retrieval Lite Gem
+Lightweight gem for document retrieval using tf-idf based algorithms for Ruby
+## Installation
+Add this line to your application's Gemfile:
+    gem 'retrieval_lite'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install retrieval_lite
+## Usage
+TODO: Write usage instructions here
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

data/Rakefile ADDED Viewed

@@ -0,0 +1,8 @@
+require "bundler/gem_tasks"
+desc "run tests"
+task :spec do
+  sh "rspec spec"
+end
+task :default => :spec

data/lib/retrieval_lite/boolean_retrieval.rb ADDED Viewed

@@ -0,0 +1,21 @@
+module RetrievalLite::BooleanRetrieval
+  # Queries a corpus using a boolean expression with the standard operators,
+  # AND, OR, NOT.  Only returns documents that satisfy the query, and does
+  # not rank the documents in any way.
+  #
+  # @param corpus [Corpus] the collection of documents
+  # @param query [String] the boolean query to be evaluated
+  # @return [Array<Document>] unordered array of documents that satisfy the query
+  def self.evaluate(corpus, query)
+    if !is_valid?(query)
+      raise "Boolean expression is not valid." # TODO better validation message?
+    end
+    corpus.documents_with(query)
+  end
+  private
+    def self.is_valid?(query)
+      true
+    end
+end

data/lib/retrieval_lite/corpus.rb ADDED Viewed

@@ -0,0 +1,64 @@
+class RetrievalLite::Corpus
+  # the documents within the corpus
+  attr_reader :documents
+  # hash of a term to the array of documents that contain the particular term
+  attr_reader :term_occurrences
+  # @param documents [Array<Document>] the documents of the corpus
+  # @param opts [Hash] optional arguments to initializer
+  # @option opts [Array<String>] :stop_words the words to ignore when creating tokens
+  def initialize(documents = [], opts = {})
+    @documents = documents
+    @term_occurrences = {}
+    @stop_words = opts[:stop_words] || []
+    # stop_words should be lowercased since tokens are in lowercase
+    @stop_words.each do |w|
+      w.downcase!
+    end
+    @stop_words = Set.new @stop_words # faster .include?
+    documents.each do |d|
+      update_term_occurrences(d)
+    end
+  end
+  # Adds a document to the corpus
+  # @param document [Document] the document to be added
+  def add(document)
+    @documents << document
+    update_term_occurrences(document)
+  end
+  # @return [Integer] the number documents in the corpus
+  def size
+    documents.size
+  end
+  # @param term [String] the term to retrieve the documents for
+  # @return [Array<Document>] the array of documents containing the particular term or nil if no such occurence
+  def documents_with(term)
+    term_occurrences[term]
+  end
+  # @param term [String] the query term for the documents
+  # @return [Integer] the number of documents that contain the particular term
+  def document_frequency(term)
+    if term_occurrences[term]
+      return term_occurrences[term].size
+    else
+      return 0
+    end
+  end
+  private
+    # adds each term of the document to the term_occurence hash
+    def update_term_occurrences(document)
+      document.terms.each do |term|
+        if @term_occurrences.has_key?(term)
+          @term_occurrences[term] << document
+        elsif !@stop_words.include?(term)
+          @term_occurrences[term] = [document]
+        end
+      end
+    end
+end

data/lib/retrieval_lite/document.rb ADDED Viewed

@@ -0,0 +1,55 @@
+class RetrievalLite::Document
+  # the text of the document
+  attr_reader :content
+  # a Hash<String, Integer> of all terms of the documents to the frequency of each term
+  attr_reader :term_frequencies
+  # the id of the document
+  attr_reader :id
+  # splits the text of the document into an array of tokens
+  #
+  # @param content [String] the text of the document
+  # @param opts [Hash] optional arguments to initializer
+  # @option opts [String] :id the id of the document.  Defaults to object_id assigned by ruby
+  def initialize(content, opts = {})
+    @content = content
+    @id = opts[:id] || object_id
+    @term_frequencies = RetrievalLite::Tokenizer.parse_content(content)
+  end
+  # for debugging
+  def print_tokens
+    @term_frequencies.each do |key, value|
+      puts "#{key}: #{value}"
+    end
+  end
+  # @return [Integer] the total number of unique terms in the document
+  def term_count
+    @term_frequencies.size
+  end
+  # @return [Array<String>] the unique terms of the document
+  def terms
+    @term_frequencies.keys
+  end
+  # @param term [String]
+  # @return [Integer] the number of times a term appears in the document
+  def frequency_of(term)
+    if @term_frequencies.has_key?(term)
+      return @term_frequencies[term]
+    else
+      return 0
+    end
+  end
+  # @return [Integer] the total number of terms (not unique) in the document
+  def total_terms
+    count = 0
+    @term_frequencies.each do |key, value|
+      count += value
+    end
+    return count
+  end
+end

data/lib/retrieval_lite/tfidf_retrieval.rb ADDED Viewed

@@ -0,0 +1,84 @@
+# @see http://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf
+module RetrievalLite::TfIdfRetrieval
+  # Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
+  # Returns documents ordered by tf-idf score.
+  #
+  # @param corpus [Corpus] the collection of documents
+  # @param query [String] the boolean query to be evaluated
+  # @return [Array<Document>] ordered array of documents that satisfy the query
+  def self.evaluate(corpus, query)
+    evaluate_with_scores(corpus, query).keys
+  end
+  # Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
+  # Same as #evaluate but returns a hash whose keys are documents and values
+  # are the tf-idf score.
+  #
+  # @param corpus [Corpus] the collection of documents
+  # @param query [String] the boolean query to be evaluated
+  # @return [Hash<Document, Integer>] ordered array of documents that satisfy the query
+  def self.evaluate_with_scores(corpus, query)
+    query_document = RetrievalLite::Document.new(query)
+    terms = query_document.term_frequencies.keys
+    query_vector = query_document.term_frequencies.values # should be in same order as keys
+    documents = Set.new # ordering of documents doesn't matter right now
+    # gathering only the documents that contain at least one of those terms
+    terms.each do |t|
+      docs_with_term = corpus.documents_with(t)
+      if docs_with_term
+        docs_with_term.each do |d|
+          if !documents.include?(d)
+            documents << d
+          end
+        end
+      end
+    end
+    scores = {}
+    documents.each do |document|
+      document_vector = Array.new(terms.size)
+      terms.each_with_index do |term, index|
+        document_vector[index] = tfidf_weight(corpus, document, term)
+      end
+      scores[document] = RetrievalLite::Vector.cosine_similarity(query_vector, document_vector)
+    end
+    # order it by score in descending order
+    return Hash[scores.sort_by{|key, value| value}.reverse]
+  end
+  # Ranks a document in corpus using the tf-idf scoring.
+  #
+  # @note tf-idf is slightly modified.  n_j (# of docs containing term j) is replaced with n_j + 1 to avoid divide by zero
+  #
+  # @param corpus [Corpus]
+  # @param document [Document]
+  # @param term [String]
+  # @return [Float] the tfidf weight of the term in the document
+  def self.tfidf_weight(corpus, document, term)
+    if corpus.document_frequency(term) == 0
+      return 0
+    else
+      return document.frequency_of(term) * Math.log(1.0 * corpus.size/(corpus.document_frequency(term)))
+    end
+  end
+  # Ranks a document in corpus using the normalized tf-idf scoring.
+  # @see #tfidf_weight
+  #
+  # @param corpus [Corpus]
+  # @param document [Document]
+  # @param term [String]
+  # @return [Float] the normalized tfidf weight of the term in the document
+  def self.normalized_tfidf_weight(corpus, document, term)
+    length_of_vector = 0
+    corpus.documents_with(term).each do |d|
+      weight = tfidf_weight(corpus, d, term)
+      length_of_vector += weight * weight
+    end
+    tfidf_weight(corpus, document, term) / Math.sqrt(length_of_vector)
+  end
+end

data/lib/retrieval_lite/tokenizer.rb ADDED Viewed

@@ -0,0 +1,44 @@
+module RetrievalLite::Tokenizer
+  SPECIAL_SEPARATERS = ['[', ']', '\\', ';', '\'', ',', '.', '/', '!', '@', '#', '%', '&', '*', '(', ')', '_', '{', '}', ':', '"', '?', '=', '`', '~', '$', '^', '+', '|', '<', '>']
+  # @param content [String] the text of the document
+  # @return [Hash<String, Integer>] a hash that gives term frequency of content
+  def self.parse_content(content)
+    tokens = Hash.new(0) # initialize to 0
+    # removes everything BUT the letters
+    token_text = content.strip.downcase.split(/#{separaters_regex}/)
+    token_text.each do |t|
+      # also validates whether there are no other special characters left in there
+      if has_hyphen?(t)
+        tokens[t] += 1
+      else
+        # get rid of any extra symbols we might have forgotten.
+        term = t.gsub(/[^a-z]/, '')
+        # just in case the entire string was just non-characters
+        if term != ''
+          tokens[term] += 1
+        end
+      end
+    end
+    tokens
+  end
+  private
+    # separates by whitespace and any special characters
+    def self.separaters_regex
+      regex = "\s+" # captures all white spaces
+      SPECIAL_SEPARATERS.each do |s|
+        regex = regex + '|' + Regexp.quote(s)
+      end
+      return Regexp.new(regex)
+    end
+    # detects whether term is hyphenated
+    def self.has_hyphen?(term)
+      term =~ /\A[a-z]+\-[a-z]+\Z/
+    end
+end

data/lib/retrieval_lite/vector.rb ADDED Viewed

@@ -0,0 +1,39 @@
+module RetrievalLite::Vector
+  # @param scores1 [Array<Integer>] each term and its corresponding score in the first document
+  # @param scores2 [Array<Integer>] each term and its corresponding score in the second document
+  # @return [Float] the cosine similarity of the two vectors representing the score of the documents
+  def self.cosine_similarity(scores1, scores2)
+    length = (euclidean_length(scores1) * euclidean_length(scores2))
+    if length == 0
+      return 0
+    else
+      dot_product(scores1, scores2) / length
+    end
+  end
+  # @param scores1 [Array<Integer>] each term and its corresponding score in the first document
+  # @param scores2 [Array<Integer>] each term and its corresponding score in the second document
+  # @return [Float] the dot product of the two vectors representing the score of the documents
+  def self.dot_product(scores1, scores2)
+    raise "document vectors are not of same length" if scores1.size != scores2.size
+    sum = 0
+    for i in 0...scores1.size
+      sum += scores1[i]*scores2[i]
+    end
+    return sum
+  end
+  # @param scores [Array<Integer>] each term and its corresponding score in the document
+  # @return [Float] the euclidean length of the vectors representing the score of the document
+  def self.euclidean_length(scores)
+    sum = 0
+    for i in 0...scores.size
+      sum += scores[i] * scores[i]
+    end
+    Math.sqrt(sum)
+  end
+end

data/lib/retrieval_lite.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require "version"
+module RetrievalLite
+end
+require 'retrieval_lite/document'
+require 'retrieval_lite/corpus'
+require 'retrieval_lite/tokenizer'
+require 'retrieval_lite/boolean_retrieval'
+require 'retrieval_lite/tfidf_retrieval'
+require 'retrieval_lite/vector'

data/lib/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module RetrievalLite
+  VERSION = "1.0.0"
+end

data/retrieval_lite.gemspec ADDED Viewed

@@ -0,0 +1,24 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'version'
+Gem::Specification.new do |spec|
+  spec.name          = "retrieval_lite"
+  spec.version       = RetrievalLite::VERSION
+  spec.authors       = ["Irvin Zhan"]
+  spec.email         = ["izhan@princeton.edu"]
+  spec.description   = %q{Lightweight gem for document retrieval using tf-idf based algorithms for Ruby}
+  spec.summary       = %q{Please see associated GitHub page for usage.}
+  spec.homepage      = "https://github.com/izhan/retrieval_lite"
+  spec.license       = "MIT"
+  spec.files         = `git ls-files`.split($/)
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "rspec"
+  spec.add_development_dependency "bundler", "~> 1.3"
+  spec.add_development_dependency "rake"
+end

data/spec/boolean_retrieval_spec.rb ADDED Viewed

@@ -0,0 +1,44 @@
+require 'spec_helper'
+describe RetrievalLite::BooleanRetrieval do
+  let (:document) do
+    RetrievalLite::Document.new("lorem ipsum dolor sit amet")
+  end
+  let (:document_replicated) do
+    RetrievalLite::Document.new("lorem ipsum dolor sit amet")
+  end
+  let (:document_with_duplicates) do
+    RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
+  end
+  let (:document_two) do
+    RetrievalLite::Document.new("Mauris ullamcorper, tortor et consequat sagittis.")
+  end
+  let (:document_three) do
+    RetrievalLite::Document.new("Pellentesque felis lectus, lacinia nec mauris non.")
+  end
+  let (:document_paragraph) do
+    RetrievalLite::Document.new("In semper enim non ullamcorper venenatis.
+      Sed dictum metus condimentum libero ullamcorper, eget scelerisque risus congue.
+      Morbi tempus rhoncus ante, at varius sem adipiscing eu. Sed ut purus pretium,
+      consequat velit et, ultricies magna. Etiam sit amet elit mi. Sed et nibh non nibh
+      vestibulum hendrerit vitae dapibus lectus. Aenean eget odio vitae tortor elementum
+      euismod non nec eros. Nunc id convallis magna. Aliquam ultrices dignissim ipsum,
+      a accumsan enim faucibus non. Pellentesque a felis quis diam blandit tempor.
+      In aliquet laoreet tortor, at adipiscing diam scelerisque ut."
+      )
+  end
+  let (:all_documents) do
+    [document, document_replicated, document_with_duplicates, document_two, document_three, document_paragraph]
+  end
+  let (:corpus) do
+    RetrievalLite::Corpus.new(all_documents)
+  end
+  describe "one-term retrieval" do
+    it "should return array of all documents with that term" do
+      RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem") == [document, document_replicated, document_with_duplicates]
+    end
+    it "should ignore case" do
+      RetrievalLite::BooleanRetrieval.evaluate(corpus, "LOREM") == [document, document_replicated, document_with_duplicates]
+    end
+  end
+end

data/spec/corpus_spec.rb ADDED Viewed

@@ -0,0 +1,132 @@
+require 'spec_helper'
+describe RetrievalLite::Corpus do
+  let (:document) do
+    RetrievalLite::Document.new("lorem ipsum dolor sit amet")
+  end
+  let (:document_replicated) do
+    RetrievalLite::Document.new("lorem ipsum dolor sit amet")
+  end
+  let (:document_with_duplicates) do
+    RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
+  end
+  let (:document_two) do
+    RetrievalLite::Document.new("Mauris ullamcorper, tortor et consequat sagittis.")
+  end
+  let (:document_three) do
+    RetrievalLite::Document.new("Pellentesque felis lectus, lacinia nec mauris non.")
+  end
+  let (:document_paragraph) do
+    RetrievalLite::Document.new("In semper enim non ullamcorper venenatis.
+      Sed dictum metus condimentum libero ullamcorper, eget scelerisque risus congue.
+      Morbi tempus rhoncus ante, at varius sem adipiscing eu. Sed ut purus pretium,
+      consequat velit et, ultricies magna. Etiam sit amet elit mi. Sed et nibh non nibh
+      vestibulum hendrerit vitae dapibus lectus. Aenean eget odio vitae tortor elementum
+      euismod non nec eros. Nunc id convallis magna. Aliquam ultrices dignissim ipsum,
+      a accumsan enim faucibus non. Pellentesque a felis quis diam blandit tempor.
+      In aliquet laoreet tortor, at adipiscing diam scelerisque ut."
+      )
+  end
+  let (:all_documents) do
+    [document, document_replicated, document_with_duplicates, document_two, document_three, document_paragraph]
+  end
+  describe "for empty corpus" do
+    let (:corpus) do
+      RetrievalLite::Corpus.new
+    end
+    it "should have size of zero" do
+      corpus.size.should == 0
+    end
+    it "should not error when querying terms" do
+      expect { corpus.documents_with("foo") }.to_not raise_error
+      expect { corpus.document_frequency("foo") }.to_not raise_error
+    end
+  end
+  describe "for basic one-document corpus" do
+    let (:corpus) do
+      RetrievalLite::Corpus.new([document])
+    end
+    it "should have size of one" do
+      corpus.size.should == 1
+    end
+    it "should give us correct document frequencies" do
+      terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
+      terms.each do |t|
+        corpus.document_frequency(t).should == 1
+      end
+      corpus.document_frequency("foo").should == 0
+    end
+    it "should return document when queried" do
+      terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
+      terms.each do |t|
+        corpus.documents_with(t).should == [document]
+      end
+      corpus.documents_with("foo").should == nil
+    end
+  end
+  describe "for two-identical-document corpus" do
+    let (:corpus) do
+      RetrievalLite::Corpus.new([document, document_replicated])
+    end
+    it "should give us correct document frequencies" do
+      terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
+      terms.each do |t|
+        corpus.document_frequency(t).should == 2
+      end
+      corpus.document_frequency("foo").should == 0
+    end
+    it "should return document when queried" do
+      terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
+      terms.each do |t|
+        corpus.documents_with(t).should == [document, document_replicated]
+      end
+      corpus.documents_with("foo").should == nil
+    end
+  end
+  describe "for multiple-document corpus" do
+    let (:corpus) do
+      RetrievalLite::Corpus.new(all_documents)
+    end
+    it "should have the correct size" do
+      corpus.size.should == 6
+    end
+    # TODO are more comprehensive tests needed....?
+    it "should give us correct document frequencies" do
+      corpus.document_frequency("lorem").should == 3
+      corpus.document_frequency("semper").should == 1
+    end
+  end
+  describe "adding in documents one at a time" do
+    let (:correct_corpus) do
+      RetrievalLite::Corpus.new(all_documents)
+    end
+    let (:corpus) do
+      RetrievalLite::Corpus.new
+    end
+    it "should be same as initializing corpus with all documents" do
+      all_documents.each do |d|
+        corpus.add(d)
+      end
+      corpus.documents.should == correct_corpus.documents
+    end
+  end
+  describe "with optional parameters" do
+    it "should ignore any stopwords (not case sensitive)" do
+      stop_words = ["lorem", "IPSum"]
+      corpus = RetrievalLite::Corpus.new([document], stop_words: stop_words)
+      corpus.documents_with("lorem").should == nil
+      corpus.documents_with("ipsum").should == nil
+    end
+  end
+end

data/spec/document_spec.rb ADDED Viewed

@@ -0,0 +1,96 @@
+require 'spec_helper'
+describe RetrievalLite::Document do
+  describe "for a basic document" do
+    let (:document) do
+      RetrievalLite::Document.new("lorem ipsum dolor sit amet")
+    end
+    let (:capitalized_document) do
+      RetrievalLite::Document.new("LorEM iPSUM DOLOR sit ameT")
+    end
+    let (:document_with_duplicates) do
+      RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
+    end
+    let (:basic_tf) do
+      {
+        "lorem" => 1,
+        "ipsum" => 1,
+        "dolor" => 1,
+        "sit" => 1,
+        "amet" => 1
+      }
+    end
+    let (:multiple_tf) do
+      {
+        "lorem" => 1,
+        "ipsum" => 2,
+        "dolor" => 3,
+        "sit" => 4,
+        "amet" => 5
+      }
+    end
+    describe "content of the document" do
+      it "should have original content" do
+        document.content.should == "lorem ipsum dolor sit amet"
+        capitalized_document.content.should == "LorEM iPSUM DOLOR sit ameT"
+        document_with_duplicates.content.should == "lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet"
+      end
+    end
+    describe "the number of terms of the document" do
+      it "should be correct for singleton terms" do
+        document.term_count.should == 5
+      end
+      it "should not care about capitalization" do
+        capitalized_document.term_count.should == 5
+      end
+      it "should be correct for duplicate terms" do
+        document_with_duplicates.term_count.should == 5
+      end
+    end
+    describe "id of the document" do
+      it "should default to ruby's object_id" do
+        document.id.should == document.object_id
+      end
+    end
+    describe "term frequencies of the document" do
+      it "should be correct for singleton terms" do
+        document.term_frequencies.should == basic_tf
+      end
+      it "should be correct for capitalization" do
+        capitalized_document.term_frequencies.should == basic_tf
+      end
+      it "should be correct for capitalization" do
+        document_with_duplicates.term_frequencies.should == multiple_tf
+      end
+    end
+    describe "frequencies of a term" do
+      it "should be correct for term in document" do
+        document.frequency_of("lorem").should == 1
+        document_with_duplicates.frequency_of("ipsum").should == 2
+      end
+      it "should be zero for term not in document" do
+        document.frequency_of("foo").should == 0
+        document_with_duplicates.frequency_of("foo").should == 0
+      end
+    end
+    describe "for blank document" do
+      it "should not raise error on initialization" do
+        expect { RetrievalLite::Document.new("") }.to_not raise_error
+      end
+    end
+  end
+  describe "optional parameters" do
+    it "should allow for customized id" do
+      doc = RetrievalLite::Document.new("lorem ipsum dolor sit amet", id: "foo")
+      doc.id.should == "foo"
+    end
+  end
+end

data/spec/retrieval_lite_spec.rb ADDED Viewed

@@ -0,0 +1,3 @@
+require 'spec_helper'
+describe RetrievalLite do
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,16 @@
+require "retrieval_lite"
+require "spec_helpers/file_helpers"
+RSpec.configure do |config|
+  config.treat_symbols_as_metadata_keys_with_true_values = true
+  config.run_all_when_everything_filtered = true
+  config.filter_run :focus
+  config.include RetrievalLite::SpecHelpers::FileHelpers
+  # Run specs in random order to surface order dependencies. If you find an
+  # order dependency and want to debug it, you can fix the order by providing
+  # the seed, which is printed after each run.
+  #     --seed 1234
+  config.order = 'random'
+end

data/spec/spec_helpers/file_helpers.rb ADDED Viewed

@@ -0,0 +1,9 @@
+module RetrievalLite
+  module SpecHelpers
+    module FileHelpers
+    end
+  end
+end

data/spec/tfidf_retrieval_spec.rb ADDED Viewed

@@ -0,0 +1,130 @@
+require 'spec_helper'
+describe RetrievalLite::TfIdfRetrieval do
+  let (:document_one_term) do
+    RetrievalLite::Document.new("lorem")
+  end
+  let (:document) do
+    RetrievalLite::Document.new("lorem ipsum dolor sit amet")
+  end
+  let (:document_with_duplicates) do
+    RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
+  end
+  let (:document_doubled) do
+    RetrievalLite::Document.new("lorem ipsum dolor sit amet lorem ipsum dolor sit amet")
+  end
+  let (:document_both_terms) do
+    RetrievalLite::Document.new("lorem ipsum")
+  end
+  let (:document_with_unique) do
+    RetrievalLite::Document.new("lorem unique")
+  end
+  # sorted by lorem order
+  let (:all_documents) do
+    [document, document_with_duplicates, document_doubled, document_one_term, document_both_terms, document_with_unique]
+  end
+  let (:corpus) do
+    RetrievalLite::Corpus.new(all_documents)
+  end
+  let (:corpus_different) do
+    RetrievalLite::Corpus.new([document_one_term, document, document_with_duplicates])
+  end
+  describe "calculating tf-idf scores" do
+    describe "term that all documents have" do
+      it "should have correct tf-idf" do
+        RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_one_term, "lorem").should be_within(0.001).of(0)
+        RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document, "lorem").should be_within(0.001).of(0)
+        RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_with_duplicates, "lorem").should be_within(0.001).of(0)
+        RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_doubled, "lorem").should be_within(0.001).of(0)
+        RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_both_terms, "lorem").should be_within(0.001).of(0)
+        RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_with_unique, "lorem").should be_within(0.001).of(0)
+      end
+    end
+    describe "term that a few documents have" do
+      it "should have correct tf-idf" do
+        RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.405)
+        RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.811)
+        RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.811)
+        RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(0.405)
+      end
+    end
+  end
+  describe "calculating normalized tf-idf scores" do
+    describe "term that a few documents have" do
+      it "should have correct tf-idf" do
+        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.316)
+        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.632)
+        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.632)
+        RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(0.316)
+      end
+    end
+  end
+  describe "calculating total tf-idf scores" do
+    describe "for when all documents of corpus has a term" do
+      it "should have score of zero for each document" do
+        scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "lorem")
+        scores.size.should == all_documents.size
+        scores.values.each do |v|
+          v.should == 0
+        end
+      end
+    end
+    describe "term that only one document has" do
+      it "should return the correct score" do
+        scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "unique")
+        scores.size.should == 1
+        scores[document_with_unique].should be_within(0.001).of(1.0)
+      end
+    end
+    describe "term that a few documents have" do
+      it "should return the correct score" do
+        scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "ipsum")
+        scores.size.should == 4
+        scores[document].should be_within(0.001).of(1.0)
+        scores[document_with_duplicates].should be_within(0.001).of(1.0)
+        scores[document_doubled].should be_within(0.001).of(1.0)
+        scores[document_both_terms].should be_within(0.001).of(1.0)
+      end
+    end
+  end
+  describe "one-term retrieval" do
+    it "should return array with that term" do
+      RetrievalLite::TfIdfRetrieval.evaluate(corpus, "lorem").should =~ all_documents
+    end
+    it "should ignore case" do
+      RetrievalLite::TfIdfRetrieval.evaluate(corpus, "LOREM").should =~ all_documents
+    end
+  end
+  describe "when corpus has only one document containing term" do
+    it "should return array with only that document" do
+      RetrievalLite::TfIdfRetrieval.evaluate(corpus, "unique").should == [document_with_unique]
+    end
+  end
+  describe "for no matches" do
+    it "should return empty array for term not in any documents" do
+      RetrievalLite::TfIdfRetrieval.evaluate(corpus, "foobar").should == []
+    end
+    it "should return empty array for empty string" do
+      RetrievalLite::TfIdfRetrieval.evaluate(corpus, "").should == []
+    end
+  end
+  describe "multiple-term retrieval" do
+    it "should order documents correctly" do
+      RetrievalLite::TfIdfRetrieval.evaluate(corpus_different, "lorem dolor sit").should == [document, document_with_duplicates, document_one_term]
+    end
+    it "should have the correct scores" do
+      scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus_different, "lorem dolor sit")
+      scores[document].should be_within(0.001).of(0.816)
+      scores[document_with_duplicates].should be_within(0.001).of(0.808)
+      scores[document_one_term].should be_within(0.001).of(0.0)
+    end
+  end
+end

data/spec/tokenizer_spec.rb ADDED Viewed

@@ -0,0 +1,114 @@
+require 'spec_helper'
+describe RetrievalLite::Tokenizer do
+  describe "parse_content" do
+    describe "for basic terms" do
+      let (:basic_tf) do
+        {
+          "lorem" => 1,
+          "ipsum" => 1,
+          "dolor" => 1,
+          "sit" => 1,
+          "amet" => 1
+        }
+      end
+      it "should split the content" do
+        content = "lorem ipsum dolor sit amet"
+        RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
+      end
+      it "should ignore extra white spaces" do
+        content = "lorem    ipsum  dolor \n sit   amet"
+        RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
+      end
+      it "should ignore punctuation" do
+        content = "lorem! @ #ipsum (dolor) sit * \  amet"
+        RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
+      end
+      it "should ignore capitalization" do
+        content = "LOREM iPSuM dOLOR sit amet"
+        RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
+      end
+      it "should ignore any whitespaces in front and back of content" do
+        content = "     lorem ipsum dolor sit amet    \n"
+        RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
+      end
+    end
+    describe "for content with multiple terms" do
+      let (:multiple_tf) do
+        {
+          "lorem" => 1,
+          "ipsum" => 2,
+          "dolor" => 3,
+          "sit" => 4,
+          "amet" => 5
+        }
+      end
+      it "should not care about order" do
+        content = "lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet"
+        RetrievalLite::Tokenizer.parse_content(content).should == multiple_tf
+        content = "amet amet lorem dolor dolor sit sit ipsum ipsum sit amet dolor sit amet amet"
+        RetrievalLite::Tokenizer.parse_content(content).should == multiple_tf
+      end
+      it "should consider capitalized terms to be the same" do
+        content = "lorem IPSUM ipsum doLOR doLOR dolor SIT SIT SIT SIT amet ameT amET aMET AMET"
+        RetrievalLite::Tokenizer.parse_content(content).should == multiple_tf
+      end
+    end
+    describe "for special cases" do
+      let(:foo_bar_hash) do
+        {
+          "foo" => 1,
+          "bar" => 1
+        }
+      end
+      let(:foo_bar_baz_hash) do
+        {
+          "foo" => 1,
+          "bar" => 1,
+          "baz" => 1
+        }
+      end
+      it "should return empty hash if there are no terms" do
+        RetrievalLite::Tokenizer.parse_content("").should == Hash.new
+      end
+      it "should ignore numbers" do
+        RetrievalLite::Tokenizer.parse_content("1 2 3.14159").should == Hash.new
+      end
+      it "should ignore control characters" do
+        RetrievalLite::Tokenizer.parse_content("\a\e\f\n\r\t\v").should == Hash.new
+        RetrievalLite::Tokenizer.parse_content("\x07\x1B\f\n\r\t\v").should == Hash.new
+      end
+      it "should split words connected by special characters" do
+        RetrievalLite::Tokenizer.parse_content("foo/bar").should == foo_bar_hash
+        RetrievalLite::Tokenizer.parse_content("foo,bar").should == foo_bar_hash
+        RetrievalLite::Tokenizer.parse_content("foo,:bar").should == foo_bar_hash
+        RetrievalLite::Tokenizer.parse_content("foo   ,:bar").should == foo_bar_hash
+        RetrievalLite::Tokenizer.parse_content("!@foo  ,:bar#").should == foo_bar_hash
+        RetrievalLite::Tokenizer.parse_content("foo:bar baz").should == foo_bar_baz_hash
+      end
+      it "should not split words connected by only one hyphen" do
+        RetrievalLite::Tokenizer.parse_content("foo-bar").should == { "foo-bar" => 1 }
+        RetrievalLite::Tokenizer.parse_content("foo - bar").should == foo_bar_hash
+        RetrievalLite::Tokenizer.parse_content("foo --bar").should == foo_bar_hash
+        #RetrievalLite::Tokenizer.parse_content("foo--bar").should == foo_bar_hash # TODO is this worth it?
+        RetrievalLite::Tokenizer.parse_content("foo-bar-baz").should_not == foo_bar_baz_hash
+      end
+    end
+  end
+end

data/spec/vector_spec.rb ADDED Viewed

@@ -0,0 +1,27 @@
+require 'spec_helper'
+describe RetrievalLite::Vector do
+  describe "dot product" do
+    it "should compute correctly for vectors length 1" do
+      RetrievalLite::Vector.dot_product([3], [5]).should == 15
+    end
+    it "should compute correctly for longer vectors" do
+      RetrievalLite::Vector.dot_product([2, 3], [4, 5]).should == 23
+    end
+    it "should raise error for unequal sized arrays" do
+      expect { RetrievalLite::Vector.dot_product([2, 3], [4]) }.to raise_error
+    end
+  end
+  describe "euclidean length" do
+    it "should calculate it for vectors length 1" do
+      RetrievalLite::Vector.euclidean_length([1]).should == 1
+    end
+    it "should calculate it for zero vectors" do
+      RetrievalLite::Vector.euclidean_length([0, 0, 0]).should == 0
+    end
+    it "should calculate it for longer vectors" do
+      RetrievalLite::Vector.euclidean_length([3, 4]).should == 5
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,122 @@
+--- !ruby/object:Gem::Specification
+name: retrieval_lite
+version: !ruby/object:Gem::Version
+  version: 1.0.0
+platform: ruby
+authors:
+- Irvin Zhan
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-05-12 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: Lightweight gem for document retrieval using tf-idf based algorithms
+  for Ruby
+email:
+- izhan@princeton.edu
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- .rspec
+- Gemfile
+- LICENSE
+- LICENSE.txt
+- README.md
+- Rakefile
+- lib/retrieval_lite.rb
+- lib/retrieval_lite/boolean_retrieval.rb
+- lib/retrieval_lite/corpus.rb
+- lib/retrieval_lite/document.rb
+- lib/retrieval_lite/tfidf_retrieval.rb
+- lib/retrieval_lite/tokenizer.rb
+- lib/retrieval_lite/vector.rb
+- lib/version.rb
+- retrieval_lite.gemspec
+- spec/boolean_retrieval_spec.rb
+- spec/corpus_spec.rb
+- spec/document_spec.rb
+- spec/retrieval_lite_spec.rb
+- spec/spec_helper.rb
+- spec/spec_helpers/file_helpers.rb
+- spec/tfidf_retrieval_spec.rb
+- spec/tokenizer_spec.rb
+- spec/vector_spec.rb
+homepage: https://github.com/izhan/retrieval_lite
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.1.11
+signing_key:
+specification_version: 4
+summary: Please see associated GitHub page for usage.
+test_files:
+- spec/boolean_retrieval_spec.rb
+- spec/corpus_spec.rb
+- spec/document_spec.rb
+- spec/retrieval_lite_spec.rb
+- spec/spec_helper.rb
+- spec/spec_helpers/file_helpers.rb
+- spec/tfidf_retrieval_spec.rb
+- spec/tokenizer_spec.rb
+- spec/vector_spec.rb
+has_rdoc: