retrieval_lite 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8e080b5b0c600b330f5d07827b5c2e576132306b
4
+ data.tar.gz: f0fa5049a6f5e9df0a1fbc761107d28dc1efe857
5
+ SHA512:
6
+ metadata.gz: b1827e3f90edbc3ee841fbe25526c748ac82ad1aae6f42455605e3199dffe7c5cd309c2684741d41f695c88285152c1bf83558beb87668ec9eebb4973f7ff0a3
7
+ data.tar.gz: 5d2009f175da5ce2cecb277061e9592093a14435d5012094c461afb1ac626b1d72705edf560cece8f53d4a02623f7ebe2fd3adc0f0555b0934cd5b878f6c12b2
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .DS_STORE
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # gem's dependencies specified in retrieval_lite.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2014 Irvin Zhan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Irvin Zhan
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Retrieval Lite Gem
2
+
3
+ Lightweight gem for document retrieval using tf-idf based algorithms for Ruby
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'retrieval_lite'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install retrieval_lite
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ desc "run tests"
4
+ task :spec do
5
+ sh "rspec spec"
6
+ end
7
+
8
+ task :default => :spec
@@ -0,0 +1,21 @@
1
+ module RetrievalLite::BooleanRetrieval
2
+ # Queries a corpus using a boolean expression with the standard operators,
3
+ # AND, OR, NOT. Only returns documents that satisfy the query, and does
4
+ # not rank the documents in any way.
5
+ #
6
+ # @param corpus [Corpus] the collection of documents
7
+ # @param query [String] the boolean query to be evaluated
8
+ # @return [Array<Document>] unordered array of documents that satisfy the query
9
+ def self.evaluate(corpus, query)
10
+ if !is_valid?(query)
11
+ raise "Boolean expression is not valid." # TODO better validation message?
12
+ end
13
+
14
+ corpus.documents_with(query)
15
+ end
16
+
17
+ private
18
+ def self.is_valid?(query)
19
+ true
20
+ end
21
+ end
@@ -0,0 +1,64 @@
1
+ class RetrievalLite::Corpus
2
+ # the documents within the corpus
3
+ attr_reader :documents
4
+ # hash of a term to the array of documents that contain the particular term
5
+ attr_reader :term_occurrences
6
+
7
+ # @param documents [Array<Document>] the documents of the corpus
8
+ # @param opts [Hash] optional arguments to initializer
9
+ # @option opts [Array<String>] :stop_words the words to ignore when creating tokens
10
+ def initialize(documents = [], opts = {})
11
+ @documents = documents
12
+ @term_occurrences = {}
13
+ @stop_words = opts[:stop_words] || []
14
+ # stop_words should be lowercased since tokens are in lowercase
15
+ @stop_words.each do |w|
16
+ w.downcase!
17
+ end
18
+ @stop_words = Set.new @stop_words # faster .include?
19
+
20
+ documents.each do |d|
21
+ update_term_occurrences(d)
22
+ end
23
+ end
24
+
25
+ # Adds a document to the corpus
26
+ # @param document [Document] the document to be added
27
+ def add(document)
28
+ @documents << document
29
+ update_term_occurrences(document)
30
+ end
31
+
32
+ # @return [Integer] the number documents in the corpus
33
+ def size
34
+ documents.size
35
+ end
36
+
37
+ # @param term [String] the term to retrieve the documents for
38
+ # @return [Array<Document>] the array of documents containing the particular term or nil if no such occurence
39
+ def documents_with(term)
40
+ term_occurrences[term]
41
+ end
42
+
43
+ # @param term [String] the query term for the documents
44
+ # @return [Integer] the number of documents that contain the particular term
45
+ def document_frequency(term)
46
+ if term_occurrences[term]
47
+ return term_occurrences[term].size
48
+ else
49
+ return 0
50
+ end
51
+ end
52
+
53
+ private
54
+ # adds each term of the document to the term_occurence hash
55
+ def update_term_occurrences(document)
56
+ document.terms.each do |term|
57
+ if @term_occurrences.has_key?(term)
58
+ @term_occurrences[term] << document
59
+ elsif !@stop_words.include?(term)
60
+ @term_occurrences[term] = [document]
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,55 @@
1
+ class RetrievalLite::Document
2
+ # the text of the document
3
+ attr_reader :content
4
+ # a Hash<String, Integer> of all terms of the documents to the frequency of each term
5
+ attr_reader :term_frequencies
6
+ # the id of the document
7
+ attr_reader :id
8
+
9
+ # splits the text of the document into an array of tokens
10
+ #
11
+ # @param content [String] the text of the document
12
+ # @param opts [Hash] optional arguments to initializer
13
+ # @option opts [String] :id the id of the document. Defaults to object_id assigned by ruby
14
+ def initialize(content, opts = {})
15
+ @content = content
16
+ @id = opts[:id] || object_id
17
+ @term_frequencies = RetrievalLite::Tokenizer.parse_content(content)
18
+ end
19
+
20
+ # for debugging
21
+ def print_tokens
22
+ @term_frequencies.each do |key, value|
23
+ puts "#{key}: #{value}"
24
+ end
25
+ end
26
+
27
+ # @return [Integer] the total number of unique terms in the document
28
+ def term_count
29
+ @term_frequencies.size
30
+ end
31
+
32
+ # @return [Array<String>] the unique terms of the document
33
+ def terms
34
+ @term_frequencies.keys
35
+ end
36
+
37
+ # @param term [String]
38
+ # @return [Integer] the number of times a term appears in the document
39
+ def frequency_of(term)
40
+ if @term_frequencies.has_key?(term)
41
+ return @term_frequencies[term]
42
+ else
43
+ return 0
44
+ end
45
+ end
46
+
47
+ # @return [Integer] the total number of terms (not unique) in the document
48
+ def total_terms
49
+ count = 0
50
+ @term_frequencies.each do |key, value|
51
+ count += value
52
+ end
53
+ return count
54
+ end
55
+ end
@@ -0,0 +1,84 @@
1
+ # @see http://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf
2
+ module RetrievalLite::TfIdfRetrieval
3
+ # Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
4
+ # Returns documents ordered by tf-idf score.
5
+ #
6
+ # @param corpus [Corpus] the collection of documents
7
+ # @param query [String] the boolean query to be evaluated
8
+ # @return [Array<Document>] ordered array of documents that satisfy the query
9
+ def self.evaluate(corpus, query)
10
+ evaluate_with_scores(corpus, query).keys
11
+ end
12
+
13
+ # Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
14
+ # Same as #evaluate but returns a hash whose keys are documents and values
15
+ # are the tf-idf score.
16
+ #
17
+ # @param corpus [Corpus] the collection of documents
18
+ # @param query [String] the boolean query to be evaluated
19
+ # @return [Hash<Document, Integer>] ordered array of documents that satisfy the query
20
+ def self.evaluate_with_scores(corpus, query)
21
+ query_document = RetrievalLite::Document.new(query)
22
+ terms = query_document.term_frequencies.keys
23
+ query_vector = query_document.term_frequencies.values # should be in same order as keys
24
+
25
+ documents = Set.new # ordering of documents doesn't matter right now
26
+ # gathering only the documents that contain at least one of those terms
27
+ terms.each do |t|
28
+ docs_with_term = corpus.documents_with(t)
29
+ if docs_with_term
30
+ docs_with_term.each do |d|
31
+ if !documents.include?(d)
32
+ documents << d
33
+ end
34
+ end
35
+ end
36
+ end
37
+
38
+ scores = {}
39
+ documents.each do |document|
40
+ document_vector = Array.new(terms.size)
41
+ terms.each_with_index do |term, index|
42
+ document_vector[index] = tfidf_weight(corpus, document, term)
43
+ end
44
+ scores[document] = RetrievalLite::Vector.cosine_similarity(query_vector, document_vector)
45
+ end
46
+
47
+ # order it by score in descending order
48
+ return Hash[scores.sort_by{|key, value| value}.reverse]
49
+ end
50
+
51
+ # Ranks a document in corpus using the tf-idf scoring.
52
+ #
53
+ # @note tf-idf is slightly modified. n_j (# of docs containing term j) is replaced with n_j + 1 to avoid divide by zero
54
+ #
55
+ # @param corpus [Corpus]
56
+ # @param document [Document]
57
+ # @param term [String]
58
+ # @return [Float] the tfidf weight of the term in the document
59
+ def self.tfidf_weight(corpus, document, term)
60
+ if corpus.document_frequency(term) == 0
61
+ return 0
62
+ else
63
+ return document.frequency_of(term) * Math.log(1.0 * corpus.size/(corpus.document_frequency(term)))
64
+ end
65
+ end
66
+
67
+ # Ranks a document in corpus using the normalized tf-idf scoring.
68
+ # @see #tfidf_weight
69
+ #
70
+ # @param corpus [Corpus]
71
+ # @param document [Document]
72
+ # @param term [String]
73
+ # @return [Float] the normalized tfidf weight of the term in the document
74
+ def self.normalized_tfidf_weight(corpus, document, term)
75
+ length_of_vector = 0
76
+
77
+ corpus.documents_with(term).each do |d|
78
+ weight = tfidf_weight(corpus, d, term)
79
+ length_of_vector += weight * weight
80
+ end
81
+
82
+ tfidf_weight(corpus, document, term) / Math.sqrt(length_of_vector)
83
+ end
84
+ end
@@ -0,0 +1,44 @@
1
+ module RetrievalLite::Tokenizer
2
+ SPECIAL_SEPARATERS = ['[', ']', '\\', ';', '\'', ',', '.', '/', '!', '@', '#', '%', '&', '*', '(', ')', '_', '{', '}', ':', '"', '?', '=', '`', '~', '$', '^', '+', '|', '<', '>']
3
+
4
+ # @param content [String] the text of the document
5
+ # @return [Hash<String, Integer>] a hash that gives term frequency of content
6
+ def self.parse_content(content)
7
+ tokens = Hash.new(0) # initialize to 0
8
+
9
+ # removes everything BUT the letters
10
+ token_text = content.strip.downcase.split(/#{separaters_regex}/)
11
+
12
+ token_text.each do |t|
13
+ # also validates whether there are no other special characters left in there
14
+ if has_hyphen?(t)
15
+ tokens[t] += 1
16
+ else
17
+ # get rid of any extra symbols we might have forgotten.
18
+ term = t.gsub(/[^a-z]/, '')
19
+
20
+ # just in case the entire string was just non-characters
21
+ if term != ''
22
+ tokens[term] += 1
23
+ end
24
+ end
25
+ end
26
+
27
+ tokens
28
+ end
29
+
30
+ private
31
+ # separates by whitespace and any special characters
32
+ def self.separaters_regex
33
+ regex = "\s+" # captures all white spaces
34
+ SPECIAL_SEPARATERS.each do |s|
35
+ regex = regex + '|' + Regexp.quote(s)
36
+ end
37
+ return Regexp.new(regex)
38
+ end
39
+
40
+ # detects whether term is hyphenated
41
+ def self.has_hyphen?(term)
42
+ term =~ /\A[a-z]+\-[a-z]+\Z/
43
+ end
44
+ end
@@ -0,0 +1,39 @@
1
+ module RetrievalLite::Vector
2
+ # @param scores1 [Array<Integer>] each term and its corresponding score in the first document
3
+ # @param scores2 [Array<Integer>] each term and its corresponding score in the second document
4
+ # @return [Float] the cosine similarity of the two vectors representing the score of the documents
5
+ def self.cosine_similarity(scores1, scores2)
6
+ length = (euclidean_length(scores1) * euclidean_length(scores2))
7
+ if length == 0
8
+ return 0
9
+ else
10
+ dot_product(scores1, scores2) / length
11
+ end
12
+ end
13
+
14
+ # @param scores1 [Array<Integer>] each term and its corresponding score in the first document
15
+ # @param scores2 [Array<Integer>] each term and its corresponding score in the second document
16
+ # @return [Float] the dot product of the two vectors representing the score of the documents
17
+ def self.dot_product(scores1, scores2)
18
+ raise "document vectors are not of same length" if scores1.size != scores2.size
19
+
20
+ sum = 0
21
+ for i in 0...scores1.size
22
+ sum += scores1[i]*scores2[i]
23
+ end
24
+
25
+ return sum
26
+ end
27
+
28
+ # @param scores [Array<Integer>] each term and its corresponding score in the document
29
+ # @return [Float] the euclidean length of the vectors representing the score of the document
30
+ def self.euclidean_length(scores)
31
+ sum = 0
32
+
33
+ for i in 0...scores.size
34
+ sum += scores[i] * scores[i]
35
+ end
36
+
37
+ Math.sqrt(sum)
38
+ end
39
+ end
@@ -0,0 +1,12 @@
1
+ require "version"
2
+
3
+ module RetrievalLite
4
+
5
+ end
6
+
7
+ require 'retrieval_lite/document'
8
+ require 'retrieval_lite/corpus'
9
+ require 'retrieval_lite/tokenizer'
10
+ require 'retrieval_lite/boolean_retrieval'
11
+ require 'retrieval_lite/tfidf_retrieval'
12
+ require 'retrieval_lite/vector'
data/lib/version.rb ADDED
@@ -0,0 +1,3 @@
1
+ module RetrievalLite
2
+ VERSION = "1.0.0"
3
+ end
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "retrieval_lite"
8
+ spec.version = RetrievalLite::VERSION
9
+ spec.authors = ["Irvin Zhan"]
10
+ spec.email = ["izhan@princeton.edu"]
11
+ spec.description = %q{Lightweight gem for document retrieval using tf-idf based algorithms for Ruby}
12
+ spec.summary = %q{Please see associated GitHub page for usage.}
13
+ spec.homepage = "https://github.com/izhan/retrieval_lite"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "rspec"
22
+ spec.add_development_dependency "bundler", "~> 1.3"
23
+ spec.add_development_dependency "rake"
24
+ end
@@ -0,0 +1,44 @@
1
+ require 'spec_helper'
2
+
3
+ describe RetrievalLite::BooleanRetrieval do
4
+ let (:document) do
5
+ RetrievalLite::Document.new("lorem ipsum dolor sit amet")
6
+ end
7
+ let (:document_replicated) do
8
+ RetrievalLite::Document.new("lorem ipsum dolor sit amet")
9
+ end
10
+ let (:document_with_duplicates) do
11
+ RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
12
+ end
13
+ let (:document_two) do
14
+ RetrievalLite::Document.new("Mauris ullamcorper, tortor et consequat sagittis.")
15
+ end
16
+ let (:document_three) do
17
+ RetrievalLite::Document.new("Pellentesque felis lectus, lacinia nec mauris non.")
18
+ end
19
+ let (:document_paragraph) do
20
+ RetrievalLite::Document.new("In semper enim non ullamcorper venenatis.
21
+ Sed dictum metus condimentum libero ullamcorper, eget scelerisque risus congue.
22
+ Morbi tempus rhoncus ante, at varius sem adipiscing eu. Sed ut purus pretium,
23
+ consequat velit et, ultricies magna. Etiam sit amet elit mi. Sed et nibh non nibh
24
+ vestibulum hendrerit vitae dapibus lectus. Aenean eget odio vitae tortor elementum
25
+ euismod non nec eros. Nunc id convallis magna. Aliquam ultrices dignissim ipsum,
26
+ a accumsan enim faucibus non. Pellentesque a felis quis diam blandit tempor.
27
+ In aliquet laoreet tortor, at adipiscing diam scelerisque ut."
28
+ )
29
+ end
30
+ let (:all_documents) do
31
+ [document, document_replicated, document_with_duplicates, document_two, document_three, document_paragraph]
32
+ end
33
+ let (:corpus) do
34
+ RetrievalLite::Corpus.new(all_documents)
35
+ end
36
+ describe "one-term retrieval" do
37
+ it "should return array of all documents with that term" do
38
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem") == [document, document_replicated, document_with_duplicates]
39
+ end
40
+ it "should ignore case" do
41
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "LOREM") == [document, document_replicated, document_with_duplicates]
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,132 @@
1
+ require 'spec_helper'
2
+
3
+ describe RetrievalLite::Corpus do
4
+ let (:document) do
5
+ RetrievalLite::Document.new("lorem ipsum dolor sit amet")
6
+ end
7
+ let (:document_replicated) do
8
+ RetrievalLite::Document.new("lorem ipsum dolor sit amet")
9
+ end
10
+ let (:document_with_duplicates) do
11
+ RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
12
+ end
13
+ let (:document_two) do
14
+ RetrievalLite::Document.new("Mauris ullamcorper, tortor et consequat sagittis.")
15
+ end
16
+ let (:document_three) do
17
+ RetrievalLite::Document.new("Pellentesque felis lectus, lacinia nec mauris non.")
18
+ end
19
+ let (:document_paragraph) do
20
+ RetrievalLite::Document.new("In semper enim non ullamcorper venenatis.
21
+ Sed dictum metus condimentum libero ullamcorper, eget scelerisque risus congue.
22
+ Morbi tempus rhoncus ante, at varius sem adipiscing eu. Sed ut purus pretium,
23
+ consequat velit et, ultricies magna. Etiam sit amet elit mi. Sed et nibh non nibh
24
+ vestibulum hendrerit vitae dapibus lectus. Aenean eget odio vitae tortor elementum
25
+ euismod non nec eros. Nunc id convallis magna. Aliquam ultrices dignissim ipsum,
26
+ a accumsan enim faucibus non. Pellentesque a felis quis diam blandit tempor.
27
+ In aliquet laoreet tortor, at adipiscing diam scelerisque ut."
28
+ )
29
+ end
30
+ let (:all_documents) do
31
+ [document, document_replicated, document_with_duplicates, document_two, document_three, document_paragraph]
32
+ end
33
+
34
+ describe "for empty corpus" do
35
+ let (:corpus) do
36
+ RetrievalLite::Corpus.new
37
+ end
38
+
39
+ it "should have size of zero" do
40
+ corpus.size.should == 0
41
+ end
42
+ it "should not error when querying terms" do
43
+ expect { corpus.documents_with("foo") }.to_not raise_error
44
+ expect { corpus.document_frequency("foo") }.to_not raise_error
45
+ end
46
+ end
47
+
48
+ describe "for basic one-document corpus" do
49
+ let (:corpus) do
50
+ RetrievalLite::Corpus.new([document])
51
+ end
52
+
53
+ it "should have size of one" do
54
+ corpus.size.should == 1
55
+ end
56
+ it "should give us correct document frequencies" do
57
+ terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
58
+ terms.each do |t|
59
+ corpus.document_frequency(t).should == 1
60
+ end
61
+ corpus.document_frequency("foo").should == 0
62
+ end
63
+ it "should return document when queried" do
64
+ terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
65
+ terms.each do |t|
66
+ corpus.documents_with(t).should == [document]
67
+ end
68
+ corpus.documents_with("foo").should == nil
69
+ end
70
+ end
71
+
72
+ describe "for two-identical-document corpus" do
73
+ let (:corpus) do
74
+ RetrievalLite::Corpus.new([document, document_replicated])
75
+ end
76
+ it "should give us correct document frequencies" do
77
+ terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
78
+ terms.each do |t|
79
+ corpus.document_frequency(t).should == 2
80
+ end
81
+ corpus.document_frequency("foo").should == 0
82
+ end
83
+ it "should return document when queried" do
84
+ terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
85
+ terms.each do |t|
86
+ corpus.documents_with(t).should == [document, document_replicated]
87
+ end
88
+ corpus.documents_with("foo").should == nil
89
+ end
90
+ end
91
+
92
+ describe "for multiple-document corpus" do
93
+ let (:corpus) do
94
+ RetrievalLite::Corpus.new(all_documents)
95
+ end
96
+
97
+ it "should have the correct size" do
98
+ corpus.size.should == 6
99
+ end
100
+
101
+ # TODO are more comprehensive tests needed....?
102
+ it "should give us correct document frequencies" do
103
+ corpus.document_frequency("lorem").should == 3
104
+ corpus.document_frequency("semper").should == 1
105
+ end
106
+ end
107
+
108
+ describe "adding in documents one at a time" do
109
+ let (:correct_corpus) do
110
+ RetrievalLite::Corpus.new(all_documents)
111
+ end
112
+ let (:corpus) do
113
+ RetrievalLite::Corpus.new
114
+ end
115
+
116
+ it "should be same as initializing corpus with all documents" do
117
+ all_documents.each do |d|
118
+ corpus.add(d)
119
+ end
120
+ corpus.documents.should == correct_corpus.documents
121
+ end
122
+ end
123
+
124
+ describe "with optional parameters" do
125
+ it "should ignore any stopwords (not case sensitive)" do
126
+ stop_words = ["lorem", "IPSum"]
127
+ corpus = RetrievalLite::Corpus.new([document], stop_words: stop_words)
128
+ corpus.documents_with("lorem").should == nil
129
+ corpus.documents_with("ipsum").should == nil
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,96 @@
1
+ require 'spec_helper'
2
+
3
+ describe RetrievalLite::Document do
4
+ describe "for a basic document" do
5
+ let (:document) do
6
+ RetrievalLite::Document.new("lorem ipsum dolor sit amet")
7
+ end
8
+ let (:capitalized_document) do
9
+ RetrievalLite::Document.new("LorEM iPSUM DOLOR sit ameT")
10
+ end
11
+ let (:document_with_duplicates) do
12
+ RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
13
+ end
14
+ let (:basic_tf) do
15
+ {
16
+ "lorem" => 1,
17
+ "ipsum" => 1,
18
+ "dolor" => 1,
19
+ "sit" => 1,
20
+ "amet" => 1
21
+ }
22
+ end
23
+ let (:multiple_tf) do
24
+ {
25
+ "lorem" => 1,
26
+ "ipsum" => 2,
27
+ "dolor" => 3,
28
+ "sit" => 4,
29
+ "amet" => 5
30
+ }
31
+ end
32
+
33
+ describe "content of the document" do
34
+ it "should have original content" do
35
+ document.content.should == "lorem ipsum dolor sit amet"
36
+ capitalized_document.content.should == "LorEM iPSUM DOLOR sit ameT"
37
+ document_with_duplicates.content.should == "lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet"
38
+ end
39
+ end
40
+
41
+ describe "the number of terms of the document" do
42
+ it "should be correct for singleton terms" do
43
+ document.term_count.should == 5
44
+ end
45
+ it "should not care about capitalization" do
46
+ capitalized_document.term_count.should == 5
47
+ end
48
+ it "should be correct for duplicate terms" do
49
+ document_with_duplicates.term_count.should == 5
50
+ end
51
+ end
52
+
53
+ describe "id of the document" do
54
+ it "should default to ruby's object_id" do
55
+ document.id.should == document.object_id
56
+ end
57
+ end
58
+
59
+ describe "term frequencies of the document" do
60
+ it "should be correct for singleton terms" do
61
+ document.term_frequencies.should == basic_tf
62
+ end
63
+ it "should be correct for capitalization" do
64
+ capitalized_document.term_frequencies.should == basic_tf
65
+ end
66
+ it "should be correct for capitalization" do
67
+ document_with_duplicates.term_frequencies.should == multiple_tf
68
+ end
69
+ end
70
+
71
+
72
+ describe "frequencies of a term" do
73
+ it "should be correct for term in document" do
74
+ document.frequency_of("lorem").should == 1
75
+ document_with_duplicates.frequency_of("ipsum").should == 2
76
+ end
77
+ it "should be zero for term not in document" do
78
+ document.frequency_of("foo").should == 0
79
+ document_with_duplicates.frequency_of("foo").should == 0
80
+ end
81
+ end
82
+
83
+ describe "for blank document" do
84
+ it "should not raise error on initialization" do
85
+ expect { RetrievalLite::Document.new("") }.to_not raise_error
86
+ end
87
+ end
88
+ end
89
+
90
+ describe "optional parameters" do
91
+ it "should allow for customized id" do
92
+ doc = RetrievalLite::Document.new("lorem ipsum dolor sit amet", id: "foo")
93
+ doc.id.should == "foo"
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,3 @@
1
+ require 'spec_helper'
2
+ describe RetrievalLite do
3
+ end
@@ -0,0 +1,16 @@
1
+ require "retrieval_lite"
2
+ require "spec_helpers/file_helpers"
3
+
4
+ RSpec.configure do |config|
5
+ config.treat_symbols_as_metadata_keys_with_true_values = true
6
+ config.run_all_when_everything_filtered = true
7
+ config.filter_run :focus
8
+
9
+ config.include RetrievalLite::SpecHelpers::FileHelpers
10
+
11
+ # Run specs in random order to surface order dependencies. If you find an
12
+ # order dependency and want to debug it, you can fix the order by providing
13
+ # the seed, which is printed after each run.
14
+ # --seed 1234
15
+ config.order = 'random'
16
+ end
@@ -0,0 +1,9 @@
1
+ module RetrievalLite
2
+ module SpecHelpers
3
+
4
+ module FileHelpers
5
+
6
+ end
7
+
8
+ end
9
+ end
@@ -0,0 +1,130 @@
1
+ require 'spec_helper'
2
+
3
+ describe RetrievalLite::TfIdfRetrieval do
4
+ let (:document_one_term) do
5
+ RetrievalLite::Document.new("lorem")
6
+ end
7
+ let (:document) do
8
+ RetrievalLite::Document.new("lorem ipsum dolor sit amet")
9
+ end
10
+ let (:document_with_duplicates) do
11
+ RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
12
+ end
13
+ let (:document_doubled) do
14
+ RetrievalLite::Document.new("lorem ipsum dolor sit amet lorem ipsum dolor sit amet")
15
+ end
16
+ let (:document_both_terms) do
17
+ RetrievalLite::Document.new("lorem ipsum")
18
+ end
19
+ let (:document_with_unique) do
20
+ RetrievalLite::Document.new("lorem unique")
21
+ end
22
+ # sorted by lorem order
23
+ let (:all_documents) do
24
+ [document, document_with_duplicates, document_doubled, document_one_term, document_both_terms, document_with_unique]
25
+ end
26
+ let (:corpus) do
27
+ RetrievalLite::Corpus.new(all_documents)
28
+ end
29
+ let (:corpus_different) do
30
+ RetrievalLite::Corpus.new([document_one_term, document, document_with_duplicates])
31
+ end
32
+
33
+ describe "calculating tf-idf scores" do
34
+ describe "term that all documents have" do
35
+ it "should have correct tf-idf" do
36
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_one_term, "lorem").should be_within(0.001).of(0)
37
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document, "lorem").should be_within(0.001).of(0)
38
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_with_duplicates, "lorem").should be_within(0.001).of(0)
39
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_doubled, "lorem").should be_within(0.001).of(0)
40
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_both_terms, "lorem").should be_within(0.001).of(0)
41
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_with_unique, "lorem").should be_within(0.001).of(0)
42
+ end
43
+ end
44
+
45
+ describe "term that a few documents have" do
46
+ it "should have correct tf-idf" do
47
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.405)
48
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.811)
49
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.811)
50
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(0.405)
51
+ end
52
+ end
53
+ end
54
+
55
+ describe "calculating normalized tf-idf scores" do
56
+ describe "term that a few documents have" do
57
+ it "should have correct tf-idf" do
58
+ RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.316)
59
+ RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.632)
60
+ RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.632)
61
+ RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(0.316)
62
+ end
63
+ end
64
+ end
65
+
66
+ describe "calculating total tf-idf scores" do
67
+ describe "for when all documents of corpus has a term" do
68
+ it "should have score of zero for each document" do
69
+ scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "lorem")
70
+ scores.size.should == all_documents.size
71
+ scores.values.each do |v|
72
+ v.should == 0
73
+ end
74
+ end
75
+ end
76
+ describe "term that only one document has" do
77
+ it "should return the correct score" do
78
+ scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "unique")
79
+ scores.size.should == 1
80
+ scores[document_with_unique].should be_within(0.001).of(1.0)
81
+ end
82
+ end
83
+ describe "term that a few documents have" do
84
+ it "should return the correct score" do
85
+ scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "ipsum")
86
+ scores.size.should == 4
87
+ scores[document].should be_within(0.001).of(1.0)
88
+ scores[document_with_duplicates].should be_within(0.001).of(1.0)
89
+ scores[document_doubled].should be_within(0.001).of(1.0)
90
+ scores[document_both_terms].should be_within(0.001).of(1.0)
91
+ end
92
+ end
93
+ end
94
+
95
+ describe "one-term retrieval" do
96
+ it "should return array with that term" do
97
+ RetrievalLite::TfIdfRetrieval.evaluate(corpus, "lorem").should =~ all_documents
98
+ end
99
+ it "should ignore case" do
100
+ RetrievalLite::TfIdfRetrieval.evaluate(corpus, "LOREM").should =~ all_documents
101
+ end
102
+ end
103
+
104
+ describe "when corpus has only one document containing term" do
105
+ it "should return array with only that document" do
106
+ RetrievalLite::TfIdfRetrieval.evaluate(corpus, "unique").should == [document_with_unique]
107
+ end
108
+ end
109
+
110
+ describe "for no matches" do
111
+ it "should return empty array for term not in any documents" do
112
+ RetrievalLite::TfIdfRetrieval.evaluate(corpus, "foobar").should == []
113
+ end
114
+ it "should return empty array for empty string" do
115
+ RetrievalLite::TfIdfRetrieval.evaluate(corpus, "").should == []
116
+ end
117
+ end
118
+
119
+ describe "multiple-term retrieval" do
120
+ it "should order documents correctly" do
121
+ RetrievalLite::TfIdfRetrieval.evaluate(corpus_different, "lorem dolor sit").should == [document, document_with_duplicates, document_one_term]
122
+ end
123
+ it "should have the correct scores" do
124
+ scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus_different, "lorem dolor sit")
125
+ scores[document].should be_within(0.001).of(0.816)
126
+ scores[document_with_duplicates].should be_within(0.001).of(0.808)
127
+ scores[document_one_term].should be_within(0.001).of(0.0)
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,114 @@
1
+ require 'spec_helper'
2
+
3
+ describe RetrievalLite::Tokenizer do
4
+ describe "parse_content" do
5
+ describe "for basic terms" do
6
+ let (:basic_tf) do
7
+ {
8
+ "lorem" => 1,
9
+ "ipsum" => 1,
10
+ "dolor" => 1,
11
+ "sit" => 1,
12
+ "amet" => 1
13
+ }
14
+ end
15
+
16
+ it "should split the content" do
17
+ content = "lorem ipsum dolor sit amet"
18
+ RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
19
+ end
20
+
21
+ it "should ignore extra white spaces" do
22
+ content = "lorem ipsum dolor \n sit amet"
23
+ RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
24
+ end
25
+
26
+ it "should ignore punctuation" do
27
+ content = "lorem! @ #ipsum (dolor) sit * \ amet"
28
+ RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
29
+ end
30
+
31
+ it "should ignore capitalization" do
32
+ content = "LOREM iPSuM dOLOR sit amet"
33
+ RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
34
+ end
35
+
36
+ it "should ignore any whitespaces in front and back of content" do
37
+ content = " lorem ipsum dolor sit amet \n"
38
+ RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
39
+ end
40
+ end
41
+
42
+ describe "for content with multiple terms" do
43
+ let (:multiple_tf) do
44
+ {
45
+ "lorem" => 1,
46
+ "ipsum" => 2,
47
+ "dolor" => 3,
48
+ "sit" => 4,
49
+ "amet" => 5
50
+ }
51
+ end
52
+ it "should not care about order" do
53
+ content = "lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet"
54
+ RetrievalLite::Tokenizer.parse_content(content).should == multiple_tf
55
+
56
+ content = "amet amet lorem dolor dolor sit sit ipsum ipsum sit amet dolor sit amet amet"
57
+ RetrievalLite::Tokenizer.parse_content(content).should == multiple_tf
58
+ end
59
+
60
+ it "should consider capitalized terms to be the same" do
61
+ content = "lorem IPSUM ipsum doLOR doLOR dolor SIT SIT SIT SIT amet ameT amET aMET AMET"
62
+ RetrievalLite::Tokenizer.parse_content(content).should == multiple_tf
63
+ end
64
+ end
65
+
66
+ describe "for special cases" do
67
+ let(:foo_bar_hash) do
68
+ {
69
+ "foo" => 1,
70
+ "bar" => 1
71
+ }
72
+ end
73
+ let(:foo_bar_baz_hash) do
74
+ {
75
+ "foo" => 1,
76
+ "bar" => 1,
77
+ "baz" => 1
78
+ }
79
+ end
80
+
81
+ it "should return empty hash if there are no terms" do
82
+ RetrievalLite::Tokenizer.parse_content("").should == Hash.new
83
+ end
84
+
85
+ it "should ignore numbers" do
86
+ RetrievalLite::Tokenizer.parse_content("1 2 3.14159").should == Hash.new
87
+ end
88
+
89
+ it "should ignore control characters" do
90
+ RetrievalLite::Tokenizer.parse_content("\a\e\f\n\r\t\v").should == Hash.new
91
+ RetrievalLite::Tokenizer.parse_content("\x07\x1B\f\n\r\t\v").should == Hash.new
92
+ end
93
+
94
+ it "should split words connected by special characters" do
95
+ RetrievalLite::Tokenizer.parse_content("foo/bar").should == foo_bar_hash
96
+ RetrievalLite::Tokenizer.parse_content("foo,bar").should == foo_bar_hash
97
+ RetrievalLite::Tokenizer.parse_content("foo,:bar").should == foo_bar_hash
98
+ RetrievalLite::Tokenizer.parse_content("foo ,:bar").should == foo_bar_hash
99
+ RetrievalLite::Tokenizer.parse_content("!@foo ,:bar#").should == foo_bar_hash
100
+
101
+ RetrievalLite::Tokenizer.parse_content("foo:bar baz").should == foo_bar_baz_hash
102
+ end
103
+
104
+ it "should not split words connected by only one hyphen" do
105
+ RetrievalLite::Tokenizer.parse_content("foo-bar").should == { "foo-bar" => 1 }
106
+ RetrievalLite::Tokenizer.parse_content("foo - bar").should == foo_bar_hash
107
+ RetrievalLite::Tokenizer.parse_content("foo --bar").should == foo_bar_hash
108
+ #RetrievalLite::Tokenizer.parse_content("foo--bar").should == foo_bar_hash # TODO is this worth it?
109
+
110
+ RetrievalLite::Tokenizer.parse_content("foo-bar-baz").should_not == foo_bar_baz_hash
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,27 @@
1
+ require 'spec_helper'
2
+
3
+ describe RetrievalLite::Vector do
4
+ describe "dot product" do
5
+ it "should compute correctly for vectors length 1" do
6
+ RetrievalLite::Vector.dot_product([3], [5]).should == 15
7
+ end
8
+ it "should compute correctly for longer vectors" do
9
+ RetrievalLite::Vector.dot_product([2, 3], [4, 5]).should == 23
10
+ end
11
+ it "should raise error for unequal sized arrays" do
12
+ expect { RetrievalLite::Vector.dot_product([2, 3], [4]) }.to raise_error
13
+ end
14
+ end
15
+
16
+ describe "euclidean length" do
17
+ it "should calculate it for vectors length 1" do
18
+ RetrievalLite::Vector.euclidean_length([1]).should == 1
19
+ end
20
+ it "should calculate it for zero vectors" do
21
+ RetrievalLite::Vector.euclidean_length([0, 0, 0]).should == 0
22
+ end
23
+ it "should calculate it for longer vectors" do
24
+ RetrievalLite::Vector.euclidean_length([3, 4]).should == 5
25
+ end
26
+ end
27
+ end
metadata ADDED
@@ -0,0 +1,122 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: retrieval_lite
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Irvin Zhan
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-05-12 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '1.3'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '1.3'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Lightweight gem for document retrieval using tf-idf based algorithms
56
+ for Ruby
57
+ email:
58
+ - izhan@princeton.edu
59
+ executables: []
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - .gitignore
64
+ - .rspec
65
+ - Gemfile
66
+ - LICENSE
67
+ - LICENSE.txt
68
+ - README.md
69
+ - Rakefile
70
+ - lib/retrieval_lite.rb
71
+ - lib/retrieval_lite/boolean_retrieval.rb
72
+ - lib/retrieval_lite/corpus.rb
73
+ - lib/retrieval_lite/document.rb
74
+ - lib/retrieval_lite/tfidf_retrieval.rb
75
+ - lib/retrieval_lite/tokenizer.rb
76
+ - lib/retrieval_lite/vector.rb
77
+ - lib/version.rb
78
+ - retrieval_lite.gemspec
79
+ - spec/boolean_retrieval_spec.rb
80
+ - spec/corpus_spec.rb
81
+ - spec/document_spec.rb
82
+ - spec/retrieval_lite_spec.rb
83
+ - spec/spec_helper.rb
84
+ - spec/spec_helpers/file_helpers.rb
85
+ - spec/tfidf_retrieval_spec.rb
86
+ - spec/tokenizer_spec.rb
87
+ - spec/vector_spec.rb
88
+ homepage: https://github.com/izhan/retrieval_lite
89
+ licenses:
90
+ - MIT
91
+ metadata: {}
92
+ post_install_message:
93
+ rdoc_options: []
94
+ require_paths:
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - '>='
99
+ - !ruby/object:Gem::Version
100
+ version: '0'
101
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - '>='
104
+ - !ruby/object:Gem::Version
105
+ version: '0'
106
+ requirements: []
107
+ rubyforge_project:
108
+ rubygems_version: 2.1.11
109
+ signing_key:
110
+ specification_version: 4
111
+ summary: Please see associated GitHub page for usage.
112
+ test_files:
113
+ - spec/boolean_retrieval_spec.rb
114
+ - spec/corpus_spec.rb
115
+ - spec/document_spec.rb
116
+ - spec/retrieval_lite_spec.rb
117
+ - spec/spec_helper.rb
118
+ - spec/spec_helpers/file_helpers.rb
119
+ - spec/tfidf_retrieval_spec.rb
120
+ - spec/tokenizer_spec.rb
121
+ - spec/vector_spec.rb
122
+ has_rdoc: