retrieval_lite 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8e080b5b0c600b330f5d07827b5c2e576132306b
4
+ data.tar.gz: f0fa5049a6f5e9df0a1fbc761107d28dc1efe857
5
+ SHA512:
6
+ metadata.gz: b1827e3f90edbc3ee841fbe25526c748ac82ad1aae6f42455605e3199dffe7c5cd309c2684741d41f695c88285152c1bf83558beb87668ec9eebb4973f7ff0a3
7
+ data.tar.gz: 5d2009f175da5ce2cecb277061e9592093a14435d5012094c461afb1ac626b1d72705edf560cece8f53d4a02623f7ebe2fd3adc0f0555b0934cd5b878f6c12b2
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .DS_STORE
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # gem's dependencies specified in retrieval_lite.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2014 Irvin Zhan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Irvin Zhan
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Retrieval Lite Gem
2
+
3
+ Lightweight gem for document retrieval using tf-idf based algorithms for Ruby
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'retrieval_lite'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install retrieval_lite
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ desc "run tests"
4
+ task :spec do
5
+ sh "rspec spec"
6
+ end
7
+
8
+ task :default => :spec
@@ -0,0 +1,21 @@
1
+ module RetrievalLite::BooleanRetrieval
2
+ # Queries a corpus using a boolean expression with the standard operators,
3
+ # AND, OR, NOT. Only returns documents that satisfy the query, and does
4
+ # not rank the documents in any way.
5
+ #
6
+ # @param corpus [Corpus] the collection of documents
7
+ # @param query [String] the boolean query to be evaluated
8
+ # @return [Array<Document>] unordered array of documents that satisfy the query
9
+ def self.evaluate(corpus, query)
10
+ if !is_valid?(query)
11
+ raise "Boolean expression is not valid." # TODO better validation message?
12
+ end
13
+
14
+ corpus.documents_with(query)
15
+ end
16
+
17
+ private
18
+ def self.is_valid?(query)
19
+ true
20
+ end
21
+ end
@@ -0,0 +1,64 @@
1
+ class RetrievalLite::Corpus
2
+ # the documents within the corpus
3
+ attr_reader :documents
4
+ # hash of a term to the array of documents that contain the particular term
5
+ attr_reader :term_occurrences
6
+
7
+ # @param documents [Array<Document>] the documents of the corpus
8
+ # @param opts [Hash] optional arguments to initializer
9
+ # @option opts [Array<String>] :stop_words the words to ignore when creating tokens
10
+ def initialize(documents = [], opts = {})
11
+ @documents = documents
12
+ @term_occurrences = {}
13
+ @stop_words = opts[:stop_words] || []
14
+ # stop_words should be lowercased since tokens are in lowercase
15
+ @stop_words.each do |w|
16
+ w.downcase!
17
+ end
18
+ @stop_words = Set.new @stop_words # faster .include?
19
+
20
+ documents.each do |d|
21
+ update_term_occurrences(d)
22
+ end
23
+ end
24
+
25
+ # Adds a document to the corpus
26
+ # @param document [Document] the document to be added
27
+ def add(document)
28
+ @documents << document
29
+ update_term_occurrences(document)
30
+ end
31
+
32
+ # @return [Integer] the number documents in the corpus
33
+ def size
34
+ documents.size
35
+ end
36
+
37
+ # @param term [String] the term to retrieve the documents for
38
+ # @return [Array<Document>] the array of documents containing the particular term or nil if no such occurence
39
+ def documents_with(term)
40
+ term_occurrences[term]
41
+ end
42
+
43
+ # @param term [String] the query term for the documents
44
+ # @return [Integer] the number of documents that contain the particular term
45
+ def document_frequency(term)
46
+ if term_occurrences[term]
47
+ return term_occurrences[term].size
48
+ else
49
+ return 0
50
+ end
51
+ end
52
+
53
+ private
54
+ # adds each term of the document to the term_occurence hash
55
+ def update_term_occurrences(document)
56
+ document.terms.each do |term|
57
+ if @term_occurrences.has_key?(term)
58
+ @term_occurrences[term] << document
59
+ elsif !@stop_words.include?(term)
60
+ @term_occurrences[term] = [document]
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,55 @@
1
+ class RetrievalLite::Document
2
+ # the text of the document
3
+ attr_reader :content
4
+ # a Hash<String, Integer> of all terms of the documents to the frequency of each term
5
+ attr_reader :term_frequencies
6
+ # the id of the document
7
+ attr_reader :id
8
+
9
+ # splits the text of the document into an array of tokens
10
+ #
11
+ # @param content [String] the text of the document
12
+ # @param opts [Hash] optional arguments to initializer
13
+ # @option opts [String] :id the id of the document. Defaults to object_id assigned by ruby
14
+ def initialize(content, opts = {})
15
+ @content = content
16
+ @id = opts[:id] || object_id
17
+ @term_frequencies = RetrievalLite::Tokenizer.parse_content(content)
18
+ end
19
+
20
+ # for debugging
21
+ def print_tokens
22
+ @term_frequencies.each do |key, value|
23
+ puts "#{key}: #{value}"
24
+ end
25
+ end
26
+
27
+ # @return [Integer] the total number of unique terms in the document
28
+ def term_count
29
+ @term_frequencies.size
30
+ end
31
+
32
+ # @return [Array<String>] the unique terms of the document
33
+ def terms
34
+ @term_frequencies.keys
35
+ end
36
+
37
+ # @param term [String]
38
+ # @return [Integer] the number of times a term appears in the document
39
+ def frequency_of(term)
40
+ if @term_frequencies.has_key?(term)
41
+ return @term_frequencies[term]
42
+ else
43
+ return 0
44
+ end
45
+ end
46
+
47
+ # @return [Integer] the total number of terms (not unique) in the document
48
+ def total_terms
49
+ count = 0
50
+ @term_frequencies.each do |key, value|
51
+ count += value
52
+ end
53
+ return count
54
+ end
55
+ end
@@ -0,0 +1,84 @@
1
+ # @see http://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf
2
+ module RetrievalLite::TfIdfRetrieval
3
+ # Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
4
+ # Returns documents ordered by tf-idf score.
5
+ #
6
+ # @param corpus [Corpus] the collection of documents
7
+ # @param query [String] the boolean query to be evaluated
8
+ # @return [Array<Document>] ordered array of documents that satisfy the query
9
+ def self.evaluate(corpus, query)
10
+ evaluate_with_scores(corpus, query).keys
11
+ end
12
+
13
+ # Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
14
+ # Same as #evaluate but returns a hash whose keys are documents and values
15
+ # are the tf-idf score.
16
+ #
17
+ # @param corpus [Corpus] the collection of documents
18
+ # @param query [String] the boolean query to be evaluated
19
+ # @return [Hash<Document, Integer>] ordered array of documents that satisfy the query
20
+ def self.evaluate_with_scores(corpus, query)
21
+ query_document = RetrievalLite::Document.new(query)
22
+ terms = query_document.term_frequencies.keys
23
+ query_vector = query_document.term_frequencies.values # should be in same order as keys
24
+
25
+ documents = Set.new # ordering of documents doesn't matter right now
26
+ # gathering only the documents that contain at least one of those terms
27
+ terms.each do |t|
28
+ docs_with_term = corpus.documents_with(t)
29
+ if docs_with_term
30
+ docs_with_term.each do |d|
31
+ if !documents.include?(d)
32
+ documents << d
33
+ end
34
+ end
35
+ end
36
+ end
37
+
38
+ scores = {}
39
+ documents.each do |document|
40
+ document_vector = Array.new(terms.size)
41
+ terms.each_with_index do |term, index|
42
+ document_vector[index] = tfidf_weight(corpus, document, term)
43
+ end
44
+ scores[document] = RetrievalLite::Vector.cosine_similarity(query_vector, document_vector)
45
+ end
46
+
47
+ # order it by score in descending order
48
+ return Hash[scores.sort_by{|key, value| value}.reverse]
49
+ end
50
+
51
+ # Ranks a document in corpus using the tf-idf scoring.
52
+ #
53
+ # @note tf-idf is slightly modified. n_j (# of docs containing term j) is replaced with n_j + 1 to avoid divide by zero
54
+ #
55
+ # @param corpus [Corpus]
56
+ # @param document [Document]
57
+ # @param term [String]
58
+ # @return [Float] the tfidf weight of the term in the document
59
+ def self.tfidf_weight(corpus, document, term)
60
+ if corpus.document_frequency(term) == 0
61
+ return 0
62
+ else
63
+ return document.frequency_of(term) * Math.log(1.0 * corpus.size/(corpus.document_frequency(term)))
64
+ end
65
+ end
66
+
67
+ # Ranks a document in corpus using the normalized tf-idf scoring.
68
+ # @see #tfidf_weight
69
+ #
70
+ # @param corpus [Corpus]
71
+ # @param document [Document]
72
+ # @param term [String]
73
+ # @return [Float] the normalized tfidf weight of the term in the document
74
+ def self.normalized_tfidf_weight(corpus, document, term)
75
+ length_of_vector = 0
76
+
77
+ corpus.documents_with(term).each do |d|
78
+ weight = tfidf_weight(corpus, d, term)
79
+ length_of_vector += weight * weight
80
+ end
81
+
82
+ tfidf_weight(corpus, document, term) / Math.sqrt(length_of_vector)
83
+ end
84
+ end
@@ -0,0 +1,44 @@
1
+ module RetrievalLite::Tokenizer
2
+ SPECIAL_SEPARATERS = ['[', ']', '\\', ';', '\'', ',', '.', '/', '!', '@', '#', '%', '&', '*', '(', ')', '_', '{', '}', ':', '"', '?', '=', '`', '~', '$', '^', '+', '|', '<', '>']
3
+
4
+ # @param content [String] the text of the document
5
+ # @return [Hash<String, Integer>] a hash that gives term frequency of content
6
+ def self.parse_content(content)
7
+ tokens = Hash.new(0) # initialize to 0
8
+
9
+ # removes everything BUT the letters
10
+ token_text = content.strip.downcase.split(/#{separaters_regex}/)
11
+
12
+ token_text.each do |t|
13
+ # also validates whether there are no other special characters left in there
14
+ if has_hyphen?(t)
15
+ tokens[t] += 1
16
+ else
17
+ # get rid of any extra symbols we might have forgotten.
18
+ term = t.gsub(/[^a-z]/, '')
19
+
20
+ # just in case the entire string was just non-characters
21
+ if term != ''
22
+ tokens[term] += 1
23
+ end
24
+ end
25
+ end
26
+
27
+ tokens
28
+ end
29
+
30
+ private
31
+ # separates by whitespace and any special characters
32
+ def self.separaters_regex
33
+ regex = "\s+" # captures all white spaces
34
+ SPECIAL_SEPARATERS.each do |s|
35
+ regex = regex + '|' + Regexp.quote(s)
36
+ end
37
+ return Regexp.new(regex)
38
+ end
39
+
40
+ # detects whether term is hyphenated
41
+ def self.has_hyphen?(term)
42
+ term =~ /\A[a-z]+\-[a-z]+\Z/
43
+ end
44
+ end
@@ -0,0 +1,39 @@
1
+ module RetrievalLite::Vector
2
+ # @param scores1 [Array<Integer>] each term and its corresponding score in the first document
3
+ # @param scores2 [Array<Integer>] each term and its corresponding score in the second document
4
+ # @return [Float] the cosine similarity of the two vectors representing the score of the documents
5
+ def self.cosine_similarity(scores1, scores2)
6
+ length = (euclidean_length(scores1) * euclidean_length(scores2))
7
+ if length == 0
8
+ return 0
9
+ else
10
+ dot_product(scores1, scores2) / length
11
+ end
12
+ end
13
+
14
+ # @param scores1 [Array<Integer>] each term and its corresponding score in the first document
15
+ # @param scores2 [Array<Integer>] each term and its corresponding score in the second document
16
+ # @return [Float] the dot product of the two vectors representing the score of the documents
17
+ def self.dot_product(scores1, scores2)
18
+ raise "document vectors are not of same length" if scores1.size != scores2.size
19
+
20
+ sum = 0
21
+ for i in 0...scores1.size
22
+ sum += scores1[i]*scores2[i]
23
+ end
24
+
25
+ return sum
26
+ end
27
+
28
+ # @param scores [Array<Integer>] each term and its corresponding score in the document
29
+ # @return [Float] the euclidean length of the vectors representing the score of the document
30
+ def self.euclidean_length(scores)
31
+ sum = 0
32
+
33
+ for i in 0...scores.size
34
+ sum += scores[i] * scores[i]
35
+ end
36
+
37
+ Math.sqrt(sum)
38
+ end
39
+ end
@@ -0,0 +1,12 @@
1
+ require "version"
2
+
3
+ module RetrievalLite
4
+
5
+ end
6
+
7
+ require 'retrieval_lite/document'
8
+ require 'retrieval_lite/corpus'
9
+ require 'retrieval_lite/tokenizer'
10
+ require 'retrieval_lite/boolean_retrieval'
11
+ require 'retrieval_lite/tfidf_retrieval'
12
+ require 'retrieval_lite/vector'
data/lib/version.rb ADDED
@@ -0,0 +1,3 @@
1
+ module RetrievalLite
2
+ VERSION = "1.0.0"
3
+ end
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "retrieval_lite"
8
+ spec.version = RetrievalLite::VERSION
9
+ spec.authors = ["Irvin Zhan"]
10
+ spec.email = ["izhan@princeton.edu"]
11
+ spec.description = %q{Lightweight gem for document retrieval using tf-idf based algorithms for Ruby}
12
+ spec.summary = %q{Please see associated GitHub page for usage.}
13
+ spec.homepage = "https://github.com/izhan/retrieval_lite"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "rspec"
22
+ spec.add_development_dependency "bundler", "~> 1.3"
23
+ spec.add_development_dependency "rake"
24
+ end
@@ -0,0 +1,44 @@
1
+ require 'spec_helper'
2
+
3
+ describe RetrievalLite::BooleanRetrieval do
4
+ let (:document) do
5
+ RetrievalLite::Document.new("lorem ipsum dolor sit amet")
6
+ end
7
+ let (:document_replicated) do
8
+ RetrievalLite::Document.new("lorem ipsum dolor sit amet")
9
+ end
10
+ let (:document_with_duplicates) do
11
+ RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
12
+ end
13
+ let (:document_two) do
14
+ RetrievalLite::Document.new("Mauris ullamcorper, tortor et consequat sagittis.")
15
+ end
16
+ let (:document_three) do
17
+ RetrievalLite::Document.new("Pellentesque felis lectus, lacinia nec mauris non.")
18
+ end
19
+ let (:document_paragraph) do
20
+ RetrievalLite::Document.new("In semper enim non ullamcorper venenatis.
21
+ Sed dictum metus condimentum libero ullamcorper, eget scelerisque risus congue.
22
+ Morbi tempus rhoncus ante, at varius sem adipiscing eu. Sed ut purus pretium,
23
+ consequat velit et, ultricies magna. Etiam sit amet elit mi. Sed et nibh non nibh
24
+ vestibulum hendrerit vitae dapibus lectus. Aenean eget odio vitae tortor elementum
25
+ euismod non nec eros. Nunc id convallis magna. Aliquam ultrices dignissim ipsum,
26
+ a accumsan enim faucibus non. Pellentesque a felis quis diam blandit tempor.
27
+ In aliquet laoreet tortor, at adipiscing diam scelerisque ut."
28
+ )
29
+ end
30
+ let (:all_documents) do
31
+ [document, document_replicated, document_with_duplicates, document_two, document_three, document_paragraph]
32
+ end
33
+ let (:corpus) do
34
+ RetrievalLite::Corpus.new(all_documents)
35
+ end
36
+ describe "one-term retrieval" do
37
+ it "should return array of all documents with that term" do
38
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem") == [document, document_replicated, document_with_duplicates]
39
+ end
40
+ it "should ignore case" do
41
+ RetrievalLite::BooleanRetrieval.evaluate(corpus, "LOREM") == [document, document_replicated, document_with_duplicates]
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,132 @@
1
+ require 'spec_helper'
2
+
3
+ describe RetrievalLite::Corpus do
4
+ let (:document) do
5
+ RetrievalLite::Document.new("lorem ipsum dolor sit amet")
6
+ end
7
+ let (:document_replicated) do
8
+ RetrievalLite::Document.new("lorem ipsum dolor sit amet")
9
+ end
10
+ let (:document_with_duplicates) do
11
+ RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
12
+ end
13
+ let (:document_two) do
14
+ RetrievalLite::Document.new("Mauris ullamcorper, tortor et consequat sagittis.")
15
+ end
16
+ let (:document_three) do
17
+ RetrievalLite::Document.new("Pellentesque felis lectus, lacinia nec mauris non.")
18
+ end
19
+ let (:document_paragraph) do
20
+ RetrievalLite::Document.new("In semper enim non ullamcorper venenatis.
21
+ Sed dictum metus condimentum libero ullamcorper, eget scelerisque risus congue.
22
+ Morbi tempus rhoncus ante, at varius sem adipiscing eu. Sed ut purus pretium,
23
+ consequat velit et, ultricies magna. Etiam sit amet elit mi. Sed et nibh non nibh
24
+ vestibulum hendrerit vitae dapibus lectus. Aenean eget odio vitae tortor elementum
25
+ euismod non nec eros. Nunc id convallis magna. Aliquam ultrices dignissim ipsum,
26
+ a accumsan enim faucibus non. Pellentesque a felis quis diam blandit tempor.
27
+ In aliquet laoreet tortor, at adipiscing diam scelerisque ut."
28
+ )
29
+ end
30
+ let (:all_documents) do
31
+ [document, document_replicated, document_with_duplicates, document_two, document_three, document_paragraph]
32
+ end
33
+
34
+ describe "for empty corpus" do
35
+ let (:corpus) do
36
+ RetrievalLite::Corpus.new
37
+ end
38
+
39
+ it "should have size of zero" do
40
+ corpus.size.should == 0
41
+ end
42
+ it "should not error when querying terms" do
43
+ expect { corpus.documents_with("foo") }.to_not raise_error
44
+ expect { corpus.document_frequency("foo") }.to_not raise_error
45
+ end
46
+ end
47
+
48
+ describe "for basic one-document corpus" do
49
+ let (:corpus) do
50
+ RetrievalLite::Corpus.new([document])
51
+ end
52
+
53
+ it "should have size of one" do
54
+ corpus.size.should == 1
55
+ end
56
+ it "should give us correct document frequencies" do
57
+ terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
58
+ terms.each do |t|
59
+ corpus.document_frequency(t).should == 1
60
+ end
61
+ corpus.document_frequency("foo").should == 0
62
+ end
63
+ it "should return document when queried" do
64
+ terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
65
+ terms.each do |t|
66
+ corpus.documents_with(t).should == [document]
67
+ end
68
+ corpus.documents_with("foo").should == nil
69
+ end
70
+ end
71
+
72
+ describe "for two-identical-document corpus" do
73
+ let (:corpus) do
74
+ RetrievalLite::Corpus.new([document, document_replicated])
75
+ end
76
+ it "should give us correct document frequencies" do
77
+ terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
78
+ terms.each do |t|
79
+ corpus.document_frequency(t).should == 2
80
+ end
81
+ corpus.document_frequency("foo").should == 0
82
+ end
83
+ it "should return document when queried" do
84
+ terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
85
+ terms.each do |t|
86
+ corpus.documents_with(t).should == [document, document_replicated]
87
+ end
88
+ corpus.documents_with("foo").should == nil
89
+ end
90
+ end
91
+
92
+ describe "for multiple-document corpus" do
93
+ let (:corpus) do
94
+ RetrievalLite::Corpus.new(all_documents)
95
+ end
96
+
97
+ it "should have the correct size" do
98
+ corpus.size.should == 6
99
+ end
100
+
101
+ # TODO are more comprehensive tests needed....?
102
+ it "should give us correct document frequencies" do
103
+ corpus.document_frequency("lorem").should == 3
104
+ corpus.document_frequency("semper").should == 1
105
+ end
106
+ end
107
+
108
+ describe "adding in documents one at a time" do
109
+ let (:correct_corpus) do
110
+ RetrievalLite::Corpus.new(all_documents)
111
+ end
112
+ let (:corpus) do
113
+ RetrievalLite::Corpus.new
114
+ end
115
+
116
+ it "should be same as initializing corpus with all documents" do
117
+ all_documents.each do |d|
118
+ corpus.add(d)
119
+ end
120
+ corpus.documents.should == correct_corpus.documents
121
+ end
122
+ end
123
+
124
+ describe "with optional parameters" do
125
+ it "should ignore any stopwords (not case sensitive)" do
126
+ stop_words = ["lorem", "IPSum"]
127
+ corpus = RetrievalLite::Corpus.new([document], stop_words: stop_words)
128
+ corpus.documents_with("lorem").should == nil
129
+ corpus.documents_with("ipsum").should == nil
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,96 @@
1
+ require 'spec_helper'
2
+
3
+ describe RetrievalLite::Document do
4
+ describe "for a basic document" do
5
+ let (:document) do
6
+ RetrievalLite::Document.new("lorem ipsum dolor sit amet")
7
+ end
8
+ let (:capitalized_document) do
9
+ RetrievalLite::Document.new("LorEM iPSUM DOLOR sit ameT")
10
+ end
11
+ let (:document_with_duplicates) do
12
+ RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
13
+ end
14
+ let (:basic_tf) do
15
+ {
16
+ "lorem" => 1,
17
+ "ipsum" => 1,
18
+ "dolor" => 1,
19
+ "sit" => 1,
20
+ "amet" => 1
21
+ }
22
+ end
23
+ let (:multiple_tf) do
24
+ {
25
+ "lorem" => 1,
26
+ "ipsum" => 2,
27
+ "dolor" => 3,
28
+ "sit" => 4,
29
+ "amet" => 5
30
+ }
31
+ end
32
+
33
+ describe "content of the document" do
34
+ it "should have original content" do
35
+ document.content.should == "lorem ipsum dolor sit amet"
36
+ capitalized_document.content.should == "LorEM iPSUM DOLOR sit ameT"
37
+ document_with_duplicates.content.should == "lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet"
38
+ end
39
+ end
40
+
41
+ describe "the number of terms of the document" do
42
+ it "should be correct for singleton terms" do
43
+ document.term_count.should == 5
44
+ end
45
+ it "should not care about capitalization" do
46
+ capitalized_document.term_count.should == 5
47
+ end
48
+ it "should be correct for duplicate terms" do
49
+ document_with_duplicates.term_count.should == 5
50
+ end
51
+ end
52
+
53
+ describe "id of the document" do
54
+ it "should default to ruby's object_id" do
55
+ document.id.should == document.object_id
56
+ end
57
+ end
58
+
59
+ describe "term frequencies of the document" do
60
+ it "should be correct for singleton terms" do
61
+ document.term_frequencies.should == basic_tf
62
+ end
63
+ it "should be correct for capitalization" do
64
+ capitalized_document.term_frequencies.should == basic_tf
65
+ end
66
+ it "should be correct for capitalization" do
67
+ document_with_duplicates.term_frequencies.should == multiple_tf
68
+ end
69
+ end
70
+
71
+
72
+ describe "frequencies of a term" do
73
+ it "should be correct for term in document" do
74
+ document.frequency_of("lorem").should == 1
75
+ document_with_duplicates.frequency_of("ipsum").should == 2
76
+ end
77
+ it "should be zero for term not in document" do
78
+ document.frequency_of("foo").should == 0
79
+ document_with_duplicates.frequency_of("foo").should == 0
80
+ end
81
+ end
82
+
83
+ describe "for blank document" do
84
+ it "should not raise error on initialization" do
85
+ expect { RetrievalLite::Document.new("") }.to_not raise_error
86
+ end
87
+ end
88
+ end
89
+
90
+ describe "optional parameters" do
91
+ it "should allow for customized id" do
92
+ doc = RetrievalLite::Document.new("lorem ipsum dolor sit amet", id: "foo")
93
+ doc.id.should == "foo"
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,3 @@
1
+ require 'spec_helper'
2
+ describe RetrievalLite do
3
+ end
@@ -0,0 +1,16 @@
1
+ require "retrieval_lite"
2
+ require "spec_helpers/file_helpers"
3
+
4
+ RSpec.configure do |config|
5
+ config.treat_symbols_as_metadata_keys_with_true_values = true
6
+ config.run_all_when_everything_filtered = true
7
+ config.filter_run :focus
8
+
9
+ config.include RetrievalLite::SpecHelpers::FileHelpers
10
+
11
+ # Run specs in random order to surface order dependencies. If you find an
12
+ # order dependency and want to debug it, you can fix the order by providing
13
+ # the seed, which is printed after each run.
14
+ # --seed 1234
15
+ config.order = 'random'
16
+ end
@@ -0,0 +1,9 @@
1
+ module RetrievalLite
2
+ module SpecHelpers
3
+
4
+ module FileHelpers
5
+
6
+ end
7
+
8
+ end
9
+ end
@@ -0,0 +1,130 @@
1
+ require 'spec_helper'
2
+
3
+ describe RetrievalLite::TfIdfRetrieval do
4
+ let (:document_one_term) do
5
+ RetrievalLite::Document.new("lorem")
6
+ end
7
+ let (:document) do
8
+ RetrievalLite::Document.new("lorem ipsum dolor sit amet")
9
+ end
10
+ let (:document_with_duplicates) do
11
+ RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
12
+ end
13
+ let (:document_doubled) do
14
+ RetrievalLite::Document.new("lorem ipsum dolor sit amet lorem ipsum dolor sit amet")
15
+ end
16
+ let (:document_both_terms) do
17
+ RetrievalLite::Document.new("lorem ipsum")
18
+ end
19
+ let (:document_with_unique) do
20
+ RetrievalLite::Document.new("lorem unique")
21
+ end
22
+ # sorted by lorem order
23
+ let (:all_documents) do
24
+ [document, document_with_duplicates, document_doubled, document_one_term, document_both_terms, document_with_unique]
25
+ end
26
+ let (:corpus) do
27
+ RetrievalLite::Corpus.new(all_documents)
28
+ end
29
+ let (:corpus_different) do
30
+ RetrievalLite::Corpus.new([document_one_term, document, document_with_duplicates])
31
+ end
32
+
33
+ describe "calculating tf-idf scores" do
34
+ describe "term that all documents have" do
35
+ it "should have correct tf-idf" do
36
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_one_term, "lorem").should be_within(0.001).of(0)
37
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document, "lorem").should be_within(0.001).of(0)
38
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_with_duplicates, "lorem").should be_within(0.001).of(0)
39
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_doubled, "lorem").should be_within(0.001).of(0)
40
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_both_terms, "lorem").should be_within(0.001).of(0)
41
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_with_unique, "lorem").should be_within(0.001).of(0)
42
+ end
43
+ end
44
+
45
+ describe "term that a few documents have" do
46
+ it "should have correct tf-idf" do
47
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.405)
48
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.811)
49
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.811)
50
+ RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(0.405)
51
+ end
52
+ end
53
+ end
54
+
55
+ describe "calculating normalized tf-idf scores" do
56
+ describe "term that a few documents have" do
57
+ it "should have correct tf-idf" do
58
+ RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.316)
59
+ RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.632)
60
+ RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.632)
61
+ RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(0.316)
62
+ end
63
+ end
64
+ end
65
+
66
+ describe "calculating total tf-idf scores" do
67
+ describe "for when all documents of corpus has a term" do
68
+ it "should have score of zero for each document" do
69
+ scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "lorem")
70
+ scores.size.should == all_documents.size
71
+ scores.values.each do |v|
72
+ v.should == 0
73
+ end
74
+ end
75
+ end
76
+ describe "term that only one document has" do
77
+ it "should return the correct score" do
78
+ scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "unique")
79
+ scores.size.should == 1
80
+ scores[document_with_unique].should be_within(0.001).of(1.0)
81
+ end
82
+ end
83
+ describe "term that a few documents have" do
84
+ it "should return the correct score" do
85
+ scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "ipsum")
86
+ scores.size.should == 4
87
+ scores[document].should be_within(0.001).of(1.0)
88
+ scores[document_with_duplicates].should be_within(0.001).of(1.0)
89
+ scores[document_doubled].should be_within(0.001).of(1.0)
90
+ scores[document_both_terms].should be_within(0.001).of(1.0)
91
+ end
92
+ end
93
+ end
94
+
95
+ describe "one-term retrieval" do
96
+ it "should return array with that term" do
97
+ RetrievalLite::TfIdfRetrieval.evaluate(corpus, "lorem").should =~ all_documents
98
+ end
99
+ it "should ignore case" do
100
+ RetrievalLite::TfIdfRetrieval.evaluate(corpus, "LOREM").should =~ all_documents
101
+ end
102
+ end
103
+
104
+ describe "when corpus has only one document containing term" do
105
+ it "should return array with only that document" do
106
+ RetrievalLite::TfIdfRetrieval.evaluate(corpus, "unique").should == [document_with_unique]
107
+ end
108
+ end
109
+
110
+ describe "for no matches" do
111
+ it "should return empty array for term not in any documents" do
112
+ RetrievalLite::TfIdfRetrieval.evaluate(corpus, "foobar").should == []
113
+ end
114
+ it "should return empty array for empty string" do
115
+ RetrievalLite::TfIdfRetrieval.evaluate(corpus, "").should == []
116
+ end
117
+ end
118
+
119
+ describe "multiple-term retrieval" do
120
+ it "should order documents correctly" do
121
+ RetrievalLite::TfIdfRetrieval.evaluate(corpus_different, "lorem dolor sit").should == [document, document_with_duplicates, document_one_term]
122
+ end
123
+ it "should have the correct scores" do
124
+ scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus_different, "lorem dolor sit")
125
+ scores[document].should be_within(0.001).of(0.816)
126
+ scores[document_with_duplicates].should be_within(0.001).of(0.808)
127
+ scores[document_one_term].should be_within(0.001).of(0.0)
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,114 @@
1
+ require 'spec_helper'
2
+
3
+ describe RetrievalLite::Tokenizer do
4
+ describe "parse_content" do
5
+ describe "for basic terms" do
6
+ let (:basic_tf) do
7
+ {
8
+ "lorem" => 1,
9
+ "ipsum" => 1,
10
+ "dolor" => 1,
11
+ "sit" => 1,
12
+ "amet" => 1
13
+ }
14
+ end
15
+
16
+ it "should split the content" do
17
+ content = "lorem ipsum dolor sit amet"
18
+ RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
19
+ end
20
+
21
+ it "should ignore extra white spaces" do
22
+ content = "lorem ipsum dolor \n sit amet"
23
+ RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
24
+ end
25
+
26
+ it "should ignore punctuation" do
27
+ content = "lorem! @ #ipsum (dolor) sit * \ amet"
28
+ RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
29
+ end
30
+
31
+ it "should ignore capitalization" do
32
+ content = "LOREM iPSuM dOLOR sit amet"
33
+ RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
34
+ end
35
+
36
+ it "should ignore any whitespaces in front and back of content" do
37
+ content = " lorem ipsum dolor sit amet \n"
38
+ RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
39
+ end
40
+ end
41
+
42
+ describe "for content with multiple terms" do
43
+ let (:multiple_tf) do
44
+ {
45
+ "lorem" => 1,
46
+ "ipsum" => 2,
47
+ "dolor" => 3,
48
+ "sit" => 4,
49
+ "amet" => 5
50
+ }
51
+ end
52
+ it "should not care about order" do
53
+ content = "lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet"
54
+ RetrievalLite::Tokenizer.parse_content(content).should == multiple_tf
55
+
56
+ content = "amet amet lorem dolor dolor sit sit ipsum ipsum sit amet dolor sit amet amet"
57
+ RetrievalLite::Tokenizer.parse_content(content).should == multiple_tf
58
+ end
59
+
60
+ it "should consider capitalized terms to be the same" do
61
+ content = "lorem IPSUM ipsum doLOR doLOR dolor SIT SIT SIT SIT amet ameT amET aMET AMET"
62
+ RetrievalLite::Tokenizer.parse_content(content).should == multiple_tf
63
+ end
64
+ end
65
+
66
+ describe "for special cases" do
67
+ let(:foo_bar_hash) do
68
+ {
69
+ "foo" => 1,
70
+ "bar" => 1
71
+ }
72
+ end
73
+ let(:foo_bar_baz_hash) do
74
+ {
75
+ "foo" => 1,
76
+ "bar" => 1,
77
+ "baz" => 1
78
+ }
79
+ end
80
+
81
+ it "should return empty hash if there are no terms" do
82
+ RetrievalLite::Tokenizer.parse_content("").should == Hash.new
83
+ end
84
+
85
+ it "should ignore numbers" do
86
+ RetrievalLite::Tokenizer.parse_content("1 2 3.14159").should == Hash.new
87
+ end
88
+
89
+ it "should ignore control characters" do
90
+ RetrievalLite::Tokenizer.parse_content("\a\e\f\n\r\t\v").should == Hash.new
91
+ RetrievalLite::Tokenizer.parse_content("\x07\x1B\f\n\r\t\v").should == Hash.new
92
+ end
93
+
94
+ it "should split words connected by special characters" do
95
+ RetrievalLite::Tokenizer.parse_content("foo/bar").should == foo_bar_hash
96
+ RetrievalLite::Tokenizer.parse_content("foo,bar").should == foo_bar_hash
97
+ RetrievalLite::Tokenizer.parse_content("foo,:bar").should == foo_bar_hash
98
+ RetrievalLite::Tokenizer.parse_content("foo ,:bar").should == foo_bar_hash
99
+ RetrievalLite::Tokenizer.parse_content("!@foo ,:bar#").should == foo_bar_hash
100
+
101
+ RetrievalLite::Tokenizer.parse_content("foo:bar baz").should == foo_bar_baz_hash
102
+ end
103
+
104
+ it "should not split words connected by only one hyphen" do
105
+ RetrievalLite::Tokenizer.parse_content("foo-bar").should == { "foo-bar" => 1 }
106
+ RetrievalLite::Tokenizer.parse_content("foo - bar").should == foo_bar_hash
107
+ RetrievalLite::Tokenizer.parse_content("foo --bar").should == foo_bar_hash
108
+ #RetrievalLite::Tokenizer.parse_content("foo--bar").should == foo_bar_hash # TODO is this worth it?
109
+
110
+ RetrievalLite::Tokenizer.parse_content("foo-bar-baz").should_not == foo_bar_baz_hash
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,27 @@
1
+ require 'spec_helper'
2
+
3
+ describe RetrievalLite::Vector do
4
+ describe "dot product" do
5
+ it "should compute correctly for vectors length 1" do
6
+ RetrievalLite::Vector.dot_product([3], [5]).should == 15
7
+ end
8
+ it "should compute correctly for longer vectors" do
9
+ RetrievalLite::Vector.dot_product([2, 3], [4, 5]).should == 23
10
+ end
11
+ it "should raise error for unequal sized arrays" do
12
+ expect { RetrievalLite::Vector.dot_product([2, 3], [4]) }.to raise_error
13
+ end
14
+ end
15
+
16
+ describe "euclidean length" do
17
+ it "should calculate it for vectors length 1" do
18
+ RetrievalLite::Vector.euclidean_length([1]).should == 1
19
+ end
20
+ it "should calculate it for zero vectors" do
21
+ RetrievalLite::Vector.euclidean_length([0, 0, 0]).should == 0
22
+ end
23
+ it "should calculate it for longer vectors" do
24
+ RetrievalLite::Vector.euclidean_length([3, 4]).should == 5
25
+ end
26
+ end
27
+ end
metadata ADDED
@@ -0,0 +1,122 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: retrieval_lite
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Irvin Zhan
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-05-12 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '1.3'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '1.3'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Lightweight gem for document retrieval using tf-idf based algorithms
56
+ for Ruby
57
+ email:
58
+ - izhan@princeton.edu
59
+ executables: []
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - .gitignore
64
+ - .rspec
65
+ - Gemfile
66
+ - LICENSE
67
+ - LICENSE.txt
68
+ - README.md
69
+ - Rakefile
70
+ - lib/retrieval_lite.rb
71
+ - lib/retrieval_lite/boolean_retrieval.rb
72
+ - lib/retrieval_lite/corpus.rb
73
+ - lib/retrieval_lite/document.rb
74
+ - lib/retrieval_lite/tfidf_retrieval.rb
75
+ - lib/retrieval_lite/tokenizer.rb
76
+ - lib/retrieval_lite/vector.rb
77
+ - lib/version.rb
78
+ - retrieval_lite.gemspec
79
+ - spec/boolean_retrieval_spec.rb
80
+ - spec/corpus_spec.rb
81
+ - spec/document_spec.rb
82
+ - spec/retrieval_lite_spec.rb
83
+ - spec/spec_helper.rb
84
+ - spec/spec_helpers/file_helpers.rb
85
+ - spec/tfidf_retrieval_spec.rb
86
+ - spec/tokenizer_spec.rb
87
+ - spec/vector_spec.rb
88
+ homepage: https://github.com/izhan/retrieval_lite
89
+ licenses:
90
+ - MIT
91
+ metadata: {}
92
+ post_install_message:
93
+ rdoc_options: []
94
+ require_paths:
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - '>='
99
+ - !ruby/object:Gem::Version
100
+ version: '0'
101
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - '>='
104
+ - !ruby/object:Gem::Version
105
+ version: '0'
106
+ requirements: []
107
+ rubyforge_project:
108
+ rubygems_version: 2.1.11
109
+ signing_key:
110
+ specification_version: 4
111
+ summary: Please see associated GitHub page for usage.
112
+ test_files:
113
+ - spec/boolean_retrieval_spec.rb
114
+ - spec/corpus_spec.rb
115
+ - spec/document_spec.rb
116
+ - spec/retrieval_lite_spec.rb
117
+ - spec/spec_helper.rb
118
+ - spec/spec_helpers/file_helpers.rb
119
+ - spec/tfidf_retrieval_spec.rb
120
+ - spec/tokenizer_spec.rb
121
+ - spec/vector_spec.rb
122
+ has_rdoc: