retrieval_lite 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/LICENSE +21 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +8 -0
- data/lib/retrieval_lite/boolean_retrieval.rb +21 -0
- data/lib/retrieval_lite/corpus.rb +64 -0
- data/lib/retrieval_lite/document.rb +55 -0
- data/lib/retrieval_lite/tfidf_retrieval.rb +84 -0
- data/lib/retrieval_lite/tokenizer.rb +44 -0
- data/lib/retrieval_lite/vector.rb +39 -0
- data/lib/retrieval_lite.rb +12 -0
- data/lib/version.rb +3 -0
- data/retrieval_lite.gemspec +24 -0
- data/spec/boolean_retrieval_spec.rb +44 -0
- data/spec/corpus_spec.rb +132 -0
- data/spec/document_spec.rb +96 -0
- data/spec/retrieval_lite_spec.rb +3 -0
- data/spec/spec_helper.rb +16 -0
- data/spec/spec_helpers/file_helpers.rb +9 -0
- data/spec/tfidf_retrieval_spec.rb +130 -0
- data/spec/tokenizer_spec.rb +114 -0
- data/spec/vector_spec.rb +27 -0
- metadata +122 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8e080b5b0c600b330f5d07827b5c2e576132306b
|
4
|
+
data.tar.gz: f0fa5049a6f5e9df0a1fbc761107d28dc1efe857
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b1827e3f90edbc3ee841fbe25526c748ac82ad1aae6f42455605e3199dffe7c5cd309c2684741d41f695c88285152c1bf83558beb87668ec9eebb4973f7ff0a3
|
7
|
+
data.tar.gz: 5d2009f175da5ce2cecb277061e9592093a14435d5012094c461afb1ac626b1d72705edf560cece8f53d4a02623f7ebe2fd3adc0f0555b0934cd5b878f6c12b2
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2014 Irvin Zhan
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Irvin Zhan
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Retrieval Lite Gem
|
2
|
+
|
3
|
+
Lightweight gem for document retrieval using tf-idf based algorithms for Ruby
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'retrieval_lite'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install retrieval_lite
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
module RetrievalLite::BooleanRetrieval
|
2
|
+
# Queries a corpus using a boolean expression with the standard operators,
|
3
|
+
# AND, OR, NOT. Only returns documents that satisfy the query, and does
|
4
|
+
# not rank the documents in any way.
|
5
|
+
#
|
6
|
+
# @param corpus [Corpus] the collection of documents
|
7
|
+
# @param query [String] the boolean query to be evaluated
|
8
|
+
# @return [Array<Document>] unordered array of documents that satisfy the query
|
9
|
+
def self.evaluate(corpus, query)
|
10
|
+
if !is_valid?(query)
|
11
|
+
raise "Boolean expression is not valid." # TODO better validation message?
|
12
|
+
end
|
13
|
+
|
14
|
+
corpus.documents_with(query)
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
def self.is_valid?(query)
|
19
|
+
true
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
class RetrievalLite::Corpus
|
2
|
+
# the documents within the corpus
|
3
|
+
attr_reader :documents
|
4
|
+
# hash of a term to the array of documents that contain the particular term
|
5
|
+
attr_reader :term_occurrences
|
6
|
+
|
7
|
+
# @param documents [Array<Document>] the documents of the corpus
|
8
|
+
# @param opts [Hash] optional arguments to initializer
|
9
|
+
# @option opts [Array<String>] :stop_words the words to ignore when creating tokens
|
10
|
+
def initialize(documents = [], opts = {})
|
11
|
+
@documents = documents
|
12
|
+
@term_occurrences = {}
|
13
|
+
@stop_words = opts[:stop_words] || []
|
14
|
+
# stop_words should be lowercased since tokens are in lowercase
|
15
|
+
@stop_words.each do |w|
|
16
|
+
w.downcase!
|
17
|
+
end
|
18
|
+
@stop_words = Set.new @stop_words # faster .include?
|
19
|
+
|
20
|
+
documents.each do |d|
|
21
|
+
update_term_occurrences(d)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Adds a document to the corpus
|
26
|
+
# @param document [Document] the document to be added
|
27
|
+
def add(document)
|
28
|
+
@documents << document
|
29
|
+
update_term_occurrences(document)
|
30
|
+
end
|
31
|
+
|
32
|
+
# @return [Integer] the number documents in the corpus
|
33
|
+
def size
|
34
|
+
documents.size
|
35
|
+
end
|
36
|
+
|
37
|
+
# @param term [String] the term to retrieve the documents for
|
38
|
+
# @return [Array<Document>] the array of documents containing the particular term or nil if no such occurence
|
39
|
+
def documents_with(term)
|
40
|
+
term_occurrences[term]
|
41
|
+
end
|
42
|
+
|
43
|
+
# @param term [String] the query term for the documents
|
44
|
+
# @return [Integer] the number of documents that contain the particular term
|
45
|
+
def document_frequency(term)
|
46
|
+
if term_occurrences[term]
|
47
|
+
return term_occurrences[term].size
|
48
|
+
else
|
49
|
+
return 0
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
# adds each term of the document to the term_occurence hash
|
55
|
+
def update_term_occurrences(document)
|
56
|
+
document.terms.each do |term|
|
57
|
+
if @term_occurrences.has_key?(term)
|
58
|
+
@term_occurrences[term] << document
|
59
|
+
elsif !@stop_words.include?(term)
|
60
|
+
@term_occurrences[term] = [document]
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
class RetrievalLite::Document
|
2
|
+
# the text of the document
|
3
|
+
attr_reader :content
|
4
|
+
# a Hash<String, Integer> of all terms of the documents to the frequency of each term
|
5
|
+
attr_reader :term_frequencies
|
6
|
+
# the id of the document
|
7
|
+
attr_reader :id
|
8
|
+
|
9
|
+
# splits the text of the document into an array of tokens
|
10
|
+
#
|
11
|
+
# @param content [String] the text of the document
|
12
|
+
# @param opts [Hash] optional arguments to initializer
|
13
|
+
# @option opts [String] :id the id of the document. Defaults to object_id assigned by ruby
|
14
|
+
def initialize(content, opts = {})
|
15
|
+
@content = content
|
16
|
+
@id = opts[:id] || object_id
|
17
|
+
@term_frequencies = RetrievalLite::Tokenizer.parse_content(content)
|
18
|
+
end
|
19
|
+
|
20
|
+
# for debugging
|
21
|
+
def print_tokens
|
22
|
+
@term_frequencies.each do |key, value|
|
23
|
+
puts "#{key}: #{value}"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# @return [Integer] the total number of unique terms in the document
|
28
|
+
def term_count
|
29
|
+
@term_frequencies.size
|
30
|
+
end
|
31
|
+
|
32
|
+
# @return [Array<String>] the unique terms of the document
|
33
|
+
def terms
|
34
|
+
@term_frequencies.keys
|
35
|
+
end
|
36
|
+
|
37
|
+
# @param term [String]
|
38
|
+
# @return [Integer] the number of times a term appears in the document
|
39
|
+
def frequency_of(term)
|
40
|
+
if @term_frequencies.has_key?(term)
|
41
|
+
return @term_frequencies[term]
|
42
|
+
else
|
43
|
+
return 0
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# @return [Integer] the total number of terms (not unique) in the document
|
48
|
+
def total_terms
|
49
|
+
count = 0
|
50
|
+
@term_frequencies.each do |key, value|
|
51
|
+
count += value
|
52
|
+
end
|
53
|
+
return count
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
# @see http://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf
|
2
|
+
module RetrievalLite::TfIdfRetrieval
|
3
|
+
# Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
|
4
|
+
# Returns documents ordered by tf-idf score.
|
5
|
+
#
|
6
|
+
# @param corpus [Corpus] the collection of documents
|
7
|
+
# @param query [String] the boolean query to be evaluated
|
8
|
+
# @return [Array<Document>] ordered array of documents that satisfy the query
|
9
|
+
def self.evaluate(corpus, query)
|
10
|
+
evaluate_with_scores(corpus, query).keys
|
11
|
+
end
|
12
|
+
|
13
|
+
# Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
|
14
|
+
# Same as #evaluate but returns a hash whose keys are documents and values
|
15
|
+
# are the tf-idf score.
|
16
|
+
#
|
17
|
+
# @param corpus [Corpus] the collection of documents
|
18
|
+
# @param query [String] the boolean query to be evaluated
|
19
|
+
# @return [Hash<Document, Integer>] ordered array of documents that satisfy the query
|
20
|
+
def self.evaluate_with_scores(corpus, query)
|
21
|
+
query_document = RetrievalLite::Document.new(query)
|
22
|
+
terms = query_document.term_frequencies.keys
|
23
|
+
query_vector = query_document.term_frequencies.values # should be in same order as keys
|
24
|
+
|
25
|
+
documents = Set.new # ordering of documents doesn't matter right now
|
26
|
+
# gathering only the documents that contain at least one of those terms
|
27
|
+
terms.each do |t|
|
28
|
+
docs_with_term = corpus.documents_with(t)
|
29
|
+
if docs_with_term
|
30
|
+
docs_with_term.each do |d|
|
31
|
+
if !documents.include?(d)
|
32
|
+
documents << d
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
scores = {}
|
39
|
+
documents.each do |document|
|
40
|
+
document_vector = Array.new(terms.size)
|
41
|
+
terms.each_with_index do |term, index|
|
42
|
+
document_vector[index] = tfidf_weight(corpus, document, term)
|
43
|
+
end
|
44
|
+
scores[document] = RetrievalLite::Vector.cosine_similarity(query_vector, document_vector)
|
45
|
+
end
|
46
|
+
|
47
|
+
# order it by score in descending order
|
48
|
+
return Hash[scores.sort_by{|key, value| value}.reverse]
|
49
|
+
end
|
50
|
+
|
51
|
+
# Ranks a document in corpus using the tf-idf scoring.
|
52
|
+
#
|
53
|
+
# @note tf-idf is slightly modified. n_j (# of docs containing term j) is replaced with n_j + 1 to avoid divide by zero
|
54
|
+
#
|
55
|
+
# @param corpus [Corpus]
|
56
|
+
# @param document [Document]
|
57
|
+
# @param term [String]
|
58
|
+
# @return [Float] the tfidf weight of the term in the document
|
59
|
+
def self.tfidf_weight(corpus, document, term)
|
60
|
+
if corpus.document_frequency(term) == 0
|
61
|
+
return 0
|
62
|
+
else
|
63
|
+
return document.frequency_of(term) * Math.log(1.0 * corpus.size/(corpus.document_frequency(term)))
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Ranks a document in corpus using the normalized tf-idf scoring.
|
68
|
+
# @see #tfidf_weight
|
69
|
+
#
|
70
|
+
# @param corpus [Corpus]
|
71
|
+
# @param document [Document]
|
72
|
+
# @param term [String]
|
73
|
+
# @return [Float] the normalized tfidf weight of the term in the document
|
74
|
+
def self.normalized_tfidf_weight(corpus, document, term)
|
75
|
+
length_of_vector = 0
|
76
|
+
|
77
|
+
corpus.documents_with(term).each do |d|
|
78
|
+
weight = tfidf_weight(corpus, d, term)
|
79
|
+
length_of_vector += weight * weight
|
80
|
+
end
|
81
|
+
|
82
|
+
tfidf_weight(corpus, document, term) / Math.sqrt(length_of_vector)
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module RetrievalLite::Tokenizer
|
2
|
+
SPECIAL_SEPARATERS = ['[', ']', '\\', ';', '\'', ',', '.', '/', '!', '@', '#', '%', '&', '*', '(', ')', '_', '{', '}', ':', '"', '?', '=', '`', '~', '$', '^', '+', '|', '<', '>']
|
3
|
+
|
4
|
+
# @param content [String] the text of the document
|
5
|
+
# @return [Hash<String, Integer>] a hash that gives term frequency of content
|
6
|
+
def self.parse_content(content)
|
7
|
+
tokens = Hash.new(0) # initialize to 0
|
8
|
+
|
9
|
+
# removes everything BUT the letters
|
10
|
+
token_text = content.strip.downcase.split(/#{separaters_regex}/)
|
11
|
+
|
12
|
+
token_text.each do |t|
|
13
|
+
# also validates whether there are no other special characters left in there
|
14
|
+
if has_hyphen?(t)
|
15
|
+
tokens[t] += 1
|
16
|
+
else
|
17
|
+
# get rid of any extra symbols we might have forgotten.
|
18
|
+
term = t.gsub(/[^a-z]/, '')
|
19
|
+
|
20
|
+
# just in case the entire string was just non-characters
|
21
|
+
if term != ''
|
22
|
+
tokens[term] += 1
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
tokens
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
# separates by whitespace and any special characters
|
32
|
+
def self.separaters_regex
|
33
|
+
regex = "\s+" # captures all white spaces
|
34
|
+
SPECIAL_SEPARATERS.each do |s|
|
35
|
+
regex = regex + '|' + Regexp.quote(s)
|
36
|
+
end
|
37
|
+
return Regexp.new(regex)
|
38
|
+
end
|
39
|
+
|
40
|
+
# detects whether term is hyphenated
|
41
|
+
def self.has_hyphen?(term)
|
42
|
+
term =~ /\A[a-z]+\-[a-z]+\Z/
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module RetrievalLite::Vector
|
2
|
+
# @param scores1 [Array<Integer>] each term and its corresponding score in the first document
|
3
|
+
# @param scores2 [Array<Integer>] each term and its corresponding score in the second document
|
4
|
+
# @return [Float] the cosine similarity of the two vectors representing the score of the documents
|
5
|
+
def self.cosine_similarity(scores1, scores2)
|
6
|
+
length = (euclidean_length(scores1) * euclidean_length(scores2))
|
7
|
+
if length == 0
|
8
|
+
return 0
|
9
|
+
else
|
10
|
+
dot_product(scores1, scores2) / length
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
# @param scores1 [Array<Integer>] each term and its corresponding score in the first document
|
15
|
+
# @param scores2 [Array<Integer>] each term and its corresponding score in the second document
|
16
|
+
# @return [Float] the dot product of the two vectors representing the score of the documents
|
17
|
+
def self.dot_product(scores1, scores2)
|
18
|
+
raise "document vectors are not of same length" if scores1.size != scores2.size
|
19
|
+
|
20
|
+
sum = 0
|
21
|
+
for i in 0...scores1.size
|
22
|
+
sum += scores1[i]*scores2[i]
|
23
|
+
end
|
24
|
+
|
25
|
+
return sum
|
26
|
+
end
|
27
|
+
|
28
|
+
# @param scores [Array<Integer>] each term and its corresponding score in the document
|
29
|
+
# @return [Float] the euclidean length of the vectors representing the score of the document
|
30
|
+
def self.euclidean_length(scores)
|
31
|
+
sum = 0
|
32
|
+
|
33
|
+
for i in 0...scores.size
|
34
|
+
sum += scores[i] * scores[i]
|
35
|
+
end
|
36
|
+
|
37
|
+
Math.sqrt(sum)
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require "version"
|
2
|
+
|
3
|
+
module RetrievalLite
|
4
|
+
|
5
|
+
end
|
6
|
+
|
7
|
+
require 'retrieval_lite/document'
|
8
|
+
require 'retrieval_lite/corpus'
|
9
|
+
require 'retrieval_lite/tokenizer'
|
10
|
+
require 'retrieval_lite/boolean_retrieval'
|
11
|
+
require 'retrieval_lite/tfidf_retrieval'
|
12
|
+
require 'retrieval_lite/vector'
|
data/lib/version.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "retrieval_lite"
|
8
|
+
spec.version = RetrievalLite::VERSION
|
9
|
+
spec.authors = ["Irvin Zhan"]
|
10
|
+
spec.email = ["izhan@princeton.edu"]
|
11
|
+
spec.description = %q{Lightweight gem for document retrieval using tf-idf based algorithms for Ruby}
|
12
|
+
spec.summary = %q{Please see associated GitHub page for usage.}
|
13
|
+
spec.homepage = "https://github.com/izhan/retrieval_lite"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "rspec"
|
22
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
23
|
+
spec.add_development_dependency "rake"
|
24
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe RetrievalLite::BooleanRetrieval do
|
4
|
+
let (:document) do
|
5
|
+
RetrievalLite::Document.new("lorem ipsum dolor sit amet")
|
6
|
+
end
|
7
|
+
let (:document_replicated) do
|
8
|
+
RetrievalLite::Document.new("lorem ipsum dolor sit amet")
|
9
|
+
end
|
10
|
+
let (:document_with_duplicates) do
|
11
|
+
RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
|
12
|
+
end
|
13
|
+
let (:document_two) do
|
14
|
+
RetrievalLite::Document.new("Mauris ullamcorper, tortor et consequat sagittis.")
|
15
|
+
end
|
16
|
+
let (:document_three) do
|
17
|
+
RetrievalLite::Document.new("Pellentesque felis lectus, lacinia nec mauris non.")
|
18
|
+
end
|
19
|
+
let (:document_paragraph) do
|
20
|
+
RetrievalLite::Document.new("In semper enim non ullamcorper venenatis.
|
21
|
+
Sed dictum metus condimentum libero ullamcorper, eget scelerisque risus congue.
|
22
|
+
Morbi tempus rhoncus ante, at varius sem adipiscing eu. Sed ut purus pretium,
|
23
|
+
consequat velit et, ultricies magna. Etiam sit amet elit mi. Sed et nibh non nibh
|
24
|
+
vestibulum hendrerit vitae dapibus lectus. Aenean eget odio vitae tortor elementum
|
25
|
+
euismod non nec eros. Nunc id convallis magna. Aliquam ultrices dignissim ipsum,
|
26
|
+
a accumsan enim faucibus non. Pellentesque a felis quis diam blandit tempor.
|
27
|
+
In aliquet laoreet tortor, at adipiscing diam scelerisque ut."
|
28
|
+
)
|
29
|
+
end
|
30
|
+
let (:all_documents) do
|
31
|
+
[document, document_replicated, document_with_duplicates, document_two, document_three, document_paragraph]
|
32
|
+
end
|
33
|
+
let (:corpus) do
|
34
|
+
RetrievalLite::Corpus.new(all_documents)
|
35
|
+
end
|
36
|
+
describe "one-term retrieval" do
|
37
|
+
it "should return array of all documents with that term" do
|
38
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem") == [document, document_replicated, document_with_duplicates]
|
39
|
+
end
|
40
|
+
it "should ignore case" do
|
41
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "LOREM") == [document, document_replicated, document_with_duplicates]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
data/spec/corpus_spec.rb
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe RetrievalLite::Corpus do
|
4
|
+
let (:document) do
|
5
|
+
RetrievalLite::Document.new("lorem ipsum dolor sit amet")
|
6
|
+
end
|
7
|
+
let (:document_replicated) do
|
8
|
+
RetrievalLite::Document.new("lorem ipsum dolor sit amet")
|
9
|
+
end
|
10
|
+
let (:document_with_duplicates) do
|
11
|
+
RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
|
12
|
+
end
|
13
|
+
let (:document_two) do
|
14
|
+
RetrievalLite::Document.new("Mauris ullamcorper, tortor et consequat sagittis.")
|
15
|
+
end
|
16
|
+
let (:document_three) do
|
17
|
+
RetrievalLite::Document.new("Pellentesque felis lectus, lacinia nec mauris non.")
|
18
|
+
end
|
19
|
+
let (:document_paragraph) do
|
20
|
+
RetrievalLite::Document.new("In semper enim non ullamcorper venenatis.
|
21
|
+
Sed dictum metus condimentum libero ullamcorper, eget scelerisque risus congue.
|
22
|
+
Morbi tempus rhoncus ante, at varius sem adipiscing eu. Sed ut purus pretium,
|
23
|
+
consequat velit et, ultricies magna. Etiam sit amet elit mi. Sed et nibh non nibh
|
24
|
+
vestibulum hendrerit vitae dapibus lectus. Aenean eget odio vitae tortor elementum
|
25
|
+
euismod non nec eros. Nunc id convallis magna. Aliquam ultrices dignissim ipsum,
|
26
|
+
a accumsan enim faucibus non. Pellentesque a felis quis diam blandit tempor.
|
27
|
+
In aliquet laoreet tortor, at adipiscing diam scelerisque ut."
|
28
|
+
)
|
29
|
+
end
|
30
|
+
let (:all_documents) do
|
31
|
+
[document, document_replicated, document_with_duplicates, document_two, document_three, document_paragraph]
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "for empty corpus" do
|
35
|
+
let (:corpus) do
|
36
|
+
RetrievalLite::Corpus.new
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should have size of zero" do
|
40
|
+
corpus.size.should == 0
|
41
|
+
end
|
42
|
+
it "should not error when querying terms" do
|
43
|
+
expect { corpus.documents_with("foo") }.to_not raise_error
|
44
|
+
expect { corpus.document_frequency("foo") }.to_not raise_error
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe "for basic one-document corpus" do
|
49
|
+
let (:corpus) do
|
50
|
+
RetrievalLite::Corpus.new([document])
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should have size of one" do
|
54
|
+
corpus.size.should == 1
|
55
|
+
end
|
56
|
+
it "should give us correct document frequencies" do
|
57
|
+
terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
|
58
|
+
terms.each do |t|
|
59
|
+
corpus.document_frequency(t).should == 1
|
60
|
+
end
|
61
|
+
corpus.document_frequency("foo").should == 0
|
62
|
+
end
|
63
|
+
it "should return document when queried" do
|
64
|
+
terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
|
65
|
+
terms.each do |t|
|
66
|
+
corpus.documents_with(t).should == [document]
|
67
|
+
end
|
68
|
+
corpus.documents_with("foo").should == nil
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
describe "for two-identical-document corpus" do
|
73
|
+
let (:corpus) do
|
74
|
+
RetrievalLite::Corpus.new([document, document_replicated])
|
75
|
+
end
|
76
|
+
it "should give us correct document frequencies" do
|
77
|
+
terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
|
78
|
+
terms.each do |t|
|
79
|
+
corpus.document_frequency(t).should == 2
|
80
|
+
end
|
81
|
+
corpus.document_frequency("foo").should == 0
|
82
|
+
end
|
83
|
+
it "should return document when queried" do
|
84
|
+
terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
|
85
|
+
terms.each do |t|
|
86
|
+
corpus.documents_with(t).should == [document, document_replicated]
|
87
|
+
end
|
88
|
+
corpus.documents_with("foo").should == nil
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
describe "for multiple-document corpus" do
|
93
|
+
let (:corpus) do
|
94
|
+
RetrievalLite::Corpus.new(all_documents)
|
95
|
+
end
|
96
|
+
|
97
|
+
it "should have the correct size" do
|
98
|
+
corpus.size.should == 6
|
99
|
+
end
|
100
|
+
|
101
|
+
# TODO are more comprehensive tests needed....?
|
102
|
+
it "should give us correct document frequencies" do
|
103
|
+
corpus.document_frequency("lorem").should == 3
|
104
|
+
corpus.document_frequency("semper").should == 1
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
describe "adding in documents one at a time" do
|
109
|
+
let (:correct_corpus) do
|
110
|
+
RetrievalLite::Corpus.new(all_documents)
|
111
|
+
end
|
112
|
+
let (:corpus) do
|
113
|
+
RetrievalLite::Corpus.new
|
114
|
+
end
|
115
|
+
|
116
|
+
it "should be same as initializing corpus with all documents" do
|
117
|
+
all_documents.each do |d|
|
118
|
+
corpus.add(d)
|
119
|
+
end
|
120
|
+
corpus.documents.should == correct_corpus.documents
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
describe "with optional parameters" do
|
125
|
+
it "should ignore any stopwords (not case sensitive)" do
|
126
|
+
stop_words = ["lorem", "IPSum"]
|
127
|
+
corpus = RetrievalLite::Corpus.new([document], stop_words: stop_words)
|
128
|
+
corpus.documents_with("lorem").should == nil
|
129
|
+
corpus.documents_with("ipsum").should == nil
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe RetrievalLite::Document do
|
4
|
+
describe "for a basic document" do
|
5
|
+
let (:document) do
|
6
|
+
RetrievalLite::Document.new("lorem ipsum dolor sit amet")
|
7
|
+
end
|
8
|
+
let (:capitalized_document) do
|
9
|
+
RetrievalLite::Document.new("LorEM iPSUM DOLOR sit ameT")
|
10
|
+
end
|
11
|
+
let (:document_with_duplicates) do
|
12
|
+
RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
|
13
|
+
end
|
14
|
+
let (:basic_tf) do
|
15
|
+
{
|
16
|
+
"lorem" => 1,
|
17
|
+
"ipsum" => 1,
|
18
|
+
"dolor" => 1,
|
19
|
+
"sit" => 1,
|
20
|
+
"amet" => 1
|
21
|
+
}
|
22
|
+
end
|
23
|
+
let (:multiple_tf) do
|
24
|
+
{
|
25
|
+
"lorem" => 1,
|
26
|
+
"ipsum" => 2,
|
27
|
+
"dolor" => 3,
|
28
|
+
"sit" => 4,
|
29
|
+
"amet" => 5
|
30
|
+
}
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "content of the document" do
|
34
|
+
it "should have original content" do
|
35
|
+
document.content.should == "lorem ipsum dolor sit amet"
|
36
|
+
capitalized_document.content.should == "LorEM iPSUM DOLOR sit ameT"
|
37
|
+
document_with_duplicates.content.should == "lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
describe "the number of terms of the document" do
|
42
|
+
it "should be correct for singleton terms" do
|
43
|
+
document.term_count.should == 5
|
44
|
+
end
|
45
|
+
it "should not care about capitalization" do
|
46
|
+
capitalized_document.term_count.should == 5
|
47
|
+
end
|
48
|
+
it "should be correct for duplicate terms" do
|
49
|
+
document_with_duplicates.term_count.should == 5
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
describe "id of the document" do
|
54
|
+
it "should default to ruby's object_id" do
|
55
|
+
document.id.should == document.object_id
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
describe "term frequencies of the document" do
|
60
|
+
it "should be correct for singleton terms" do
|
61
|
+
document.term_frequencies.should == basic_tf
|
62
|
+
end
|
63
|
+
it "should be correct for capitalization" do
|
64
|
+
capitalized_document.term_frequencies.should == basic_tf
|
65
|
+
end
|
66
|
+
it "should be correct for capitalization" do
|
67
|
+
document_with_duplicates.term_frequencies.should == multiple_tf
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
describe "frequencies of a term" do
|
73
|
+
it "should be correct for term in document" do
|
74
|
+
document.frequency_of("lorem").should == 1
|
75
|
+
document_with_duplicates.frequency_of("ipsum").should == 2
|
76
|
+
end
|
77
|
+
it "should be zero for term not in document" do
|
78
|
+
document.frequency_of("foo").should == 0
|
79
|
+
document_with_duplicates.frequency_of("foo").should == 0
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
describe "for blank document" do
|
84
|
+
it "should not raise error on initialization" do
|
85
|
+
expect { RetrievalLite::Document.new("") }.to_not raise_error
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
describe "optional parameters" do
|
91
|
+
it "should allow for customized id" do
|
92
|
+
doc = RetrievalLite::Document.new("lorem ipsum dolor sit amet", id: "foo")
|
93
|
+
doc.id.should == "foo"
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require "retrieval_lite"
|
2
|
+
require "spec_helpers/file_helpers"
|
3
|
+
|
4
|
+
RSpec.configure do |config|
|
5
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
6
|
+
config.run_all_when_everything_filtered = true
|
7
|
+
config.filter_run :focus
|
8
|
+
|
9
|
+
config.include RetrievalLite::SpecHelpers::FileHelpers
|
10
|
+
|
11
|
+
# Run specs in random order to surface order dependencies. If you find an
|
12
|
+
# order dependency and want to debug it, you can fix the order by providing
|
13
|
+
# the seed, which is printed after each run.
|
14
|
+
# --seed 1234
|
15
|
+
config.order = 'random'
|
16
|
+
end
|
@@ -0,0 +1,130 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe RetrievalLite::TfIdfRetrieval do
|
4
|
+
let (:document_one_term) do
|
5
|
+
RetrievalLite::Document.new("lorem")
|
6
|
+
end
|
7
|
+
let (:document) do
|
8
|
+
RetrievalLite::Document.new("lorem ipsum dolor sit amet")
|
9
|
+
end
|
10
|
+
let (:document_with_duplicates) do
|
11
|
+
RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
|
12
|
+
end
|
13
|
+
let (:document_doubled) do
|
14
|
+
RetrievalLite::Document.new("lorem ipsum dolor sit amet lorem ipsum dolor sit amet")
|
15
|
+
end
|
16
|
+
let (:document_both_terms) do
|
17
|
+
RetrievalLite::Document.new("lorem ipsum")
|
18
|
+
end
|
19
|
+
let (:document_with_unique) do
|
20
|
+
RetrievalLite::Document.new("lorem unique")
|
21
|
+
end
|
22
|
+
# sorted by lorem order
|
23
|
+
let (:all_documents) do
|
24
|
+
[document, document_with_duplicates, document_doubled, document_one_term, document_both_terms, document_with_unique]
|
25
|
+
end
|
26
|
+
let (:corpus) do
|
27
|
+
RetrievalLite::Corpus.new(all_documents)
|
28
|
+
end
|
29
|
+
let (:corpus_different) do
|
30
|
+
RetrievalLite::Corpus.new([document_one_term, document, document_with_duplicates])
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "calculating tf-idf scores" do
|
34
|
+
describe "term that all documents have" do
|
35
|
+
it "should have correct tf-idf" do
|
36
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_one_term, "lorem").should be_within(0.001).of(0)
|
37
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document, "lorem").should be_within(0.001).of(0)
|
38
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_with_duplicates, "lorem").should be_within(0.001).of(0)
|
39
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_doubled, "lorem").should be_within(0.001).of(0)
|
40
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_both_terms, "lorem").should be_within(0.001).of(0)
|
41
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_with_unique, "lorem").should be_within(0.001).of(0)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "term that a few documents have" do
|
46
|
+
it "should have correct tf-idf" do
|
47
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.405)
|
48
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.811)
|
49
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.811)
|
50
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(0.405)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
describe "calculating normalized tf-idf scores" do
|
56
|
+
describe "term that a few documents have" do
|
57
|
+
it "should have correct tf-idf" do
|
58
|
+
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.316)
|
59
|
+
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.632)
|
60
|
+
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.632)
|
61
|
+
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(0.316)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
describe "calculating total tf-idf scores" do
|
67
|
+
describe "for when all documents of corpus has a term" do
|
68
|
+
it "should have score of zero for each document" do
|
69
|
+
scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "lorem")
|
70
|
+
scores.size.should == all_documents.size
|
71
|
+
scores.values.each do |v|
|
72
|
+
v.should == 0
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
describe "term that only one document has" do
|
77
|
+
it "should return the correct score" do
|
78
|
+
scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "unique")
|
79
|
+
scores.size.should == 1
|
80
|
+
scores[document_with_unique].should be_within(0.001).of(1.0)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
describe "term that a few documents have" do
|
84
|
+
it "should return the correct score" do
|
85
|
+
scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "ipsum")
|
86
|
+
scores.size.should == 4
|
87
|
+
scores[document].should be_within(0.001).of(1.0)
|
88
|
+
scores[document_with_duplicates].should be_within(0.001).of(1.0)
|
89
|
+
scores[document_doubled].should be_within(0.001).of(1.0)
|
90
|
+
scores[document_both_terms].should be_within(0.001).of(1.0)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
describe "one-term retrieval" do
|
96
|
+
it "should return array with that term" do
|
97
|
+
RetrievalLite::TfIdfRetrieval.evaluate(corpus, "lorem").should =~ all_documents
|
98
|
+
end
|
99
|
+
it "should ignore case" do
|
100
|
+
RetrievalLite::TfIdfRetrieval.evaluate(corpus, "LOREM").should =~ all_documents
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
describe "when corpus has only one document containing term" do
|
105
|
+
it "should return array with only that document" do
|
106
|
+
RetrievalLite::TfIdfRetrieval.evaluate(corpus, "unique").should == [document_with_unique]
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
describe "for no matches" do
|
111
|
+
it "should return empty array for term not in any documents" do
|
112
|
+
RetrievalLite::TfIdfRetrieval.evaluate(corpus, "foobar").should == []
|
113
|
+
end
|
114
|
+
it "should return empty array for empty string" do
|
115
|
+
RetrievalLite::TfIdfRetrieval.evaluate(corpus, "").should == []
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
describe "multiple-term retrieval" do
|
120
|
+
it "should order documents correctly" do
|
121
|
+
RetrievalLite::TfIdfRetrieval.evaluate(corpus_different, "lorem dolor sit").should == [document, document_with_duplicates, document_one_term]
|
122
|
+
end
|
123
|
+
it "should have the correct scores" do
|
124
|
+
scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus_different, "lorem dolor sit")
|
125
|
+
scores[document].should be_within(0.001).of(0.816)
|
126
|
+
scores[document_with_duplicates].should be_within(0.001).of(0.808)
|
127
|
+
scores[document_one_term].should be_within(0.001).of(0.0)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe RetrievalLite::Tokenizer do
|
4
|
+
describe "parse_content" do
|
5
|
+
describe "for basic terms" do
|
6
|
+
let (:basic_tf) do
|
7
|
+
{
|
8
|
+
"lorem" => 1,
|
9
|
+
"ipsum" => 1,
|
10
|
+
"dolor" => 1,
|
11
|
+
"sit" => 1,
|
12
|
+
"amet" => 1
|
13
|
+
}
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should split the content" do
|
17
|
+
content = "lorem ipsum dolor sit amet"
|
18
|
+
RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should ignore extra white spaces" do
|
22
|
+
content = "lorem ipsum dolor \n sit amet"
|
23
|
+
RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should ignore punctuation" do
|
27
|
+
content = "lorem! @ #ipsum (dolor) sit * \ amet"
|
28
|
+
RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should ignore capitalization" do
|
32
|
+
content = "LOREM iPSuM dOLOR sit amet"
|
33
|
+
RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should ignore any whitespaces in front and back of content" do
|
37
|
+
content = " lorem ipsum dolor sit amet \n"
|
38
|
+
RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
describe "for content with multiple terms" do
|
43
|
+
let (:multiple_tf) do
|
44
|
+
{
|
45
|
+
"lorem" => 1,
|
46
|
+
"ipsum" => 2,
|
47
|
+
"dolor" => 3,
|
48
|
+
"sit" => 4,
|
49
|
+
"amet" => 5
|
50
|
+
}
|
51
|
+
end
|
52
|
+
it "should not care about order" do
|
53
|
+
content = "lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet"
|
54
|
+
RetrievalLite::Tokenizer.parse_content(content).should == multiple_tf
|
55
|
+
|
56
|
+
content = "amet amet lorem dolor dolor sit sit ipsum ipsum sit amet dolor sit amet amet"
|
57
|
+
RetrievalLite::Tokenizer.parse_content(content).should == multiple_tf
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should consider capitalized terms to be the same" do
|
61
|
+
content = "lorem IPSUM ipsum doLOR doLOR dolor SIT SIT SIT SIT amet ameT amET aMET AMET"
|
62
|
+
RetrievalLite::Tokenizer.parse_content(content).should == multiple_tf
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
describe "for special cases" do
|
67
|
+
let(:foo_bar_hash) do
|
68
|
+
{
|
69
|
+
"foo" => 1,
|
70
|
+
"bar" => 1
|
71
|
+
}
|
72
|
+
end
|
73
|
+
let(:foo_bar_baz_hash) do
|
74
|
+
{
|
75
|
+
"foo" => 1,
|
76
|
+
"bar" => 1,
|
77
|
+
"baz" => 1
|
78
|
+
}
|
79
|
+
end
|
80
|
+
|
81
|
+
it "should return empty hash if there are no terms" do
|
82
|
+
RetrievalLite::Tokenizer.parse_content("").should == Hash.new
|
83
|
+
end
|
84
|
+
|
85
|
+
it "should ignore numbers" do
|
86
|
+
RetrievalLite::Tokenizer.parse_content("1 2 3.14159").should == Hash.new
|
87
|
+
end
|
88
|
+
|
89
|
+
it "should ignore control characters" do
|
90
|
+
RetrievalLite::Tokenizer.parse_content("\a\e\f\n\r\t\v").should == Hash.new
|
91
|
+
RetrievalLite::Tokenizer.parse_content("\x07\x1B\f\n\r\t\v").should == Hash.new
|
92
|
+
end
|
93
|
+
|
94
|
+
it "should split words connected by special characters" do
|
95
|
+
RetrievalLite::Tokenizer.parse_content("foo/bar").should == foo_bar_hash
|
96
|
+
RetrievalLite::Tokenizer.parse_content("foo,bar").should == foo_bar_hash
|
97
|
+
RetrievalLite::Tokenizer.parse_content("foo,:bar").should == foo_bar_hash
|
98
|
+
RetrievalLite::Tokenizer.parse_content("foo ,:bar").should == foo_bar_hash
|
99
|
+
RetrievalLite::Tokenizer.parse_content("!@foo ,:bar#").should == foo_bar_hash
|
100
|
+
|
101
|
+
RetrievalLite::Tokenizer.parse_content("foo:bar baz").should == foo_bar_baz_hash
|
102
|
+
end
|
103
|
+
|
104
|
+
it "should not split words connected by only one hyphen" do
|
105
|
+
RetrievalLite::Tokenizer.parse_content("foo-bar").should == { "foo-bar" => 1 }
|
106
|
+
RetrievalLite::Tokenizer.parse_content("foo - bar").should == foo_bar_hash
|
107
|
+
RetrievalLite::Tokenizer.parse_content("foo --bar").should == foo_bar_hash
|
108
|
+
#RetrievalLite::Tokenizer.parse_content("foo--bar").should == foo_bar_hash # TODO is this worth it?
|
109
|
+
|
110
|
+
RetrievalLite::Tokenizer.parse_content("foo-bar-baz").should_not == foo_bar_baz_hash
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
data/spec/vector_spec.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe RetrievalLite::Vector do
|
4
|
+
describe "dot product" do
|
5
|
+
it "should compute correctly for vectors length 1" do
|
6
|
+
RetrievalLite::Vector.dot_product([3], [5]).should == 15
|
7
|
+
end
|
8
|
+
it "should compute correctly for longer vectors" do
|
9
|
+
RetrievalLite::Vector.dot_product([2, 3], [4, 5]).should == 23
|
10
|
+
end
|
11
|
+
it "should raise error for unequal sized arrays" do
|
12
|
+
expect { RetrievalLite::Vector.dot_product([2, 3], [4]) }.to raise_error
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "euclidean length" do
|
17
|
+
it "should calculate it for vectors length 1" do
|
18
|
+
RetrievalLite::Vector.euclidean_length([1]).should == 1
|
19
|
+
end
|
20
|
+
it "should calculate it for zero vectors" do
|
21
|
+
RetrievalLite::Vector.euclidean_length([0, 0, 0]).should == 0
|
22
|
+
end
|
23
|
+
it "should calculate it for longer vectors" do
|
24
|
+
RetrievalLite::Vector.euclidean_length([3, 4]).should == 5
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
metadata
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: retrieval_lite
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Irvin Zhan
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-05-12 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.3'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.3'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
description: Lightweight gem for document retrieval using tf-idf based algorithms
|
56
|
+
for Ruby
|
57
|
+
email:
|
58
|
+
- izhan@princeton.edu
|
59
|
+
executables: []
|
60
|
+
extensions: []
|
61
|
+
extra_rdoc_files: []
|
62
|
+
files:
|
63
|
+
- .gitignore
|
64
|
+
- .rspec
|
65
|
+
- Gemfile
|
66
|
+
- LICENSE
|
67
|
+
- LICENSE.txt
|
68
|
+
- README.md
|
69
|
+
- Rakefile
|
70
|
+
- lib/retrieval_lite.rb
|
71
|
+
- lib/retrieval_lite/boolean_retrieval.rb
|
72
|
+
- lib/retrieval_lite/corpus.rb
|
73
|
+
- lib/retrieval_lite/document.rb
|
74
|
+
- lib/retrieval_lite/tfidf_retrieval.rb
|
75
|
+
- lib/retrieval_lite/tokenizer.rb
|
76
|
+
- lib/retrieval_lite/vector.rb
|
77
|
+
- lib/version.rb
|
78
|
+
- retrieval_lite.gemspec
|
79
|
+
- spec/boolean_retrieval_spec.rb
|
80
|
+
- spec/corpus_spec.rb
|
81
|
+
- spec/document_spec.rb
|
82
|
+
- spec/retrieval_lite_spec.rb
|
83
|
+
- spec/spec_helper.rb
|
84
|
+
- spec/spec_helpers/file_helpers.rb
|
85
|
+
- spec/tfidf_retrieval_spec.rb
|
86
|
+
- spec/tokenizer_spec.rb
|
87
|
+
- spec/vector_spec.rb
|
88
|
+
homepage: https://github.com/izhan/retrieval_lite
|
89
|
+
licenses:
|
90
|
+
- MIT
|
91
|
+
metadata: {}
|
92
|
+
post_install_message:
|
93
|
+
rdoc_options: []
|
94
|
+
require_paths:
|
95
|
+
- lib
|
96
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - '>='
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '0'
|
101
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
103
|
+
- - '>='
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '0'
|
106
|
+
requirements: []
|
107
|
+
rubyforge_project:
|
108
|
+
rubygems_version: 2.1.11
|
109
|
+
signing_key:
|
110
|
+
specification_version: 4
|
111
|
+
summary: Please see associated GitHub page for usage.
|
112
|
+
test_files:
|
113
|
+
- spec/boolean_retrieval_spec.rb
|
114
|
+
- spec/corpus_spec.rb
|
115
|
+
- spec/document_spec.rb
|
116
|
+
- spec/retrieval_lite_spec.rb
|
117
|
+
- spec/spec_helper.rb
|
118
|
+
- spec/spec_helpers/file_helpers.rb
|
119
|
+
- spec/tfidf_retrieval_spec.rb
|
120
|
+
- spec/tokenizer_spec.rb
|
121
|
+
- spec/vector_spec.rb
|
122
|
+
has_rdoc:
|