retrieval_lite 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/LICENSE +21 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +8 -0
- data/lib/retrieval_lite/boolean_retrieval.rb +21 -0
- data/lib/retrieval_lite/corpus.rb +64 -0
- data/lib/retrieval_lite/document.rb +55 -0
- data/lib/retrieval_lite/tfidf_retrieval.rb +84 -0
- data/lib/retrieval_lite/tokenizer.rb +44 -0
- data/lib/retrieval_lite/vector.rb +39 -0
- data/lib/retrieval_lite.rb +12 -0
- data/lib/version.rb +3 -0
- data/retrieval_lite.gemspec +24 -0
- data/spec/boolean_retrieval_spec.rb +44 -0
- data/spec/corpus_spec.rb +132 -0
- data/spec/document_spec.rb +96 -0
- data/spec/retrieval_lite_spec.rb +3 -0
- data/spec/spec_helper.rb +16 -0
- data/spec/spec_helpers/file_helpers.rb +9 -0
- data/spec/tfidf_retrieval_spec.rb +130 -0
- data/spec/tokenizer_spec.rb +114 -0
- data/spec/vector_spec.rb +27 -0
- metadata +122 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8e080b5b0c600b330f5d07827b5c2e576132306b
|
4
|
+
data.tar.gz: f0fa5049a6f5e9df0a1fbc761107d28dc1efe857
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b1827e3f90edbc3ee841fbe25526c748ac82ad1aae6f42455605e3199dffe7c5cd309c2684741d41f695c88285152c1bf83558beb87668ec9eebb4973f7ff0a3
|
7
|
+
data.tar.gz: 5d2009f175da5ce2cecb277061e9592093a14435d5012094c461afb1ac626b1d72705edf560cece8f53d4a02623f7ebe2fd3adc0f0555b0934cd5b878f6c12b2
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2014 Irvin Zhan
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Irvin Zhan
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Retrieval Lite Gem
|
2
|
+
|
3
|
+
Lightweight gem for document retrieval using tf-idf based algorithms for Ruby
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'retrieval_lite'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install retrieval_lite
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
module RetrievalLite::BooleanRetrieval
|
2
|
+
# Queries a corpus using a boolean expression with the standard operators,
|
3
|
+
# AND, OR, NOT. Only returns documents that satisfy the query, and does
|
4
|
+
# not rank the documents in any way.
|
5
|
+
#
|
6
|
+
# @param corpus [Corpus] the collection of documents
|
7
|
+
# @param query [String] the boolean query to be evaluated
|
8
|
+
# @return [Array<Document>] unordered array of documents that satisfy the query
|
9
|
+
def self.evaluate(corpus, query)
|
10
|
+
if !is_valid?(query)
|
11
|
+
raise "Boolean expression is not valid." # TODO better validation message?
|
12
|
+
end
|
13
|
+
|
14
|
+
corpus.documents_with(query)
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
def self.is_valid?(query)
|
19
|
+
true
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
class RetrievalLite::Corpus
|
2
|
+
# the documents within the corpus
|
3
|
+
attr_reader :documents
|
4
|
+
# hash of a term to the array of documents that contain the particular term
|
5
|
+
attr_reader :term_occurrences
|
6
|
+
|
7
|
+
# @param documents [Array<Document>] the documents of the corpus
|
8
|
+
# @param opts [Hash] optional arguments to initializer
|
9
|
+
# @option opts [Array<String>] :stop_words the words to ignore when creating tokens
|
10
|
+
def initialize(documents = [], opts = {})
|
11
|
+
@documents = documents
|
12
|
+
@term_occurrences = {}
|
13
|
+
@stop_words = opts[:stop_words] || []
|
14
|
+
# stop_words should be lowercased since tokens are in lowercase
|
15
|
+
@stop_words.each do |w|
|
16
|
+
w.downcase!
|
17
|
+
end
|
18
|
+
@stop_words = Set.new @stop_words # faster .include?
|
19
|
+
|
20
|
+
documents.each do |d|
|
21
|
+
update_term_occurrences(d)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Adds a document to the corpus
|
26
|
+
# @param document [Document] the document to be added
|
27
|
+
def add(document)
|
28
|
+
@documents << document
|
29
|
+
update_term_occurrences(document)
|
30
|
+
end
|
31
|
+
|
32
|
+
# @return [Integer] the number documents in the corpus
|
33
|
+
def size
|
34
|
+
documents.size
|
35
|
+
end
|
36
|
+
|
37
|
+
# @param term [String] the term to retrieve the documents for
|
38
|
+
# @return [Array<Document>] the array of documents containing the particular term or nil if no such occurence
|
39
|
+
def documents_with(term)
|
40
|
+
term_occurrences[term]
|
41
|
+
end
|
42
|
+
|
43
|
+
# @param term [String] the query term for the documents
|
44
|
+
# @return [Integer] the number of documents that contain the particular term
|
45
|
+
def document_frequency(term)
|
46
|
+
if term_occurrences[term]
|
47
|
+
return term_occurrences[term].size
|
48
|
+
else
|
49
|
+
return 0
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
# adds each term of the document to the term_occurence hash
|
55
|
+
def update_term_occurrences(document)
|
56
|
+
document.terms.each do |term|
|
57
|
+
if @term_occurrences.has_key?(term)
|
58
|
+
@term_occurrences[term] << document
|
59
|
+
elsif !@stop_words.include?(term)
|
60
|
+
@term_occurrences[term] = [document]
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
class RetrievalLite::Document
|
2
|
+
# the text of the document
|
3
|
+
attr_reader :content
|
4
|
+
# a Hash<String, Integer> of all terms of the documents to the frequency of each term
|
5
|
+
attr_reader :term_frequencies
|
6
|
+
# the id of the document
|
7
|
+
attr_reader :id
|
8
|
+
|
9
|
+
# splits the text of the document into an array of tokens
|
10
|
+
#
|
11
|
+
# @param content [String] the text of the document
|
12
|
+
# @param opts [Hash] optional arguments to initializer
|
13
|
+
# @option opts [String] :id the id of the document. Defaults to object_id assigned by ruby
|
14
|
+
def initialize(content, opts = {})
|
15
|
+
@content = content
|
16
|
+
@id = opts[:id] || object_id
|
17
|
+
@term_frequencies = RetrievalLite::Tokenizer.parse_content(content)
|
18
|
+
end
|
19
|
+
|
20
|
+
# for debugging
|
21
|
+
def print_tokens
|
22
|
+
@term_frequencies.each do |key, value|
|
23
|
+
puts "#{key}: #{value}"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# @return [Integer] the total number of unique terms in the document
|
28
|
+
def term_count
|
29
|
+
@term_frequencies.size
|
30
|
+
end
|
31
|
+
|
32
|
+
# @return [Array<String>] the unique terms of the document
|
33
|
+
def terms
|
34
|
+
@term_frequencies.keys
|
35
|
+
end
|
36
|
+
|
37
|
+
# @param term [String]
|
38
|
+
# @return [Integer] the number of times a term appears in the document
|
39
|
+
def frequency_of(term)
|
40
|
+
if @term_frequencies.has_key?(term)
|
41
|
+
return @term_frequencies[term]
|
42
|
+
else
|
43
|
+
return 0
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# @return [Integer] the total number of terms (not unique) in the document
|
48
|
+
def total_terms
|
49
|
+
count = 0
|
50
|
+
@term_frequencies.each do |key, value|
|
51
|
+
count += value
|
52
|
+
end
|
53
|
+
return count
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
# @see http://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf
|
2
|
+
module RetrievalLite::TfIdfRetrieval
|
3
|
+
# Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
|
4
|
+
# Returns documents ordered by tf-idf score.
|
5
|
+
#
|
6
|
+
# @param corpus [Corpus] the collection of documents
|
7
|
+
# @param query [String] the boolean query to be evaluated
|
8
|
+
# @return [Array<Document>] ordered array of documents that satisfy the query
|
9
|
+
def self.evaluate(corpus, query)
|
10
|
+
evaluate_with_scores(corpus, query).keys
|
11
|
+
end
|
12
|
+
|
13
|
+
# Queries a corpus using the tf-idf ranking algorithm and cosine similarity.
|
14
|
+
# Same as #evaluate but returns a hash whose keys are documents and values
|
15
|
+
# are the tf-idf score.
|
16
|
+
#
|
17
|
+
# @param corpus [Corpus] the collection of documents
|
18
|
+
# @param query [String] the boolean query to be evaluated
|
19
|
+
# @return [Hash<Document, Integer>] ordered array of documents that satisfy the query
|
20
|
+
def self.evaluate_with_scores(corpus, query)
|
21
|
+
query_document = RetrievalLite::Document.new(query)
|
22
|
+
terms = query_document.term_frequencies.keys
|
23
|
+
query_vector = query_document.term_frequencies.values # should be in same order as keys
|
24
|
+
|
25
|
+
documents = Set.new # ordering of documents doesn't matter right now
|
26
|
+
# gathering only the documents that contain at least one of those terms
|
27
|
+
terms.each do |t|
|
28
|
+
docs_with_term = corpus.documents_with(t)
|
29
|
+
if docs_with_term
|
30
|
+
docs_with_term.each do |d|
|
31
|
+
if !documents.include?(d)
|
32
|
+
documents << d
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
scores = {}
|
39
|
+
documents.each do |document|
|
40
|
+
document_vector = Array.new(terms.size)
|
41
|
+
terms.each_with_index do |term, index|
|
42
|
+
document_vector[index] = tfidf_weight(corpus, document, term)
|
43
|
+
end
|
44
|
+
scores[document] = RetrievalLite::Vector.cosine_similarity(query_vector, document_vector)
|
45
|
+
end
|
46
|
+
|
47
|
+
# order it by score in descending order
|
48
|
+
return Hash[scores.sort_by{|key, value| value}.reverse]
|
49
|
+
end
|
50
|
+
|
51
|
+
# Ranks a document in corpus using the tf-idf scoring.
|
52
|
+
#
|
53
|
+
# @note tf-idf is slightly modified. n_j (# of docs containing term j) is replaced with n_j + 1 to avoid divide by zero
|
54
|
+
#
|
55
|
+
# @param corpus [Corpus]
|
56
|
+
# @param document [Document]
|
57
|
+
# @param term [String]
|
58
|
+
# @return [Float] the tfidf weight of the term in the document
|
59
|
+
def self.tfidf_weight(corpus, document, term)
|
60
|
+
if corpus.document_frequency(term) == 0
|
61
|
+
return 0
|
62
|
+
else
|
63
|
+
return document.frequency_of(term) * Math.log(1.0 * corpus.size/(corpus.document_frequency(term)))
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Ranks a document in corpus using the normalized tf-idf scoring.
|
68
|
+
# @see #tfidf_weight
|
69
|
+
#
|
70
|
+
# @param corpus [Corpus]
|
71
|
+
# @param document [Document]
|
72
|
+
# @param term [String]
|
73
|
+
# @return [Float] the normalized tfidf weight of the term in the document
|
74
|
+
def self.normalized_tfidf_weight(corpus, document, term)
|
75
|
+
length_of_vector = 0
|
76
|
+
|
77
|
+
corpus.documents_with(term).each do |d|
|
78
|
+
weight = tfidf_weight(corpus, d, term)
|
79
|
+
length_of_vector += weight * weight
|
80
|
+
end
|
81
|
+
|
82
|
+
tfidf_weight(corpus, document, term) / Math.sqrt(length_of_vector)
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module RetrievalLite::Tokenizer
|
2
|
+
SPECIAL_SEPARATERS = ['[', ']', '\\', ';', '\'', ',', '.', '/', '!', '@', '#', '%', '&', '*', '(', ')', '_', '{', '}', ':', '"', '?', '=', '`', '~', '$', '^', '+', '|', '<', '>']
|
3
|
+
|
4
|
+
# @param content [String] the text of the document
|
5
|
+
# @return [Hash<String, Integer>] a hash that gives term frequency of content
|
6
|
+
def self.parse_content(content)
|
7
|
+
tokens = Hash.new(0) # initialize to 0
|
8
|
+
|
9
|
+
# removes everything BUT the letters
|
10
|
+
token_text = content.strip.downcase.split(/#{separaters_regex}/)
|
11
|
+
|
12
|
+
token_text.each do |t|
|
13
|
+
# also validates whether there are no other special characters left in there
|
14
|
+
if has_hyphen?(t)
|
15
|
+
tokens[t] += 1
|
16
|
+
else
|
17
|
+
# get rid of any extra symbols we might have forgotten.
|
18
|
+
term = t.gsub(/[^a-z]/, '')
|
19
|
+
|
20
|
+
# just in case the entire string was just non-characters
|
21
|
+
if term != ''
|
22
|
+
tokens[term] += 1
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
tokens
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
# separates by whitespace and any special characters
|
32
|
+
def self.separaters_regex
|
33
|
+
regex = "\s+" # captures all white spaces
|
34
|
+
SPECIAL_SEPARATERS.each do |s|
|
35
|
+
regex = regex + '|' + Regexp.quote(s)
|
36
|
+
end
|
37
|
+
return Regexp.new(regex)
|
38
|
+
end
|
39
|
+
|
40
|
+
# detects whether term is hyphenated
|
41
|
+
def self.has_hyphen?(term)
|
42
|
+
term =~ /\A[a-z]+\-[a-z]+\Z/
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module RetrievalLite::Vector
|
2
|
+
# @param scores1 [Array<Integer>] each term and its corresponding score in the first document
|
3
|
+
# @param scores2 [Array<Integer>] each term and its corresponding score in the second document
|
4
|
+
# @return [Float] the cosine similarity of the two vectors representing the score of the documents
|
5
|
+
def self.cosine_similarity(scores1, scores2)
|
6
|
+
length = (euclidean_length(scores1) * euclidean_length(scores2))
|
7
|
+
if length == 0
|
8
|
+
return 0
|
9
|
+
else
|
10
|
+
dot_product(scores1, scores2) / length
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
# @param scores1 [Array<Integer>] each term and its corresponding score in the first document
|
15
|
+
# @param scores2 [Array<Integer>] each term and its corresponding score in the second document
|
16
|
+
# @return [Float] the dot product of the two vectors representing the score of the documents
|
17
|
+
def self.dot_product(scores1, scores2)
|
18
|
+
raise "document vectors are not of same length" if scores1.size != scores2.size
|
19
|
+
|
20
|
+
sum = 0
|
21
|
+
for i in 0...scores1.size
|
22
|
+
sum += scores1[i]*scores2[i]
|
23
|
+
end
|
24
|
+
|
25
|
+
return sum
|
26
|
+
end
|
27
|
+
|
28
|
+
# @param scores [Array<Integer>] each term and its corresponding score in the document
|
29
|
+
# @return [Float] the euclidean length of the vectors representing the score of the document
|
30
|
+
def self.euclidean_length(scores)
|
31
|
+
sum = 0
|
32
|
+
|
33
|
+
for i in 0...scores.size
|
34
|
+
sum += scores[i] * scores[i]
|
35
|
+
end
|
36
|
+
|
37
|
+
Math.sqrt(sum)
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require "version"
|
2
|
+
|
3
|
+
module RetrievalLite
|
4
|
+
|
5
|
+
end
|
6
|
+
|
7
|
+
require 'retrieval_lite/document'
|
8
|
+
require 'retrieval_lite/corpus'
|
9
|
+
require 'retrieval_lite/tokenizer'
|
10
|
+
require 'retrieval_lite/boolean_retrieval'
|
11
|
+
require 'retrieval_lite/tfidf_retrieval'
|
12
|
+
require 'retrieval_lite/vector'
|
data/lib/version.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "retrieval_lite"
|
8
|
+
spec.version = RetrievalLite::VERSION
|
9
|
+
spec.authors = ["Irvin Zhan"]
|
10
|
+
spec.email = ["izhan@princeton.edu"]
|
11
|
+
spec.description = %q{Lightweight gem for document retrieval using tf-idf based algorithms for Ruby}
|
12
|
+
spec.summary = %q{Please see associated GitHub page for usage.}
|
13
|
+
spec.homepage = "https://github.com/izhan/retrieval_lite"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "rspec"
|
22
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
23
|
+
spec.add_development_dependency "rake"
|
24
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe RetrievalLite::BooleanRetrieval do
|
4
|
+
let (:document) do
|
5
|
+
RetrievalLite::Document.new("lorem ipsum dolor sit amet")
|
6
|
+
end
|
7
|
+
let (:document_replicated) do
|
8
|
+
RetrievalLite::Document.new("lorem ipsum dolor sit amet")
|
9
|
+
end
|
10
|
+
let (:document_with_duplicates) do
|
11
|
+
RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
|
12
|
+
end
|
13
|
+
let (:document_two) do
|
14
|
+
RetrievalLite::Document.new("Mauris ullamcorper, tortor et consequat sagittis.")
|
15
|
+
end
|
16
|
+
let (:document_three) do
|
17
|
+
RetrievalLite::Document.new("Pellentesque felis lectus, lacinia nec mauris non.")
|
18
|
+
end
|
19
|
+
let (:document_paragraph) do
|
20
|
+
RetrievalLite::Document.new("In semper enim non ullamcorper venenatis.
|
21
|
+
Sed dictum metus condimentum libero ullamcorper, eget scelerisque risus congue.
|
22
|
+
Morbi tempus rhoncus ante, at varius sem adipiscing eu. Sed ut purus pretium,
|
23
|
+
consequat velit et, ultricies magna. Etiam sit amet elit mi. Sed et nibh non nibh
|
24
|
+
vestibulum hendrerit vitae dapibus lectus. Aenean eget odio vitae tortor elementum
|
25
|
+
euismod non nec eros. Nunc id convallis magna. Aliquam ultrices dignissim ipsum,
|
26
|
+
a accumsan enim faucibus non. Pellentesque a felis quis diam blandit tempor.
|
27
|
+
In aliquet laoreet tortor, at adipiscing diam scelerisque ut."
|
28
|
+
)
|
29
|
+
end
|
30
|
+
let (:all_documents) do
|
31
|
+
[document, document_replicated, document_with_duplicates, document_two, document_three, document_paragraph]
|
32
|
+
end
|
33
|
+
let (:corpus) do
|
34
|
+
RetrievalLite::Corpus.new(all_documents)
|
35
|
+
end
|
36
|
+
describe "one-term retrieval" do
|
37
|
+
it "should return array of all documents with that term" do
|
38
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "lorem") == [document, document_replicated, document_with_duplicates]
|
39
|
+
end
|
40
|
+
it "should ignore case" do
|
41
|
+
RetrievalLite::BooleanRetrieval.evaluate(corpus, "LOREM") == [document, document_replicated, document_with_duplicates]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
data/spec/corpus_spec.rb
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe RetrievalLite::Corpus do
|
4
|
+
let (:document) do
|
5
|
+
RetrievalLite::Document.new("lorem ipsum dolor sit amet")
|
6
|
+
end
|
7
|
+
let (:document_replicated) do
|
8
|
+
RetrievalLite::Document.new("lorem ipsum dolor sit amet")
|
9
|
+
end
|
10
|
+
let (:document_with_duplicates) do
|
11
|
+
RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
|
12
|
+
end
|
13
|
+
let (:document_two) do
|
14
|
+
RetrievalLite::Document.new("Mauris ullamcorper, tortor et consequat sagittis.")
|
15
|
+
end
|
16
|
+
let (:document_three) do
|
17
|
+
RetrievalLite::Document.new("Pellentesque felis lectus, lacinia nec mauris non.")
|
18
|
+
end
|
19
|
+
let (:document_paragraph) do
|
20
|
+
RetrievalLite::Document.new("In semper enim non ullamcorper venenatis.
|
21
|
+
Sed dictum metus condimentum libero ullamcorper, eget scelerisque risus congue.
|
22
|
+
Morbi tempus rhoncus ante, at varius sem adipiscing eu. Sed ut purus pretium,
|
23
|
+
consequat velit et, ultricies magna. Etiam sit amet elit mi. Sed et nibh non nibh
|
24
|
+
vestibulum hendrerit vitae dapibus lectus. Aenean eget odio vitae tortor elementum
|
25
|
+
euismod non nec eros. Nunc id convallis magna. Aliquam ultrices dignissim ipsum,
|
26
|
+
a accumsan enim faucibus non. Pellentesque a felis quis diam blandit tempor.
|
27
|
+
In aliquet laoreet tortor, at adipiscing diam scelerisque ut."
|
28
|
+
)
|
29
|
+
end
|
30
|
+
let (:all_documents) do
|
31
|
+
[document, document_replicated, document_with_duplicates, document_two, document_three, document_paragraph]
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "for empty corpus" do
|
35
|
+
let (:corpus) do
|
36
|
+
RetrievalLite::Corpus.new
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should have size of zero" do
|
40
|
+
corpus.size.should == 0
|
41
|
+
end
|
42
|
+
it "should not error when querying terms" do
|
43
|
+
expect { corpus.documents_with("foo") }.to_not raise_error
|
44
|
+
expect { corpus.document_frequency("foo") }.to_not raise_error
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe "for basic one-document corpus" do
|
49
|
+
let (:corpus) do
|
50
|
+
RetrievalLite::Corpus.new([document])
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should have size of one" do
|
54
|
+
corpus.size.should == 1
|
55
|
+
end
|
56
|
+
it "should give us correct document frequencies" do
|
57
|
+
terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
|
58
|
+
terms.each do |t|
|
59
|
+
corpus.document_frequency(t).should == 1
|
60
|
+
end
|
61
|
+
corpus.document_frequency("foo").should == 0
|
62
|
+
end
|
63
|
+
it "should return document when queried" do
|
64
|
+
terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
|
65
|
+
terms.each do |t|
|
66
|
+
corpus.documents_with(t).should == [document]
|
67
|
+
end
|
68
|
+
corpus.documents_with("foo").should == nil
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
describe "for two-identical-document corpus" do
|
73
|
+
let (:corpus) do
|
74
|
+
RetrievalLite::Corpus.new([document, document_replicated])
|
75
|
+
end
|
76
|
+
it "should give us correct document frequencies" do
|
77
|
+
terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
|
78
|
+
terms.each do |t|
|
79
|
+
corpus.document_frequency(t).should == 2
|
80
|
+
end
|
81
|
+
corpus.document_frequency("foo").should == 0
|
82
|
+
end
|
83
|
+
it "should return document when queried" do
|
84
|
+
terms = ["lorem", "ipsum", "dolor", "sit", "amet"]
|
85
|
+
terms.each do |t|
|
86
|
+
corpus.documents_with(t).should == [document, document_replicated]
|
87
|
+
end
|
88
|
+
corpus.documents_with("foo").should == nil
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
describe "for multiple-document corpus" do
|
93
|
+
let (:corpus) do
|
94
|
+
RetrievalLite::Corpus.new(all_documents)
|
95
|
+
end
|
96
|
+
|
97
|
+
it "should have the correct size" do
|
98
|
+
corpus.size.should == 6
|
99
|
+
end
|
100
|
+
|
101
|
+
# TODO are more comprehensive tests needed....?
|
102
|
+
it "should give us correct document frequencies" do
|
103
|
+
corpus.document_frequency("lorem").should == 3
|
104
|
+
corpus.document_frequency("semper").should == 1
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
describe "adding in documents one at a time" do
|
109
|
+
let (:correct_corpus) do
|
110
|
+
RetrievalLite::Corpus.new(all_documents)
|
111
|
+
end
|
112
|
+
let (:corpus) do
|
113
|
+
RetrievalLite::Corpus.new
|
114
|
+
end
|
115
|
+
|
116
|
+
it "should be same as initializing corpus with all documents" do
|
117
|
+
all_documents.each do |d|
|
118
|
+
corpus.add(d)
|
119
|
+
end
|
120
|
+
corpus.documents.should == correct_corpus.documents
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
describe "with optional parameters" do
|
125
|
+
it "should ignore any stopwords (not case sensitive)" do
|
126
|
+
stop_words = ["lorem", "IPSum"]
|
127
|
+
corpus = RetrievalLite::Corpus.new([document], stop_words: stop_words)
|
128
|
+
corpus.documents_with("lorem").should == nil
|
129
|
+
corpus.documents_with("ipsum").should == nil
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe RetrievalLite::Document do
|
4
|
+
describe "for a basic document" do
|
5
|
+
let (:document) do
|
6
|
+
RetrievalLite::Document.new("lorem ipsum dolor sit amet")
|
7
|
+
end
|
8
|
+
let (:capitalized_document) do
|
9
|
+
RetrievalLite::Document.new("LorEM iPSUM DOLOR sit ameT")
|
10
|
+
end
|
11
|
+
let (:document_with_duplicates) do
|
12
|
+
RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
|
13
|
+
end
|
14
|
+
let (:basic_tf) do
|
15
|
+
{
|
16
|
+
"lorem" => 1,
|
17
|
+
"ipsum" => 1,
|
18
|
+
"dolor" => 1,
|
19
|
+
"sit" => 1,
|
20
|
+
"amet" => 1
|
21
|
+
}
|
22
|
+
end
|
23
|
+
let (:multiple_tf) do
|
24
|
+
{
|
25
|
+
"lorem" => 1,
|
26
|
+
"ipsum" => 2,
|
27
|
+
"dolor" => 3,
|
28
|
+
"sit" => 4,
|
29
|
+
"amet" => 5
|
30
|
+
}
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "content of the document" do
|
34
|
+
it "should have original content" do
|
35
|
+
document.content.should == "lorem ipsum dolor sit amet"
|
36
|
+
capitalized_document.content.should == "LorEM iPSUM DOLOR sit ameT"
|
37
|
+
document_with_duplicates.content.should == "lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
describe "the number of terms of the document" do
|
42
|
+
it "should be correct for singleton terms" do
|
43
|
+
document.term_count.should == 5
|
44
|
+
end
|
45
|
+
it "should not care about capitalization" do
|
46
|
+
capitalized_document.term_count.should == 5
|
47
|
+
end
|
48
|
+
it "should be correct for duplicate terms" do
|
49
|
+
document_with_duplicates.term_count.should == 5
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
describe "id of the document" do
|
54
|
+
it "should default to ruby's object_id" do
|
55
|
+
document.id.should == document.object_id
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
describe "term frequencies of the document" do
|
60
|
+
it "should be correct for singleton terms" do
|
61
|
+
document.term_frequencies.should == basic_tf
|
62
|
+
end
|
63
|
+
it "should be correct for capitalization" do
|
64
|
+
capitalized_document.term_frequencies.should == basic_tf
|
65
|
+
end
|
66
|
+
it "should be correct for capitalization" do
|
67
|
+
document_with_duplicates.term_frequencies.should == multiple_tf
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
describe "frequencies of a term" do
|
73
|
+
it "should be correct for term in document" do
|
74
|
+
document.frequency_of("lorem").should == 1
|
75
|
+
document_with_duplicates.frequency_of("ipsum").should == 2
|
76
|
+
end
|
77
|
+
it "should be zero for term not in document" do
|
78
|
+
document.frequency_of("foo").should == 0
|
79
|
+
document_with_duplicates.frequency_of("foo").should == 0
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
describe "for blank document" do
|
84
|
+
it "should not raise error on initialization" do
|
85
|
+
expect { RetrievalLite::Document.new("") }.to_not raise_error
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
describe "optional parameters" do
|
91
|
+
it "should allow for customized id" do
|
92
|
+
doc = RetrievalLite::Document.new("lorem ipsum dolor sit amet", id: "foo")
|
93
|
+
doc.id.should == "foo"
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require "retrieval_lite"
|
2
|
+
require "spec_helpers/file_helpers"
|
3
|
+
|
4
|
+
RSpec.configure do |config|
|
5
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
6
|
+
config.run_all_when_everything_filtered = true
|
7
|
+
config.filter_run :focus
|
8
|
+
|
9
|
+
config.include RetrievalLite::SpecHelpers::FileHelpers
|
10
|
+
|
11
|
+
# Run specs in random order to surface order dependencies. If you find an
|
12
|
+
# order dependency and want to debug it, you can fix the order by providing
|
13
|
+
# the seed, which is printed after each run.
|
14
|
+
# --seed 1234
|
15
|
+
config.order = 'random'
|
16
|
+
end
|
@@ -0,0 +1,130 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe RetrievalLite::TfIdfRetrieval do
|
4
|
+
let (:document_one_term) do
|
5
|
+
RetrievalLite::Document.new("lorem")
|
6
|
+
end
|
7
|
+
let (:document) do
|
8
|
+
RetrievalLite::Document.new("lorem ipsum dolor sit amet")
|
9
|
+
end
|
10
|
+
let (:document_with_duplicates) do
|
11
|
+
RetrievalLite::Document.new("lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet")
|
12
|
+
end
|
13
|
+
let (:document_doubled) do
|
14
|
+
RetrievalLite::Document.new("lorem ipsum dolor sit amet lorem ipsum dolor sit amet")
|
15
|
+
end
|
16
|
+
let (:document_both_terms) do
|
17
|
+
RetrievalLite::Document.new("lorem ipsum")
|
18
|
+
end
|
19
|
+
let (:document_with_unique) do
|
20
|
+
RetrievalLite::Document.new("lorem unique")
|
21
|
+
end
|
22
|
+
# sorted by lorem order
|
23
|
+
let (:all_documents) do
|
24
|
+
[document, document_with_duplicates, document_doubled, document_one_term, document_both_terms, document_with_unique]
|
25
|
+
end
|
26
|
+
let (:corpus) do
|
27
|
+
RetrievalLite::Corpus.new(all_documents)
|
28
|
+
end
|
29
|
+
let (:corpus_different) do
|
30
|
+
RetrievalLite::Corpus.new([document_one_term, document, document_with_duplicates])
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "calculating tf-idf scores" do
|
34
|
+
describe "term that all documents have" do
|
35
|
+
it "should have correct tf-idf" do
|
36
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_one_term, "lorem").should be_within(0.001).of(0)
|
37
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document, "lorem").should be_within(0.001).of(0)
|
38
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_with_duplicates, "lorem").should be_within(0.001).of(0)
|
39
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_doubled, "lorem").should be_within(0.001).of(0)
|
40
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_both_terms, "lorem").should be_within(0.001).of(0)
|
41
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_with_unique, "lorem").should be_within(0.001).of(0)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "term that a few documents have" do
|
46
|
+
it "should have correct tf-idf" do
|
47
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.405)
|
48
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.811)
|
49
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.811)
|
50
|
+
RetrievalLite::TfIdfRetrieval.tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(0.405)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
describe "calculating normalized tf-idf scores" do
|
56
|
+
describe "term that a few documents have" do
|
57
|
+
it "should have correct tf-idf" do
|
58
|
+
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document, "ipsum").should be_within(0.001).of(0.316)
|
59
|
+
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_with_duplicates, "ipsum").should be_within(0.001).of(0.632)
|
60
|
+
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_doubled, "ipsum").should be_within(0.001).of(0.632)
|
61
|
+
RetrievalLite::TfIdfRetrieval.normalized_tfidf_weight(corpus, document_both_terms, "ipsum").should be_within(0.001).of(0.316)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
describe "calculating total tf-idf scores" do
|
67
|
+
describe "for when all documents of corpus has a term" do
|
68
|
+
it "should have score of zero for each document" do
|
69
|
+
scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "lorem")
|
70
|
+
scores.size.should == all_documents.size
|
71
|
+
scores.values.each do |v|
|
72
|
+
v.should == 0
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
describe "term that only one document has" do
|
77
|
+
it "should return the correct score" do
|
78
|
+
scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "unique")
|
79
|
+
scores.size.should == 1
|
80
|
+
scores[document_with_unique].should be_within(0.001).of(1.0)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
describe "term that a few documents have" do
|
84
|
+
it "should return the correct score" do
|
85
|
+
scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus, "ipsum")
|
86
|
+
scores.size.should == 4
|
87
|
+
scores[document].should be_within(0.001).of(1.0)
|
88
|
+
scores[document_with_duplicates].should be_within(0.001).of(1.0)
|
89
|
+
scores[document_doubled].should be_within(0.001).of(1.0)
|
90
|
+
scores[document_both_terms].should be_within(0.001).of(1.0)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
describe "one-term retrieval" do
|
96
|
+
it "should return array with that term" do
|
97
|
+
RetrievalLite::TfIdfRetrieval.evaluate(corpus, "lorem").should =~ all_documents
|
98
|
+
end
|
99
|
+
it "should ignore case" do
|
100
|
+
RetrievalLite::TfIdfRetrieval.evaluate(corpus, "LOREM").should =~ all_documents
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
describe "when corpus has only one document containing term" do
|
105
|
+
it "should return array with only that document" do
|
106
|
+
RetrievalLite::TfIdfRetrieval.evaluate(corpus, "unique").should == [document_with_unique]
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
describe "for no matches" do
|
111
|
+
it "should return empty array for term not in any documents" do
|
112
|
+
RetrievalLite::TfIdfRetrieval.evaluate(corpus, "foobar").should == []
|
113
|
+
end
|
114
|
+
it "should return empty array for empty string" do
|
115
|
+
RetrievalLite::TfIdfRetrieval.evaluate(corpus, "").should == []
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
describe "multiple-term retrieval" do
|
120
|
+
it "should order documents correctly" do
|
121
|
+
RetrievalLite::TfIdfRetrieval.evaluate(corpus_different, "lorem dolor sit").should == [document, document_with_duplicates, document_one_term]
|
122
|
+
end
|
123
|
+
it "should have the correct scores" do
|
124
|
+
scores = RetrievalLite::TfIdfRetrieval.evaluate_with_scores(corpus_different, "lorem dolor sit")
|
125
|
+
scores[document].should be_within(0.001).of(0.816)
|
126
|
+
scores[document_with_duplicates].should be_within(0.001).of(0.808)
|
127
|
+
scores[document_one_term].should be_within(0.001).of(0.0)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe RetrievalLite::Tokenizer do
|
4
|
+
describe "parse_content" do
|
5
|
+
describe "for basic terms" do
|
6
|
+
let (:basic_tf) do
|
7
|
+
{
|
8
|
+
"lorem" => 1,
|
9
|
+
"ipsum" => 1,
|
10
|
+
"dolor" => 1,
|
11
|
+
"sit" => 1,
|
12
|
+
"amet" => 1
|
13
|
+
}
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should split the content" do
|
17
|
+
content = "lorem ipsum dolor sit amet"
|
18
|
+
RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should ignore extra white spaces" do
|
22
|
+
content = "lorem ipsum dolor \n sit amet"
|
23
|
+
RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should ignore punctuation" do
|
27
|
+
content = "lorem! @ #ipsum (dolor) sit * \ amet"
|
28
|
+
RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should ignore capitalization" do
|
32
|
+
content = "LOREM iPSuM dOLOR sit amet"
|
33
|
+
RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should ignore any whitespaces in front and back of content" do
|
37
|
+
content = " lorem ipsum dolor sit amet \n"
|
38
|
+
RetrievalLite::Tokenizer.parse_content(content).should == basic_tf
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
describe "for content with multiple terms" do
|
43
|
+
let (:multiple_tf) do
|
44
|
+
{
|
45
|
+
"lorem" => 1,
|
46
|
+
"ipsum" => 2,
|
47
|
+
"dolor" => 3,
|
48
|
+
"sit" => 4,
|
49
|
+
"amet" => 5
|
50
|
+
}
|
51
|
+
end
|
52
|
+
it "should not care about order" do
|
53
|
+
content = "lorem ipsum ipsum dolor dolor dolor sit sit sit sit amet amet amet amet amet"
|
54
|
+
RetrievalLite::Tokenizer.parse_content(content).should == multiple_tf
|
55
|
+
|
56
|
+
content = "amet amet lorem dolor dolor sit sit ipsum ipsum sit amet dolor sit amet amet"
|
57
|
+
RetrievalLite::Tokenizer.parse_content(content).should == multiple_tf
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should consider capitalized terms to be the same" do
|
61
|
+
content = "lorem IPSUM ipsum doLOR doLOR dolor SIT SIT SIT SIT amet ameT amET aMET AMET"
|
62
|
+
RetrievalLite::Tokenizer.parse_content(content).should == multiple_tf
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
describe "for special cases" do
|
67
|
+
let(:foo_bar_hash) do
|
68
|
+
{
|
69
|
+
"foo" => 1,
|
70
|
+
"bar" => 1
|
71
|
+
}
|
72
|
+
end
|
73
|
+
let(:foo_bar_baz_hash) do
|
74
|
+
{
|
75
|
+
"foo" => 1,
|
76
|
+
"bar" => 1,
|
77
|
+
"baz" => 1
|
78
|
+
}
|
79
|
+
end
|
80
|
+
|
81
|
+
it "should return empty hash if there are no terms" do
|
82
|
+
RetrievalLite::Tokenizer.parse_content("").should == Hash.new
|
83
|
+
end
|
84
|
+
|
85
|
+
it "should ignore numbers" do
|
86
|
+
RetrievalLite::Tokenizer.parse_content("1 2 3.14159").should == Hash.new
|
87
|
+
end
|
88
|
+
|
89
|
+
it "should ignore control characters" do
|
90
|
+
RetrievalLite::Tokenizer.parse_content("\a\e\f\n\r\t\v").should == Hash.new
|
91
|
+
RetrievalLite::Tokenizer.parse_content("\x07\x1B\f\n\r\t\v").should == Hash.new
|
92
|
+
end
|
93
|
+
|
94
|
+
it "should split words connected by special characters" do
|
95
|
+
RetrievalLite::Tokenizer.parse_content("foo/bar").should == foo_bar_hash
|
96
|
+
RetrievalLite::Tokenizer.parse_content("foo,bar").should == foo_bar_hash
|
97
|
+
RetrievalLite::Tokenizer.parse_content("foo,:bar").should == foo_bar_hash
|
98
|
+
RetrievalLite::Tokenizer.parse_content("foo ,:bar").should == foo_bar_hash
|
99
|
+
RetrievalLite::Tokenizer.parse_content("!@foo ,:bar#").should == foo_bar_hash
|
100
|
+
|
101
|
+
RetrievalLite::Tokenizer.parse_content("foo:bar baz").should == foo_bar_baz_hash
|
102
|
+
end
|
103
|
+
|
104
|
+
it "should not split words connected by only one hyphen" do
|
105
|
+
RetrievalLite::Tokenizer.parse_content("foo-bar").should == { "foo-bar" => 1 }
|
106
|
+
RetrievalLite::Tokenizer.parse_content("foo - bar").should == foo_bar_hash
|
107
|
+
RetrievalLite::Tokenizer.parse_content("foo --bar").should == foo_bar_hash
|
108
|
+
#RetrievalLite::Tokenizer.parse_content("foo--bar").should == foo_bar_hash # TODO is this worth it?
|
109
|
+
|
110
|
+
RetrievalLite::Tokenizer.parse_content("foo-bar-baz").should_not == foo_bar_baz_hash
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
data/spec/vector_spec.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe RetrievalLite::Vector do
|
4
|
+
describe "dot product" do
|
5
|
+
it "should compute correctly for vectors length 1" do
|
6
|
+
RetrievalLite::Vector.dot_product([3], [5]).should == 15
|
7
|
+
end
|
8
|
+
it "should compute correctly for longer vectors" do
|
9
|
+
RetrievalLite::Vector.dot_product([2, 3], [4, 5]).should == 23
|
10
|
+
end
|
11
|
+
it "should raise error for unequal sized arrays" do
|
12
|
+
expect { RetrievalLite::Vector.dot_product([2, 3], [4]) }.to raise_error
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "euclidean length" do
|
17
|
+
it "should calculate it for vectors length 1" do
|
18
|
+
RetrievalLite::Vector.euclidean_length([1]).should == 1
|
19
|
+
end
|
20
|
+
it "should calculate it for zero vectors" do
|
21
|
+
RetrievalLite::Vector.euclidean_length([0, 0, 0]).should == 0
|
22
|
+
end
|
23
|
+
it "should calculate it for longer vectors" do
|
24
|
+
RetrievalLite::Vector.euclidean_length([3, 4]).should == 5
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
metadata
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: retrieval_lite
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Irvin Zhan
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-05-12 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.3'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.3'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
description: Lightweight gem for document retrieval using tf-idf based algorithms
|
56
|
+
for Ruby
|
57
|
+
email:
|
58
|
+
- izhan@princeton.edu
|
59
|
+
executables: []
|
60
|
+
extensions: []
|
61
|
+
extra_rdoc_files: []
|
62
|
+
files:
|
63
|
+
- .gitignore
|
64
|
+
- .rspec
|
65
|
+
- Gemfile
|
66
|
+
- LICENSE
|
67
|
+
- LICENSE.txt
|
68
|
+
- README.md
|
69
|
+
- Rakefile
|
70
|
+
- lib/retrieval_lite.rb
|
71
|
+
- lib/retrieval_lite/boolean_retrieval.rb
|
72
|
+
- lib/retrieval_lite/corpus.rb
|
73
|
+
- lib/retrieval_lite/document.rb
|
74
|
+
- lib/retrieval_lite/tfidf_retrieval.rb
|
75
|
+
- lib/retrieval_lite/tokenizer.rb
|
76
|
+
- lib/retrieval_lite/vector.rb
|
77
|
+
- lib/version.rb
|
78
|
+
- retrieval_lite.gemspec
|
79
|
+
- spec/boolean_retrieval_spec.rb
|
80
|
+
- spec/corpus_spec.rb
|
81
|
+
- spec/document_spec.rb
|
82
|
+
- spec/retrieval_lite_spec.rb
|
83
|
+
- spec/spec_helper.rb
|
84
|
+
- spec/spec_helpers/file_helpers.rb
|
85
|
+
- spec/tfidf_retrieval_spec.rb
|
86
|
+
- spec/tokenizer_spec.rb
|
87
|
+
- spec/vector_spec.rb
|
88
|
+
homepage: https://github.com/izhan/retrieval_lite
|
89
|
+
licenses:
|
90
|
+
- MIT
|
91
|
+
metadata: {}
|
92
|
+
post_install_message:
|
93
|
+
rdoc_options: []
|
94
|
+
require_paths:
|
95
|
+
- lib
|
96
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - '>='
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '0'
|
101
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
103
|
+
- - '>='
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '0'
|
106
|
+
requirements: []
|
107
|
+
rubyforge_project:
|
108
|
+
rubygems_version: 2.1.11
|
109
|
+
signing_key:
|
110
|
+
specification_version: 4
|
111
|
+
summary: Please see associated GitHub page for usage.
|
112
|
+
test_files:
|
113
|
+
- spec/boolean_retrieval_spec.rb
|
114
|
+
- spec/corpus_spec.rb
|
115
|
+
- spec/document_spec.rb
|
116
|
+
- spec/retrieval_lite_spec.rb
|
117
|
+
- spec/spec_helper.rb
|
118
|
+
- spec/spec_helpers/file_helpers.rb
|
119
|
+
- spec/tfidf_retrieval_spec.rb
|
120
|
+
- spec/tokenizer_spec.rb
|
121
|
+
- spec/vector_spec.rb
|
122
|
+
has_rdoc:
|