tf-idf 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +5 -0
- data/LICENSE +20 -0
- data/README.markdown +26 -0
- data/Rakefile +58 -0
- data/VERSION +1 -0
- data/features/step_definitions/tf-idf_steps.rb +49 -0
- data/features/support/env.rb +4 -0
- data/features/tf-idf.feature +79 -0
- data/lib/tf-idf.rb +199 -0
- data/spec/fixtures/tfidf_testcorpus.txt +7 -0
- data/spec/fixtures/tfidf_teststopwords.txt +1 -0
- data/spec/spec_helper.rb +13 -0
- data/spec/tf-idf_spec.rb +104 -0
- metadata +99 -0
data/.document
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Marc Chung
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
tf-idf
|
2
|
+
------
|
3
|
+
|
4
|
+
This is a simple Tf-idf library. The algorithm is described in:
|
5
|
+
|
6
|
+
[http://en.wikipedia.org/wiki/Tf-idf][wiki].
|
7
|
+
|
8
|
+
A port of Niniane Wang's [tfidf][tfidf] library.
|
9
|
+
|
10
|
+
Note on Patches/Pull Requests
|
11
|
+
-----------------------------
|
12
|
+
|
13
|
+
* Fork the project.
|
14
|
+
* Make your feature addition or bug fix.
|
15
|
+
* Add tests for it. This is important so I don't break it in a future version unintentionally.
|
16
|
+
* Commit, do not mess with rakefile, version, or history. (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
17
|
+
* Send me a pull request. Bonus points for topic branches.
|
18
|
+
|
19
|
+
Copyright
|
20
|
+
--------
|
21
|
+
|
22
|
+
Copyright (c) 2009 Marc Chung. See LICENSE for details.
|
23
|
+
|
24
|
+
[tfidf]: http://code.google.com/p/tfidf
|
25
|
+
|
26
|
+
[wiki]: http://en.wikipedia.org/wiki/Tf-idf
|
data/Rakefile
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "tf-idf"
|
8
|
+
gem.summary = %Q{A rubygem that implements the Tf-Idf algorithm}
|
9
|
+
gem.description = %Q{Computes IDF for a specified term based on the corpus, or generates keywords ordered by tf-idf for a specified document.}
|
10
|
+
gem.email = "mchung@gmail.com"
|
11
|
+
gem.homepage = "http://github.com/mchung/tf-idf"
|
12
|
+
gem.authors = ["Marc Chung"]
|
13
|
+
gem.add_development_dependency "rspec"
|
14
|
+
gem.add_development_dependency "yard"
|
15
|
+
gem.add_development_dependency "cucumber"
|
16
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
17
|
+
end
|
18
|
+
Jeweler::GemcutterTasks.new
|
19
|
+
rescue LoadError
|
20
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
21
|
+
end
|
22
|
+
|
23
|
+
require 'spec/rake/spectask'
|
24
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
25
|
+
spec.libs << 'lib' << 'spec'
|
26
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
27
|
+
end
|
28
|
+
|
29
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
30
|
+
spec.libs << 'lib' << 'spec'
|
31
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
32
|
+
spec.rcov = true
|
33
|
+
spec.rcov_dir = "doc/coverage"
|
34
|
+
end
|
35
|
+
|
36
|
+
task :spec => :check_dependencies
|
37
|
+
|
38
|
+
begin
|
39
|
+
require 'cucumber/rake/task'
|
40
|
+
Cucumber::Rake::Task.new(:features)
|
41
|
+
|
42
|
+
task :features => :check_dependencies
|
43
|
+
rescue LoadError
|
44
|
+
task :features do
|
45
|
+
abort "Cucumber is not available. In order to run features, you must: sudo gem install cucumber"
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
task :default => [:spec, :features]
|
50
|
+
|
51
|
+
begin
|
52
|
+
require 'yard'
|
53
|
+
YARD::Rake::YardocTask.new
|
54
|
+
rescue LoadError
|
55
|
+
task :yardoc do
|
56
|
+
abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
|
57
|
+
end
|
58
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.0.0
|
@@ -0,0 +1,49 @@
|
|
1
|
+
Before do
|
2
|
+
@default_idf_unittest = 1.0
|
3
|
+
end
|
4
|
+
|
5
|
+
Given /^the default IDF is set to "([^\"]*)"$/ do |val|
|
6
|
+
@default_idf_unittest = val.to_f
|
7
|
+
end
|
8
|
+
|
9
|
+
Given /^I have loaded the sample corpus data set "([^\"]*)"$/ do |corpus_filename|
|
10
|
+
@corpus_filename = File.expand_path(File.dirname(__FILE__) + "/../../spec/fixtures/#{corpus_filename}")
|
11
|
+
@my_tfidf = TfIdf.from_corpus(@corpus_filename, @default_idf_unittest)
|
12
|
+
end
|
13
|
+
|
14
|
+
Then /^I should have a total of "([^\"]*)" documents$/ do |total_doc_count|
|
15
|
+
@my_tfidf.num_docs.should == total_doc_count.to_i
|
16
|
+
end
|
17
|
+
|
18
|
+
Then /^I should have a total of "([^\"]*)" term\/num_doc pairs$/ do |total_term_num_doc_count|
|
19
|
+
@my_tfidf.term_num_docs.size.should == total_term_num_doc_count.to_i
|
20
|
+
end
|
21
|
+
|
22
|
+
Then /^I should get the default IDF for "([^\"]*)"$/ do |term|
|
23
|
+
@my_tfidf.idf(term).should == @default_idf_unittest
|
24
|
+
end
|
25
|
+
|
26
|
+
Then /^the IDF for "([^\"]*)" should be greater than the IDF for "([^\"]*)"$/ do |term1, term2|
|
27
|
+
@my_tfidf.idf(term1).should > @my_tfidf.idf(term2)
|
28
|
+
end
|
29
|
+
|
30
|
+
Then /^the IDF for "([^\"]*)" should be equal to the IDF for "([^\"]*)"$/ do |term1, term2|
|
31
|
+
@my_tfidf.idf(term1).should == @my_tfidf.idf(term2)
|
32
|
+
end
|
33
|
+
|
34
|
+
Given /^the keywords "([^\"]*)"$/ do |keywords|
|
35
|
+
@keywords = @my_tfidf.doc_keywords(keywords)
|
36
|
+
end
|
37
|
+
|
38
|
+
Then /^"([^\"]*)" should be located at "([^\"]*)", "([^\"]*)"$/ do |term, x, y|
|
39
|
+
@keywords[x.to_i][y.to_i].should == term
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
# Then /^I should get the expected IDF for "([^\"]*)"$/ do |arg1|
|
44
|
+
# pending
|
45
|
+
# end
|
46
|
+
|
47
|
+
def get_expected_idf(num_docs_total, num_docs_term)
|
48
|
+
Math.log((1 + num_docs_total).to_f / (1 + num_docs_term))
|
49
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
Feature: Tf-Idf
|
2
|
+
A user should be able to construct an IDF corpus
|
3
|
+
|
4
|
+
Scenario: A corpus should contain the total number of documents
|
5
|
+
Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
|
6
|
+
Then I should have a total of "50" documents
|
7
|
+
|
8
|
+
Scenario: A corpus should contain terms and the number of documents they are found in.
|
9
|
+
Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
|
10
|
+
Then I should have a total of "6" term/num_doc pairs
|
11
|
+
|
12
|
+
Scenario: A corpus should query IDF for nonexistent terms
|
13
|
+
Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
|
14
|
+
Then I should get the default IDF for "nonexistent"
|
15
|
+
Then I should get the default IDF for "THE"
|
16
|
+
|
17
|
+
Scenario: A corpus should query IDF for existent terms
|
18
|
+
Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
|
19
|
+
Then the IDF for "a" should be greater than the IDF for "the"
|
20
|
+
Then the IDF for "girl" should be equal to the IDF for "moon"
|
21
|
+
|
22
|
+
Scenario: A corpus should retrieve keywords from a document, ordered by tf-idf"
|
23
|
+
Given the default IDF is set to "0.01"
|
24
|
+
And I have loaded the sample corpus data set "tfidf_testcorpus.txt"
|
25
|
+
Given the keywords "the spoon and the fork"
|
26
|
+
Then "the" should be located at "0", "0"
|
27
|
+
Given the keywords "the girl said hello over the phone"
|
28
|
+
Then "girl" should be located at "0", "0"
|
29
|
+
Then "phone" should be located at "1", "0"
|
30
|
+
Then "said" should be located at "2", "0"
|
31
|
+
Then "the" should be located at "3", "0"
|
32
|
+
|
33
|
+
# Scenario: A corpus should add input documents to an existing corpus"
|
34
|
+
# Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
|
35
|
+
# Then I should get the default IDF for "water"
|
36
|
+
# Then I should get the expected IDF for "moon" when it has "1" occurrence
|
37
|
+
# Then I should get the expected IDF for "said" when it has "5" occurrences
|
38
|
+
|
39
|
+
|
40
|
+
# it "should add input documents to an existing corpus" do
|
41
|
+
# my_tfidf = TfIdf.new(@test_corpus, nil, @default_idf_unittest)
|
42
|
+
# my_tfidf.idf("water").should == @default_idf_unittest
|
43
|
+
# my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 1)
|
44
|
+
# my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
|
45
|
+
#
|
46
|
+
# my_tfidf.add_input_document("water moon") # doesn't support commas
|
47
|
+
#
|
48
|
+
# my_tfidf.idf("water").should == get_expected_idf(my_tfidf.num_docs, 1)
|
49
|
+
# my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 2)
|
50
|
+
# my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
|
51
|
+
# end
|
52
|
+
#
|
53
|
+
# it "should add input documents to an empty corpus" do
|
54
|
+
# my_tfidf = TfIdf.new(nil, nil, @default_idf_unittest)
|
55
|
+
# my_tfidf.idf("moon").should == @default_idf_unittest
|
56
|
+
# my_tfidf.idf("water").should == @default_idf_unittest
|
57
|
+
# my_tfidf.idf("said").should == @default_idf_unittest
|
58
|
+
#
|
59
|
+
# my_tfidf.add_input_document("moon")
|
60
|
+
# my_tfidf.add_input_document("moon said hello")
|
61
|
+
#
|
62
|
+
# my_tfidf.idf("water").should == @default_idf_unittest
|
63
|
+
# my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 1)
|
64
|
+
# my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 2)
|
65
|
+
# end
|
66
|
+
#
|
67
|
+
# it "should observe stopwords list" do
|
68
|
+
# my_tfidf = TfIdf.new(@test_corpus, @test_stopwords, @default_idf_unittest)
|
69
|
+
# my_tfidf.idf("water").should == @default_idf_unittest
|
70
|
+
# my_tfidf.idf("moon").should == 0 # ignored
|
71
|
+
# my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
|
72
|
+
#
|
73
|
+
# my_tfidf.add_input_document("moon")
|
74
|
+
# my_tfidf.add_input_document("moon and water")
|
75
|
+
#
|
76
|
+
# my_tfidf.idf("water").should == get_expected_idf(my_tfidf.num_docs, 1)
|
77
|
+
# my_tfidf.idf("moon").should == 0
|
78
|
+
# my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
|
79
|
+
# end
|
data/lib/tf-idf.rb
ADDED
@@ -0,0 +1,199 @@
|
|
1
|
+
# Tf-idf class implementing http://en.wikipedia.org/wiki/Tf-idf.
|
2
|
+
#
|
3
|
+
# The library constructs an IDF corpus and stopword list either from
|
4
|
+
# documents specified by the client, or by reading from input files. It
|
5
|
+
# computes IDF for a specified term based on the corpus, or generates
|
6
|
+
# keywords ordered by tf-idf for a specified document.
|
7
|
+
#
|
8
|
+
# @author Marc Chung <mchung@gmail.com>
|
9
|
+
# @see http://en.wikipedia.org/wiki/Tf-idf Term frequency-inverse document frequency
|
10
|
+
class TfIdf
|
11
|
+
|
12
|
+
# @return [Integer] The total number of documents in the tf-idf corpus.
|
13
|
+
attr_accessor :num_docs
|
14
|
+
|
15
|
+
# @return [Hash<String, Integer>] A histogram of terms and their term frequency.
|
16
|
+
attr_accessor :term_num_docs
|
17
|
+
|
18
|
+
# @return [Array<String>] An array of stopwords.
|
19
|
+
attr_accessor :stopwords
|
20
|
+
|
21
|
+
# @return [Float] The default value returned when a term is not found in the tf-idf corpus.
|
22
|
+
attr_accessor :idf_default
|
23
|
+
|
24
|
+
DEFAULT_IDF = 1.5
|
25
|
+
|
26
|
+
##
|
27
|
+
# Initialize the tf-idf dictionary.
|
28
|
+
#
|
29
|
+
# If a corpus file is supplied, reads the idf dictionary from it, in the
|
30
|
+
# format of:
|
31
|
+
# # of total documents
|
32
|
+
# term: # of documents containing the term
|
33
|
+
#
|
34
|
+
# If a stopword file is specified, reads the stopword list from it, in
|
35
|
+
# the format of one stopword per line.
|
36
|
+
#
|
37
|
+
# The DEFAULT_IDF value is returned when a query term is not found in the
|
38
|
+
# IDF corpus.
|
39
|
+
#
|
40
|
+
# @param [String] corpus_filename The disk location of the IDF corpus.
|
41
|
+
# @param [String] stopword_filename The disk location of the stopword list.
|
42
|
+
# @param [Float] default_idf The value returned when a term is not found in the IDF corpus.
|
43
|
+
# @raise ["Corpus Not Found"] Thrown when the corpus isn't found.
|
44
|
+
# @raise ["Stopwords Not Found"] Thrown when the stopwords list isn't found.
|
45
|
+
# @return [TfIdf] A TfIdf instance loaded with the corpus.
|
46
|
+
def initialize(corpus_filename = nil, stopword_filename = nil, default_idf = DEFAULT_IDF)
|
47
|
+
self.num_docs = 0
|
48
|
+
self.term_num_docs = {}
|
49
|
+
self.stopwords = []
|
50
|
+
self.idf_default = default_idf
|
51
|
+
|
52
|
+
raise "Corpus not found" if corpus_filename && !File.exists?(corpus_filename)
|
53
|
+
if corpus_filename
|
54
|
+
entries = File.read(corpus_filename).entries
|
55
|
+
self.num_docs = entries.shift.strip.to_i
|
56
|
+
entries.each do |line|
|
57
|
+
tokens = line.split(":")
|
58
|
+
term = tokens[0].strip
|
59
|
+
frequency = tokens[1].strip.to_i
|
60
|
+
self.term_num_docs[term] = frequency
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
raise "Stopwords not found" if stopword_filename && !File.exists?(stopword_filename)
|
65
|
+
if stopword_filename
|
66
|
+
self.stopwords = File.read(stopword_filename).entries.collect{|x| x.strip}
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
##
|
71
|
+
# Convenience method for creating a TfIdf instance.
|
72
|
+
#
|
73
|
+
# @param [String] corpus_filename The disk location of the IDF corpus.
|
74
|
+
# @return [TfIdf] A TfIdf instance loaded with the corpus.
|
75
|
+
def self.from_corpus(corpus_filename, default_idf = DEFAULT_IDF)
|
76
|
+
self.new(corpus_filename, nil, default_idf)
|
77
|
+
end
|
78
|
+
|
79
|
+
##
|
80
|
+
# Breaks a string into tokens. This implementation matches whole words.
|
81
|
+
# Clients may wish to override this behaviour with their own tokenization.
|
82
|
+
# strategy.
|
83
|
+
#
|
84
|
+
# @param [String] input String representation of a document
|
85
|
+
# @return [Array<String>] A list of tokens
|
86
|
+
def get_tokens(input)
|
87
|
+
# str.split().collect{|x| x if x =~ /[A-Za-z]+/}.compact
|
88
|
+
input.split.select{|x| x =~ /<a.*?\/a>|<[^\>]*>|[\w'@#]+/}
|
89
|
+
end
|
90
|
+
|
91
|
+
##
|
92
|
+
# Add terms in the specified document to the IDF corpus.
|
93
|
+
#
|
94
|
+
# @param [String] input String representation of a document.
|
95
|
+
def add_input_document(input)
|
96
|
+
self.num_docs += 1
|
97
|
+
token_set = get_tokens(input).uniq
|
98
|
+
token_set.each do |term|
|
99
|
+
if self.term_num_docs[term]
|
100
|
+
self.term_num_docs[term] += 1
|
101
|
+
else
|
102
|
+
self.term_num_docs[term] = 1
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
##
|
108
|
+
# Saves the tf-idf corpus and stopword list to the specified file.
|
109
|
+
#
|
110
|
+
# A word is a stopword if it occurs in more than stopword_threshold% of num_docs.
|
111
|
+
# A threshold of 0.4, means that the word must occur in more than 40% of the documents.
|
112
|
+
#
|
113
|
+
# @param [String] idf_filename Filename.
|
114
|
+
# @param [String] stopword_filename Filename.
|
115
|
+
# @param [Float] stopword_percentage_threshold Stopword threshold. Lower threshold lower criteria.
|
116
|
+
def save_corpus_to_file(idf_filename, stopword_filename, stopword_percentage_threshold = 0.01)
|
117
|
+
File.open(idf_filename, "w") do |file|
|
118
|
+
file.write("#{self.num_docs}\n")
|
119
|
+
self.term_num_docs.each do |term, num_docs|
|
120
|
+
file.write("#{term}: #{num_docs}\n")
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
File.open(stopword_filename, "w") do |file|
|
125
|
+
sorted_term_num_docs = sort_by_tfidf(self.term_num_docs)
|
126
|
+
sorted_term_num_docs.each do |term, num_docs|
|
127
|
+
# pp [term, num_docs, stopword_percentage_threshold, self.num_docs, stopword_percentage_threshold * self.num_docs, ]
|
128
|
+
if num_docs > stopword_percentage_threshold * self.num_docs
|
129
|
+
file.write("#{term}\n")
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
##
|
136
|
+
# Retrieves the IDF for the specified term.
|
137
|
+
#
|
138
|
+
# This is computed with:
|
139
|
+
# logarithm of ((number of documents in corpus) divided by
|
140
|
+
# (number of documents containing this term)).
|
141
|
+
#
|
142
|
+
# @param [String] term A term in the IDF corpus.
|
143
|
+
# @return [Float] The IDF for the specified term.
|
144
|
+
def idf(term)
|
145
|
+
if self.stopwords.include?(term)
|
146
|
+
return 0
|
147
|
+
end
|
148
|
+
|
149
|
+
if self.term_num_docs[term].nil?
|
150
|
+
return self.idf_default
|
151
|
+
end
|
152
|
+
|
153
|
+
return Math.log((1 + self.num_docs).to_f /
|
154
|
+
(1 + self.term_num_docs[term]))
|
155
|
+
end
|
156
|
+
|
157
|
+
##
|
158
|
+
# Retrieve terms and corresponding tf-idf for the specified document.
|
159
|
+
#
|
160
|
+
# The returned terms are ordered by decreasing tf-idf.
|
161
|
+
#
|
162
|
+
# @param [String] curr_doc String representation of an existing document.
|
163
|
+
# @return [Array] Terms ordered by decreasing tf-idf rank.
|
164
|
+
def doc_keywords(curr_doc)
|
165
|
+
tfidf = {}
|
166
|
+
|
167
|
+
tokens = self.get_tokens(curr_doc)
|
168
|
+
token_set = tokens.uniq
|
169
|
+
token_set_sz = token_set.count
|
170
|
+
|
171
|
+
token_set.each do |term|
|
172
|
+
mytf = tokens.count(term).to_f / token_set_sz
|
173
|
+
myidf = self.idf(term)
|
174
|
+
tfidf[term] = mytf * myidf
|
175
|
+
end
|
176
|
+
|
177
|
+
sort_by_tfidf(tfidf)
|
178
|
+
end
|
179
|
+
|
180
|
+
##
|
181
|
+
# Returns a string representation of the tf-idf corpus.
|
182
|
+
#
|
183
|
+
# @return [String] Contains # docs, # term and frequency.
|
184
|
+
def to_s
|
185
|
+
{:num_docs => self.num_docs, :term_num_docs => self.term_num_docs.size}.inspect
|
186
|
+
end
|
187
|
+
|
188
|
+
##
|
189
|
+
# Sorts terms by decreasing tf-idf.
|
190
|
+
#
|
191
|
+
# @example Sort by tf-idf
|
192
|
+
# "{'and'=>0.0025, 'fork'=>0.0025, 'the'=>0.37688590118819, 'spoon'=>1.0025}" #=>
|
193
|
+
# "[['spoon', 1.0025], ['the', 0.37688590118819], ['fork', 0.0025], ['and', 0.0025]]"
|
194
|
+
# @return [Array<Array<String, Float>>] An array of term/IDF array pairs.
|
195
|
+
def sort_by_tfidf(tfidf)
|
196
|
+
tfidf.sort{|a, b| b[1] <=> a[1]}
|
197
|
+
end
|
198
|
+
|
199
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
moon
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
2
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
3
|
+
require 'tf-idf'
|
4
|
+
require 'spec'
|
5
|
+
require 'spec/autorun'
|
6
|
+
require 'pp'
|
7
|
+
|
8
|
+
Spec::Runner.configure do |config|
|
9
|
+
end
|
10
|
+
|
11
|
+
def get_expected_idf(num_docs_total, num_docs_term)
|
12
|
+
Math.log((1 + num_docs_total).to_f / (1 + num_docs_term))
|
13
|
+
end
|
data/spec/tf-idf_spec.rb
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe "TF-IDF library" do
|
4
|
+
|
5
|
+
before(:all) do
|
6
|
+
@test_corpus = File.expand_path(File.dirname(__FILE__) + '/fixtures/tfidf_testcorpus.txt')
|
7
|
+
@test_stopwords = File.expand_path(File.dirname(__FILE__) + '/fixtures/tfidf_teststopwords.txt')
|
8
|
+
@default_idf_unittest = 1.0
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should instantiate without args" do
|
12
|
+
TfIdf.new
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should report the correct number of documents" do
|
16
|
+
my_tfidf = TfIdf.from_corpus(@test_corpus, @default_idf_unittest)
|
17
|
+
my_tfidf.num_docs.should == 50
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should report the correct number of terms" do
|
21
|
+
my_tfidf = TfIdf.from_corpus(@test_corpus, @default_idf_unittest)
|
22
|
+
my_tfidf.term_num_docs.size.should == 6
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should query IDF for nonexistent terms" do
|
26
|
+
my_tfidf = TfIdf.from_corpus(@test_corpus, @default_idf_unittest)
|
27
|
+
my_tfidf.idf("nonexistent").should == @default_idf_unittest
|
28
|
+
my_tfidf.idf("THE").should == @default_idf_unittest
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should query IDF for existent terms" do
|
32
|
+
my_tfidf = TfIdf.from_corpus(@test_corpus, @default_idf_unittest)
|
33
|
+
my_tfidf.idf("a").should > my_tfidf.idf("the")
|
34
|
+
my_tfidf.idf("girl").should == my_tfidf.idf("moon")
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should retrieve keywords from a document, ordered by tf-idf" do
|
38
|
+
my_tfidf = TfIdf.from_corpus(@test_corpus, 0.01)
|
39
|
+
|
40
|
+
# Test retrieving keywords when there is only one keyword.
|
41
|
+
keywords = my_tfidf.doc_keywords("the spoon and the fork")
|
42
|
+
keywords[0][0].should == "the"
|
43
|
+
|
44
|
+
# Test retrieving multiple keywords.
|
45
|
+
keywords = my_tfidf.doc_keywords("the girl said hello over the phone")
|
46
|
+
keywords[0][0].should == "girl"
|
47
|
+
keywords[1][0].should == "phone"
|
48
|
+
keywords[2][0].should == "said"
|
49
|
+
keywords[3][0].should == "the"
|
50
|
+
end
|
51
|
+
|
52
|
+
it "should add input documents to an existing corpus" do
|
53
|
+
my_tfidf = TfIdf.new(@test_corpus, nil, @default_idf_unittest)
|
54
|
+
my_tfidf.idf("water").should == @default_idf_unittest
|
55
|
+
my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 1)
|
56
|
+
my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
|
57
|
+
|
58
|
+
my_tfidf.add_input_document("water moon") # doesn't support commas
|
59
|
+
|
60
|
+
my_tfidf.idf("water").should == get_expected_idf(my_tfidf.num_docs, 1)
|
61
|
+
my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 2)
|
62
|
+
my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
|
63
|
+
end
|
64
|
+
|
65
|
+
it "should add input documents to an empty corpus" do
|
66
|
+
my_tfidf = TfIdf.new(nil, nil, @default_idf_unittest)
|
67
|
+
my_tfidf.idf("moon").should == @default_idf_unittest
|
68
|
+
my_tfidf.idf("water").should == @default_idf_unittest
|
69
|
+
my_tfidf.idf("said").should == @default_idf_unittest
|
70
|
+
|
71
|
+
my_tfidf.add_input_document("moon")
|
72
|
+
my_tfidf.add_input_document("moon said hello")
|
73
|
+
|
74
|
+
my_tfidf.idf("water").should == @default_idf_unittest
|
75
|
+
my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 1)
|
76
|
+
my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 2)
|
77
|
+
end
|
78
|
+
|
79
|
+
it "should observe stopwords list" do
|
80
|
+
my_tfidf = TfIdf.new(@test_corpus, @test_stopwords, @default_idf_unittest)
|
81
|
+
my_tfidf.idf("water").should == @default_idf_unittest
|
82
|
+
my_tfidf.idf("moon").should == 0 # ignored
|
83
|
+
my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
|
84
|
+
|
85
|
+
my_tfidf.add_input_document("moon")
|
86
|
+
my_tfidf.add_input_document("moon and water")
|
87
|
+
|
88
|
+
my_tfidf.idf("water").should == get_expected_idf(my_tfidf.num_docs, 1)
|
89
|
+
my_tfidf.idf("moon").should == 0
|
90
|
+
my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
|
91
|
+
end
|
92
|
+
|
93
|
+
# Abstract out File IO
|
94
|
+
# it "should write the contents of the TF/IDF corpus to disk" do
|
95
|
+
# my_tfidf = TfIdf.new(@test_corpus, @test_stopwords, @default_idf_unittest)
|
96
|
+
# my_tfidf.save_corpus_to_file("foo.txt", "bar.txt", 0.3)
|
97
|
+
# stopwords = File.read("bar.txt").split
|
98
|
+
#
|
99
|
+
# stopwords.size.should == 2
|
100
|
+
# stopwords.should include("a")
|
101
|
+
# stopwords.should include("the")
|
102
|
+
# end
|
103
|
+
|
104
|
+
end
|
metadata
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tf-idf
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Marc Chung
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-11-29 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rspec
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: yard
|
27
|
+
type: :development
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "0"
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: cucumber
|
37
|
+
type: :development
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: "0"
|
44
|
+
version:
|
45
|
+
description: Computes IDF for a specified term based on the corpus, or generates keywords ordered by tf-idf for a specified document.
|
46
|
+
email: mchung@gmail.com
|
47
|
+
executables: []
|
48
|
+
|
49
|
+
extensions: []
|
50
|
+
|
51
|
+
extra_rdoc_files:
|
52
|
+
- LICENSE
|
53
|
+
- README.markdown
|
54
|
+
files:
|
55
|
+
- .document
|
56
|
+
- .gitignore
|
57
|
+
- LICENSE
|
58
|
+
- README.markdown
|
59
|
+
- Rakefile
|
60
|
+
- VERSION
|
61
|
+
- features/step_definitions/tf-idf_steps.rb
|
62
|
+
- features/support/env.rb
|
63
|
+
- features/tf-idf.feature
|
64
|
+
- lib/tf-idf.rb
|
65
|
+
- spec/fixtures/tfidf_testcorpus.txt
|
66
|
+
- spec/fixtures/tfidf_teststopwords.txt
|
67
|
+
- spec/spec_helper.rb
|
68
|
+
- spec/tf-idf_spec.rb
|
69
|
+
has_rdoc: true
|
70
|
+
homepage: http://github.com/mchung/tf-idf
|
71
|
+
licenses: []
|
72
|
+
|
73
|
+
post_install_message:
|
74
|
+
rdoc_options:
|
75
|
+
- --charset=UTF-8
|
76
|
+
require_paths:
|
77
|
+
- lib
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: "0"
|
83
|
+
version:
|
84
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: "0"
|
89
|
+
version:
|
90
|
+
requirements: []
|
91
|
+
|
92
|
+
rubyforge_project:
|
93
|
+
rubygems_version: 1.3.5
|
94
|
+
signing_key:
|
95
|
+
specification_version: 3
|
96
|
+
summary: A rubygem that implements the Tf-Idf algorithm
|
97
|
+
test_files:
|
98
|
+
- spec/spec_helper.rb
|
99
|
+
- spec/tf-idf_spec.rb
|