tf-idf 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +5 -0
- data/LICENSE +20 -0
- data/README.markdown +26 -0
- data/Rakefile +58 -0
- data/VERSION +1 -0
- data/features/step_definitions/tf-idf_steps.rb +49 -0
- data/features/support/env.rb +4 -0
- data/features/tf-idf.feature +79 -0
- data/lib/tf-idf.rb +199 -0
- data/spec/fixtures/tfidf_testcorpus.txt +7 -0
- data/spec/fixtures/tfidf_teststopwords.txt +1 -0
- data/spec/spec_helper.rb +13 -0
- data/spec/tf-idf_spec.rb +104 -0
- metadata +99 -0
data/.document
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Marc Chung
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
tf-idf
|
2
|
+
------
|
3
|
+
|
4
|
+
This is a simple Tf-idf library. The algorithm is described in:
|
5
|
+
|
6
|
+
[http://en.wikipedia.org/wiki/Tf-idf][wiki].
|
7
|
+
|
8
|
+
A port of Niniane Wang's [tfidf][tfidf] library.
|
9
|
+
|
10
|
+
Note on Patches/Pull Requests
|
11
|
+
-----------------------------
|
12
|
+
|
13
|
+
* Fork the project.
|
14
|
+
* Make your feature addition or bug fix.
|
15
|
+
* Add tests for it. This is important so I don't break it in a future version unintentionally.
|
16
|
+
* Commit, do not mess with rakefile, version, or history. (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
17
|
+
* Send me a pull request. Bonus points for topic branches.
|
18
|
+
|
19
|
+
Copyright
|
20
|
+
--------
|
21
|
+
|
22
|
+
Copyright (c) 2009 Marc Chung. See LICENSE for details.
|
23
|
+
|
24
|
+
[tfidf]: http://code.google.com/p/tfidf
|
25
|
+
|
26
|
+
[wiki]: http://en.wikipedia.org/wiki/Tf-idf
|
data/Rakefile
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "tf-idf"
|
8
|
+
gem.summary = %Q{A rubygem that implements the Tf-Idf algorithm}
|
9
|
+
gem.description = %Q{Computes IDF for a specified term based on the corpus, or generates keywords ordered by tf-idf for a specified document.}
|
10
|
+
gem.email = "mchung@gmail.com"
|
11
|
+
gem.homepage = "http://github.com/mchung/tf-idf"
|
12
|
+
gem.authors = ["Marc Chung"]
|
13
|
+
gem.add_development_dependency "rspec"
|
14
|
+
gem.add_development_dependency "yard"
|
15
|
+
gem.add_development_dependency "cucumber"
|
16
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
17
|
+
end
|
18
|
+
Jeweler::GemcutterTasks.new
|
19
|
+
rescue LoadError
|
20
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
21
|
+
end
|
22
|
+
|
23
|
+
require 'spec/rake/spectask'
|
24
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
25
|
+
spec.libs << 'lib' << 'spec'
|
26
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
27
|
+
end
|
28
|
+
|
29
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
30
|
+
spec.libs << 'lib' << 'spec'
|
31
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
32
|
+
spec.rcov = true
|
33
|
+
spec.rcov_dir = "doc/coverage"
|
34
|
+
end
|
35
|
+
|
36
|
+
task :spec => :check_dependencies
|
37
|
+
|
38
|
+
begin
|
39
|
+
require 'cucumber/rake/task'
|
40
|
+
Cucumber::Rake::Task.new(:features)
|
41
|
+
|
42
|
+
task :features => :check_dependencies
|
43
|
+
rescue LoadError
|
44
|
+
task :features do
|
45
|
+
abort "Cucumber is not available. In order to run features, you must: sudo gem install cucumber"
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
task :default => [:spec, :features]
|
50
|
+
|
51
|
+
begin
|
52
|
+
require 'yard'
|
53
|
+
YARD::Rake::YardocTask.new
|
54
|
+
rescue LoadError
|
55
|
+
task :yardoc do
|
56
|
+
abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
|
57
|
+
end
|
58
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.0.0
|
@@ -0,0 +1,49 @@
|
|
1
|
+
Before do
|
2
|
+
@default_idf_unittest = 1.0
|
3
|
+
end
|
4
|
+
|
5
|
+
Given /^the default IDF is set to "([^\"]*)"$/ do |val|
|
6
|
+
@default_idf_unittest = val.to_f
|
7
|
+
end
|
8
|
+
|
9
|
+
Given /^I have loaded the sample corpus data set "([^\"]*)"$/ do |corpus_filename|
|
10
|
+
@corpus_filename = File.expand_path(File.dirname(__FILE__) + "/../../spec/fixtures/#{corpus_filename}")
|
11
|
+
@my_tfidf = TfIdf.from_corpus(@corpus_filename, @default_idf_unittest)
|
12
|
+
end
|
13
|
+
|
14
|
+
Then /^I should have a total of "([^\"]*)" documents$/ do |total_doc_count|
|
15
|
+
@my_tfidf.num_docs.should == total_doc_count.to_i
|
16
|
+
end
|
17
|
+
|
18
|
+
Then /^I should have a total of "([^\"]*)" term\/num_doc pairs$/ do |total_term_num_doc_count|
|
19
|
+
@my_tfidf.term_num_docs.size.should == total_term_num_doc_count.to_i
|
20
|
+
end
|
21
|
+
|
22
|
+
Then /^I should get the default IDF for "([^\"]*)"$/ do |term|
|
23
|
+
@my_tfidf.idf(term).should == @default_idf_unittest
|
24
|
+
end
|
25
|
+
|
26
|
+
Then /^the IDF for "([^\"]*)" should be greater than the IDF for "([^\"]*)"$/ do |term1, term2|
|
27
|
+
@my_tfidf.idf(term1).should > @my_tfidf.idf(term2)
|
28
|
+
end
|
29
|
+
|
30
|
+
Then /^the IDF for "([^\"]*)" should be equal to the IDF for "([^\"]*)"$/ do |term1, term2|
|
31
|
+
@my_tfidf.idf(term1).should == @my_tfidf.idf(term2)
|
32
|
+
end
|
33
|
+
|
34
|
+
Given /^the keywords "([^\"]*)"$/ do |keywords|
|
35
|
+
@keywords = @my_tfidf.doc_keywords(keywords)
|
36
|
+
end
|
37
|
+
|
38
|
+
Then /^"([^\"]*)" should be located at "([^\"]*)", "([^\"]*)"$/ do |term, x, y|
|
39
|
+
@keywords[x.to_i][y.to_i].should == term
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
# Then /^I should get the expected IDF for "([^\"]*)"$/ do |arg1|
|
44
|
+
# pending
|
45
|
+
# end
|
46
|
+
|
47
|
+
def get_expected_idf(num_docs_total, num_docs_term)
|
48
|
+
Math.log((1 + num_docs_total).to_f / (1 + num_docs_term))
|
49
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
Feature: Tf-Idf
|
2
|
+
A user should be able to construct an IDF corpus
|
3
|
+
|
4
|
+
Scenario: A corpus should contain the total number of documents
|
5
|
+
Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
|
6
|
+
Then I should have a total of "50" documents
|
7
|
+
|
8
|
+
Scenario: A corpus should contain terms and the number of documents they are found in.
|
9
|
+
Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
|
10
|
+
Then I should have a total of "6" term/num_doc pairs
|
11
|
+
|
12
|
+
Scenario: A corpus should query IDF for nonexistent terms
|
13
|
+
Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
|
14
|
+
Then I should get the default IDF for "nonexistent"
|
15
|
+
Then I should get the default IDF for "THE"
|
16
|
+
|
17
|
+
Scenario: A corpus should query IDF for existent terms
|
18
|
+
Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
|
19
|
+
Then the IDF for "a" should be greater than the IDF for "the"
|
20
|
+
Then the IDF for "girl" should be equal to the IDF for "moon"
|
21
|
+
|
22
|
+
Scenario: A corpus should retrieve keywords from a document, ordered by tf-idf"
|
23
|
+
Given the default IDF is set to "0.01"
|
24
|
+
And I have loaded the sample corpus data set "tfidf_testcorpus.txt"
|
25
|
+
Given the keywords "the spoon and the fork"
|
26
|
+
Then "the" should be located at "0", "0"
|
27
|
+
Given the keywords "the girl said hello over the phone"
|
28
|
+
Then "girl" should be located at "0", "0"
|
29
|
+
Then "phone" should be located at "1", "0"
|
30
|
+
Then "said" should be located at "2", "0"
|
31
|
+
Then "the" should be located at "3", "0"
|
32
|
+
|
33
|
+
# Scenario: A corpus should add input documents to an existing corpus"
|
34
|
+
# Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
|
35
|
+
# Then I should get the default IDF for "water"
|
36
|
+
# Then I should get the expected IDF for "moon" when it has "1" occurrence
|
37
|
+
# Then I should get the expected IDF for "said" when it has "5" occurrences
|
38
|
+
|
39
|
+
|
40
|
+
# it "should add input documents to an existing corpus" do
|
41
|
+
# my_tfidf = TfIdf.new(@test_corpus, nil, @default_idf_unittest)
|
42
|
+
# my_tfidf.idf("water").should == @default_idf_unittest
|
43
|
+
# my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 1)
|
44
|
+
# my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
|
45
|
+
#
|
46
|
+
# my_tfidf.add_input_document("water moon") # doesn't support commas
|
47
|
+
#
|
48
|
+
# my_tfidf.idf("water").should == get_expected_idf(my_tfidf.num_docs, 1)
|
49
|
+
# my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 2)
|
50
|
+
# my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
|
51
|
+
# end
|
52
|
+
#
|
53
|
+
# it "should add input documents to an empty corpus" do
|
54
|
+
# my_tfidf = TfIdf.new(nil, nil, @default_idf_unittest)
|
55
|
+
# my_tfidf.idf("moon").should == @default_idf_unittest
|
56
|
+
# my_tfidf.idf("water").should == @default_idf_unittest
|
57
|
+
# my_tfidf.idf("said").should == @default_idf_unittest
|
58
|
+
#
|
59
|
+
# my_tfidf.add_input_document("moon")
|
60
|
+
# my_tfidf.add_input_document("moon said hello")
|
61
|
+
#
|
62
|
+
# my_tfidf.idf("water").should == @default_idf_unittest
|
63
|
+
# my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 1)
|
64
|
+
# my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 2)
|
65
|
+
# end
|
66
|
+
#
|
67
|
+
# it "should observe stopwords list" do
|
68
|
+
# my_tfidf = TfIdf.new(@test_corpus, @test_stopwords, @default_idf_unittest)
|
69
|
+
# my_tfidf.idf("water").should == @default_idf_unittest
|
70
|
+
# my_tfidf.idf("moon").should == 0 # ignored
|
71
|
+
# my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
|
72
|
+
#
|
73
|
+
# my_tfidf.add_input_document("moon")
|
74
|
+
# my_tfidf.add_input_document("moon and water")
|
75
|
+
#
|
76
|
+
# my_tfidf.idf("water").should == get_expected_idf(my_tfidf.num_docs, 1)
|
77
|
+
# my_tfidf.idf("moon").should == 0
|
78
|
+
# my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
|
79
|
+
# end
|
data/lib/tf-idf.rb
ADDED
@@ -0,0 +1,199 @@
|
|
1
|
+
# Tf-idf class implementing http://en.wikipedia.org/wiki/Tf-idf.
|
2
|
+
#
|
3
|
+
# The library constructs an IDF corpus and stopword list either from
|
4
|
+
# documents specified by the client, or by reading from input files. It
|
5
|
+
# computes IDF for a specified term based on the corpus, or generates
|
6
|
+
# keywords ordered by tf-idf for a specified document.
|
7
|
+
#
|
8
|
+
# @author Marc Chung <mchung@gmail.com>
|
9
|
+
# @see http://en.wikipedia.org/wiki/Tf-idf Term frequency-inverse document frequency
|
10
|
+
class TfIdf
|
11
|
+
|
12
|
+
# @return [Integer] The total number of documents in the tf-idf corpus.
|
13
|
+
attr_accessor :num_docs
|
14
|
+
|
15
|
+
# @return [Hash<String, Integer>] A histogram of terms and their term frequency.
|
16
|
+
attr_accessor :term_num_docs
|
17
|
+
|
18
|
+
# @return [Array<String>] An array of stopwords.
|
19
|
+
attr_accessor :stopwords
|
20
|
+
|
21
|
+
# @return [Float] The default value returned when a term is not found in the tf-idf corpus.
|
22
|
+
attr_accessor :idf_default
|
23
|
+
|
24
|
+
DEFAULT_IDF = 1.5
|
25
|
+
|
26
|
+
##
|
27
|
+
# Initialize the tf-idf dictionary.
|
28
|
+
#
|
29
|
+
# If a corpus file is supplied, reads the idf dictionary from it, in the
|
30
|
+
# format of:
|
31
|
+
# # of total documents
|
32
|
+
# term: # of documents containing the term
|
33
|
+
#
|
34
|
+
# If a stopword file is specified, reads the stopword list from it, in
|
35
|
+
# the format of one stopword per line.
|
36
|
+
#
|
37
|
+
# The DEFAULT_IDF value is returned when a query term is not found in the
|
38
|
+
# IDF corpus.
|
39
|
+
#
|
40
|
+
# @param [String] corpus_filename The disk location of the IDF corpus.
|
41
|
+
# @param [String] stopword_filename The disk location of the stopword list.
|
42
|
+
# @param [Float] default_idf The value returned when a term is not found in the IDF corpus.
|
43
|
+
# @raise ["Corpus Not Found"] Thrown when the corpus isn't found.
|
44
|
+
# @raise ["Stopwords Not Found"] Thrown when the stopwords list isn't found.
|
45
|
+
# @return [TfIdf] A TfIdf instance loaded with the corpus.
|
46
|
+
def initialize(corpus_filename = nil, stopword_filename = nil, default_idf = DEFAULT_IDF)
|
47
|
+
self.num_docs = 0
|
48
|
+
self.term_num_docs = {}
|
49
|
+
self.stopwords = []
|
50
|
+
self.idf_default = default_idf
|
51
|
+
|
52
|
+
raise "Corpus not found" if corpus_filename && !File.exists?(corpus_filename)
|
53
|
+
if corpus_filename
|
54
|
+
entries = File.read(corpus_filename).entries
|
55
|
+
self.num_docs = entries.shift.strip.to_i
|
56
|
+
entries.each do |line|
|
57
|
+
tokens = line.split(":")
|
58
|
+
term = tokens[0].strip
|
59
|
+
frequency = tokens[1].strip.to_i
|
60
|
+
self.term_num_docs[term] = frequency
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
raise "Stopwords not found" if stopword_filename && !File.exists?(stopword_filename)
|
65
|
+
if stopword_filename
|
66
|
+
self.stopwords = File.read(stopword_filename).entries.collect{|x| x.strip}
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
##
|
71
|
+
# Convenience method for creating a TfIdf instance.
|
72
|
+
#
|
73
|
+
# @param [String] corpus_filename The disk location of the IDF corpus.
|
74
|
+
# @return [TfIdf] A TfIdf instance loaded with the corpus.
|
75
|
+
def self.from_corpus(corpus_filename, default_idf = DEFAULT_IDF)
|
76
|
+
self.new(corpus_filename, nil, default_idf)
|
77
|
+
end
|
78
|
+
|
79
|
+
##
|
80
|
+
# Breaks a string into tokens. This implementation matches whole words.
|
81
|
+
# Clients may wish to override this behaviour with their own tokenization.
|
82
|
+
# strategy.
|
83
|
+
#
|
84
|
+
# @param [String] input String representation of a document
|
85
|
+
# @return [Array<String>] A list of tokens
|
86
|
+
def get_tokens(input)
|
87
|
+
# str.split().collect{|x| x if x =~ /[A-Za-z]+/}.compact
|
88
|
+
input.split.select{|x| x =~ /<a.*?\/a>|<[^\>]*>|[\w'@#]+/}
|
89
|
+
end
|
90
|
+
|
91
|
+
##
|
92
|
+
# Add terms in the specified document to the IDF corpus.
|
93
|
+
#
|
94
|
+
# @param [String] input String representation of a document.
|
95
|
+
def add_input_document(input)
|
96
|
+
self.num_docs += 1
|
97
|
+
token_set = get_tokens(input).uniq
|
98
|
+
token_set.each do |term|
|
99
|
+
if self.term_num_docs[term]
|
100
|
+
self.term_num_docs[term] += 1
|
101
|
+
else
|
102
|
+
self.term_num_docs[term] = 1
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
##
|
108
|
+
# Saves the tf-idf corpus and stopword list to the specified file.
|
109
|
+
#
|
110
|
+
# A word is a stopword if it occurs in more than stopword_threshold% of num_docs.
|
111
|
+
# A threshold of 0.4, means that the word must occur in more than 40% of the documents.
|
112
|
+
#
|
113
|
+
# @param [String] idf_filename Filename.
|
114
|
+
# @param [String] stopword_filename Filename.
|
115
|
+
# @param [Float] stopword_percentage_threshold Stopword threshold. Lower threshold lower criteria.
|
116
|
+
def save_corpus_to_file(idf_filename, stopword_filename, stopword_percentage_threshold = 0.01)
|
117
|
+
File.open(idf_filename, "w") do |file|
|
118
|
+
file.write("#{self.num_docs}\n")
|
119
|
+
self.term_num_docs.each do |term, num_docs|
|
120
|
+
file.write("#{term}: #{num_docs}\n")
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
File.open(stopword_filename, "w") do |file|
|
125
|
+
sorted_term_num_docs = sort_by_tfidf(self.term_num_docs)
|
126
|
+
sorted_term_num_docs.each do |term, num_docs|
|
127
|
+
# pp [term, num_docs, stopword_percentage_threshold, self.num_docs, stopword_percentage_threshold * self.num_docs, ]
|
128
|
+
if num_docs > stopword_percentage_threshold * self.num_docs
|
129
|
+
file.write("#{term}\n")
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
##
|
136
|
+
# Retrieves the IDF for the specified term.
|
137
|
+
#
|
138
|
+
# This is computed with:
|
139
|
+
# logarithm of ((number of documents in corpus) divided by
|
140
|
+
# (number of documents containing this term)).
|
141
|
+
#
|
142
|
+
# @param [String] term A term in the IDF corpus.
|
143
|
+
# @return [Float] The IDF for the specified term.
|
144
|
+
def idf(term)
|
145
|
+
if self.stopwords.include?(term)
|
146
|
+
return 0
|
147
|
+
end
|
148
|
+
|
149
|
+
if self.term_num_docs[term].nil?
|
150
|
+
return self.idf_default
|
151
|
+
end
|
152
|
+
|
153
|
+
return Math.log((1 + self.num_docs).to_f /
|
154
|
+
(1 + self.term_num_docs[term]))
|
155
|
+
end
|
156
|
+
|
157
|
+
##
|
158
|
+
# Retrieve terms and corresponding tf-idf for the specified document.
|
159
|
+
#
|
160
|
+
# The returned terms are ordered by decreasing tf-idf.
|
161
|
+
#
|
162
|
+
# @param [String] curr_doc String representation of an existing document.
|
163
|
+
# @return [Array] Terms ordered by decreasing tf-idf rank.
|
164
|
+
def doc_keywords(curr_doc)
|
165
|
+
tfidf = {}
|
166
|
+
|
167
|
+
tokens = self.get_tokens(curr_doc)
|
168
|
+
token_set = tokens.uniq
|
169
|
+
token_set_sz = token_set.count
|
170
|
+
|
171
|
+
token_set.each do |term|
|
172
|
+
mytf = tokens.count(term).to_f / token_set_sz
|
173
|
+
myidf = self.idf(term)
|
174
|
+
tfidf[term] = mytf * myidf
|
175
|
+
end
|
176
|
+
|
177
|
+
sort_by_tfidf(tfidf)
|
178
|
+
end
|
179
|
+
|
180
|
+
##
|
181
|
+
# Returns a string representation of the tf-idf corpus.
|
182
|
+
#
|
183
|
+
# @return [String] Contains # docs, # term and frequency.
|
184
|
+
def to_s
|
185
|
+
{:num_docs => self.num_docs, :term_num_docs => self.term_num_docs.size}.inspect
|
186
|
+
end
|
187
|
+
|
188
|
+
##
|
189
|
+
# Sorts terms by decreasing tf-idf.
|
190
|
+
#
|
191
|
+
# @example Sort by tf-idf
|
192
|
+
# "{'and'=>0.0025, 'fork'=>0.0025, 'the'=>0.37688590118819, 'spoon'=>1.0025}" #=>
|
193
|
+
# "[['spoon', 1.0025], ['the', 0.37688590118819], ['fork', 0.0025], ['and', 0.0025]]"
|
194
|
+
# @return [Array<Array<String, Float>>] An array of term/IDF array pairs.
|
195
|
+
def sort_by_tfidf(tfidf)
|
196
|
+
tfidf.sort{|a, b| b[1] <=> a[1]}
|
197
|
+
end
|
198
|
+
|
199
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
moon
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
2
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
3
|
+
require 'tf-idf'
|
4
|
+
require 'spec'
|
5
|
+
require 'spec/autorun'
|
6
|
+
require 'pp'
|
7
|
+
|
8
|
+
Spec::Runner.configure do |config|
|
9
|
+
end
|
10
|
+
|
11
|
+
def get_expected_idf(num_docs_total, num_docs_term)
|
12
|
+
Math.log((1 + num_docs_total).to_f / (1 + num_docs_term))
|
13
|
+
end
|
data/spec/tf-idf_spec.rb
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe "TF-IDF library" do
|
4
|
+
|
5
|
+
before(:all) do
|
6
|
+
@test_corpus = File.expand_path(File.dirname(__FILE__) + '/fixtures/tfidf_testcorpus.txt')
|
7
|
+
@test_stopwords = File.expand_path(File.dirname(__FILE__) + '/fixtures/tfidf_teststopwords.txt')
|
8
|
+
@default_idf_unittest = 1.0
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should instantiate without args" do
|
12
|
+
TfIdf.new
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should report the correct number of documents" do
|
16
|
+
my_tfidf = TfIdf.from_corpus(@test_corpus, @default_idf_unittest)
|
17
|
+
my_tfidf.num_docs.should == 50
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should report the correct number of terms" do
|
21
|
+
my_tfidf = TfIdf.from_corpus(@test_corpus, @default_idf_unittest)
|
22
|
+
my_tfidf.term_num_docs.size.should == 6
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should query IDF for nonexistent terms" do
|
26
|
+
my_tfidf = TfIdf.from_corpus(@test_corpus, @default_idf_unittest)
|
27
|
+
my_tfidf.idf("nonexistent").should == @default_idf_unittest
|
28
|
+
my_tfidf.idf("THE").should == @default_idf_unittest
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should query IDF for existent terms" do
|
32
|
+
my_tfidf = TfIdf.from_corpus(@test_corpus, @default_idf_unittest)
|
33
|
+
my_tfidf.idf("a").should > my_tfidf.idf("the")
|
34
|
+
my_tfidf.idf("girl").should == my_tfidf.idf("moon")
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should retrieve keywords from a document, ordered by tf-idf" do
|
38
|
+
my_tfidf = TfIdf.from_corpus(@test_corpus, 0.01)
|
39
|
+
|
40
|
+
# Test retrieving keywords when there is only one keyword.
|
41
|
+
keywords = my_tfidf.doc_keywords("the spoon and the fork")
|
42
|
+
keywords[0][0].should == "the"
|
43
|
+
|
44
|
+
# Test retrieving multiple keywords.
|
45
|
+
keywords = my_tfidf.doc_keywords("the girl said hello over the phone")
|
46
|
+
keywords[0][0].should == "girl"
|
47
|
+
keywords[1][0].should == "phone"
|
48
|
+
keywords[2][0].should == "said"
|
49
|
+
keywords[3][0].should == "the"
|
50
|
+
end
|
51
|
+
|
52
|
+
it "should add input documents to an existing corpus" do
|
53
|
+
my_tfidf = TfIdf.new(@test_corpus, nil, @default_idf_unittest)
|
54
|
+
my_tfidf.idf("water").should == @default_idf_unittest
|
55
|
+
my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 1)
|
56
|
+
my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
|
57
|
+
|
58
|
+
my_tfidf.add_input_document("water moon") # doesn't support commas
|
59
|
+
|
60
|
+
my_tfidf.idf("water").should == get_expected_idf(my_tfidf.num_docs, 1)
|
61
|
+
my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 2)
|
62
|
+
my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
|
63
|
+
end
|
64
|
+
|
65
|
+
it "should add input documents to an empty corpus" do
|
66
|
+
my_tfidf = TfIdf.new(nil, nil, @default_idf_unittest)
|
67
|
+
my_tfidf.idf("moon").should == @default_idf_unittest
|
68
|
+
my_tfidf.idf("water").should == @default_idf_unittest
|
69
|
+
my_tfidf.idf("said").should == @default_idf_unittest
|
70
|
+
|
71
|
+
my_tfidf.add_input_document("moon")
|
72
|
+
my_tfidf.add_input_document("moon said hello")
|
73
|
+
|
74
|
+
my_tfidf.idf("water").should == @default_idf_unittest
|
75
|
+
my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 1)
|
76
|
+
my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 2)
|
77
|
+
end
|
78
|
+
|
79
|
+
it "should observe stopwords list" do
|
80
|
+
my_tfidf = TfIdf.new(@test_corpus, @test_stopwords, @default_idf_unittest)
|
81
|
+
my_tfidf.idf("water").should == @default_idf_unittest
|
82
|
+
my_tfidf.idf("moon").should == 0 # ignored
|
83
|
+
my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
|
84
|
+
|
85
|
+
my_tfidf.add_input_document("moon")
|
86
|
+
my_tfidf.add_input_document("moon and water")
|
87
|
+
|
88
|
+
my_tfidf.idf("water").should == get_expected_idf(my_tfidf.num_docs, 1)
|
89
|
+
my_tfidf.idf("moon").should == 0
|
90
|
+
my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
|
91
|
+
end
|
92
|
+
|
93
|
+
# Abstract out File IO
|
94
|
+
# it "should write the contents of the TF/IDF corpus to disk" do
|
95
|
+
# my_tfidf = TfIdf.new(@test_corpus, @test_stopwords, @default_idf_unittest)
|
96
|
+
# my_tfidf.save_corpus_to_file("foo.txt", "bar.txt", 0.3)
|
97
|
+
# stopwords = File.read("bar.txt").split
|
98
|
+
#
|
99
|
+
# stopwords.size.should == 2
|
100
|
+
# stopwords.should include("a")
|
101
|
+
# stopwords.should include("the")
|
102
|
+
# end
|
103
|
+
|
104
|
+
end
|
metadata
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tf-idf
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Marc Chung
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-11-29 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rspec
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: yard
|
27
|
+
type: :development
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "0"
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: cucumber
|
37
|
+
type: :development
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: "0"
|
44
|
+
version:
|
45
|
+
description: Computes IDF for a specified term based on the corpus, or generates keywords ordered by tf-idf for a specified document.
|
46
|
+
email: mchung@gmail.com
|
47
|
+
executables: []
|
48
|
+
|
49
|
+
extensions: []
|
50
|
+
|
51
|
+
extra_rdoc_files:
|
52
|
+
- LICENSE
|
53
|
+
- README.markdown
|
54
|
+
files:
|
55
|
+
- .document
|
56
|
+
- .gitignore
|
57
|
+
- LICENSE
|
58
|
+
- README.markdown
|
59
|
+
- Rakefile
|
60
|
+
- VERSION
|
61
|
+
- features/step_definitions/tf-idf_steps.rb
|
62
|
+
- features/support/env.rb
|
63
|
+
- features/tf-idf.feature
|
64
|
+
- lib/tf-idf.rb
|
65
|
+
- spec/fixtures/tfidf_testcorpus.txt
|
66
|
+
- spec/fixtures/tfidf_teststopwords.txt
|
67
|
+
- spec/spec_helper.rb
|
68
|
+
- spec/tf-idf_spec.rb
|
69
|
+
has_rdoc: true
|
70
|
+
homepage: http://github.com/mchung/tf-idf
|
71
|
+
licenses: []
|
72
|
+
|
73
|
+
post_install_message:
|
74
|
+
rdoc_options:
|
75
|
+
- --charset=UTF-8
|
76
|
+
require_paths:
|
77
|
+
- lib
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: "0"
|
83
|
+
version:
|
84
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: "0"
|
89
|
+
version:
|
90
|
+
requirements: []
|
91
|
+
|
92
|
+
rubyforge_project:
|
93
|
+
rubygems_version: 1.3.5
|
94
|
+
signing_key:
|
95
|
+
specification_version: 3
|
96
|
+
summary: A rubygem that implements the Tf-Idf algorithm
|
97
|
+
test_files:
|
98
|
+
- spec/spec_helper.rb
|
99
|
+
- spec/tf-idf_spec.rb
|