tf-idf 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.markdown
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Marc Chung
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,26 @@
1
+ tf-idf
2
+ ------
3
+
4
+ This is a simple Tf-idf library. The algorithm is described in:
5
+
6
+ [http://en.wikipedia.org/wiki/Tf-idf][wiki].
7
+
8
+ A port of Niniane Wang's [tfidf][tfidf] library.
9
+
10
+ Note on Patches/Pull Requests
11
+ -----------------------------
12
+
13
+ * Fork the project.
14
+ * Make your feature addition or bug fix.
15
+ * Add tests for it. This is important so I don't break it in a future version unintentionally.
16
+ * Commit, do not mess with rakefile, version, or history. (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
17
+ * Send me a pull request. Bonus points for topic branches.
18
+
19
+ Copyright
20
+ --------
21
+
22
+ Copyright (c) 2009 Marc Chung. See LICENSE for details.
23
+
24
+ [tfidf]: http://code.google.com/p/tfidf
25
+
26
+ [wiki]: http://en.wikipedia.org/wiki/Tf-idf
@@ -0,0 +1,58 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "tf-idf"
8
+ gem.summary = %Q{A rubygem that implements the Tf-Idf algorithm}
9
+ gem.description = %Q{Computes IDF for a specified term based on the corpus, or generates keywords ordered by tf-idf for a specified document.}
10
+ gem.email = "mchung@gmail.com"
11
+ gem.homepage = "http://github.com/mchung/tf-idf"
12
+ gem.authors = ["Marc Chung"]
13
+ gem.add_development_dependency "rspec"
14
+ gem.add_development_dependency "yard"
15
+ gem.add_development_dependency "cucumber"
16
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
21
+ end
22
+
23
+ require 'spec/rake/spectask'
24
+ Spec::Rake::SpecTask.new(:spec) do |spec|
25
+ spec.libs << 'lib' << 'spec'
26
+ spec.spec_files = FileList['spec/**/*_spec.rb']
27
+ end
28
+
29
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
30
+ spec.libs << 'lib' << 'spec'
31
+ spec.pattern = 'spec/**/*_spec.rb'
32
+ spec.rcov = true
33
+ spec.rcov_dir = "doc/coverage"
34
+ end
35
+
36
+ task :spec => :check_dependencies
37
+
38
+ begin
39
+ require 'cucumber/rake/task'
40
+ Cucumber::Rake::Task.new(:features)
41
+
42
+ task :features => :check_dependencies
43
+ rescue LoadError
44
+ task :features do
45
+ abort "Cucumber is not available. In order to run features, you must: sudo gem install cucumber"
46
+ end
47
+ end
48
+
49
+ task :default => [:spec, :features]
50
+
51
+ begin
52
+ require 'yard'
53
+ YARD::Rake::YardocTask.new
54
+ rescue LoadError
55
+ task :yardoc do
56
+ abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
57
+ end
58
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 1.0.0
@@ -0,0 +1,49 @@
1
+ Before do
2
+ @default_idf_unittest = 1.0
3
+ end
4
+
5
+ Given /^the default IDF is set to "([^\"]*)"$/ do |val|
6
+ @default_idf_unittest = val.to_f
7
+ end
8
+
9
+ Given /^I have loaded the sample corpus data set "([^\"]*)"$/ do |corpus_filename|
10
+ @corpus_filename = File.expand_path(File.dirname(__FILE__) + "/../../spec/fixtures/#{corpus_filename}")
11
+ @my_tfidf = TfIdf.from_corpus(@corpus_filename, @default_idf_unittest)
12
+ end
13
+
14
+ Then /^I should have a total of "([^\"]*)" documents$/ do |total_doc_count|
15
+ @my_tfidf.num_docs.should == total_doc_count.to_i
16
+ end
17
+
18
+ Then /^I should have a total of "([^\"]*)" term\/num_doc pairs$/ do |total_term_num_doc_count|
19
+ @my_tfidf.term_num_docs.size.should == total_term_num_doc_count.to_i
20
+ end
21
+
22
+ Then /^I should get the default IDF for "([^\"]*)"$/ do |term|
23
+ @my_tfidf.idf(term).should == @default_idf_unittest
24
+ end
25
+
26
+ Then /^the IDF for "([^\"]*)" should be greater than the IDF for "([^\"]*)"$/ do |term1, term2|
27
+ @my_tfidf.idf(term1).should > @my_tfidf.idf(term2)
28
+ end
29
+
30
+ Then /^the IDF for "([^\"]*)" should be equal to the IDF for "([^\"]*)"$/ do |term1, term2|
31
+ @my_tfidf.idf(term1).should == @my_tfidf.idf(term2)
32
+ end
33
+
34
+ Given /^the keywords "([^\"]*)"$/ do |keywords|
35
+ @keywords = @my_tfidf.doc_keywords(keywords)
36
+ end
37
+
38
+ Then /^"([^\"]*)" should be located at "([^\"]*)", "([^\"]*)"$/ do |term, x, y|
39
+ @keywords[x.to_i][y.to_i].should == term
40
+ end
41
+
42
+
43
+ # Then /^I should get the expected IDF for "([^\"]*)"$/ do |arg1|
44
+ # pending
45
+ # end
46
+
47
+ def get_expected_idf(num_docs_total, num_docs_term)
48
+ Math.log((1 + num_docs_total).to_f / (1 + num_docs_term))
49
+ end
@@ -0,0 +1,4 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
2
+ require 'tf-idf'
3
+
4
+ require 'spec/expectations'
@@ -0,0 +1,79 @@
1
+ Feature: Tf-Idf
2
+ A user should be able to construct an IDF corpus
3
+
4
+ Scenario: A corpus should contain the total number of documents
5
+ Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
6
+ Then I should have a total of "50" documents
7
+
8
+ Scenario: A corpus should contain terms and the number of documents they are found in.
9
+ Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
10
+ Then I should have a total of "6" term/num_doc pairs
11
+
12
+ Scenario: A corpus should query IDF for nonexistent terms
13
+ Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
14
+ Then I should get the default IDF for "nonexistent"
15
+ Then I should get the default IDF for "THE"
16
+
17
+ Scenario: A corpus should query IDF for existent terms
18
+ Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
19
+ Then the IDF for "a" should be greater than the IDF for "the"
20
+ Then the IDF for "girl" should be equal to the IDF for "moon"
21
+
22
+ Scenario: A corpus should retrieve keywords from a document, ordered by tf-idf"
23
+ Given the default IDF is set to "0.01"
24
+ And I have loaded the sample corpus data set "tfidf_testcorpus.txt"
25
+ Given the keywords "the spoon and the fork"
26
+ Then "the" should be located at "0", "0"
27
+ Given the keywords "the girl said hello over the phone"
28
+ Then "girl" should be located at "0", "0"
29
+ Then "phone" should be located at "1", "0"
30
+ Then "said" should be located at "2", "0"
31
+ Then "the" should be located at "3", "0"
32
+
33
+ # Scenario: A corpus should add input documents to an existing corpus"
34
+ # Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
35
+ # Then I should get the default IDF for "water"
36
+ # Then I should get the expected IDF for "moon" when it has "1" occurrence
37
+ # Then I should get the expected IDF for "said" when it has "5" occurrences
38
+
39
+
40
+ # it "should add input documents to an existing corpus" do
41
+ # my_tfidf = TfIdf.new(@test_corpus, nil, @default_idf_unittest)
42
+ # my_tfidf.idf("water").should == @default_idf_unittest
43
+ # my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 1)
44
+ # my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
45
+ #
46
+ # my_tfidf.add_input_document("water moon") # doesn't support commas
47
+ #
48
+ # my_tfidf.idf("water").should == get_expected_idf(my_tfidf.num_docs, 1)
49
+ # my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 2)
50
+ # my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
51
+ # end
52
+ #
53
+ # it "should add input documents to an empty corpus" do
54
+ # my_tfidf = TfIdf.new(nil, nil, @default_idf_unittest)
55
+ # my_tfidf.idf("moon").should == @default_idf_unittest
56
+ # my_tfidf.idf("water").should == @default_idf_unittest
57
+ # my_tfidf.idf("said").should == @default_idf_unittest
58
+ #
59
+ # my_tfidf.add_input_document("moon")
60
+ # my_tfidf.add_input_document("moon said hello")
61
+ #
62
+ # my_tfidf.idf("water").should == @default_idf_unittest
63
+ # my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 1)
64
+ # my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 2)
65
+ # end
66
+ #
67
+ # it "should observe stopwords list" do
68
+ # my_tfidf = TfIdf.new(@test_corpus, @test_stopwords, @default_idf_unittest)
69
+ # my_tfidf.idf("water").should == @default_idf_unittest
70
+ # my_tfidf.idf("moon").should == 0 # ignored
71
+ # my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
72
+ #
73
+ # my_tfidf.add_input_document("moon")
74
+ # my_tfidf.add_input_document("moon and water")
75
+ #
76
+ # my_tfidf.idf("water").should == get_expected_idf(my_tfidf.num_docs, 1)
77
+ # my_tfidf.idf("moon").should == 0
78
+ # my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
79
+ # end
@@ -0,0 +1,199 @@
1
+ # Tf-idf class implementing http://en.wikipedia.org/wiki/Tf-idf.
2
+ #
3
+ # The library constructs an IDF corpus and stopword list either from
4
+ # documents specified by the client, or by reading from input files. It
5
+ # computes IDF for a specified term based on the corpus, or generates
6
+ # keywords ordered by tf-idf for a specified document.
7
+ #
8
+ # @author Marc Chung <mchung@gmail.com>
9
+ # @see http://en.wikipedia.org/wiki/Tf-idf Term frequency-inverse document frequency
10
+ class TfIdf
11
+
12
+ # @return [Integer] The total number of documents in the tf-idf corpus.
13
+ attr_accessor :num_docs
14
+
15
+ # @return [Hash<String, Integer>] A histogram of terms and their term frequency.
16
+ attr_accessor :term_num_docs
17
+
18
+ # @return [Array<String>] An array of stopwords.
19
+ attr_accessor :stopwords
20
+
21
+ # @return [Float] The default value returned when a term is not found in the tf-idf corpus.
22
+ attr_accessor :idf_default
23
+
24
+ DEFAULT_IDF = 1.5
25
+
26
+ ##
27
+ # Initialize the tf-idf dictionary.
28
+ #
29
+ # If a corpus file is supplied, reads the idf dictionary from it, in the
30
+ # format of:
31
+ # # of total documents
32
+ # term: # of documents containing the term
33
+ #
34
+ # If a stopword file is specified, reads the stopword list from it, in
35
+ # the format of one stopword per line.
36
+ #
37
+ # The DEFAULT_IDF value is returned when a query term is not found in the
38
+ # IDF corpus.
39
+ #
40
+ # @param [String] corpus_filename The disk location of the IDF corpus.
41
+ # @param [String] stopword_filename The disk location of the stopword list.
42
+ # @param [Float] default_idf The value returned when a term is not found in the IDF corpus.
43
+ # @raise ["Corpus Not Found"] Thrown when the corpus isn't found.
44
+ # @raise ["Stopwords Not Found"] Thrown when the stopwords list isn't found.
45
+ # @return [TfIdf] A TfIdf instance loaded with the corpus.
46
+ def initialize(corpus_filename = nil, stopword_filename = nil, default_idf = DEFAULT_IDF)
47
+ self.num_docs = 0
48
+ self.term_num_docs = {}
49
+ self.stopwords = []
50
+ self.idf_default = default_idf
51
+
52
+ raise "Corpus not found" if corpus_filename && !File.exists?(corpus_filename)
53
+ if corpus_filename
54
+ entries = File.read(corpus_filename).entries
55
+ self.num_docs = entries.shift.strip.to_i
56
+ entries.each do |line|
57
+ tokens = line.split(":")
58
+ term = tokens[0].strip
59
+ frequency = tokens[1].strip.to_i
60
+ self.term_num_docs[term] = frequency
61
+ end
62
+ end
63
+
64
+ raise "Stopwords not found" if stopword_filename && !File.exists?(stopword_filename)
65
+ if stopword_filename
66
+ self.stopwords = File.read(stopword_filename).entries.collect{|x| x.strip}
67
+ end
68
+ end
69
+
70
+ ##
71
+ # Convenience method for creating a TfIdf instance.
72
+ #
73
+ # @param [String] corpus_filename The disk location of the IDF corpus.
74
+ # @return [TfIdf] A TfIdf instance loaded with the corpus.
75
+ def self.from_corpus(corpus_filename, default_idf = DEFAULT_IDF)
76
+ self.new(corpus_filename, nil, default_idf)
77
+ end
78
+
79
+ ##
80
+ # Breaks a string into tokens. This implementation matches whole words.
81
+ # Clients may wish to override this behaviour with their own tokenization.
82
+ # strategy.
83
+ #
84
+ # @param [String] input String representation of a document
85
+ # @return [Array<String>] A list of tokens
86
+ def get_tokens(input)
87
+ # str.split().collect{|x| x if x =~ /[A-Za-z]+/}.compact
88
+ input.split.select{|x| x =~ /<a.*?\/a>|<[^\>]*>|[\w'@#]+/}
89
+ end
90
+
91
+ ##
92
+ # Add terms in the specified document to the IDF corpus.
93
+ #
94
+ # @param [String] input String representation of a document.
95
+ def add_input_document(input)
96
+ self.num_docs += 1
97
+ token_set = get_tokens(input).uniq
98
+ token_set.each do |term|
99
+ if self.term_num_docs[term]
100
+ self.term_num_docs[term] += 1
101
+ else
102
+ self.term_num_docs[term] = 1
103
+ end
104
+ end
105
+ end
106
+
107
+ ##
108
+ # Saves the tf-idf corpus and stopword list to the specified file.
109
+ #
110
+ # A word is a stopword if it occurs in more than stopword_threshold% of num_docs.
111
+ # A threshold of 0.4, means that the word must occur in more than 40% of the documents.
112
+ #
113
+ # @param [String] idf_filename Filename.
114
+ # @param [String] stopword_filename Filename.
115
+ # @param [Float] stopword_percentage_threshold Stopword threshold. Lower threshold lower criteria.
116
+ def save_corpus_to_file(idf_filename, stopword_filename, stopword_percentage_threshold = 0.01)
117
+ File.open(idf_filename, "w") do |file|
118
+ file.write("#{self.num_docs}\n")
119
+ self.term_num_docs.each do |term, num_docs|
120
+ file.write("#{term}: #{num_docs}\n")
121
+ end
122
+ end
123
+
124
+ File.open(stopword_filename, "w") do |file|
125
+ sorted_term_num_docs = sort_by_tfidf(self.term_num_docs)
126
+ sorted_term_num_docs.each do |term, num_docs|
127
+ # pp [term, num_docs, stopword_percentage_threshold, self.num_docs, stopword_percentage_threshold * self.num_docs, ]
128
+ if num_docs > stopword_percentage_threshold * self.num_docs
129
+ file.write("#{term}\n")
130
+ end
131
+ end
132
+ end
133
+ end
134
+
135
+ ##
136
+ # Retrieves the IDF for the specified term.
137
+ #
138
+ # This is computed with:
139
+ # logarithm of ((number of documents in corpus) divided by
140
+ # (number of documents containing this term)).
141
+ #
142
+ # @param [String] term A term in the IDF corpus.
143
+ # @return [Float] The IDF for the specified term.
144
+ def idf(term)
145
+ if self.stopwords.include?(term)
146
+ return 0
147
+ end
148
+
149
+ if self.term_num_docs[term].nil?
150
+ return self.idf_default
151
+ end
152
+
153
+ return Math.log((1 + self.num_docs).to_f /
154
+ (1 + self.term_num_docs[term]))
155
+ end
156
+
157
+ ##
158
+ # Retrieve terms and corresponding tf-idf for the specified document.
159
+ #
160
+ # The returned terms are ordered by decreasing tf-idf.
161
+ #
162
+ # @param [String] curr_doc String representation of an existing document.
163
+ # @return [Array] Terms ordered by decreasing tf-idf rank.
164
+ def doc_keywords(curr_doc)
165
+ tfidf = {}
166
+
167
+ tokens = self.get_tokens(curr_doc)
168
+ token_set = tokens.uniq
169
+ token_set_sz = token_set.count
170
+
171
+ token_set.each do |term|
172
+ mytf = tokens.count(term).to_f / token_set_sz
173
+ myidf = self.idf(term)
174
+ tfidf[term] = mytf * myidf
175
+ end
176
+
177
+ sort_by_tfidf(tfidf)
178
+ end
179
+
180
+ ##
181
+ # Returns a string representation of the tf-idf corpus.
182
+ #
183
+ # @return [String] Contains # docs, # term and frequency.
184
+ def to_s
185
+ {:num_docs => self.num_docs, :term_num_docs => self.term_num_docs.size}.inspect
186
+ end
187
+
188
+ ##
189
+ # Sorts terms by decreasing tf-idf.
190
+ #
191
+ # @example Sort by tf-idf
192
+ # "{'and'=>0.0025, 'fork'=>0.0025, 'the'=>0.37688590118819, 'spoon'=>1.0025}" #=>
193
+ # "[['spoon', 1.0025], ['the', 0.37688590118819], ['fork', 0.0025], ['and', 0.0025]]"
194
+ # @return [Array<Array<String, Float>>] An array of term/IDF array pairs.
195
+ def sort_by_tfidf(tfidf)
196
+ tfidf.sort{|a, b| b[1] <=> a[1]}
197
+ end
198
+
199
+ end
@@ -0,0 +1,7 @@
1
+ 50
2
+ the: 23
3
+ a: 17
4
+ girl: 1
5
+ moon: 1
6
+ said: 5
7
+ phone: 2
@@ -0,0 +1,13 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'tf-idf'
4
+ require 'spec'
5
+ require 'spec/autorun'
6
+ require 'pp'
7
+
8
+ Spec::Runner.configure do |config|
9
+ end
10
+
11
+ def get_expected_idf(num_docs_total, num_docs_term)
12
+ Math.log((1 + num_docs_total).to_f / (1 + num_docs_term))
13
+ end
@@ -0,0 +1,104 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "TF-IDF library" do
4
+
5
+ before(:all) do
6
+ @test_corpus = File.expand_path(File.dirname(__FILE__) + '/fixtures/tfidf_testcorpus.txt')
7
+ @test_stopwords = File.expand_path(File.dirname(__FILE__) + '/fixtures/tfidf_teststopwords.txt')
8
+ @default_idf_unittest = 1.0
9
+ end
10
+
11
+ it "should instantiate without args" do
12
+ TfIdf.new
13
+ end
14
+
15
+ it "should report the correct number of documents" do
16
+ my_tfidf = TfIdf.from_corpus(@test_corpus, @default_idf_unittest)
17
+ my_tfidf.num_docs.should == 50
18
+ end
19
+
20
+ it "should report the correct number of terms" do
21
+ my_tfidf = TfIdf.from_corpus(@test_corpus, @default_idf_unittest)
22
+ my_tfidf.term_num_docs.size.should == 6
23
+ end
24
+
25
+ it "should query IDF for nonexistent terms" do
26
+ my_tfidf = TfIdf.from_corpus(@test_corpus, @default_idf_unittest)
27
+ my_tfidf.idf("nonexistent").should == @default_idf_unittest
28
+ my_tfidf.idf("THE").should == @default_idf_unittest
29
+ end
30
+
31
+ it "should query IDF for existent terms" do
32
+ my_tfidf = TfIdf.from_corpus(@test_corpus, @default_idf_unittest)
33
+ my_tfidf.idf("a").should > my_tfidf.idf("the")
34
+ my_tfidf.idf("girl").should == my_tfidf.idf("moon")
35
+ end
36
+
37
+ it "should retrieve keywords from a document, ordered by tf-idf" do
38
+ my_tfidf = TfIdf.from_corpus(@test_corpus, 0.01)
39
+
40
+ # Test retrieving keywords when there is only one keyword.
41
+ keywords = my_tfidf.doc_keywords("the spoon and the fork")
42
+ keywords[0][0].should == "the"
43
+
44
+ # Test retrieving multiple keywords.
45
+ keywords = my_tfidf.doc_keywords("the girl said hello over the phone")
46
+ keywords[0][0].should == "girl"
47
+ keywords[1][0].should == "phone"
48
+ keywords[2][0].should == "said"
49
+ keywords[3][0].should == "the"
50
+ end
51
+
52
+ it "should add input documents to an existing corpus" do
53
+ my_tfidf = TfIdf.new(@test_corpus, nil, @default_idf_unittest)
54
+ my_tfidf.idf("water").should == @default_idf_unittest
55
+ my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 1)
56
+ my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
57
+
58
+ my_tfidf.add_input_document("water moon") # doesn't support commas
59
+
60
+ my_tfidf.idf("water").should == get_expected_idf(my_tfidf.num_docs, 1)
61
+ my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 2)
62
+ my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
63
+ end
64
+
65
+ it "should add input documents to an empty corpus" do
66
+ my_tfidf = TfIdf.new(nil, nil, @default_idf_unittest)
67
+ my_tfidf.idf("moon").should == @default_idf_unittest
68
+ my_tfidf.idf("water").should == @default_idf_unittest
69
+ my_tfidf.idf("said").should == @default_idf_unittest
70
+
71
+ my_tfidf.add_input_document("moon")
72
+ my_tfidf.add_input_document("moon said hello")
73
+
74
+ my_tfidf.idf("water").should == @default_idf_unittest
75
+ my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 1)
76
+ my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 2)
77
+ end
78
+
79
+ it "should observe stopwords list" do
80
+ my_tfidf = TfIdf.new(@test_corpus, @test_stopwords, @default_idf_unittest)
81
+ my_tfidf.idf("water").should == @default_idf_unittest
82
+ my_tfidf.idf("moon").should == 0 # ignored
83
+ my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
84
+
85
+ my_tfidf.add_input_document("moon")
86
+ my_tfidf.add_input_document("moon and water")
87
+
88
+ my_tfidf.idf("water").should == get_expected_idf(my_tfidf.num_docs, 1)
89
+ my_tfidf.idf("moon").should == 0
90
+ my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
91
+ end
92
+
93
+ # Abstract out File IO
94
+ # it "should write the contents of the TF/IDF corpus to disk" do
95
+ # my_tfidf = TfIdf.new(@test_corpus, @test_stopwords, @default_idf_unittest)
96
+ # my_tfidf.save_corpus_to_file("foo.txt", "bar.txt", 0.3)
97
+ # stopwords = File.read("bar.txt").split
98
+ #
99
+ # stopwords.size.should == 2
100
+ # stopwords.should include("a")
101
+ # stopwords.should include("the")
102
+ # end
103
+
104
+ end
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tf-idf
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Marc Chung
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-11-29 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rspec
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: yard
27
+ type: :development
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: cucumber
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ description: Computes IDF for a specified term based on the corpus, or generates keywords ordered by tf-idf for a specified document.
46
+ email: mchung@gmail.com
47
+ executables: []
48
+
49
+ extensions: []
50
+
51
+ extra_rdoc_files:
52
+ - LICENSE
53
+ - README.markdown
54
+ files:
55
+ - .document
56
+ - .gitignore
57
+ - LICENSE
58
+ - README.markdown
59
+ - Rakefile
60
+ - VERSION
61
+ - features/step_definitions/tf-idf_steps.rb
62
+ - features/support/env.rb
63
+ - features/tf-idf.feature
64
+ - lib/tf-idf.rb
65
+ - spec/fixtures/tfidf_testcorpus.txt
66
+ - spec/fixtures/tfidf_teststopwords.txt
67
+ - spec/spec_helper.rb
68
+ - spec/tf-idf_spec.rb
69
+ has_rdoc: true
70
+ homepage: http://github.com/mchung/tf-idf
71
+ licenses: []
72
+
73
+ post_install_message:
74
+ rdoc_options:
75
+ - --charset=UTF-8
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: "0"
83
+ version:
84
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: "0"
89
+ version:
90
+ requirements: []
91
+
92
+ rubyforge_project:
93
+ rubygems_version: 1.3.5
94
+ signing_key:
95
+ specification_version: 3
96
+ summary: A rubygem that implements the Tf-Idf algorithm
97
+ test_files:
98
+ - spec/spec_helper.rb
99
+ - spec/tf-idf_spec.rb