tf-idf 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ README.markdown
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Marc Chung
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,26 @@
1
+ tf-idf
2
+ ------
3
+
4
+ This is a simple Tf-idf library. The algorithm is described in:
5
+
6
+ [http://en.wikipedia.org/wiki/Tf-idf][wiki].
7
+
8
+ A port of Niniane Wang's [tfidf][tfidf] library.
9
+
10
+ Note on Patches/Pull Requests
11
+ -----------------------------
12
+
13
+ * Fork the project.
14
+ * Make your feature addition or bug fix.
15
+ * Add tests for it. This is important so I don't break it in a future version unintentionally.
16
+ * Commit, do not mess with rakefile, version, or history. (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
17
+ * Send me a pull request. Bonus points for topic branches.
18
+
19
+ Copyright
20
+ --------
21
+
22
+ Copyright (c) 2009 Marc Chung. See LICENSE for details.
23
+
24
+ [tfidf]: http://code.google.com/p/tfidf
25
+
26
+ [wiki]: http://en.wikipedia.org/wiki/Tf-idf
@@ -0,0 +1,58 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "tf-idf"
8
+ gem.summary = %Q{A rubygem that implements the Tf-Idf algorithm}
9
+ gem.description = %Q{Computes IDF for a specified term based on the corpus, or generates keywords ordered by tf-idf for a specified document.}
10
+ gem.email = "mchung@gmail.com"
11
+ gem.homepage = "http://github.com/mchung/tf-idf"
12
+ gem.authors = ["Marc Chung"]
13
+ gem.add_development_dependency "rspec"
14
+ gem.add_development_dependency "yard"
15
+ gem.add_development_dependency "cucumber"
16
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
21
+ end
22
+
23
+ require 'spec/rake/spectask'
24
+ Spec::Rake::SpecTask.new(:spec) do |spec|
25
+ spec.libs << 'lib' << 'spec'
26
+ spec.spec_files = FileList['spec/**/*_spec.rb']
27
+ end
28
+
29
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
30
+ spec.libs << 'lib' << 'spec'
31
+ spec.pattern = 'spec/**/*_spec.rb'
32
+ spec.rcov = true
33
+ spec.rcov_dir = "doc/coverage"
34
+ end
35
+
36
+ task :spec => :check_dependencies
37
+
38
+ begin
39
+ require 'cucumber/rake/task'
40
+ Cucumber::Rake::Task.new(:features)
41
+
42
+ task :features => :check_dependencies
43
+ rescue LoadError
44
+ task :features do
45
+ abort "Cucumber is not available. In order to run features, you must: sudo gem install cucumber"
46
+ end
47
+ end
48
+
49
+ task :default => [:spec, :features]
50
+
51
+ begin
52
+ require 'yard'
53
+ YARD::Rake::YardocTask.new
54
+ rescue LoadError
55
+ task :yardoc do
56
+ abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
57
+ end
58
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 1.0.0
@@ -0,0 +1,49 @@
1
+ Before do
2
+ @default_idf_unittest = 1.0
3
+ end
4
+
5
+ Given /^the default IDF is set to "([^\"]*)"$/ do |val|
6
+ @default_idf_unittest = val.to_f
7
+ end
8
+
9
+ Given /^I have loaded the sample corpus data set "([^\"]*)"$/ do |corpus_filename|
10
+ @corpus_filename = File.expand_path(File.dirname(__FILE__) + "/../../spec/fixtures/#{corpus_filename}")
11
+ @my_tfidf = TfIdf.from_corpus(@corpus_filename, @default_idf_unittest)
12
+ end
13
+
14
+ Then /^I should have a total of "([^\"]*)" documents$/ do |total_doc_count|
15
+ @my_tfidf.num_docs.should == total_doc_count.to_i
16
+ end
17
+
18
+ Then /^I should have a total of "([^\"]*)" term\/num_doc pairs$/ do |total_term_num_doc_count|
19
+ @my_tfidf.term_num_docs.size.should == total_term_num_doc_count.to_i
20
+ end
21
+
22
+ Then /^I should get the default IDF for "([^\"]*)"$/ do |term|
23
+ @my_tfidf.idf(term).should == @default_idf_unittest
24
+ end
25
+
26
+ Then /^the IDF for "([^\"]*)" should be greater than the IDF for "([^\"]*)"$/ do |term1, term2|
27
+ @my_tfidf.idf(term1).should > @my_tfidf.idf(term2)
28
+ end
29
+
30
+ Then /^the IDF for "([^\"]*)" should be equal to the IDF for "([^\"]*)"$/ do |term1, term2|
31
+ @my_tfidf.idf(term1).should == @my_tfidf.idf(term2)
32
+ end
33
+
34
+ Given /^the keywords "([^\"]*)"$/ do |keywords|
35
+ @keywords = @my_tfidf.doc_keywords(keywords)
36
+ end
37
+
38
+ Then /^"([^\"]*)" should be located at "([^\"]*)", "([^\"]*)"$/ do |term, x, y|
39
+ @keywords[x.to_i][y.to_i].should == term
40
+ end
41
+
42
+
43
+ # Then /^I should get the expected IDF for "([^\"]*)"$/ do |arg1|
44
+ # pending
45
+ # end
46
+
47
+ def get_expected_idf(num_docs_total, num_docs_term)
48
+ Math.log((1 + num_docs_total).to_f / (1 + num_docs_term))
49
+ end
@@ -0,0 +1,4 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
2
+ require 'tf-idf'
3
+
4
+ require 'spec/expectations'
@@ -0,0 +1,79 @@
1
+ Feature: Tf-Idf
2
+ A user should be able to construct an IDF corpus
3
+
4
+ Scenario: A corpus should contain the total number of documents
5
+ Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
6
+ Then I should have a total of "50" documents
7
+
8
+ Scenario: A corpus should contain terms and the number of documents they are found in.
9
+ Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
10
+ Then I should have a total of "6" term/num_doc pairs
11
+
12
+ Scenario: A corpus should query IDF for nonexistent terms
13
+ Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
14
+ Then I should get the default IDF for "nonexistent"
15
+ Then I should get the default IDF for "THE"
16
+
17
+ Scenario: A corpus should query IDF for existent terms
18
+ Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
19
+ Then the IDF for "a" should be greater than the IDF for "the"
20
+ Then the IDF for "girl" should be equal to the IDF for "moon"
21
+
22
+ Scenario: A corpus should retrieve keywords from a document, ordered by tf-idf"
23
+ Given the default IDF is set to "0.01"
24
+ And I have loaded the sample corpus data set "tfidf_testcorpus.txt"
25
+ Given the keywords "the spoon and the fork"
26
+ Then "the" should be located at "0", "0"
27
+ Given the keywords "the girl said hello over the phone"
28
+ Then "girl" should be located at "0", "0"
29
+ Then "phone" should be located at "1", "0"
30
+ Then "said" should be located at "2", "0"
31
+ Then "the" should be located at "3", "0"
32
+
33
+ # Scenario: A corpus should add input documents to an existing corpus"
34
+ # Given I have loaded the sample corpus data set "tfidf_testcorpus.txt"
35
+ # Then I should get the default IDF for "water"
36
+ # Then I should get the expected IDF for "moon" when it has "1" occurrence
37
+ # Then I should get the expected IDF for "said" when it has "5" occurrences
38
+
39
+
40
+ # it "should add input documents to an existing corpus" do
41
+ # my_tfidf = TfIdf.new(@test_corpus, nil, @default_idf_unittest)
42
+ # my_tfidf.idf("water").should == @default_idf_unittest
43
+ # my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 1)
44
+ # my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
45
+ #
46
+ # my_tfidf.add_input_document("water moon") # doesn't support commas
47
+ #
48
+ # my_tfidf.idf("water").should == get_expected_idf(my_tfidf.num_docs, 1)
49
+ # my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 2)
50
+ # my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
51
+ # end
52
+ #
53
+ # it "should add input documents to an empty corpus" do
54
+ # my_tfidf = TfIdf.new(nil, nil, @default_idf_unittest)
55
+ # my_tfidf.idf("moon").should == @default_idf_unittest
56
+ # my_tfidf.idf("water").should == @default_idf_unittest
57
+ # my_tfidf.idf("said").should == @default_idf_unittest
58
+ #
59
+ # my_tfidf.add_input_document("moon")
60
+ # my_tfidf.add_input_document("moon said hello")
61
+ #
62
+ # my_tfidf.idf("water").should == @default_idf_unittest
63
+ # my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 1)
64
+ # my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 2)
65
+ # end
66
+ #
67
+ # it "should observe stopwords list" do
68
+ # my_tfidf = TfIdf.new(@test_corpus, @test_stopwords, @default_idf_unittest)
69
+ # my_tfidf.idf("water").should == @default_idf_unittest
70
+ # my_tfidf.idf("moon").should == 0 # ignored
71
+ # my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
72
+ #
73
+ # my_tfidf.add_input_document("moon")
74
+ # my_tfidf.add_input_document("moon and water")
75
+ #
76
+ # my_tfidf.idf("water").should == get_expected_idf(my_tfidf.num_docs, 1)
77
+ # my_tfidf.idf("moon").should == 0
78
+ # my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
79
+ # end
@@ -0,0 +1,199 @@
1
+ # Tf-idf class implementing http://en.wikipedia.org/wiki/Tf-idf.
2
+ #
3
+ # The library constructs an IDF corpus and stopword list either from
4
+ # documents specified by the client, or by reading from input files. It
5
+ # computes IDF for a specified term based on the corpus, or generates
6
+ # keywords ordered by tf-idf for a specified document.
7
+ #
8
+ # @author Marc Chung <mchung@gmail.com>
9
+ # @see http://en.wikipedia.org/wiki/Tf-idf Term frequency-inverse document frequency
10
+ class TfIdf
11
+
12
+ # @return [Integer] The total number of documents in the tf-idf corpus.
13
+ attr_accessor :num_docs
14
+
15
+ # @return [Hash<String, Integer>] A histogram of terms and their term frequency.
16
+ attr_accessor :term_num_docs
17
+
18
+ # @return [Array<String>] An array of stopwords.
19
+ attr_accessor :stopwords
20
+
21
+ # @return [Float] The default value returned when a term is not found in the tf-idf corpus.
22
+ attr_accessor :idf_default
23
+
24
+ DEFAULT_IDF = 1.5
25
+
26
+ ##
27
+ # Initialize the tf-idf dictionary.
28
+ #
29
+ # If a corpus file is supplied, reads the idf dictionary from it, in the
30
+ # format of:
31
+ # # of total documents
32
+ # term: # of documents containing the term
33
+ #
34
+ # If a stopword file is specified, reads the stopword list from it, in
35
+ # the format of one stopword per line.
36
+ #
37
+ # The DEFAULT_IDF value is returned when a query term is not found in the
38
+ # IDF corpus.
39
+ #
40
+ # @param [String] corpus_filename The disk location of the IDF corpus.
41
+ # @param [String] stopword_filename The disk location of the stopword list.
42
+ # @param [Float] default_idf The value returned when a term is not found in the IDF corpus.
43
+ # @raise ["Corpus Not Found"] Thrown when the corpus isn't found.
44
+ # @raise ["Stopwords Not Found"] Thrown when the stopwords list isn't found.
45
+ # @return [TfIdf] A TfIdf instance loaded with the corpus.
46
+ def initialize(corpus_filename = nil, stopword_filename = nil, default_idf = DEFAULT_IDF)
47
+ self.num_docs = 0
48
+ self.term_num_docs = {}
49
+ self.stopwords = []
50
+ self.idf_default = default_idf
51
+
52
+ raise "Corpus not found" if corpus_filename && !File.exists?(corpus_filename)
53
+ if corpus_filename
54
+ entries = File.read(corpus_filename).entries
55
+ self.num_docs = entries.shift.strip.to_i
56
+ entries.each do |line|
57
+ tokens = line.split(":")
58
+ term = tokens[0].strip
59
+ frequency = tokens[1].strip.to_i
60
+ self.term_num_docs[term] = frequency
61
+ end
62
+ end
63
+
64
+ raise "Stopwords not found" if stopword_filename && !File.exists?(stopword_filename)
65
+ if stopword_filename
66
+ self.stopwords = File.read(stopword_filename).entries.collect{|x| x.strip}
67
+ end
68
+ end
69
+
70
+ ##
71
+ # Convenience method for creating a TfIdf instance.
72
+ #
73
+ # @param [String] corpus_filename The disk location of the IDF corpus.
74
+ # @return [TfIdf] A TfIdf instance loaded with the corpus.
75
+ def self.from_corpus(corpus_filename, default_idf = DEFAULT_IDF)
76
+ self.new(corpus_filename, nil, default_idf)
77
+ end
78
+
79
+ ##
80
+ # Breaks a string into tokens. This implementation matches whole words.
81
+ # Clients may wish to override this behaviour with their own tokenization.
82
+ # strategy.
83
+ #
84
+ # @param [String] input String representation of a document
85
+ # @return [Array<String>] A list of tokens
86
+ def get_tokens(input)
87
+ # str.split().collect{|x| x if x =~ /[A-Za-z]+/}.compact
88
+ input.split.select{|x| x =~ /<a.*?\/a>|<[^\>]*>|[\w'@#]+/}
89
+ end
90
+
91
+ ##
92
+ # Add terms in the specified document to the IDF corpus.
93
+ #
94
+ # @param [String] input String representation of a document.
95
+ def add_input_document(input)
96
+ self.num_docs += 1
97
+ token_set = get_tokens(input).uniq
98
+ token_set.each do |term|
99
+ if self.term_num_docs[term]
100
+ self.term_num_docs[term] += 1
101
+ else
102
+ self.term_num_docs[term] = 1
103
+ end
104
+ end
105
+ end
106
+
107
+ ##
108
+ # Saves the tf-idf corpus and stopword list to the specified file.
109
+ #
110
+ # A word is a stopword if it occurs in more than stopword_threshold% of num_docs.
111
+ # A threshold of 0.4, means that the word must occur in more than 40% of the documents.
112
+ #
113
+ # @param [String] idf_filename Filename.
114
+ # @param [String] stopword_filename Filename.
115
+ # @param [Float] stopword_percentage_threshold Stopword threshold. Lower threshold lower criteria.
116
+ def save_corpus_to_file(idf_filename, stopword_filename, stopword_percentage_threshold = 0.01)
117
+ File.open(idf_filename, "w") do |file|
118
+ file.write("#{self.num_docs}\n")
119
+ self.term_num_docs.each do |term, num_docs|
120
+ file.write("#{term}: #{num_docs}\n")
121
+ end
122
+ end
123
+
124
+ File.open(stopword_filename, "w") do |file|
125
+ sorted_term_num_docs = sort_by_tfidf(self.term_num_docs)
126
+ sorted_term_num_docs.each do |term, num_docs|
127
+ # pp [term, num_docs, stopword_percentage_threshold, self.num_docs, stopword_percentage_threshold * self.num_docs, ]
128
+ if num_docs > stopword_percentage_threshold * self.num_docs
129
+ file.write("#{term}\n")
130
+ end
131
+ end
132
+ end
133
+ end
134
+
135
+ ##
136
+ # Retrieves the IDF for the specified term.
137
+ #
138
+ # This is computed with:
139
+ # logarithm of ((number of documents in corpus) divided by
140
+ # (number of documents containing this term)).
141
+ #
142
+ # @param [String] term A term in the IDF corpus.
143
+ # @return [Float] The IDF for the specified term.
144
+ def idf(term)
145
+ if self.stopwords.include?(term)
146
+ return 0
147
+ end
148
+
149
+ if self.term_num_docs[term].nil?
150
+ return self.idf_default
151
+ end
152
+
153
+ return Math.log((1 + self.num_docs).to_f /
154
+ (1 + self.term_num_docs[term]))
155
+ end
156
+
157
+ ##
158
+ # Retrieve terms and corresponding tf-idf for the specified document.
159
+ #
160
+ # The returned terms are ordered by decreasing tf-idf.
161
+ #
162
+ # @param [String] curr_doc String representation of an existing document.
163
+ # @return [Array] Terms ordered by decreasing tf-idf rank.
164
+ def doc_keywords(curr_doc)
165
+ tfidf = {}
166
+
167
+ tokens = self.get_tokens(curr_doc)
168
+ token_set = tokens.uniq
169
+ token_set_sz = token_set.count
170
+
171
+ token_set.each do |term|
172
+ mytf = tokens.count(term).to_f / token_set_sz
173
+ myidf = self.idf(term)
174
+ tfidf[term] = mytf * myidf
175
+ end
176
+
177
+ sort_by_tfidf(tfidf)
178
+ end
179
+
180
+ ##
181
+ # Returns a string representation of the tf-idf corpus.
182
+ #
183
+ # @return [String] Contains # docs, # term and frequency.
184
+ def to_s
185
+ {:num_docs => self.num_docs, :term_num_docs => self.term_num_docs.size}.inspect
186
+ end
187
+
188
+ ##
189
+ # Sorts terms by decreasing tf-idf.
190
+ #
191
+ # @example Sort by tf-idf
192
+ # "{'and'=>0.0025, 'fork'=>0.0025, 'the'=>0.37688590118819, 'spoon'=>1.0025}" #=>
193
+ # "[['spoon', 1.0025], ['the', 0.37688590118819], ['fork', 0.0025], ['and', 0.0025]]"
194
+ # @return [Array<Array<String, Float>>] An array of term/IDF array pairs.
195
+ def sort_by_tfidf(tfidf)
196
+ tfidf.sort{|a, b| b[1] <=> a[1]}
197
+ end
198
+
199
+ end
@@ -0,0 +1,7 @@
1
+ 50
2
+ the: 23
3
+ a: 17
4
+ girl: 1
5
+ moon: 1
6
+ said: 5
7
+ phone: 2
@@ -0,0 +1,13 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'tf-idf'
4
+ require 'spec'
5
+ require 'spec/autorun'
6
+ require 'pp'
7
+
8
+ Spec::Runner.configure do |config|
9
+ end
10
+
11
+ def get_expected_idf(num_docs_total, num_docs_term)
12
+ Math.log((1 + num_docs_total).to_f / (1 + num_docs_term))
13
+ end
@@ -0,0 +1,104 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "TF-IDF library" do
4
+
5
+ before(:all) do
6
+ @test_corpus = File.expand_path(File.dirname(__FILE__) + '/fixtures/tfidf_testcorpus.txt')
7
+ @test_stopwords = File.expand_path(File.dirname(__FILE__) + '/fixtures/tfidf_teststopwords.txt')
8
+ @default_idf_unittest = 1.0
9
+ end
10
+
11
+ it "should instantiate without args" do
12
+ TfIdf.new
13
+ end
14
+
15
+ it "should report the correct number of documents" do
16
+ my_tfidf = TfIdf.from_corpus(@test_corpus, @default_idf_unittest)
17
+ my_tfidf.num_docs.should == 50
18
+ end
19
+
20
+ it "should report the correct number of terms" do
21
+ my_tfidf = TfIdf.from_corpus(@test_corpus, @default_idf_unittest)
22
+ my_tfidf.term_num_docs.size.should == 6
23
+ end
24
+
25
+ it "should query IDF for nonexistent terms" do
26
+ my_tfidf = TfIdf.from_corpus(@test_corpus, @default_idf_unittest)
27
+ my_tfidf.idf("nonexistent").should == @default_idf_unittest
28
+ my_tfidf.idf("THE").should == @default_idf_unittest
29
+ end
30
+
31
+ it "should query IDF for existent terms" do
32
+ my_tfidf = TfIdf.from_corpus(@test_corpus, @default_idf_unittest)
33
+ my_tfidf.idf("a").should > my_tfidf.idf("the")
34
+ my_tfidf.idf("girl").should == my_tfidf.idf("moon")
35
+ end
36
+
37
+ it "should retrieve keywords from a document, ordered by tf-idf" do
38
+ my_tfidf = TfIdf.from_corpus(@test_corpus, 0.01)
39
+
40
+ # Test retrieving keywords when there is only one keyword.
41
+ keywords = my_tfidf.doc_keywords("the spoon and the fork")
42
+ keywords[0][0].should == "the"
43
+
44
+ # Test retrieving multiple keywords.
45
+ keywords = my_tfidf.doc_keywords("the girl said hello over the phone")
46
+ keywords[0][0].should == "girl"
47
+ keywords[1][0].should == "phone"
48
+ keywords[2][0].should == "said"
49
+ keywords[3][0].should == "the"
50
+ end
51
+
52
+ it "should add input documents to an existing corpus" do
53
+ my_tfidf = TfIdf.new(@test_corpus, nil, @default_idf_unittest)
54
+ my_tfidf.idf("water").should == @default_idf_unittest
55
+ my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 1)
56
+ my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
57
+
58
+ my_tfidf.add_input_document("water moon") # doesn't support commas
59
+
60
+ my_tfidf.idf("water").should == get_expected_idf(my_tfidf.num_docs, 1)
61
+ my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 2)
62
+ my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
63
+ end
64
+
65
+ it "should add input documents to an empty corpus" do
66
+ my_tfidf = TfIdf.new(nil, nil, @default_idf_unittest)
67
+ my_tfidf.idf("moon").should == @default_idf_unittest
68
+ my_tfidf.idf("water").should == @default_idf_unittest
69
+ my_tfidf.idf("said").should == @default_idf_unittest
70
+
71
+ my_tfidf.add_input_document("moon")
72
+ my_tfidf.add_input_document("moon said hello")
73
+
74
+ my_tfidf.idf("water").should == @default_idf_unittest
75
+ my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 1)
76
+ my_tfidf.idf("moon").should == get_expected_idf(my_tfidf.num_docs, 2)
77
+ end
78
+
79
+ it "should observe stopwords list" do
80
+ my_tfidf = TfIdf.new(@test_corpus, @test_stopwords, @default_idf_unittest)
81
+ my_tfidf.idf("water").should == @default_idf_unittest
82
+ my_tfidf.idf("moon").should == 0 # ignored
83
+ my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
84
+
85
+ my_tfidf.add_input_document("moon")
86
+ my_tfidf.add_input_document("moon and water")
87
+
88
+ my_tfidf.idf("water").should == get_expected_idf(my_tfidf.num_docs, 1)
89
+ my_tfidf.idf("moon").should == 0
90
+ my_tfidf.idf("said").should == get_expected_idf(my_tfidf.num_docs, 5)
91
+ end
92
+
93
+ # Abstract out File IO
94
+ # it "should write the contents of the TF/IDF corpus to disk" do
95
+ # my_tfidf = TfIdf.new(@test_corpus, @test_stopwords, @default_idf_unittest)
96
+ # my_tfidf.save_corpus_to_file("foo.txt", "bar.txt", 0.3)
97
+ # stopwords = File.read("bar.txt").split
98
+ #
99
+ # stopwords.size.should == 2
100
+ # stopwords.should include("a")
101
+ # stopwords.should include("the")
102
+ # end
103
+
104
+ end
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tf-idf
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Marc Chung
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-11-29 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rspec
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: yard
27
+ type: :development
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: cucumber
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ description: Computes IDF for a specified term based on the corpus, or generates keywords ordered by tf-idf for a specified document.
46
+ email: mchung@gmail.com
47
+ executables: []
48
+
49
+ extensions: []
50
+
51
+ extra_rdoc_files:
52
+ - LICENSE
53
+ - README.markdown
54
+ files:
55
+ - .document
56
+ - .gitignore
57
+ - LICENSE
58
+ - README.markdown
59
+ - Rakefile
60
+ - VERSION
61
+ - features/step_definitions/tf-idf_steps.rb
62
+ - features/support/env.rb
63
+ - features/tf-idf.feature
64
+ - lib/tf-idf.rb
65
+ - spec/fixtures/tfidf_testcorpus.txt
66
+ - spec/fixtures/tfidf_teststopwords.txt
67
+ - spec/spec_helper.rb
68
+ - spec/tf-idf_spec.rb
69
+ has_rdoc: true
70
+ homepage: http://github.com/mchung/tf-idf
71
+ licenses: []
72
+
73
+ post_install_message:
74
+ rdoc_options:
75
+ - --charset=UTF-8
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: "0"
83
+ version:
84
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: "0"
89
+ version:
90
+ requirements: []
91
+
92
+ rubyforge_project:
93
+ rubygems_version: 1.3.5
94
+ signing_key:
95
+ specification_version: 3
96
+ summary: A rubygem that implements the Tf-Idf algorithm
97
+ test_files:
98
+ - spec/spec_helper.rb
99
+ - spec/tf-idf_spec.rb