tf-idf_csv 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010 Julian Burgess
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,19 @@
1
+ = tf-idf_csv
2
+
3
+ This provides a simple executable which will read a CSV in the following format and then produce a CSV of each term's TF-IDF.
4
+
5
+ For more detail about TF-IDF see http://en.wikipedia.org/wiki/Tf%E2%80%93idf
6
+
7
+ == Note on Patches/Pull Requests
8
+
9
+ * Fork the project.
10
+ * Make your feature addition or bug fix.
11
+ * Add tests for it. This is important so I don't break it in a
12
+ future version unintentionally.
13
+ * Commit, do not mess with rakefile, version, or history.
14
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
15
+ * Send me a pull request. Bonus points for topic branches.
16
+
17
+ == Copyright
18
+
19
+ Copyright (c) 2010 Julian Burgess. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,52 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "tf-idf_csv"
8
+ gem.summary = %Q{Calculate the TF-IDF for terms in a CSV file}
9
+ gem.email = "jburgess@ap.org"
10
+ gem.homepage = "http://github.com/aubergene/tf-idf_csv"
11
+ gem.authors = ["Julian Burgess"]
12
+ gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
13
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
+ end
15
+ Jeweler::GemcutterTasks.new
16
+ rescue LoadError
17
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
18
+ end
19
+
20
+ require 'rake/testtask'
21
+ Rake::TestTask.new(:test) do |test|
22
+ test.libs << 'lib' << 'test'
23
+ test.pattern = 'test/**/test_*.rb'
24
+ test.verbose = true
25
+ end
26
+
27
+ begin
28
+ require 'rcov/rcovtask'
29
+ Rcov::RcovTask.new do |test|
30
+ test.libs << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+ rescue LoadError
35
+ task :rcov do
36
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
37
+ end
38
+ end
39
+
40
+ task :test => :check_dependencies
41
+
42
+ task :default => :test
43
+
44
+ require 'rake/rdoctask'
45
+ Rake::RDocTask.new do |rdoc|
46
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
47
+
48
+ rdoc.rdoc_dir = 'rdoc'
49
+ rdoc.title = "tf-idf_csv #{version}"
50
+ rdoc.rdoc_files.include('README*')
51
+ rdoc.rdoc_files.include('lib/**/*.rb')
52
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/bin/tf-idf_csv ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
4
+ require 'tf-idf_csv'
5
+
6
+ tf = TfIdf_CSV.new(ARGV[0])
7
+
8
+
data/lib/tf-idf_csv.rb ADDED
@@ -0,0 +1,100 @@
1
+ require 'csv'
2
+ require 'set'
3
+
4
+ # This class expects a CSV input
5
+ # One row per document,
6
+ # the first cell should be a document identifier
7
+ # each subsequent cell contains one term.
8
+ # TF-IDF will be returned based on the number of times the term appears in each document, relative to the total number of documents it appears in
9
+
10
+ class TfIdf_CSV
11
+
12
+ DECIMAL_PLACES = 20 # number of decimal places to use in output
13
+
14
+ # n the n-grams of the data http://en.wikipedia.org/wiki/N-gram
15
+ def initialize(csv_file)
16
+ @output_csv_file = csv_file.sub(/\.csv$/,'-tf-idf.csv')
17
+
18
+ @tf_idf = {}
19
+ @total_number_of_docs = 0
20
+ @doc_count_per_term = Hash.new(0)
21
+ @term_freq_per_doc = Hash.new
22
+
23
+ add_csv(csv_file)
24
+ calculate_tf_idf
25
+ puts "Finished calculations"
26
+
27
+ save_output
28
+ puts "Finished saving to #{@output_csv_file}"
29
+ end
30
+
31
+ def add_csv(csv_file)
32
+ begin
33
+ CSV.foreach(csv_file) do |row|
34
+ add_doc(row[0], row[1..-1])
35
+ @total_number_of_docs += 1
36
+ end
37
+ rescue Exception
38
+ puts "Error opening #{csv_file}. Please specify a valid CSV file"
39
+ Process.exit(1 )
40
+ end
41
+ end
42
+
43
+ def add_doc(doc, terms)
44
+ term_counts_doc = Hash.new(0.0)
45
+
46
+ # Count the number of times each term appears in this document
47
+ terms.each do |term|
48
+ term_counts_doc[term] += 1.0
49
+ end
50
+
51
+ # Normalize the count to find term frequency. Divide count by total number of terms in document
52
+ term_counts_doc.each_key do |term|
53
+ term_counts_doc[term] /= terms.size
54
+ @doc_count_per_term[term] += 1.0
55
+ end
56
+
57
+ @term_freq_per_doc[doc] = term_counts_doc
58
+ end
59
+
60
+ def docs
61
+ @term_freq_per_doc.keys
62
+ end
63
+
64
+ # produces a hash indexed by term, with each value being a hash indexed by document with a value being the TF-IDF
65
+ # { "cat" => { "Green Eggs and Ham" => 0.04535, "Dick Wittington" => 0.02343434 }, "Eggs" => { "Green Eggs and Ham" => 0.02764} } }
66
+ def calculate_tf_idf
67
+ @doc_count_per_term.each do |term, count_per_doc|
68
+ doc_list = {}
69
+ docs.each do |doc|
70
+ # if we have a frequency for this term, we can calculate TF-IDF
71
+ if @term_freq_per_doc[doc].key?(term)
72
+ doc_list[doc] = @term_freq_per_doc[doc][term] * Math.log10(@total_number_of_docs / count_per_doc)
73
+ else
74
+ doc_list[doc] = nil
75
+ end
76
+ end
77
+ @tf_idf[term] = doc_list
78
+ end
79
+ end
80
+
81
+ # Save the results as CSV
82
+ # Term, Doc1, Doc2, Doc3...
83
+ # Eggs, 0.04535,,0.02
84
+ def save_output
85
+ CSV.open(@output_csv_file,"w") do |f|
86
+ f << ["term", docs].flatten
87
+ @tf_idf.each do |term, values|
88
+ tmp_row = [term]
89
+ docs.each do |doc|
90
+ value = values[doc] ? ("%.#{DECIMAL_PLACES}f" % values[doc]) : nil
91
+ value = nil if value =~ /^0\.0+$/
92
+ tmp_row << value
93
+ end
94
+ f << tmp_row
95
+ end
96
+ end
97
+ end
98
+
99
+ end
100
+
data/sample-tf-idf.csv ADDED
@@ -0,0 +1,46 @@
1
+ term,doc1,doc2
2
+ the,,
3
+ limerick,0.01038034467806831646,
4
+ packs,0.01038034467806831646,
5
+ laughs,0.01038034467806831646,
6
+ anatomical,0.01038034467806831646,
7
+ in,0.01038034467806831646,
8
+ space,0.01038034467806831646,
9
+ that,,
10
+ is,0.01038034467806831646,
11
+ quite,0.01038034467806831646,
12
+ economical,0.01038034467806831646,
13
+ but,,
14
+ good,0.01038034467806831646,
15
+ ones,0.02076068935613663291,
16
+ i've,0.01038034467806831646,
17
+ seen,0.01038034467806831646,
18
+ so,0.02076068935613663291,
19
+ seldom,0.02076068935613663291,
20
+ are,0.02076068935613663291,
21
+ clean,0.02076068935613663291,
22
+ and,,
23
+ comical,0.01038034467806831646,
24
+ there,,0.01003433318879937315
25
+ was,,0.01003433318879937315
26
+ a,,0.01003433318879937315
27
+ young,,0.01003433318879937315
28
+ person,,0.01003433318879937315
29
+ of,,0.02006866637759874630
30
+ smyrna,,0.02006866637759874630
31
+ whose,,0.01003433318879937315
32
+ grandmother,,0.01003433318879937315
33
+ threatened,,0.01003433318879937315
34
+ to,,0.01003433318879937315
35
+ burn,,0.02006866637759874630
36
+ her,,0.01003433318879937315
37
+ she,,0.01003433318879937315
38
+ seized,,0.01003433318879937315
39
+ on,,0.01003433318879937315
40
+ cat,,0.01003433318879937315
41
+ said,,0.01003433318879937315
42
+ 'granny,,0.01003433318879937315
43
+ you,,0.01003433318879937315
44
+ incongruous,,0.01003433318879937315
45
+ old,,0.01003433318879937315
46
+ woman,,0.01003433318879937315
data/sample.csv ADDED
@@ -0,0 +1,2 @@
1
+ doc1,the,limerick,packs,laughs,anatomical,in,space,that,is,quite,economical,but,the,good,ones,i've,seen,so,seldom,are,clean,and,the,clean,ones,so,seldom,are,comical
2
+ doc2,there,was,a,young,person,of,smyrna,whose,grandmother,threatened,to,burn,her,but,she,seized,on,the,cat,and,said,'granny,burn,that,you,incongruous,old,woman,of,smyrna
data/test/helper.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'tf-idf_csv'
8
+
9
+ class Test::Unit::TestCase
10
+ end
@@ -0,0 +1,7 @@
1
+ require 'helper'
2
+
3
+ class TestTfIdfCsv < Test::Unit::TestCase
4
+ should "probably rename this file and start testing for real" do
5
+ flunk "hey buddy, you should probably rename this file and start testing for real"
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,89 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tf-idf_csv
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - Julian Burgess
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-11-02 00:00:00 -04:00
18
+ default_executable: tf-idf_csv
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: thoughtbot-shoulda
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ segments:
29
+ - 0
30
+ version: "0"
31
+ type: :development
32
+ version_requirements: *id001
33
+ description:
34
+ email: jburgess@ap.org
35
+ executables:
36
+ - tf-idf_csv
37
+ extensions: []
38
+
39
+ extra_rdoc_files:
40
+ - LICENSE
41
+ - README.rdoc
42
+ files:
43
+ - .document
44
+ - .gitignore
45
+ - LICENSE
46
+ - README.rdoc
47
+ - Rakefile
48
+ - VERSION
49
+ - bin/tf-idf_csv
50
+ - lib/tf-idf_csv.rb
51
+ - sample-tf-idf.csv
52
+ - sample.csv
53
+ - test/helper.rb
54
+ - test/test_tf-idf_csv.rb
55
+ has_rdoc: true
56
+ homepage: http://github.com/aubergene/tf-idf_csv
57
+ licenses: []
58
+
59
+ post_install_message:
60
+ rdoc_options:
61
+ - --charset=UTF-8
62
+ require_paths:
63
+ - lib
64
+ required_ruby_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ segments:
70
+ - 0
71
+ version: "0"
72
+ required_rubygems_version: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ segments:
78
+ - 0
79
+ version: "0"
80
+ requirements: []
81
+
82
+ rubyforge_project:
83
+ rubygems_version: 1.3.7
84
+ signing_key:
85
+ specification_version: 3
86
+ summary: Calculate the TF-IDF for terms in a CSV file
87
+ test_files:
88
+ - test/helper.rb
89
+ - test/test_tf-idf_csv.rb