RubyGems - tf-idf_csv - Versions diffs - 0.2.0 → 0.2.1 - Mend

tf-idf_csv 0.2.0 → 0.2.1

Files changed (7) hide show

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.2.0
1	+ 0.2.1

data/bin/tf-idf_csv CHANGED Viewed

@@ -4,19 +4,17 @@ $LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
 require 'tf-idf_csv'
 require 'csv'
+tf_idf = Tf_Idf_CSV.new()
 begin
   csv_file = ARGV[0]
-  csv = CSV.open(csv_file)
+  tf_idf.add_csv(csv_file)
 rescue
+  puts $!
   puts "Please specify a valid CSV file"
   Process.exit(1)
 end
-tf_idf = Tf_Idf_CSV.new()
-tf_idf.add_csv(csv)
-output_csv_file = csv_file.sub(/\.csv$/,'-tf-idf.csv')
-tf_idf.write(output_csv_file)
+output_csv_file = csv_file.sub(/\.csv$/,'-fast.csv')
+tf_idf.fast_write(output_csv_file)

data/lib/tf-idf_csv.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 require 'csv'
+require 'logger'
 # This class expects a CSV input
 # One row per document,
@@ -7,83 +8,139 @@ require 'csv'
 # TF-IDF will be returned based on the number of times the term appears in each document, relative to the total number of documents it appears in
 class Tf_Idf_CSV
-  def initialize
-    @tf_idf = {}
+  def initialize
+    @logger = Logger.new(STDERR)
+    reset_tf_idf
     @total_number_of_docs = 0
-    @doc_count_per_term = Hash.new(0)
-    @term_freq_per_doc = Hash.new
+    @term_count_per_doc = Hash.new # [document] => { "term1" => count1, "term2" => count2 }
+    @term_freq_per_doc = Hash.new # [document] => { "term1" => frequency1, "term2" => frequency2 }
+    @doc_count_per_term = Hash.new(0) # [term] => num_of_documents_which_contain_this_term
+  end
+  def docs
+    @term_freq_per_doc.keys
+  end
+  def terms
+    @doc_count_per_term.keys
+  end
+  def count(doc, term)
+    return nil unless @term_count_per_doc[doc]
+    @term_count_per_doc[doc][term]
+  end
+  def tf(doc, term)
+    return nil unless @term_freq_per_doc[doc]
+    @term_freq_per_doc[doc][term]
+  end
+  def idf(term)
+    @idf[term] ||= Math.log10(@total_number_of_docs / @doc_count_per_term[term])
+  end
+  def tf_idf(doc,term)
+    return nil unless tf(doc, term)
+    @tf_idf[doc][term] ||= tf(doc, term) * idf(term)
   end
-  def add_csv(csv)
-    csv.each do |row|
-      name = row[0]
-      terms = row[1..-1]
-      add_document(name, terms)
+  def stop_words
+    @doc_count_per_term.select { |term, count| count == @total_number_of_docs }.keys
+  end
+  def add_document(doc, terms)
+    reset_tf_idf
+    @total_number_of_docs += 1.0 # use float as we want divions later
+    calculate(doc, terms)
+    @logger.debug("Added document '#{doc}'")
+  end
+  def add_csv(file_name)
+    CSV.foreach(file_name) do |row|
+      add_document(row[0],row[1..-1])
     end
-    calculate_tf_idf
   end
+  def fast_write(csv_file_name, options = {})
+    CSV.open(csv_file_name,"w") do |f|
+      f << ["doc","term","count","tf","idf","tf_idf"]
+      docs.each do |doc|
+        @term_freq_per_doc[doc].each do |term,freq|
+          f << [doc,term,count(doc,term),tf(doc,term),idf(term),tf_idf(doc,term)] if tf(doc,term)
+        end
+      end
+    end
+  end
   # Save the results as CSV
   # Term, Doc1, Doc2, Doc3...
   # Eggs, 0.04535,,0.02
-  def write(csv_file_name, options = {})
+  def write_tf_idf(csv_file_name, options = {})
     decimal_places = options[:decimal_places] || 20
     CSV.open(csv_file_name,"w") do |f|
-      f << ["term", docs].flatten
-      @tf_idf.each do |term, values|
-        tmp_row = [term]
+      f << ["terms", docs].flatten
+      terms.each do |term|
+        row = [term]
         docs.each do |doc|
-          value = values[doc] ? ("%.#{decimal_places}f" % values[doc]) : nil
-          value = nil if value =~ /^0\.0+$/
-          tmp_row << value
+          value = tf_idf(doc,term) ? ("%.#{decimal_places}f" % tf_idf(doc,term))  : nil
+          value = nil if value.to_s =~ /^0.0+$/
+          row << value
         end
-        f << tmp_row
+        f << row
       end
     end
   end
+  def write_tf(csv_file_name, options = {})
+    decimal_places = options[:decimal_places] || 20
+    CSV.open(csv_file_name,"w") do |f|
+      f << ["terms", docs].flatten
+      terms.each do |term|
+        row = [term]
+        docs.each do |doc|
+          value = tf(doc,term) ? ("%.#{decimal_places}f" % tf(doc,term))  : nil
+          value = nil if value.to_s =~ /^0.0+$/
+          row << value
+        end
+        f << row
+        # @logger.debug(row)
+      end
+    end
+  end
   private
-  def add_document(doc, terms)
-    @total_number_of_docs += 1
-    term_counts_doc = Hash.new(0.0)
+  def reset_tf_idf
+    @idf = {}
+    @tf_idf = Hash.new { |hash, key| hash[key] = {} }
+    @logger.debug("Reset tf-idf")
+  end
+  def calculate(doc, terms)
+    term_size = terms.size.to_f
+    term_count = Hash.new(0)
+    term_freq = Hash.new
     # Count the number of times each term appears in this document
     terms.each do |term|
-      term_counts_doc[term] += 1.0
+      term_count[term] += 1
     end
     # Normalize the count to find term frequency. Divide count by total number of terms in document
-    term_counts_doc.each_key do |term|
-      term_counts_doc[term] /= terms.size
-      @doc_count_per_term[term] += 1.0
+    term_count.each do |term, count|
+      term_freq[term] = count / term_size
+      @doc_count_per_term[term] += 1
     end
-    @term_freq_per_doc[doc] = term_counts_doc
-  end
-  def docs
-    @term_freq_per_doc.keys
-  end
-  # produces a hash indexed by term, with each value being a hash indexed by document with a value being the TF-IDF
-  # { "cat" => { "Green Eggs and Ham" => 0.04535, "Dick Wittington" => 0.02343434 }, "Eggs" => { "Green Eggs and Ham" => 0.02764} } }
-  def calculate_tf_idf
-    @doc_count_per_term.each do |term, count_per_doc|
-      doc_list = {}
-      docs.each do |doc|
-        # if we have a frequency for this term, we can calculate TF-IDF
-        if @term_freq_per_doc[doc].key?(term)
-          doc_list[doc] = @term_freq_per_doc[doc][term] * Math.log10(@total_number_of_docs / count_per_doc)
-        else
-          doc_list[doc] = nil
-        end
-      end
-      @tf_idf[term] = doc_list
-    end
+    @term_count_per_doc[doc] =  term_count
+    @term_freq_per_doc[doc] =  term_freq
   end
-end
+end

data/tf-idf_csv.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{tf-idf_csv}
-  s.version = "0.2.0"
+  s.version = "0.2.1"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Julian Burgess"]
-  s.date = %q{2010-11-02}
+  s.date = %q{2010-11-18}
   s.default_executable = %q{tf-idf_csv}
   s.email = %q{jburgess@ap.org}
   s.executables = ["tf-idf_csv"]
@@ -27,8 +27,6 @@ Gem::Specification.new do |s|
      "VERSION",
      "bin/tf-idf_csv",
      "lib/tf-idf_csv.rb",
-     "sample-tf-idf.csv",
-     "sample.csv",
      "test/helper.rb",
      "test/test_tf-idf_csv.rb",
      "tf-idf_csv.gemspec"

metadata CHANGED Viewed

@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 2
-  - 0
-  version: 0.2.0
+  - 1
+  version: 0.2.1
 platform: ruby
 authors:
 - Julian Burgess
@@ -14,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-11-02 00:00:00 -04:00
+date: 2010-11-18 00:00:00 -05:00
 default_executable: tf-idf_csv
 dependencies: []
@@ -37,8 +37,6 @@ files:
 - VERSION
 - bin/tf-idf_csv
 - lib/tf-idf_csv.rb
-- sample-tf-idf.csv
-- sample.csv
 - test/helper.rb
 - test/test_tf-idf_csv.rb
 - tf-idf_csv.gemspec

data/sample-tf-idf.csv DELETED Viewed

@@ -1,46 +0,0 @@
-term,doc1,doc2
-the,,
-limerick,0.01038034467806831646,
-packs,0.01038034467806831646,
-laughs,0.01038034467806831646,
-anatomical,0.01038034467806831646,
-in,0.01038034467806831646,
-space,0.01038034467806831646,
-that,,
-is,0.01038034467806831646,
-quite,0.01038034467806831646,
-economical,0.01038034467806831646,
-but,,
-good,0.01038034467806831646,
-ones,0.02076068935613663291,
-i've,0.01038034467806831646,
-seen,0.01038034467806831646,
-so,0.02076068935613663291,
-seldom,0.02076068935613663291,
-are,0.02076068935613663291,
-clean,0.02076068935613663291,
-and,,
-comical,0.01038034467806831646,
-there,,0.01003433318879937315
-was,,0.01003433318879937315
-a,,0.01003433318879937315
-young,,0.01003433318879937315
-person,,0.01003433318879937315
-of,,0.02006866637759874630
-smyrna,,0.02006866637759874630
-whose,,0.01003433318879937315
-grandmother,,0.01003433318879937315
-threatened,,0.01003433318879937315
-to,,0.01003433318879937315
-burn,,0.02006866637759874630
-her,,0.01003433318879937315
-she,,0.01003433318879937315
-seized,,0.01003433318879937315
-on,,0.01003433318879937315
-cat,,0.01003433318879937315
-said,,0.01003433318879937315
-'granny,,0.01003433318879937315
-you,,0.01003433318879937315
-incongruous,,0.01003433318879937315
-old,,0.01003433318879937315
-woman,,0.01003433318879937315

data/sample.csv DELETED Viewed

	@@ -1,2 +0,0 @@
1	- doc1,the,limerick,packs,laughs,anatomical,in,space,that,is,quite,economical,but,the,good,ones,i've,seen,so,seldom,are,clean,and,the,clean,ones,so,seldom,are,comical
2	- doc2,there,was,a,young,person,of,smyrna,whose,grandmother,threatened,to,burn,her,but,she,seized,on,the,cat,and,said,'granny,burn,that,you,incongruous,old,woman,of,smyrna