RubyGems - tf-idf_csv - Versions diffs - 0.2.0 → 0.2.1 - Mend

tf-idf_csv 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.2.0
1	+ 0.2.1

data/bin/tf-idf_csv CHANGED Viewed

@@ -4,19 +4,17 @@ $LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
 require 'tf-idf_csv'
 require 'csv'
+tf_idf = Tf_Idf_CSV.new()
 begin
   csv_file = ARGV[0]
-  csv = CSV.open(csv_file)
+  tf_idf.add_csv(csv_file)
 rescue
+  puts $!
   puts "Please specify a valid CSV file"
   Process.exit(1)
 end
-tf_idf = Tf_Idf_CSV.new()
-tf_idf.add_csv(csv)
-output_csv_file = csv_file.sub(/\.csv$/,'-tf-idf.csv')
-tf_idf.write(output_csv_file)
+output_csv_file = csv_file.sub(/\.csv$/,'-fast.csv')
+tf_idf.fast_write(output_csv_file)

data/lib/tf-idf_csv.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 require 'csv'
+require 'logger'
 # This class expects a CSV input
 # One row per document,
@@ -7,83 +8,139 @@ require 'csv'
 # TF-IDF will be returned based on the number of times the term appears in each document, relative to the total number of documents it appears in
 class Tf_Idf_CSV
-  def initialize
-    @tf_idf = {}
+  def initialize
+    @logger = Logger.new(STDERR)
+    reset_tf_idf
     @total_number_of_docs = 0
-    @doc_count_per_term = Hash.new(0)
-    @term_freq_per_doc = Hash.new
+    @term_count_per_doc = Hash.new # [document] => { "term1" => count1, "term2" => count2 }
+    @term_freq_per_doc = Hash.new # [document] => { "term1" => frequency1, "term2" => frequency2 }
+    @doc_count_per_term = Hash.new(0) # [term] => num_of_documents_which_contain_this_term
+  end
+  def docs
+    @term_freq_per_doc.keys
+  end
+  def terms
+    @doc_count_per_term.keys
+  end
+  def count(doc, term)
+    return nil unless @term_count_per_doc[doc]
+    @term_count_per_doc[doc][term]
+  end
+  def tf(doc, term)
+    return nil unless @term_freq_per_doc[doc]
+    @term_freq_per_doc[doc][term]
+  end
+  def idf(term)
+    @idf[term] ||= Math.log10(@total_number_of_docs / @doc_count_per_term[term])
+  end
+  def tf_idf(doc,term)
+    return nil unless tf(doc, term)
+    @tf_idf[doc][term] ||= tf(doc, term) * idf(term)
   end
-  def add_csv(csv)
-    csv.each do |row|
-      name = row[0]
-      terms = row[1..-1]
-      add_document(name, terms)
+  def stop_words
+    @doc_count_per_term.select { |term, count| count == @total_number_of_docs }.keys
+  end
+  def add_document(doc, terms)
+    reset_tf_idf
+    @total_number_of_docs += 1.0 # use float as we want divions later
+    calculate(doc, terms)
+    @logger.debug("Added document '#{doc}'")
+  end
+  def add_csv(file_name)
+    CSV.foreach(file_name) do |row|
+      add_document(row[0],row[1..-1])
     end
-    calculate_tf_idf
   end
+  def fast_write(csv_file_name, options = {})
+    CSV.open(csv_file_name,"w") do |f|
+      f << ["doc","term","count","tf","idf","tf_idf"]
+      docs.each do |doc|
+        @term_freq_per_doc[doc].each do |term,freq|
+          f << [doc,term,count(doc,term),tf(doc,term),idf(term),tf_idf(doc,term)] if tf(doc,term)
+        end
+      end
+    end
+  end
   # Save the results as CSV
   # Term, Doc1, Doc2, Doc3...
   # Eggs, 0.04535,,0.02
-  def write(csv_file_name, options = {})
+  def write_tf_idf(csv_file_name, options = {})
     decimal_places = options[:decimal_places] || 20
     CSV.open(csv_file_name,"w") do |f|
-      f << ["term", docs].flatten
-      @tf_idf.each do |term, values|
-        tmp_row = [term]
+      f << ["terms", docs].flatten
+      terms.each do |term|
+        row = [term]
         docs.each do |doc|
-          value = values[doc] ? ("%.#{decimal_places}f" % values[doc]) : nil
-          value = nil if value =~ /^0\.0+$/
-          tmp_row << value
+          value = tf_idf(doc,term) ? ("%.#{decimal_places}f" % tf_idf(doc,term))  : nil
+          value = nil if value.to_s =~ /^0.0+$/
+          row << value
         end
-        f << tmp_row
+        f << row
       end
     end
   end
+  def write_tf(csv_file_name, options = {})
+    decimal_places = options[:decimal_places] || 20
+    CSV.open(csv_file_name,"w") do |f|
+      f << ["terms", docs].flatten
+      terms.each do |term|
+        row = [term]
+        docs.each do |doc|
+          value = tf(doc,term) ? ("%.#{decimal_places}f" % tf(doc,term))  : nil
+          value = nil if value.to_s =~ /^0.0+$/
+          row << value
+        end
+        f << row
+        # @logger.debug(row)
+      end
+    end
+  end
   private
-  def add_document(doc, terms)
-    @total_number_of_docs += 1
-    term_counts_doc = Hash.new(0.0)
+  def reset_tf_idf
+    @idf = {}
+    @tf_idf = Hash.new { |hash, key| hash[key] = {} }
+    @logger.debug("Reset tf-idf")
+  end
+  def calculate(doc, terms)
+    term_size = terms.size.to_f
+    term_count = Hash.new(0)
+    term_freq = Hash.new
     # Count the number of times each term appears in this document
     terms.each do |term|
-      term_counts_doc[term] += 1.0
+      term_count[term] += 1
     end
     # Normalize the count to find term frequency. Divide count by total number of terms in document
-    term_counts_doc.each_key do |term|
-      term_counts_doc[term] /= terms.size
-      @doc_count_per_term[term] += 1.0
+    term_count.each do |term, count|
+      term_freq[term] = count / term_size
+      @doc_count_per_term[term] += 1
     end
-    @term_freq_per_doc[doc] = term_counts_doc
-  end
-  def docs
-    @term_freq_per_doc.keys
-  end
-  # produces a hash indexed by term, with each value being a hash indexed by document with a value being the TF-IDF
-  # { "cat" => { "Green Eggs and Ham" => 0.04535, "Dick Wittington" => 0.02343434 }, "Eggs" => { "Green Eggs and Ham" => 0.02764} } }
-  def calculate_tf_idf
-    @doc_count_per_term.each do |term, count_per_doc|
-      doc_list = {}
-      docs.each do |doc|
-        # if we have a frequency for this term, we can calculate TF-IDF
-        if @term_freq_per_doc[doc].key?(term)
-          doc_list[doc] = @term_freq_per_doc[doc][term] * Math.log10(@total_number_of_docs / count_per_doc)
-        else
-          doc_list[doc] = nil
-        end
-      end
-      @tf_idf[term] = doc_list
-    end
+    @term_count_per_doc[doc] =  term_count
+    @term_freq_per_doc[doc] =  term_freq
   end
-end
+end

data/tf-idf_csv.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{tf-idf_csv}
-  s.version = "0.2.0"
+  s.version = "0.2.1"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Julian Burgess"]
-  s.date = %q{2010-11-02}
+  s.date = %q{2010-11-18}
   s.default_executable = %q{tf-idf_csv}
   s.email = %q{jburgess@ap.org}
   s.executables = ["tf-idf_csv"]
@@ -27,8 +27,6 @@ Gem::Specification.new do |s|
      "VERSION",
      "bin/tf-idf_csv",
      "lib/tf-idf_csv.rb",
-     "sample-tf-idf.csv",
-     "sample.csv",
      "test/helper.rb",
      "test/test_tf-idf_csv.rb",
      "tf-idf_csv.gemspec"

metadata CHANGED Viewed

@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 2
-  - 0
-  version: 0.2.0
+  - 1
+  version: 0.2.1
 platform: ruby
 authors:
 - Julian Burgess
@@ -14,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-11-02 00:00:00 -04:00
+date: 2010-11-18 00:00:00 -05:00
 default_executable: tf-idf_csv
 dependencies: []
@@ -37,8 +37,6 @@ files:
 - VERSION
 - bin/tf-idf_csv
 - lib/tf-idf_csv.rb
-- sample-tf-idf.csv
-- sample.csv
 - test/helper.rb
 - test/test_tf-idf_csv.rb
 - tf-idf_csv.gemspec

data/sample-tf-idf.csv DELETED Viewed

@@ -1,46 +0,0 @@
-term,doc1,doc2
-the,,
-limerick,0.01038034467806831646,
-packs,0.01038034467806831646,
-laughs,0.01038034467806831646,
-anatomical,0.01038034467806831646,
-in,0.01038034467806831646,
-space,0.01038034467806831646,
-that,,
-is,0.01038034467806831646,
-quite,0.01038034467806831646,
-economical,0.01038034467806831646,
-but,,
-good,0.01038034467806831646,
-ones,0.02076068935613663291,
-i've,0.01038034467806831646,
-seen,0.01038034467806831646,
-so,0.02076068935613663291,
-seldom,0.02076068935613663291,
-are,0.02076068935613663291,
-clean,0.02076068935613663291,
-and,,
-comical,0.01038034467806831646,
-there,,0.01003433318879937315
-was,,0.01003433318879937315
-a,,0.01003433318879937315
-young,,0.01003433318879937315
-person,,0.01003433318879937315
-of,,0.02006866637759874630
-smyrna,,0.02006866637759874630
-whose,,0.01003433318879937315
-grandmother,,0.01003433318879937315
-threatened,,0.01003433318879937315
-to,,0.01003433318879937315
-burn,,0.02006866637759874630
-her,,0.01003433318879937315
-she,,0.01003433318879937315
-seized,,0.01003433318879937315
-on,,0.01003433318879937315
-cat,,0.01003433318879937315
-said,,0.01003433318879937315
-'granny,,0.01003433318879937315
-you,,0.01003433318879937315
-incongruous,,0.01003433318879937315
-old,,0.01003433318879937315
-woman,,0.01003433318879937315

data/sample.csv DELETED Viewed

	@@ -1,2 +0,0 @@
1	- doc1,the,limerick,packs,laughs,anatomical,in,space,that,is,quite,economical,but,the,good,ones,i've,seen,so,seldom,are,clean,and,the,clean,ones,so,seldom,are,comical
2	- doc2,there,was,a,young,person,of,smyrna,whose,grandmother,threatened,to,burn,her,but,she,seized,on,the,cat,and,said,'granny,burn,that,you,incongruous,old,woman,of,smyrna