tf-idf_csv 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.0
1
+ 0.2.1
data/bin/tf-idf_csv CHANGED
@@ -4,19 +4,17 @@ $LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
4
4
  require 'tf-idf_csv'
5
5
  require 'csv'
6
6
 
7
+ tf_idf = Tf_Idf_CSV.new()
8
+
7
9
  begin
8
10
  csv_file = ARGV[0]
9
- csv = CSV.open(csv_file)
11
+ tf_idf.add_csv(csv_file)
10
12
  rescue
13
+ puts $!
11
14
  puts "Please specify a valid CSV file"
12
15
  Process.exit(1)
13
16
  end
14
17
 
15
- tf_idf = Tf_Idf_CSV.new()
16
- tf_idf.add_csv(csv)
17
-
18
- output_csv_file = csv_file.sub(/\.csv$/,'-tf-idf.csv')
19
- tf_idf.write(output_csv_file)
20
-
21
-
18
+ output_csv_file = csv_file.sub(/\.csv$/,'-fast.csv')
19
+ tf_idf.fast_write(output_csv_file)
22
20
 
data/lib/tf-idf_csv.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require 'csv'
2
+ require 'logger'
2
3
 
3
4
  # This class expects a CSV input
4
5
  # One row per document,
@@ -7,83 +8,139 @@ require 'csv'
7
8
  # TF-IDF will be returned based on the number of times the term appears in each document, relative to the total number of documents it appears in
8
9
  class Tf_Idf_CSV
9
10
 
10
- def initialize
11
- @tf_idf = {}
11
+ def initialize
12
+ @logger = Logger.new(STDERR)
13
+
14
+ reset_tf_idf
12
15
  @total_number_of_docs = 0
13
- @doc_count_per_term = Hash.new(0)
14
- @term_freq_per_doc = Hash.new
16
+
17
+ @term_count_per_doc = Hash.new # [document] => { "term1" => count1, "term2" => count2 }
18
+ @term_freq_per_doc = Hash.new # [document] => { "term1" => frequency1, "term2" => frequency2 }
19
+ @doc_count_per_term = Hash.new(0) # [term] => num_of_documents_which_contain_this_term
20
+ end
21
+
22
+ def docs
23
+ @term_freq_per_doc.keys
24
+ end
25
+
26
+ def terms
27
+ @doc_count_per_term.keys
28
+ end
29
+
30
+ def count(doc, term)
31
+ return nil unless @term_count_per_doc[doc]
32
+ @term_count_per_doc[doc][term]
33
+ end
34
+
35
+ def tf(doc, term)
36
+ return nil unless @term_freq_per_doc[doc]
37
+ @term_freq_per_doc[doc][term]
38
+ end
39
+
40
+ def idf(term)
41
+ @idf[term] ||= Math.log10(@total_number_of_docs / @doc_count_per_term[term])
42
+ end
43
+
44
+ def tf_idf(doc,term)
45
+ return nil unless tf(doc, term)
46
+ @tf_idf[doc][term] ||= tf(doc, term) * idf(term)
15
47
  end
16
48
 
17
- def add_csv(csv)
18
- csv.each do |row|
19
- name = row[0]
20
- terms = row[1..-1]
21
- add_document(name, terms)
49
+ def stop_words
50
+ @doc_count_per_term.select { |term, count| count == @total_number_of_docs }.keys
51
+ end
52
+
53
+ def add_document(doc, terms)
54
+ reset_tf_idf
55
+ @total_number_of_docs += 1.0 # use float as we want divions later
56
+
57
+ calculate(doc, terms)
58
+ @logger.debug("Added document '#{doc}'")
59
+ end
60
+
61
+ def add_csv(file_name)
62
+ CSV.foreach(file_name) do |row|
63
+ add_document(row[0],row[1..-1])
22
64
  end
23
- calculate_tf_idf
24
65
  end
25
66
 
67
+ def fast_write(csv_file_name, options = {})
68
+ CSV.open(csv_file_name,"w") do |f|
69
+ f << ["doc","term","count","tf","idf","tf_idf"]
70
+ docs.each do |doc|
71
+ @term_freq_per_doc[doc].each do |term,freq|
72
+ f << [doc,term,count(doc,term),tf(doc,term),idf(term),tf_idf(doc,term)] if tf(doc,term)
73
+ end
74
+ end
75
+ end
76
+ end
77
+
26
78
  # Save the results as CSV
27
79
  # Term, Doc1, Doc2, Doc3...
28
80
  # Eggs, 0.04535,,0.02
29
- def write(csv_file_name, options = {})
81
+ def write_tf_idf(csv_file_name, options = {})
30
82
  decimal_places = options[:decimal_places] || 20
31
83
 
32
84
  CSV.open(csv_file_name,"w") do |f|
33
- f << ["term", docs].flatten
34
- @tf_idf.each do |term, values|
35
- tmp_row = [term]
85
+ f << ["terms", docs].flatten
86
+ terms.each do |term|
87
+ row = [term]
36
88
  docs.each do |doc|
37
- value = values[doc] ? ("%.#{decimal_places}f" % values[doc]) : nil
38
- value = nil if value =~ /^0\.0+$/
39
- tmp_row << value
89
+ value = tf_idf(doc,term) ? ("%.#{decimal_places}f" % tf_idf(doc,term)) : nil
90
+ value = nil if value.to_s =~ /^0.0+$/
91
+ row << value
40
92
  end
41
- f << tmp_row
93
+ f << row
42
94
  end
43
95
  end
44
96
  end
45
97
 
98
+ def write_tf(csv_file_name, options = {})
99
+ decimal_places = options[:decimal_places] || 20
100
+
101
+ CSV.open(csv_file_name,"w") do |f|
102
+ f << ["terms", docs].flatten
103
+ terms.each do |term|
104
+ row = [term]
105
+ docs.each do |doc|
106
+ value = tf(doc,term) ? ("%.#{decimal_places}f" % tf(doc,term)) : nil
107
+ value = nil if value.to_s =~ /^0.0+$/
108
+ row << value
109
+ end
110
+ f << row
111
+ # @logger.debug(row)
112
+ end
113
+ end
114
+ end
115
+
116
+
46
117
  private
118
+
47
119
 
48
- def add_document(doc, terms)
49
- @total_number_of_docs += 1
50
-
51
- term_counts_doc = Hash.new(0.0)
120
+ def reset_tf_idf
121
+ @idf = {}
122
+ @tf_idf = Hash.new { |hash, key| hash[key] = {} }
123
+ @logger.debug("Reset tf-idf")
124
+ end
52
125
 
126
+ def calculate(doc, terms)
127
+ term_size = terms.size.to_f
128
+ term_count = Hash.new(0)
129
+ term_freq = Hash.new
130
+
53
131
  # Count the number of times each term appears in this document
54
132
  terms.each do |term|
55
- term_counts_doc[term] += 1.0
133
+ term_count[term] += 1
56
134
  end
57
135
 
58
136
  # Normalize the count to find term frequency. Divide count by total number of terms in document
59
- term_counts_doc.each_key do |term|
60
- term_counts_doc[term] /= terms.size
61
- @doc_count_per_term[term] += 1.0
137
+ term_count.each do |term, count|
138
+ term_freq[term] = count / term_size
139
+ @doc_count_per_term[term] += 1
62
140
  end
63
-
64
- @term_freq_per_doc[doc] = term_counts_doc
65
- end
66
-
67
- def docs
68
- @term_freq_per_doc.keys
69
- end
70
141
 
71
- # produces a hash indexed by term, with each value being a hash indexed by document with a value being the TF-IDF
72
- # { "cat" => { "Green Eggs and Ham" => 0.04535, "Dick Wittington" => 0.02343434 }, "Eggs" => { "Green Eggs and Ham" => 0.02764} } }
73
- def calculate_tf_idf
74
- @doc_count_per_term.each do |term, count_per_doc|
75
- doc_list = {}
76
- docs.each do |doc|
77
- # if we have a frequency for this term, we can calculate TF-IDF
78
- if @term_freq_per_doc[doc].key?(term)
79
- doc_list[doc] = @term_freq_per_doc[doc][term] * Math.log10(@total_number_of_docs / count_per_doc)
80
- else
81
- doc_list[doc] = nil
82
- end
83
- end
84
- @tf_idf[term] = doc_list
85
- end
142
+ @term_count_per_doc[doc] = term_count
143
+ @term_freq_per_doc[doc] = term_freq
86
144
  end
87
145
 
88
- end
89
-
146
+ end
data/tf-idf_csv.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{tf-idf_csv}
8
- s.version = "0.2.0"
8
+ s.version = "0.2.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Julian Burgess"]
12
- s.date = %q{2010-11-02}
12
+ s.date = %q{2010-11-18}
13
13
  s.default_executable = %q{tf-idf_csv}
14
14
  s.email = %q{jburgess@ap.org}
15
15
  s.executables = ["tf-idf_csv"]
@@ -27,8 +27,6 @@ Gem::Specification.new do |s|
27
27
  "VERSION",
28
28
  "bin/tf-idf_csv",
29
29
  "lib/tf-idf_csv.rb",
30
- "sample-tf-idf.csv",
31
- "sample.csv",
32
30
  "test/helper.rb",
33
31
  "test/test_tf-idf_csv.rb",
34
32
  "tf-idf_csv.gemspec"
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 2
8
- - 0
9
- version: 0.2.0
8
+ - 1
9
+ version: 0.2.1
10
10
  platform: ruby
11
11
  authors:
12
12
  - Julian Burgess
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-11-02 00:00:00 -04:00
17
+ date: 2010-11-18 00:00:00 -05:00
18
18
  default_executable: tf-idf_csv
19
19
  dependencies: []
20
20
 
@@ -37,8 +37,6 @@ files:
37
37
  - VERSION
38
38
  - bin/tf-idf_csv
39
39
  - lib/tf-idf_csv.rb
40
- - sample-tf-idf.csv
41
- - sample.csv
42
40
  - test/helper.rb
43
41
  - test/test_tf-idf_csv.rb
44
42
  - tf-idf_csv.gemspec
data/sample-tf-idf.csv DELETED
@@ -1,46 +0,0 @@
1
- term,doc1,doc2
2
- the,,
3
- limerick,0.01038034467806831646,
4
- packs,0.01038034467806831646,
5
- laughs,0.01038034467806831646,
6
- anatomical,0.01038034467806831646,
7
- in,0.01038034467806831646,
8
- space,0.01038034467806831646,
9
- that,,
10
- is,0.01038034467806831646,
11
- quite,0.01038034467806831646,
12
- economical,0.01038034467806831646,
13
- but,,
14
- good,0.01038034467806831646,
15
- ones,0.02076068935613663291,
16
- i've,0.01038034467806831646,
17
- seen,0.01038034467806831646,
18
- so,0.02076068935613663291,
19
- seldom,0.02076068935613663291,
20
- are,0.02076068935613663291,
21
- clean,0.02076068935613663291,
22
- and,,
23
- comical,0.01038034467806831646,
24
- there,,0.01003433318879937315
25
- was,,0.01003433318879937315
26
- a,,0.01003433318879937315
27
- young,,0.01003433318879937315
28
- person,,0.01003433318879937315
29
- of,,0.02006866637759874630
30
- smyrna,,0.02006866637759874630
31
- whose,,0.01003433318879937315
32
- grandmother,,0.01003433318879937315
33
- threatened,,0.01003433318879937315
34
- to,,0.01003433318879937315
35
- burn,,0.02006866637759874630
36
- her,,0.01003433318879937315
37
- she,,0.01003433318879937315
38
- seized,,0.01003433318879937315
39
- on,,0.01003433318879937315
40
- cat,,0.01003433318879937315
41
- said,,0.01003433318879937315
42
- 'granny,,0.01003433318879937315
43
- you,,0.01003433318879937315
44
- incongruous,,0.01003433318879937315
45
- old,,0.01003433318879937315
46
- woman,,0.01003433318879937315
data/sample.csv DELETED
@@ -1,2 +0,0 @@
1
- doc1,the,limerick,packs,laughs,anatomical,in,space,that,is,quite,economical,but,the,good,ones,i've,seen,so,seldom,are,clean,and,the,clean,ones,so,seldom,are,comical
2
- doc2,there,was,a,young,person,of,smyrna,whose,grandmother,threatened,to,burn,her,but,she,seized,on,the,cat,and,said,'granny,burn,that,you,incongruous,old,woman,of,smyrna