tf-idf_csv 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.0
1
+ 0.2.1
data/bin/tf-idf_csv CHANGED
@@ -4,19 +4,17 @@ $LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
4
4
  require 'tf-idf_csv'
5
5
  require 'csv'
6
6
 
7
+ tf_idf = Tf_Idf_CSV.new()
8
+
7
9
  begin
8
10
  csv_file = ARGV[0]
9
- csv = CSV.open(csv_file)
11
+ tf_idf.add_csv(csv_file)
10
12
  rescue
13
+ puts $!
11
14
  puts "Please specify a valid CSV file"
12
15
  Process.exit(1)
13
16
  end
14
17
 
15
- tf_idf = Tf_Idf_CSV.new()
16
- tf_idf.add_csv(csv)
17
-
18
- output_csv_file = csv_file.sub(/\.csv$/,'-tf-idf.csv')
19
- tf_idf.write(output_csv_file)
20
-
21
-
18
+ output_csv_file = csv_file.sub(/\.csv$/,'-fast.csv')
19
+ tf_idf.fast_write(output_csv_file)
22
20
 
data/lib/tf-idf_csv.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require 'csv'
2
+ require 'logger'
2
3
 
3
4
  # This class expects a CSV input
4
5
  # One row per document,
@@ -7,83 +8,139 @@ require 'csv'
7
8
  # TF-IDF will be returned based on the number of times the term appears in each document, relative to the total number of documents it appears in
8
9
  class Tf_Idf_CSV
9
10
 
10
- def initialize
11
- @tf_idf = {}
11
+ def initialize
12
+ @logger = Logger.new(STDERR)
13
+
14
+ reset_tf_idf
12
15
  @total_number_of_docs = 0
13
- @doc_count_per_term = Hash.new(0)
14
- @term_freq_per_doc = Hash.new
16
+
17
+ @term_count_per_doc = Hash.new # [document] => { "term1" => count1, "term2" => count2 }
18
+ @term_freq_per_doc = Hash.new # [document] => { "term1" => frequency1, "term2" => frequency2 }
19
+ @doc_count_per_term = Hash.new(0) # [term] => num_of_documents_which_contain_this_term
20
+ end
21
+
22
+ def docs
23
+ @term_freq_per_doc.keys
24
+ end
25
+
26
+ def terms
27
+ @doc_count_per_term.keys
28
+ end
29
+
30
+ def count(doc, term)
31
+ return nil unless @term_count_per_doc[doc]
32
+ @term_count_per_doc[doc][term]
33
+ end
34
+
35
+ def tf(doc, term)
36
+ return nil unless @term_freq_per_doc[doc]
37
+ @term_freq_per_doc[doc][term]
38
+ end
39
+
40
+ def idf(term)
41
+ @idf[term] ||= Math.log10(@total_number_of_docs / @doc_count_per_term[term])
42
+ end
43
+
44
+ def tf_idf(doc,term)
45
+ return nil unless tf(doc, term)
46
+ @tf_idf[doc][term] ||= tf(doc, term) * idf(term)
15
47
  end
16
48
 
17
- def add_csv(csv)
18
- csv.each do |row|
19
- name = row[0]
20
- terms = row[1..-1]
21
- add_document(name, terms)
49
+ def stop_words
50
+ @doc_count_per_term.select { |term, count| count == @total_number_of_docs }.keys
51
+ end
52
+
53
+ def add_document(doc, terms)
54
+ reset_tf_idf
55
+ @total_number_of_docs += 1.0 # use float as we want divions later
56
+
57
+ calculate(doc, terms)
58
+ @logger.debug("Added document '#{doc}'")
59
+ end
60
+
61
+ def add_csv(file_name)
62
+ CSV.foreach(file_name) do |row|
63
+ add_document(row[0],row[1..-1])
22
64
  end
23
- calculate_tf_idf
24
65
  end
25
66
 
67
+ def fast_write(csv_file_name, options = {})
68
+ CSV.open(csv_file_name,"w") do |f|
69
+ f << ["doc","term","count","tf","idf","tf_idf"]
70
+ docs.each do |doc|
71
+ @term_freq_per_doc[doc].each do |term,freq|
72
+ f << [doc,term,count(doc,term),tf(doc,term),idf(term),tf_idf(doc,term)] if tf(doc,term)
73
+ end
74
+ end
75
+ end
76
+ end
77
+
26
78
  # Save the results as CSV
27
79
  # Term, Doc1, Doc2, Doc3...
28
80
  # Eggs, 0.04535,,0.02
29
- def write(csv_file_name, options = {})
81
+ def write_tf_idf(csv_file_name, options = {})
30
82
  decimal_places = options[:decimal_places] || 20
31
83
 
32
84
  CSV.open(csv_file_name,"w") do |f|
33
- f << ["term", docs].flatten
34
- @tf_idf.each do |term, values|
35
- tmp_row = [term]
85
+ f << ["terms", docs].flatten
86
+ terms.each do |term|
87
+ row = [term]
36
88
  docs.each do |doc|
37
- value = values[doc] ? ("%.#{decimal_places}f" % values[doc]) : nil
38
- value = nil if value =~ /^0\.0+$/
39
- tmp_row << value
89
+ value = tf_idf(doc,term) ? ("%.#{decimal_places}f" % tf_idf(doc,term)) : nil
90
+ value = nil if value.to_s =~ /^0.0+$/
91
+ row << value
40
92
  end
41
- f << tmp_row
93
+ f << row
42
94
  end
43
95
  end
44
96
  end
45
97
 
98
+ def write_tf(csv_file_name, options = {})
99
+ decimal_places = options[:decimal_places] || 20
100
+
101
+ CSV.open(csv_file_name,"w") do |f|
102
+ f << ["terms", docs].flatten
103
+ terms.each do |term|
104
+ row = [term]
105
+ docs.each do |doc|
106
+ value = tf(doc,term) ? ("%.#{decimal_places}f" % tf(doc,term)) : nil
107
+ value = nil if value.to_s =~ /^0.0+$/
108
+ row << value
109
+ end
110
+ f << row
111
+ # @logger.debug(row)
112
+ end
113
+ end
114
+ end
115
+
116
+
46
117
  private
118
+
47
119
 
48
- def add_document(doc, terms)
49
- @total_number_of_docs += 1
50
-
51
- term_counts_doc = Hash.new(0.0)
120
+ def reset_tf_idf
121
+ @idf = {}
122
+ @tf_idf = Hash.new { |hash, key| hash[key] = {} }
123
+ @logger.debug("Reset tf-idf")
124
+ end
52
125
 
126
+ def calculate(doc, terms)
127
+ term_size = terms.size.to_f
128
+ term_count = Hash.new(0)
129
+ term_freq = Hash.new
130
+
53
131
  # Count the number of times each term appears in this document
54
132
  terms.each do |term|
55
- term_counts_doc[term] += 1.0
133
+ term_count[term] += 1
56
134
  end
57
135
 
58
136
  # Normalize the count to find term frequency. Divide count by total number of terms in document
59
- term_counts_doc.each_key do |term|
60
- term_counts_doc[term] /= terms.size
61
- @doc_count_per_term[term] += 1.0
137
+ term_count.each do |term, count|
138
+ term_freq[term] = count / term_size
139
+ @doc_count_per_term[term] += 1
62
140
  end
63
-
64
- @term_freq_per_doc[doc] = term_counts_doc
65
- end
66
-
67
- def docs
68
- @term_freq_per_doc.keys
69
- end
70
141
 
71
- # produces a hash indexed by term, with each value being a hash indexed by document with a value being the TF-IDF
72
- # { "cat" => { "Green Eggs and Ham" => 0.04535, "Dick Wittington" => 0.02343434 }, "Eggs" => { "Green Eggs and Ham" => 0.02764} } }
73
- def calculate_tf_idf
74
- @doc_count_per_term.each do |term, count_per_doc|
75
- doc_list = {}
76
- docs.each do |doc|
77
- # if we have a frequency for this term, we can calculate TF-IDF
78
- if @term_freq_per_doc[doc].key?(term)
79
- doc_list[doc] = @term_freq_per_doc[doc][term] * Math.log10(@total_number_of_docs / count_per_doc)
80
- else
81
- doc_list[doc] = nil
82
- end
83
- end
84
- @tf_idf[term] = doc_list
85
- end
142
+ @term_count_per_doc[doc] = term_count
143
+ @term_freq_per_doc[doc] = term_freq
86
144
  end
87
145
 
88
- end
89
-
146
+ end
data/tf-idf_csv.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{tf-idf_csv}
8
- s.version = "0.2.0"
8
+ s.version = "0.2.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Julian Burgess"]
12
- s.date = %q{2010-11-02}
12
+ s.date = %q{2010-11-18}
13
13
  s.default_executable = %q{tf-idf_csv}
14
14
  s.email = %q{jburgess@ap.org}
15
15
  s.executables = ["tf-idf_csv"]
@@ -27,8 +27,6 @@ Gem::Specification.new do |s|
27
27
  "VERSION",
28
28
  "bin/tf-idf_csv",
29
29
  "lib/tf-idf_csv.rb",
30
- "sample-tf-idf.csv",
31
- "sample.csv",
32
30
  "test/helper.rb",
33
31
  "test/test_tf-idf_csv.rb",
34
32
  "tf-idf_csv.gemspec"
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 2
8
- - 0
9
- version: 0.2.0
8
+ - 1
9
+ version: 0.2.1
10
10
  platform: ruby
11
11
  authors:
12
12
  - Julian Burgess
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-11-02 00:00:00 -04:00
17
+ date: 2010-11-18 00:00:00 -05:00
18
18
  default_executable: tf-idf_csv
19
19
  dependencies: []
20
20
 
@@ -37,8 +37,6 @@ files:
37
37
  - VERSION
38
38
  - bin/tf-idf_csv
39
39
  - lib/tf-idf_csv.rb
40
- - sample-tf-idf.csv
41
- - sample.csv
42
40
  - test/helper.rb
43
41
  - test/test_tf-idf_csv.rb
44
42
  - tf-idf_csv.gemspec
data/sample-tf-idf.csv DELETED
@@ -1,46 +0,0 @@
1
- term,doc1,doc2
2
- the,,
3
- limerick,0.01038034467806831646,
4
- packs,0.01038034467806831646,
5
- laughs,0.01038034467806831646,
6
- anatomical,0.01038034467806831646,
7
- in,0.01038034467806831646,
8
- space,0.01038034467806831646,
9
- that,,
10
- is,0.01038034467806831646,
11
- quite,0.01038034467806831646,
12
- economical,0.01038034467806831646,
13
- but,,
14
- good,0.01038034467806831646,
15
- ones,0.02076068935613663291,
16
- i've,0.01038034467806831646,
17
- seen,0.01038034467806831646,
18
- so,0.02076068935613663291,
19
- seldom,0.02076068935613663291,
20
- are,0.02076068935613663291,
21
- clean,0.02076068935613663291,
22
- and,,
23
- comical,0.01038034467806831646,
24
- there,,0.01003433318879937315
25
- was,,0.01003433318879937315
26
- a,,0.01003433318879937315
27
- young,,0.01003433318879937315
28
- person,,0.01003433318879937315
29
- of,,0.02006866637759874630
30
- smyrna,,0.02006866637759874630
31
- whose,,0.01003433318879937315
32
- grandmother,,0.01003433318879937315
33
- threatened,,0.01003433318879937315
34
- to,,0.01003433318879937315
35
- burn,,0.02006866637759874630
36
- her,,0.01003433318879937315
37
- she,,0.01003433318879937315
38
- seized,,0.01003433318879937315
39
- on,,0.01003433318879937315
40
- cat,,0.01003433318879937315
41
- said,,0.01003433318879937315
42
- 'granny,,0.01003433318879937315
43
- you,,0.01003433318879937315
44
- incongruous,,0.01003433318879937315
45
- old,,0.01003433318879937315
46
- woman,,0.01003433318879937315
data/sample.csv DELETED
@@ -1,2 +0,0 @@
1
- doc1,the,limerick,packs,laughs,anatomical,in,space,that,is,quite,economical,but,the,good,ones,i've,seen,so,seldom,are,clean,and,the,clean,ones,so,seldom,are,comical
2
- doc2,there,was,a,young,person,of,smyrna,whose,grandmother,threatened,to,burn,her,but,she,seized,on,the,cat,and,said,'granny,burn,that,you,incongruous,old,woman,of,smyrna