tf-idf_csv 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/bin/tf-idf_csv +6 -8
- data/lib/tf-idf_csv.rb +107 -50
- data/tf-idf_csv.gemspec +2 -4
- metadata +3 -5
- data/sample-tf-idf.csv +0 -46
- data/sample.csv +0 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.1
|
data/bin/tf-idf_csv
CHANGED
@@ -4,19 +4,17 @@ $LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
|
|
4
4
|
require 'tf-idf_csv'
|
5
5
|
require 'csv'
|
6
6
|
|
7
|
+
tf_idf = Tf_Idf_CSV.new()
|
8
|
+
|
7
9
|
begin
|
8
10
|
csv_file = ARGV[0]
|
9
|
-
|
11
|
+
tf_idf.add_csv(csv_file)
|
10
12
|
rescue
|
13
|
+
puts $!
|
11
14
|
puts "Please specify a valid CSV file"
|
12
15
|
Process.exit(1)
|
13
16
|
end
|
14
17
|
|
15
|
-
|
16
|
-
tf_idf.
|
17
|
-
|
18
|
-
output_csv_file = csv_file.sub(/\.csv$/,'-tf-idf.csv')
|
19
|
-
tf_idf.write(output_csv_file)
|
20
|
-
|
21
|
-
|
18
|
+
output_csv_file = csv_file.sub(/\.csv$/,'-fast.csv')
|
19
|
+
tf_idf.fast_write(output_csv_file)
|
22
20
|
|
data/lib/tf-idf_csv.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'csv'
|
2
|
+
require 'logger'
|
2
3
|
|
3
4
|
# This class expects a CSV input
|
4
5
|
# One row per document,
|
@@ -7,83 +8,139 @@ require 'csv'
|
|
7
8
|
# TF-IDF will be returned based on the number of times the term appears in each document, relative to the total number of documents it appears in
|
8
9
|
class Tf_Idf_CSV
|
9
10
|
|
10
|
-
def initialize
|
11
|
-
@
|
11
|
+
def initialize
|
12
|
+
@logger = Logger.new(STDERR)
|
13
|
+
|
14
|
+
reset_tf_idf
|
12
15
|
@total_number_of_docs = 0
|
13
|
-
|
14
|
-
@
|
16
|
+
|
17
|
+
@term_count_per_doc = Hash.new # [document] => { "term1" => count1, "term2" => count2 }
|
18
|
+
@term_freq_per_doc = Hash.new # [document] => { "term1" => frequency1, "term2" => frequency2 }
|
19
|
+
@doc_count_per_term = Hash.new(0) # [term] => num_of_documents_which_contain_this_term
|
20
|
+
end
|
21
|
+
|
22
|
+
def docs
|
23
|
+
@term_freq_per_doc.keys
|
24
|
+
end
|
25
|
+
|
26
|
+
def terms
|
27
|
+
@doc_count_per_term.keys
|
28
|
+
end
|
29
|
+
|
30
|
+
def count(doc, term)
|
31
|
+
return nil unless @term_count_per_doc[doc]
|
32
|
+
@term_count_per_doc[doc][term]
|
33
|
+
end
|
34
|
+
|
35
|
+
def tf(doc, term)
|
36
|
+
return nil unless @term_freq_per_doc[doc]
|
37
|
+
@term_freq_per_doc[doc][term]
|
38
|
+
end
|
39
|
+
|
40
|
+
def idf(term)
|
41
|
+
@idf[term] ||= Math.log10(@total_number_of_docs / @doc_count_per_term[term])
|
42
|
+
end
|
43
|
+
|
44
|
+
def tf_idf(doc,term)
|
45
|
+
return nil unless tf(doc, term)
|
46
|
+
@tf_idf[doc][term] ||= tf(doc, term) * idf(term)
|
15
47
|
end
|
16
48
|
|
17
|
-
def
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
49
|
+
def stop_words
|
50
|
+
@doc_count_per_term.select { |term, count| count == @total_number_of_docs }.keys
|
51
|
+
end
|
52
|
+
|
53
|
+
def add_document(doc, terms)
|
54
|
+
reset_tf_idf
|
55
|
+
@total_number_of_docs += 1.0 # use float as we want divions later
|
56
|
+
|
57
|
+
calculate(doc, terms)
|
58
|
+
@logger.debug("Added document '#{doc}'")
|
59
|
+
end
|
60
|
+
|
61
|
+
def add_csv(file_name)
|
62
|
+
CSV.foreach(file_name) do |row|
|
63
|
+
add_document(row[0],row[1..-1])
|
22
64
|
end
|
23
|
-
calculate_tf_idf
|
24
65
|
end
|
25
66
|
|
67
|
+
def fast_write(csv_file_name, options = {})
|
68
|
+
CSV.open(csv_file_name,"w") do |f|
|
69
|
+
f << ["doc","term","count","tf","idf","tf_idf"]
|
70
|
+
docs.each do |doc|
|
71
|
+
@term_freq_per_doc[doc].each do |term,freq|
|
72
|
+
f << [doc,term,count(doc,term),tf(doc,term),idf(term),tf_idf(doc,term)] if tf(doc,term)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
26
78
|
# Save the results as CSV
|
27
79
|
# Term, Doc1, Doc2, Doc3...
|
28
80
|
# Eggs, 0.04535,,0.02
|
29
|
-
def
|
81
|
+
def write_tf_idf(csv_file_name, options = {})
|
30
82
|
decimal_places = options[:decimal_places] || 20
|
31
83
|
|
32
84
|
CSV.open(csv_file_name,"w") do |f|
|
33
|
-
f << ["
|
34
|
-
|
35
|
-
|
85
|
+
f << ["terms", docs].flatten
|
86
|
+
terms.each do |term|
|
87
|
+
row = [term]
|
36
88
|
docs.each do |doc|
|
37
|
-
value =
|
38
|
-
value = nil if value =~ /^0
|
39
|
-
|
89
|
+
value = tf_idf(doc,term) ? ("%.#{decimal_places}f" % tf_idf(doc,term)) : nil
|
90
|
+
value = nil if value.to_s =~ /^0.0+$/
|
91
|
+
row << value
|
40
92
|
end
|
41
|
-
f <<
|
93
|
+
f << row
|
42
94
|
end
|
43
95
|
end
|
44
96
|
end
|
45
97
|
|
98
|
+
def write_tf(csv_file_name, options = {})
|
99
|
+
decimal_places = options[:decimal_places] || 20
|
100
|
+
|
101
|
+
CSV.open(csv_file_name,"w") do |f|
|
102
|
+
f << ["terms", docs].flatten
|
103
|
+
terms.each do |term|
|
104
|
+
row = [term]
|
105
|
+
docs.each do |doc|
|
106
|
+
value = tf(doc,term) ? ("%.#{decimal_places}f" % tf(doc,term)) : nil
|
107
|
+
value = nil if value.to_s =~ /^0.0+$/
|
108
|
+
row << value
|
109
|
+
end
|
110
|
+
f << row
|
111
|
+
# @logger.debug(row)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
|
46
117
|
private
|
118
|
+
|
47
119
|
|
48
|
-
def
|
49
|
-
@
|
50
|
-
|
51
|
-
|
120
|
+
def reset_tf_idf
|
121
|
+
@idf = {}
|
122
|
+
@tf_idf = Hash.new { |hash, key| hash[key] = {} }
|
123
|
+
@logger.debug("Reset tf-idf")
|
124
|
+
end
|
52
125
|
|
126
|
+
def calculate(doc, terms)
|
127
|
+
term_size = terms.size.to_f
|
128
|
+
term_count = Hash.new(0)
|
129
|
+
term_freq = Hash.new
|
130
|
+
|
53
131
|
# Count the number of times each term appears in this document
|
54
132
|
terms.each do |term|
|
55
|
-
|
133
|
+
term_count[term] += 1
|
56
134
|
end
|
57
135
|
|
58
136
|
# Normalize the count to find term frequency. Divide count by total number of terms in document
|
59
|
-
|
60
|
-
|
61
|
-
@doc_count_per_term[term] += 1
|
137
|
+
term_count.each do |term, count|
|
138
|
+
term_freq[term] = count / term_size
|
139
|
+
@doc_count_per_term[term] += 1
|
62
140
|
end
|
63
|
-
|
64
|
-
@term_freq_per_doc[doc] = term_counts_doc
|
65
|
-
end
|
66
|
-
|
67
|
-
def docs
|
68
|
-
@term_freq_per_doc.keys
|
69
|
-
end
|
70
141
|
|
71
|
-
|
72
|
-
|
73
|
-
def calculate_tf_idf
|
74
|
-
@doc_count_per_term.each do |term, count_per_doc|
|
75
|
-
doc_list = {}
|
76
|
-
docs.each do |doc|
|
77
|
-
# if we have a frequency for this term, we can calculate TF-IDF
|
78
|
-
if @term_freq_per_doc[doc].key?(term)
|
79
|
-
doc_list[doc] = @term_freq_per_doc[doc][term] * Math.log10(@total_number_of_docs / count_per_doc)
|
80
|
-
else
|
81
|
-
doc_list[doc] = nil
|
82
|
-
end
|
83
|
-
end
|
84
|
-
@tf_idf[term] = doc_list
|
85
|
-
end
|
142
|
+
@term_count_per_doc[doc] = term_count
|
143
|
+
@term_freq_per_doc[doc] = term_freq
|
86
144
|
end
|
87
145
|
|
88
|
-
end
|
89
|
-
|
146
|
+
end
|
data/tf-idf_csv.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{tf-idf_csv}
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Julian Burgess"]
|
12
|
-
s.date = %q{2010-11-
|
12
|
+
s.date = %q{2010-11-18}
|
13
13
|
s.default_executable = %q{tf-idf_csv}
|
14
14
|
s.email = %q{jburgess@ap.org}
|
15
15
|
s.executables = ["tf-idf_csv"]
|
@@ -27,8 +27,6 @@ Gem::Specification.new do |s|
|
|
27
27
|
"VERSION",
|
28
28
|
"bin/tf-idf_csv",
|
29
29
|
"lib/tf-idf_csv.rb",
|
30
|
-
"sample-tf-idf.csv",
|
31
|
-
"sample.csv",
|
32
30
|
"test/helper.rb",
|
33
31
|
"test/test_tf-idf_csv.rb",
|
34
32
|
"tf-idf_csv.gemspec"
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 2
|
8
|
-
-
|
9
|
-
version: 0.2.
|
8
|
+
- 1
|
9
|
+
version: 0.2.1
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Julian Burgess
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-11-
|
17
|
+
date: 2010-11-18 00:00:00 -05:00
|
18
18
|
default_executable: tf-idf_csv
|
19
19
|
dependencies: []
|
20
20
|
|
@@ -37,8 +37,6 @@ files:
|
|
37
37
|
- VERSION
|
38
38
|
- bin/tf-idf_csv
|
39
39
|
- lib/tf-idf_csv.rb
|
40
|
-
- sample-tf-idf.csv
|
41
|
-
- sample.csv
|
42
40
|
- test/helper.rb
|
43
41
|
- test/test_tf-idf_csv.rb
|
44
42
|
- tf-idf_csv.gemspec
|
data/sample-tf-idf.csv
DELETED
@@ -1,46 +0,0 @@
|
|
1
|
-
term,doc1,doc2
|
2
|
-
the,,
|
3
|
-
limerick,0.01038034467806831646,
|
4
|
-
packs,0.01038034467806831646,
|
5
|
-
laughs,0.01038034467806831646,
|
6
|
-
anatomical,0.01038034467806831646,
|
7
|
-
in,0.01038034467806831646,
|
8
|
-
space,0.01038034467806831646,
|
9
|
-
that,,
|
10
|
-
is,0.01038034467806831646,
|
11
|
-
quite,0.01038034467806831646,
|
12
|
-
economical,0.01038034467806831646,
|
13
|
-
but,,
|
14
|
-
good,0.01038034467806831646,
|
15
|
-
ones,0.02076068935613663291,
|
16
|
-
i've,0.01038034467806831646,
|
17
|
-
seen,0.01038034467806831646,
|
18
|
-
so,0.02076068935613663291,
|
19
|
-
seldom,0.02076068935613663291,
|
20
|
-
are,0.02076068935613663291,
|
21
|
-
clean,0.02076068935613663291,
|
22
|
-
and,,
|
23
|
-
comical,0.01038034467806831646,
|
24
|
-
there,,0.01003433318879937315
|
25
|
-
was,,0.01003433318879937315
|
26
|
-
a,,0.01003433318879937315
|
27
|
-
young,,0.01003433318879937315
|
28
|
-
person,,0.01003433318879937315
|
29
|
-
of,,0.02006866637759874630
|
30
|
-
smyrna,,0.02006866637759874630
|
31
|
-
whose,,0.01003433318879937315
|
32
|
-
grandmother,,0.01003433318879937315
|
33
|
-
threatened,,0.01003433318879937315
|
34
|
-
to,,0.01003433318879937315
|
35
|
-
burn,,0.02006866637759874630
|
36
|
-
her,,0.01003433318879937315
|
37
|
-
she,,0.01003433318879937315
|
38
|
-
seized,,0.01003433318879937315
|
39
|
-
on,,0.01003433318879937315
|
40
|
-
cat,,0.01003433318879937315
|
41
|
-
said,,0.01003433318879937315
|
42
|
-
'granny,,0.01003433318879937315
|
43
|
-
you,,0.01003433318879937315
|
44
|
-
incongruous,,0.01003433318879937315
|
45
|
-
old,,0.01003433318879937315
|
46
|
-
woman,,0.01003433318879937315
|
data/sample.csv
DELETED
@@ -1,2 +0,0 @@
|
|
1
|
-
doc1,the,limerick,packs,laughs,anatomical,in,space,that,is,quite,economical,but,the,good,ones,i've,seen,so,seldom,are,clean,and,the,clean,ones,so,seldom,are,comical
|
2
|
-
doc2,there,was,a,young,person,of,smyrna,whose,grandmother,threatened,to,burn,her,but,she,seized,on,the,cat,and,said,'granny,burn,that,you,incongruous,old,woman,of,smyrna
|