tf-idf_csv 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/bin/tf-idf_csv +6 -8
- data/lib/tf-idf_csv.rb +107 -50
- data/tf-idf_csv.gemspec +2 -4
- metadata +3 -5
- data/sample-tf-idf.csv +0 -46
- data/sample.csv +0 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.1
|
data/bin/tf-idf_csv
CHANGED
@@ -4,19 +4,17 @@ $LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
|
|
4
4
|
require 'tf-idf_csv'
|
5
5
|
require 'csv'
|
6
6
|
|
7
|
+
tf_idf = Tf_Idf_CSV.new()
|
8
|
+
|
7
9
|
begin
|
8
10
|
csv_file = ARGV[0]
|
9
|
-
|
11
|
+
tf_idf.add_csv(csv_file)
|
10
12
|
rescue
|
13
|
+
puts $!
|
11
14
|
puts "Please specify a valid CSV file"
|
12
15
|
Process.exit(1)
|
13
16
|
end
|
14
17
|
|
15
|
-
|
16
|
-
tf_idf.
|
17
|
-
|
18
|
-
output_csv_file = csv_file.sub(/\.csv$/,'-tf-idf.csv')
|
19
|
-
tf_idf.write(output_csv_file)
|
20
|
-
|
21
|
-
|
18
|
+
output_csv_file = csv_file.sub(/\.csv$/,'-fast.csv')
|
19
|
+
tf_idf.fast_write(output_csv_file)
|
22
20
|
|
data/lib/tf-idf_csv.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'csv'
|
2
|
+
require 'logger'
|
2
3
|
|
3
4
|
# This class expects a CSV input
|
4
5
|
# One row per document,
|
@@ -7,83 +8,139 @@ require 'csv'
|
|
7
8
|
# TF-IDF will be returned based on the number of times the term appears in each document, relative to the total number of documents it appears in
|
8
9
|
class Tf_Idf_CSV
|
9
10
|
|
10
|
-
def initialize
|
11
|
-
@
|
11
|
+
def initialize
|
12
|
+
@logger = Logger.new(STDERR)
|
13
|
+
|
14
|
+
reset_tf_idf
|
12
15
|
@total_number_of_docs = 0
|
13
|
-
|
14
|
-
@
|
16
|
+
|
17
|
+
@term_count_per_doc = Hash.new # [document] => { "term1" => count1, "term2" => count2 }
|
18
|
+
@term_freq_per_doc = Hash.new # [document] => { "term1" => frequency1, "term2" => frequency2 }
|
19
|
+
@doc_count_per_term = Hash.new(0) # [term] => num_of_documents_which_contain_this_term
|
20
|
+
end
|
21
|
+
|
22
|
+
def docs
|
23
|
+
@term_freq_per_doc.keys
|
24
|
+
end
|
25
|
+
|
26
|
+
def terms
|
27
|
+
@doc_count_per_term.keys
|
28
|
+
end
|
29
|
+
|
30
|
+
def count(doc, term)
|
31
|
+
return nil unless @term_count_per_doc[doc]
|
32
|
+
@term_count_per_doc[doc][term]
|
33
|
+
end
|
34
|
+
|
35
|
+
def tf(doc, term)
|
36
|
+
return nil unless @term_freq_per_doc[doc]
|
37
|
+
@term_freq_per_doc[doc][term]
|
38
|
+
end
|
39
|
+
|
40
|
+
def idf(term)
|
41
|
+
@idf[term] ||= Math.log10(@total_number_of_docs / @doc_count_per_term[term])
|
42
|
+
end
|
43
|
+
|
44
|
+
def tf_idf(doc,term)
|
45
|
+
return nil unless tf(doc, term)
|
46
|
+
@tf_idf[doc][term] ||= tf(doc, term) * idf(term)
|
15
47
|
end
|
16
48
|
|
17
|
-
def
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
49
|
+
def stop_words
|
50
|
+
@doc_count_per_term.select { |term, count| count == @total_number_of_docs }.keys
|
51
|
+
end
|
52
|
+
|
53
|
+
def add_document(doc, terms)
|
54
|
+
reset_tf_idf
|
55
|
+
@total_number_of_docs += 1.0 # use float as we want divions later
|
56
|
+
|
57
|
+
calculate(doc, terms)
|
58
|
+
@logger.debug("Added document '#{doc}'")
|
59
|
+
end
|
60
|
+
|
61
|
+
def add_csv(file_name)
|
62
|
+
CSV.foreach(file_name) do |row|
|
63
|
+
add_document(row[0],row[1..-1])
|
22
64
|
end
|
23
|
-
calculate_tf_idf
|
24
65
|
end
|
25
66
|
|
67
|
+
def fast_write(csv_file_name, options = {})
|
68
|
+
CSV.open(csv_file_name,"w") do |f|
|
69
|
+
f << ["doc","term","count","tf","idf","tf_idf"]
|
70
|
+
docs.each do |doc|
|
71
|
+
@term_freq_per_doc[doc].each do |term,freq|
|
72
|
+
f << [doc,term,count(doc,term),tf(doc,term),idf(term),tf_idf(doc,term)] if tf(doc,term)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
26
78
|
# Save the results as CSV
|
27
79
|
# Term, Doc1, Doc2, Doc3...
|
28
80
|
# Eggs, 0.04535,,0.02
|
29
|
-
def
|
81
|
+
def write_tf_idf(csv_file_name, options = {})
|
30
82
|
decimal_places = options[:decimal_places] || 20
|
31
83
|
|
32
84
|
CSV.open(csv_file_name,"w") do |f|
|
33
|
-
f << ["
|
34
|
-
|
35
|
-
|
85
|
+
f << ["terms", docs].flatten
|
86
|
+
terms.each do |term|
|
87
|
+
row = [term]
|
36
88
|
docs.each do |doc|
|
37
|
-
value =
|
38
|
-
value = nil if value =~ /^0
|
39
|
-
|
89
|
+
value = tf_idf(doc,term) ? ("%.#{decimal_places}f" % tf_idf(doc,term)) : nil
|
90
|
+
value = nil if value.to_s =~ /^0.0+$/
|
91
|
+
row << value
|
40
92
|
end
|
41
|
-
f <<
|
93
|
+
f << row
|
42
94
|
end
|
43
95
|
end
|
44
96
|
end
|
45
97
|
|
98
|
+
def write_tf(csv_file_name, options = {})
|
99
|
+
decimal_places = options[:decimal_places] || 20
|
100
|
+
|
101
|
+
CSV.open(csv_file_name,"w") do |f|
|
102
|
+
f << ["terms", docs].flatten
|
103
|
+
terms.each do |term|
|
104
|
+
row = [term]
|
105
|
+
docs.each do |doc|
|
106
|
+
value = tf(doc,term) ? ("%.#{decimal_places}f" % tf(doc,term)) : nil
|
107
|
+
value = nil if value.to_s =~ /^0.0+$/
|
108
|
+
row << value
|
109
|
+
end
|
110
|
+
f << row
|
111
|
+
# @logger.debug(row)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
|
46
117
|
private
|
118
|
+
|
47
119
|
|
48
|
-
def
|
49
|
-
@
|
50
|
-
|
51
|
-
|
120
|
+
def reset_tf_idf
|
121
|
+
@idf = {}
|
122
|
+
@tf_idf = Hash.new { |hash, key| hash[key] = {} }
|
123
|
+
@logger.debug("Reset tf-idf")
|
124
|
+
end
|
52
125
|
|
126
|
+
def calculate(doc, terms)
|
127
|
+
term_size = terms.size.to_f
|
128
|
+
term_count = Hash.new(0)
|
129
|
+
term_freq = Hash.new
|
130
|
+
|
53
131
|
# Count the number of times each term appears in this document
|
54
132
|
terms.each do |term|
|
55
|
-
|
133
|
+
term_count[term] += 1
|
56
134
|
end
|
57
135
|
|
58
136
|
# Normalize the count to find term frequency. Divide count by total number of terms in document
|
59
|
-
|
60
|
-
|
61
|
-
@doc_count_per_term[term] += 1
|
137
|
+
term_count.each do |term, count|
|
138
|
+
term_freq[term] = count / term_size
|
139
|
+
@doc_count_per_term[term] += 1
|
62
140
|
end
|
63
|
-
|
64
|
-
@term_freq_per_doc[doc] = term_counts_doc
|
65
|
-
end
|
66
|
-
|
67
|
-
def docs
|
68
|
-
@term_freq_per_doc.keys
|
69
|
-
end
|
70
141
|
|
71
|
-
|
72
|
-
|
73
|
-
def calculate_tf_idf
|
74
|
-
@doc_count_per_term.each do |term, count_per_doc|
|
75
|
-
doc_list = {}
|
76
|
-
docs.each do |doc|
|
77
|
-
# if we have a frequency for this term, we can calculate TF-IDF
|
78
|
-
if @term_freq_per_doc[doc].key?(term)
|
79
|
-
doc_list[doc] = @term_freq_per_doc[doc][term] * Math.log10(@total_number_of_docs / count_per_doc)
|
80
|
-
else
|
81
|
-
doc_list[doc] = nil
|
82
|
-
end
|
83
|
-
end
|
84
|
-
@tf_idf[term] = doc_list
|
85
|
-
end
|
142
|
+
@term_count_per_doc[doc] = term_count
|
143
|
+
@term_freq_per_doc[doc] = term_freq
|
86
144
|
end
|
87
145
|
|
88
|
-
end
|
89
|
-
|
146
|
+
end
|
data/tf-idf_csv.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{tf-idf_csv}
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Julian Burgess"]
|
12
|
-
s.date = %q{2010-11-
|
12
|
+
s.date = %q{2010-11-18}
|
13
13
|
s.default_executable = %q{tf-idf_csv}
|
14
14
|
s.email = %q{jburgess@ap.org}
|
15
15
|
s.executables = ["tf-idf_csv"]
|
@@ -27,8 +27,6 @@ Gem::Specification.new do |s|
|
|
27
27
|
"VERSION",
|
28
28
|
"bin/tf-idf_csv",
|
29
29
|
"lib/tf-idf_csv.rb",
|
30
|
-
"sample-tf-idf.csv",
|
31
|
-
"sample.csv",
|
32
30
|
"test/helper.rb",
|
33
31
|
"test/test_tf-idf_csv.rb",
|
34
32
|
"tf-idf_csv.gemspec"
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 2
|
8
|
-
-
|
9
|
-
version: 0.2.
|
8
|
+
- 1
|
9
|
+
version: 0.2.1
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Julian Burgess
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-11-
|
17
|
+
date: 2010-11-18 00:00:00 -05:00
|
18
18
|
default_executable: tf-idf_csv
|
19
19
|
dependencies: []
|
20
20
|
|
@@ -37,8 +37,6 @@ files:
|
|
37
37
|
- VERSION
|
38
38
|
- bin/tf-idf_csv
|
39
39
|
- lib/tf-idf_csv.rb
|
40
|
-
- sample-tf-idf.csv
|
41
|
-
- sample.csv
|
42
40
|
- test/helper.rb
|
43
41
|
- test/test_tf-idf_csv.rb
|
44
42
|
- tf-idf_csv.gemspec
|
data/sample-tf-idf.csv
DELETED
@@ -1,46 +0,0 @@
|
|
1
|
-
term,doc1,doc2
|
2
|
-
the,,
|
3
|
-
limerick,0.01038034467806831646,
|
4
|
-
packs,0.01038034467806831646,
|
5
|
-
laughs,0.01038034467806831646,
|
6
|
-
anatomical,0.01038034467806831646,
|
7
|
-
in,0.01038034467806831646,
|
8
|
-
space,0.01038034467806831646,
|
9
|
-
that,,
|
10
|
-
is,0.01038034467806831646,
|
11
|
-
quite,0.01038034467806831646,
|
12
|
-
economical,0.01038034467806831646,
|
13
|
-
but,,
|
14
|
-
good,0.01038034467806831646,
|
15
|
-
ones,0.02076068935613663291,
|
16
|
-
i've,0.01038034467806831646,
|
17
|
-
seen,0.01038034467806831646,
|
18
|
-
so,0.02076068935613663291,
|
19
|
-
seldom,0.02076068935613663291,
|
20
|
-
are,0.02076068935613663291,
|
21
|
-
clean,0.02076068935613663291,
|
22
|
-
and,,
|
23
|
-
comical,0.01038034467806831646,
|
24
|
-
there,,0.01003433318879937315
|
25
|
-
was,,0.01003433318879937315
|
26
|
-
a,,0.01003433318879937315
|
27
|
-
young,,0.01003433318879937315
|
28
|
-
person,,0.01003433318879937315
|
29
|
-
of,,0.02006866637759874630
|
30
|
-
smyrna,,0.02006866637759874630
|
31
|
-
whose,,0.01003433318879937315
|
32
|
-
grandmother,,0.01003433318879937315
|
33
|
-
threatened,,0.01003433318879937315
|
34
|
-
to,,0.01003433318879937315
|
35
|
-
burn,,0.02006866637759874630
|
36
|
-
her,,0.01003433318879937315
|
37
|
-
she,,0.01003433318879937315
|
38
|
-
seized,,0.01003433318879937315
|
39
|
-
on,,0.01003433318879937315
|
40
|
-
cat,,0.01003433318879937315
|
41
|
-
said,,0.01003433318879937315
|
42
|
-
'granny,,0.01003433318879937315
|
43
|
-
you,,0.01003433318879937315
|
44
|
-
incongruous,,0.01003433318879937315
|
45
|
-
old,,0.01003433318879937315
|
46
|
-
woman,,0.01003433318879937315
|
data/sample.csv
DELETED
@@ -1,2 +0,0 @@
|
|
1
|
-
doc1,the,limerick,packs,laughs,anatomical,in,space,that,is,quite,economical,but,the,good,ones,i've,seen,so,seldom,are,clean,and,the,clean,ones,so,seldom,are,comical
|
2
|
-
doc2,there,was,a,young,person,of,smyrna,whose,grandmother,threatened,to,burn,her,but,she,seized,on,the,cat,and,said,'granny,burn,that,you,incongruous,old,woman,of,smyrna
|