tf-idf_csv 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +3 -0
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/bin/tf-idf_csv +15 -1
- data/lib/tf-idf_csv.rb +33 -44
- data/tf-idf_csv.gemspec +56 -0
- metadata +6 -16
data/Gemfile
ADDED
data/Rakefile
CHANGED
@@ -9,7 +9,7 @@ begin
|
|
9
9
|
gem.email = "jburgess@ap.org"
|
10
10
|
gem.homepage = "http://github.com/aubergene/tf-idf_csv"
|
11
11
|
gem.authors = ["Julian Burgess"]
|
12
|
-
gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
|
12
|
+
# gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
|
13
13
|
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
14
14
|
end
|
15
15
|
Jeweler::GemcutterTasks.new
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/bin/tf-idf_csv
CHANGED
@@ -2,7 +2,21 @@
|
|
2
2
|
|
3
3
|
$LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
|
4
4
|
require 'tf-idf_csv'
|
5
|
+
require 'csv'
|
6
|
+
|
7
|
+
begin
|
8
|
+
csv_file = ARGV[0]
|
9
|
+
csv = CSV.open(csv_file)
|
10
|
+
rescue
|
11
|
+
puts "Please specify a valid CSV file"
|
12
|
+
Process.exit(1)
|
13
|
+
end
|
14
|
+
|
15
|
+
tf_idf = Tf_Idf_CSV.new()
|
16
|
+
tf_idf.add_csv(csv)
|
17
|
+
|
18
|
+
output_csv_file = csv_file.sub(/\.csv$/,'-tf-idf.csv')
|
19
|
+
tf_idf.write(output_csv_file)
|
5
20
|
|
6
|
-
tf = TfIdf_CSV.new(ARGV[0])
|
7
21
|
|
8
22
|
|
data/lib/tf-idf_csv.rb
CHANGED
@@ -1,46 +1,53 @@
|
|
1
1
|
require 'csv'
|
2
|
-
require 'set'
|
3
2
|
|
4
3
|
# This class expects a CSV input
|
5
4
|
# One row per document,
|
6
5
|
# the first cell should be a document identifier
|
7
6
|
# each subsequent cell contains one term.
|
8
7
|
# TF-IDF will be returned based on the number of times the term appears in each document, relative to the total number of documents it appears in
|
8
|
+
class Tf_Idf_CSV
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
DECIMAL_PLACES = 20 # number of decimal places to use in output
|
13
|
-
|
14
|
-
# n the n-grams of the data http://en.wikipedia.org/wiki/N-gram
|
15
|
-
def initialize(csv_file)
|
16
|
-
@output_csv_file = csv_file.sub(/\.csv$/,'-tf-idf.csv')
|
17
|
-
|
10
|
+
def initialize
|
18
11
|
@tf_idf = {}
|
19
12
|
@total_number_of_docs = 0
|
20
13
|
@doc_count_per_term = Hash.new(0)
|
21
|
-
@term_freq_per_doc = Hash.new
|
22
|
-
|
23
|
-
add_csv(csv_file)
|
24
|
-
calculate_tf_idf
|
25
|
-
puts "Finished calculations"
|
14
|
+
@term_freq_per_doc = Hash.new
|
15
|
+
end
|
26
16
|
|
27
|
-
|
28
|
-
|
17
|
+
def add_csv(csv)
|
18
|
+
csv.each do |row|
|
19
|
+
name = row[0]
|
20
|
+
terms = row[1..-1]
|
21
|
+
add_document(name, terms)
|
22
|
+
end
|
23
|
+
calculate_tf_idf
|
29
24
|
end
|
30
25
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
26
|
+
# Save the results as CSV
|
27
|
+
# Term, Doc1, Doc2, Doc3...
|
28
|
+
# Eggs, 0.04535,,0.02
|
29
|
+
def write(csv_file_name, options = {})
|
30
|
+
decimal_places = options[:decimal_places] || 20
|
31
|
+
|
32
|
+
CSV.open(csv_file_name,"w") do |f|
|
33
|
+
f << ["term", docs].flatten
|
34
|
+
@tf_idf.each do |term, values|
|
35
|
+
tmp_row = [term]
|
36
|
+
docs.each do |doc|
|
37
|
+
value = values[doc] ? ("%.#{decimal_places}f" % values[doc]) : nil
|
38
|
+
value = nil if value =~ /^0\.0+$/
|
39
|
+
tmp_row << value
|
40
|
+
end
|
41
|
+
f << tmp_row
|
36
42
|
end
|
37
|
-
rescue Exception
|
38
|
-
puts "Error opening #{csv_file}. Please specify a valid CSV file"
|
39
|
-
Process.exit(1 )
|
40
43
|
end
|
41
44
|
end
|
42
45
|
|
43
|
-
|
46
|
+
private
|
47
|
+
|
48
|
+
def add_document(doc, terms)
|
49
|
+
@total_number_of_docs += 1
|
50
|
+
|
44
51
|
term_counts_doc = Hash.new(0.0)
|
45
52
|
|
46
53
|
# Count the number of times each term appears in this document
|
@@ -56,7 +63,7 @@ class TfIdf_CSV
|
|
56
63
|
|
57
64
|
@term_freq_per_doc[doc] = term_counts_doc
|
58
65
|
end
|
59
|
-
|
66
|
+
|
60
67
|
def docs
|
61
68
|
@term_freq_per_doc.keys
|
62
69
|
end
|
@@ -78,23 +85,5 @@ class TfIdf_CSV
|
|
78
85
|
end
|
79
86
|
end
|
80
87
|
|
81
|
-
# Save the results as CSV
|
82
|
-
# Term, Doc1, Doc2, Doc3...
|
83
|
-
# Eggs, 0.04535,,0.02
|
84
|
-
def save_output
|
85
|
-
CSV.open(@output_csv_file,"w") do |f|
|
86
|
-
f << ["term", docs].flatten
|
87
|
-
@tf_idf.each do |term, values|
|
88
|
-
tmp_row = [term]
|
89
|
-
docs.each do |doc|
|
90
|
-
value = values[doc] ? ("%.#{DECIMAL_PLACES}f" % values[doc]) : nil
|
91
|
-
value = nil if value =~ /^0\.0+$/
|
92
|
-
tmp_row << value
|
93
|
-
end
|
94
|
-
f << tmp_row
|
95
|
-
end
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
88
|
end
|
100
89
|
|
data/tf-idf_csv.gemspec
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{tf-idf_csv}
|
8
|
+
s.version = "0.2.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Julian Burgess"]
|
12
|
+
s.date = %q{2010-11-02}
|
13
|
+
s.default_executable = %q{tf-idf_csv}
|
14
|
+
s.email = %q{jburgess@ap.org}
|
15
|
+
s.executables = ["tf-idf_csv"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"LICENSE",
|
18
|
+
"README.rdoc"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
".document",
|
22
|
+
".gitignore",
|
23
|
+
"Gemfile",
|
24
|
+
"LICENSE",
|
25
|
+
"README.rdoc",
|
26
|
+
"Rakefile",
|
27
|
+
"VERSION",
|
28
|
+
"bin/tf-idf_csv",
|
29
|
+
"lib/tf-idf_csv.rb",
|
30
|
+
"sample-tf-idf.csv",
|
31
|
+
"sample.csv",
|
32
|
+
"test/helper.rb",
|
33
|
+
"test/test_tf-idf_csv.rb",
|
34
|
+
"tf-idf_csv.gemspec"
|
35
|
+
]
|
36
|
+
s.homepage = %q{http://github.com/aubergene/tf-idf_csv}
|
37
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
38
|
+
s.require_paths = ["lib"]
|
39
|
+
s.rubygems_version = %q{1.3.7}
|
40
|
+
s.summary = %q{Calculate the TF-IDF for terms in a CSV file}
|
41
|
+
s.test_files = [
|
42
|
+
"test/helper.rb",
|
43
|
+
"test/test_tf-idf_csv.rb"
|
44
|
+
]
|
45
|
+
|
46
|
+
if s.respond_to? :specification_version then
|
47
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
48
|
+
s.specification_version = 3
|
49
|
+
|
50
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
51
|
+
else
|
52
|
+
end
|
53
|
+
else
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
7
|
+
- 2
|
8
8
|
- 0
|
9
|
-
version: 0.
|
9
|
+
version: 0.2.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Julian Burgess
|
@@ -16,20 +16,8 @@ cert_chain: []
|
|
16
16
|
|
17
17
|
date: 2010-11-02 00:00:00 -04:00
|
18
18
|
default_executable: tf-idf_csv
|
19
|
-
dependencies:
|
20
|
-
|
21
|
-
name: thoughtbot-shoulda
|
22
|
-
prerelease: false
|
23
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
-
none: false
|
25
|
-
requirements:
|
26
|
-
- - ">="
|
27
|
-
- !ruby/object:Gem::Version
|
28
|
-
segments:
|
29
|
-
- 0
|
30
|
-
version: "0"
|
31
|
-
type: :development
|
32
|
-
version_requirements: *id001
|
19
|
+
dependencies: []
|
20
|
+
|
33
21
|
description:
|
34
22
|
email: jburgess@ap.org
|
35
23
|
executables:
|
@@ -42,6 +30,7 @@ extra_rdoc_files:
|
|
42
30
|
files:
|
43
31
|
- .document
|
44
32
|
- .gitignore
|
33
|
+
- Gemfile
|
45
34
|
- LICENSE
|
46
35
|
- README.rdoc
|
47
36
|
- Rakefile
|
@@ -52,6 +41,7 @@ files:
|
|
52
41
|
- sample.csv
|
53
42
|
- test/helper.rb
|
54
43
|
- test/test_tf-idf_csv.rb
|
44
|
+
- tf-idf_csv.gemspec
|
55
45
|
has_rdoc: true
|
56
46
|
homepage: http://github.com/aubergene/tf-idf_csv
|
57
47
|
licenses: []
|