tf-idf_csv 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "http://rubygems.org"
2
+ gem "csv"
3
+
data/Rakefile CHANGED
@@ -9,7 +9,7 @@ begin
9
9
  gem.email = "jburgess@ap.org"
10
10
  gem.homepage = "http://github.com/aubergene/tf-idf_csv"
11
11
  gem.authors = ["Julian Burgess"]
12
- gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
12
+ # gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
13
13
  # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
14
  end
15
15
  Jeweler::GemcutterTasks.new
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.2.0
data/bin/tf-idf_csv CHANGED
@@ -2,7 +2,21 @@
2
2
 
3
3
  $LOAD_PATH.unshift File.dirname(__FILE__) + '/../lib'
4
4
  require 'tf-idf_csv'
5
+ require 'csv'
6
+
7
+ begin
8
+ csv_file = ARGV[0]
9
+ csv = CSV.open(csv_file)
10
+ rescue
11
+ puts "Please specify a valid CSV file"
12
+ Process.exit(1)
13
+ end
14
+
15
+ tf_idf = Tf_Idf_CSV.new()
16
+ tf_idf.add_csv(csv)
17
+
18
+ output_csv_file = csv_file.sub(/\.csv$/,'-tf-idf.csv')
19
+ tf_idf.write(output_csv_file)
5
20
 
6
- tf = TfIdf_CSV.new(ARGV[0])
7
21
 
8
22
 
data/lib/tf-idf_csv.rb CHANGED
@@ -1,46 +1,53 @@
1
1
  require 'csv'
2
- require 'set'
3
2
 
4
3
  # This class expects a CSV input
5
4
  # One row per document,
6
5
  # the first cell should be a document identifier
7
6
  # each subsequent cell contains one term.
8
7
  # TF-IDF will be returned based on the number of times the term appears in each document, relative to the total number of documents it appears in
8
+ class Tf_Idf_CSV
9
9
 
10
- class TfIdf_CSV
11
-
12
- DECIMAL_PLACES = 20 # number of decimal places to use in output
13
-
14
- # n the n-grams of the data http://en.wikipedia.org/wiki/N-gram
15
- def initialize(csv_file)
16
- @output_csv_file = csv_file.sub(/\.csv$/,'-tf-idf.csv')
17
-
10
+ def initialize
18
11
  @tf_idf = {}
19
12
  @total_number_of_docs = 0
20
13
  @doc_count_per_term = Hash.new(0)
21
- @term_freq_per_doc = Hash.new
22
-
23
- add_csv(csv_file)
24
- calculate_tf_idf
25
- puts "Finished calculations"
14
+ @term_freq_per_doc = Hash.new
15
+ end
26
16
 
27
- save_output
28
- puts "Finished saving to #{@output_csv_file}"
17
+ def add_csv(csv)
18
+ csv.each do |row|
19
+ name = row[0]
20
+ terms = row[1..-1]
21
+ add_document(name, terms)
22
+ end
23
+ calculate_tf_idf
29
24
  end
30
25
 
31
- def add_csv(csv_file)
32
- begin
33
- CSV.foreach(csv_file) do |row|
34
- add_doc(row[0], row[1..-1])
35
- @total_number_of_docs += 1
26
+ # Save the results as CSV
27
+ # Term, Doc1, Doc2, Doc3...
28
+ # Eggs, 0.04535,,0.02
29
+ def write(csv_file_name, options = {})
30
+ decimal_places = options[:decimal_places] || 20
31
+
32
+ CSV.open(csv_file_name,"w") do |f|
33
+ f << ["term", docs].flatten
34
+ @tf_idf.each do |term, values|
35
+ tmp_row = [term]
36
+ docs.each do |doc|
37
+ value = values[doc] ? ("%.#{decimal_places}f" % values[doc]) : nil
38
+ value = nil if value =~ /^0\.0+$/
39
+ tmp_row << value
40
+ end
41
+ f << tmp_row
36
42
  end
37
- rescue Exception
38
- puts "Error opening #{csv_file}. Please specify a valid CSV file"
39
- Process.exit(1 )
40
43
  end
41
44
  end
42
45
 
43
- def add_doc(doc, terms)
46
+ private
47
+
48
+ def add_document(doc, terms)
49
+ @total_number_of_docs += 1
50
+
44
51
  term_counts_doc = Hash.new(0.0)
45
52
 
46
53
  # Count the number of times each term appears in this document
@@ -56,7 +63,7 @@ class TfIdf_CSV
56
63
 
57
64
  @term_freq_per_doc[doc] = term_counts_doc
58
65
  end
59
-
66
+
60
67
  def docs
61
68
  @term_freq_per_doc.keys
62
69
  end
@@ -78,23 +85,5 @@ class TfIdf_CSV
78
85
  end
79
86
  end
80
87
 
81
- # Save the results as CSV
82
- # Term, Doc1, Doc2, Doc3...
83
- # Eggs, 0.04535,,0.02
84
- def save_output
85
- CSV.open(@output_csv_file,"w") do |f|
86
- f << ["term", docs].flatten
87
- @tf_idf.each do |term, values|
88
- tmp_row = [term]
89
- docs.each do |doc|
90
- value = values[doc] ? ("%.#{DECIMAL_PLACES}f" % values[doc]) : nil
91
- value = nil if value =~ /^0\.0+$/
92
- tmp_row << value
93
- end
94
- f << tmp_row
95
- end
96
- end
97
- end
98
-
99
88
  end
100
89
 
@@ -0,0 +1,56 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{tf-idf_csv}
8
+ s.version = "0.2.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Julian Burgess"]
12
+ s.date = %q{2010-11-02}
13
+ s.default_executable = %q{tf-idf_csv}
14
+ s.email = %q{jburgess@ap.org}
15
+ s.executables = ["tf-idf_csv"]
16
+ s.extra_rdoc_files = [
17
+ "LICENSE",
18
+ "README.rdoc"
19
+ ]
20
+ s.files = [
21
+ ".document",
22
+ ".gitignore",
23
+ "Gemfile",
24
+ "LICENSE",
25
+ "README.rdoc",
26
+ "Rakefile",
27
+ "VERSION",
28
+ "bin/tf-idf_csv",
29
+ "lib/tf-idf_csv.rb",
30
+ "sample-tf-idf.csv",
31
+ "sample.csv",
32
+ "test/helper.rb",
33
+ "test/test_tf-idf_csv.rb",
34
+ "tf-idf_csv.gemspec"
35
+ ]
36
+ s.homepage = %q{http://github.com/aubergene/tf-idf_csv}
37
+ s.rdoc_options = ["--charset=UTF-8"]
38
+ s.require_paths = ["lib"]
39
+ s.rubygems_version = %q{1.3.7}
40
+ s.summary = %q{Calculate the TF-IDF for terms in a CSV file}
41
+ s.test_files = [
42
+ "test/helper.rb",
43
+ "test/test_tf-idf_csv.rb"
44
+ ]
45
+
46
+ if s.respond_to? :specification_version then
47
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
48
+ s.specification_version = 3
49
+
50
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
51
+ else
52
+ end
53
+ else
54
+ end
55
+ end
56
+
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 1
7
+ - 2
8
8
  - 0
9
- version: 0.1.0
9
+ version: 0.2.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Julian Burgess
@@ -16,20 +16,8 @@ cert_chain: []
16
16
 
17
17
  date: 2010-11-02 00:00:00 -04:00
18
18
  default_executable: tf-idf_csv
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
21
- name: thoughtbot-shoulda
22
- prerelease: false
23
- requirement: &id001 !ruby/object:Gem::Requirement
24
- none: false
25
- requirements:
26
- - - ">="
27
- - !ruby/object:Gem::Version
28
- segments:
29
- - 0
30
- version: "0"
31
- type: :development
32
- version_requirements: *id001
19
+ dependencies: []
20
+
33
21
  description:
34
22
  email: jburgess@ap.org
35
23
  executables:
@@ -42,6 +30,7 @@ extra_rdoc_files:
42
30
  files:
43
31
  - .document
44
32
  - .gitignore
33
+ - Gemfile
45
34
  - LICENSE
46
35
  - README.rdoc
47
36
  - Rakefile
@@ -52,6 +41,7 @@ files:
52
41
  - sample.csv
53
42
  - test/helper.rb
54
43
  - test/test_tf-idf_csv.rb
44
+ - tf-idf_csv.gemspec
55
45
  has_rdoc: true
56
46
  homepage: http://github.com/aubergene/tf-idf_csv
57
47
  licenses: []