tf_idf 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,10 +11,9 @@ http://en.wikipedia.org/wiki/Tf–idf
11
11
  require 'rubygems'
12
12
  require 'tf_idf'
13
13
 
14
- data = ['a a a a a a a a b b', 'a a']
14
+ data = [%w{a a a a a a a a b b}, %w{a a}]
15
15
 
16
- # 1 is the ngram setter => http://en.wikipedia.org/wiki/N-gram
17
- a = TfIdf.new(data, 1)
16
+ a = TfIdf.new(data)
18
17
 
19
18
  # To find the term frequencies
20
19
  a.tf
data/Rakefile CHANGED
@@ -11,7 +11,6 @@ begin
11
11
  gem.homepage = "http://github.com/reddavis/TF-IDF"
12
12
  gem.authors = ["reddavis"]
13
13
  gem.add_development_dependency "rspec", ">= 1.2.9"
14
- gem.add_dependency('n_gram', ">= 0.0.0")
15
14
  # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
16
15
  end
17
16
  Jeweler::GemcutterTasks.new
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.1
1
+ 0.0.2
@@ -1,11 +1,8 @@
1
- require 'n_gram'
2
-
3
1
  class TfIdf
4
2
 
5
3
  # n the n-grams of the data http://en.wikipedia.org/wiki/N-gram
6
- def initialize(data, n=1)
4
+ def initialize(data)
7
5
  @data = data
8
- @n = n
9
6
  end
10
7
 
11
8
  def tf
@@ -28,54 +25,62 @@ class TfIdf
28
25
 
29
26
  tf_idf
30
27
  end
28
+
29
+ private
31
30
 
32
31
  def total_documents
33
32
  @data.size.to_f
34
33
  end
35
34
 
36
- private
35
+ # Returns all terms, once
36
+ def terms
37
+ @data.flatten.uniq
38
+ end
37
39
 
38
40
  # IDF = total_documents / number_of_document_term_appears_in
39
41
  # This calculates how important a term is.
40
42
  def calculate_inverse_document_frequency
41
- original_ngrams = n_gram.ngrams_of_all_data[@n].clone
43
+ results = {}
42
44
 
43
- original_ngrams.each_key do |term|
44
-
45
- # Calculate how many documents the term appears in
45
+ terms.each do |term|
46
46
  count = 0.0
47
- n_gram.ngrams_of_inputs.each do |document|
48
- count += 1 if document[@n].key?(term)
47
+
48
+ @data.each do |document|
49
+ count += 1 if document.include?(term)
49
50
  end
50
-
51
- original_ngrams[term] = Math.log10(total_documents / count)
51
+
52
+ results[term] = Math.log10(total_documents / count)
52
53
  end
53
54
 
54
- original_ngrams
55
+ results
55
56
  end
56
57
 
57
58
  # TF = number_of_n_term_in_document / number_of_terms_in_document
58
59
  # Calculates the number of times a term appears in the document
59
60
  # It is then normalized (as some documents are longer than others)
60
61
  def calculate_term_frequencies
61
- original_ngrams = n_gram.ngrams_of_inputs.clone
62
-
63
- original_ngrams.each_with_index do |document, index|
62
+ results = []
63
+
64
+ @data.each do |document|
65
+ document_result = {}
66
+
67
+ document.each do |term|
68
+ if document_result.key?(term)
69
+ document_result[term] += 1.0
70
+ else
71
+ document_result[term] = 1.0
72
+ end
73
+ end
64
74
 
65
- # Calculate the total number of terms
66
- total_terms = 0.0
67
- document[@n].each_value {|v| total_terms += v}
68
-
69
- document[@n].each_pair do |key, value|
70
- original_ngrams[index][@n][key] = (value.to_f / total_terms)
75
+ # Normalize the count
76
+ document_result.each_key do |term|
77
+ document_result[term] /= document.size
71
78
  end
79
+
80
+ results << document_result
72
81
  end
73
82
 
74
- original_ngrams.map {|x| x.map {|y| y[1] }}.flatten
83
+ results
75
84
  end
76
-
77
- def n_gram
78
- @n_gram ||= NGram.new(@data, :n => @n)
79
- end
80
-
85
+
81
86
  end
@@ -3,7 +3,7 @@ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
3
3
  describe "TfIdf" do
4
4
  describe "Term Frequency" do
5
5
  before do
6
- @a = TfIdf.new(data, 1)
6
+ @a = TfIdf.new(data)
7
7
  end
8
8
 
9
9
  it "should return 0.2" do
@@ -13,16 +13,16 @@ describe "TfIdf" do
13
13
 
14
14
  describe "Inverse Document Frequency" do
15
15
  before do
16
- @a = TfIdf.new(data, 1)
16
+ @a = TfIdf.new(data)
17
17
  end
18
18
 
19
19
  it "should return 0.3010" do
20
20
  @a.idf['b'].to_s.should match(/0.30102999/)
21
21
  end
22
22
  end
23
-
23
+
24
24
  before do
25
- @a = TfIdf.new(data, 1)
25
+ @a = TfIdf.new(data)
26
26
  end
27
27
 
28
28
  it "should return 0.0602" do
@@ -32,6 +32,6 @@ describe "TfIdf" do
32
32
  private
33
33
 
34
34
  def data
35
- ['a a a a a a a a b b', 'a a']
35
+ [%w{a a a a a a a a b b}, %w{a a}]
36
36
  end
37
37
  end
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{tf_idf}
8
- s.version = "0.0.1"
8
+ s.version = "0.0.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["reddavis"]
12
- s.date = %q{2009-12-21}
12
+ s.date = %q{2010-01-06}
13
13
  s.description = %q{A TF-IDF in ruby - http://en.wikipedia.org/wiki/Tf–idf}
14
14
  s.email = %q{reddavis@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -45,14 +45,11 @@ Gem::Specification.new do |s|
45
45
 
46
46
  if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
47
47
  s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
48
- s.add_runtime_dependency(%q<n_gram>, [">= 0.0.0"])
49
48
  else
50
49
  s.add_dependency(%q<rspec>, [">= 1.2.9"])
51
- s.add_dependency(%q<n_gram>, [">= 0.0.0"])
52
50
  end
53
51
  else
54
52
  s.add_dependency(%q<rspec>, [">= 1.2.9"])
55
- s.add_dependency(%q<n_gram>, [">= 0.0.0"])
56
53
  end
57
54
  end
58
55
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tf_idf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - reddavis
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-12-21 00:00:00 +00:00
12
+ date: 2010-01-06 00:00:00 +00:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -22,16 +22,6 @@ dependencies:
22
22
  - !ruby/object:Gem::Version
23
23
  version: 1.2.9
24
24
  version:
25
- - !ruby/object:Gem::Dependency
26
- name: n_gram
27
- type: :runtime
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: 0.0.0
34
- version:
35
25
  description: "A TF-IDF in ruby - http://en.wikipedia.org/wiki/Tf\xE2\x80\x93idf"
36
26
  email: reddavis@gmail.com
37
27
  executables: []