tf_idf 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -11,10 +11,9 @@ http://en.wikipedia.org/wiki/Tf–idf
11
11
  require 'rubygems'
12
12
  require 'tf_idf'
13
13
 
14
- data = ['a a a a a a a a b b', 'a a']
14
+ data = [%w{a a a a a a a a b b}, %w{a a}]
15
15
 
16
- # 1 is the ngram setter => http://en.wikipedia.org/wiki/N-gram
17
- a = TfIdf.new(data, 1)
16
+ a = TfIdf.new(data)
18
17
 
19
18
  # To find the term frequencies
20
19
  a.tf
data/Rakefile CHANGED
@@ -11,7 +11,6 @@ begin
11
11
  gem.homepage = "http://github.com/reddavis/TF-IDF"
12
12
  gem.authors = ["reddavis"]
13
13
  gem.add_development_dependency "rspec", ">= 1.2.9"
14
- gem.add_dependency('n_gram', ">= 0.0.0")
15
14
  # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
16
15
  end
17
16
  Jeweler::GemcutterTasks.new
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.1
1
+ 0.0.2
@@ -1,11 +1,8 @@
1
- require 'n_gram'
2
-
3
1
  class TfIdf
4
2
 
5
3
  # n the n-grams of the data http://en.wikipedia.org/wiki/N-gram
6
- def initialize(data, n=1)
4
+ def initialize(data)
7
5
  @data = data
8
- @n = n
9
6
  end
10
7
 
11
8
  def tf
@@ -28,54 +25,62 @@ class TfIdf
28
25
 
29
26
  tf_idf
30
27
  end
28
+
29
+ private
31
30
 
32
31
  def total_documents
33
32
  @data.size.to_f
34
33
  end
35
34
 
36
- private
35
+ # Returns all terms, once
36
+ def terms
37
+ @data.flatten.uniq
38
+ end
37
39
 
38
40
  # IDF = total_documents / number_of_document_term_appears_in
39
41
  # This calculates how important a term is.
40
42
  def calculate_inverse_document_frequency
41
- original_ngrams = n_gram.ngrams_of_all_data[@n].clone
43
+ results = {}
42
44
 
43
- original_ngrams.each_key do |term|
44
-
45
- # Calculate how many documents the term appears in
45
+ terms.each do |term|
46
46
  count = 0.0
47
- n_gram.ngrams_of_inputs.each do |document|
48
- count += 1 if document[@n].key?(term)
47
+
48
+ @data.each do |document|
49
+ count += 1 if document.include?(term)
49
50
  end
50
-
51
- original_ngrams[term] = Math.log10(total_documents / count)
51
+
52
+ results[term] = Math.log10(total_documents / count)
52
53
  end
53
54
 
54
- original_ngrams
55
+ results
55
56
  end
56
57
 
57
58
  # TF = number_of_n_term_in_document / number_of_terms_in_document
58
59
  # Calculates the number of times a term appears in the document
59
60
  # It is then normalized (as some documents are longer than others)
60
61
  def calculate_term_frequencies
61
- original_ngrams = n_gram.ngrams_of_inputs.clone
62
-
63
- original_ngrams.each_with_index do |document, index|
62
+ results = []
63
+
64
+ @data.each do |document|
65
+ document_result = {}
66
+
67
+ document.each do |term|
68
+ if document_result.key?(term)
69
+ document_result[term] += 1.0
70
+ else
71
+ document_result[term] = 1.0
72
+ end
73
+ end
64
74
 
65
- # Calculate the total number of terms
66
- total_terms = 0.0
67
- document[@n].each_value {|v| total_terms += v}
68
-
69
- document[@n].each_pair do |key, value|
70
- original_ngrams[index][@n][key] = (value.to_f / total_terms)
75
+ # Normalize the count
76
+ document_result.each_key do |term|
77
+ document_result[term] /= document.size
71
78
  end
79
+
80
+ results << document_result
72
81
  end
73
82
 
74
- original_ngrams.map {|x| x.map {|y| y[1] }}.flatten
83
+ results
75
84
  end
76
-
77
- def n_gram
78
- @n_gram ||= NGram.new(@data, :n => @n)
79
- end
80
-
85
+
81
86
  end
@@ -3,7 +3,7 @@ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
3
3
  describe "TfIdf" do
4
4
  describe "Term Frequency" do
5
5
  before do
6
- @a = TfIdf.new(data, 1)
6
+ @a = TfIdf.new(data)
7
7
  end
8
8
 
9
9
  it "should return 0.2" do
@@ -13,16 +13,16 @@ describe "TfIdf" do
13
13
 
14
14
  describe "Inverse Document Frequency" do
15
15
  before do
16
- @a = TfIdf.new(data, 1)
16
+ @a = TfIdf.new(data)
17
17
  end
18
18
 
19
19
  it "should return 0.3010" do
20
20
  @a.idf['b'].to_s.should match(/0.30102999/)
21
21
  end
22
22
  end
23
-
23
+
24
24
  before do
25
- @a = TfIdf.new(data, 1)
25
+ @a = TfIdf.new(data)
26
26
  end
27
27
 
28
28
  it "should return 0.0602" do
@@ -32,6 +32,6 @@ describe "TfIdf" do
32
32
  private
33
33
 
34
34
  def data
35
- ['a a a a a a a a b b', 'a a']
35
+ [%w{a a a a a a a a b b}, %w{a a}]
36
36
  end
37
37
  end
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{tf_idf}
8
- s.version = "0.0.1"
8
+ s.version = "0.0.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["reddavis"]
12
- s.date = %q{2009-12-21}
12
+ s.date = %q{2010-01-06}
13
13
  s.description = %q{A TF-IDF in ruby - http://en.wikipedia.org/wiki/Tf–idf}
14
14
  s.email = %q{reddavis@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -45,14 +45,11 @@ Gem::Specification.new do |s|
45
45
 
46
46
  if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
47
47
  s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
48
- s.add_runtime_dependency(%q<n_gram>, [">= 0.0.0"])
49
48
  else
50
49
  s.add_dependency(%q<rspec>, [">= 1.2.9"])
51
- s.add_dependency(%q<n_gram>, [">= 0.0.0"])
52
50
  end
53
51
  else
54
52
  s.add_dependency(%q<rspec>, [">= 1.2.9"])
55
- s.add_dependency(%q<n_gram>, [">= 0.0.0"])
56
53
  end
57
54
  end
58
55
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tf_idf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - reddavis
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-12-21 00:00:00 +00:00
12
+ date: 2010-01-06 00:00:00 +00:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -22,16 +22,6 @@ dependencies:
22
22
  - !ruby/object:Gem::Version
23
23
  version: 1.2.9
24
24
  version:
25
- - !ruby/object:Gem::Dependency
26
- name: n_gram
27
- type: :runtime
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: 0.0.0
34
- version:
35
25
  description: "A TF-IDF in ruby - http://en.wikipedia.org/wiki/Tf\xE2\x80\x93idf"
36
26
  email: reddavis@gmail.com
37
27
  executables: []