tf_idf 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +2 -3
- data/Rakefile +0 -1
- data/VERSION +1 -1
- data/lib/tf_idf.rb +34 -29
- data/spec/tf_idf_spec.rb +5 -5
- data/tf_idf.gemspec +2 -5
- metadata +2 -12
data/README.rdoc
CHANGED
@@ -11,10 +11,9 @@ http://en.wikipedia.org/wiki/Tf–idf
|
|
11
11
|
require 'rubygems'
|
12
12
|
require 'tf_idf'
|
13
13
|
|
14
|
-
data = [
|
14
|
+
data = [%w{a a a a a a a a b b}, %w{a a}]
|
15
15
|
|
16
|
-
|
17
|
-
a = TfIdf.new(data, 1)
|
16
|
+
a = TfIdf.new(data)
|
18
17
|
|
19
18
|
# To find the term frequencies
|
20
19
|
a.tf
|
data/Rakefile
CHANGED
@@ -11,7 +11,6 @@ begin
|
|
11
11
|
gem.homepage = "http://github.com/reddavis/TF-IDF"
|
12
12
|
gem.authors = ["reddavis"]
|
13
13
|
gem.add_development_dependency "rspec", ">= 1.2.9"
|
14
|
-
gem.add_dependency('n_gram', ">= 0.0.0")
|
15
14
|
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
16
15
|
end
|
17
16
|
Jeweler::GemcutterTasks.new
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.2
|
data/lib/tf_idf.rb
CHANGED
@@ -1,11 +1,8 @@
|
|
1
|
-
require 'n_gram'
|
2
|
-
|
3
1
|
class TfIdf
|
4
2
|
|
5
3
|
# n the n-grams of the data http://en.wikipedia.org/wiki/N-gram
|
6
|
-
def initialize(data
|
4
|
+
def initialize(data)
|
7
5
|
@data = data
|
8
|
-
@n = n
|
9
6
|
end
|
10
7
|
|
11
8
|
def tf
|
@@ -28,54 +25,62 @@ class TfIdf
|
|
28
25
|
|
29
26
|
tf_idf
|
30
27
|
end
|
28
|
+
|
29
|
+
private
|
31
30
|
|
32
31
|
def total_documents
|
33
32
|
@data.size.to_f
|
34
33
|
end
|
35
34
|
|
36
|
-
|
35
|
+
# Returns all terms, once
|
36
|
+
def terms
|
37
|
+
@data.flatten.uniq
|
38
|
+
end
|
37
39
|
|
38
40
|
# IDF = total_documents / number_of_document_term_appears_in
|
39
41
|
# This calculates how important a term is.
|
40
42
|
def calculate_inverse_document_frequency
|
41
|
-
|
43
|
+
results = {}
|
42
44
|
|
43
|
-
|
44
|
-
|
45
|
-
# Calculate how many documents the term appears in
|
45
|
+
terms.each do |term|
|
46
46
|
count = 0.0
|
47
|
-
|
48
|
-
|
47
|
+
|
48
|
+
@data.each do |document|
|
49
|
+
count += 1 if document.include?(term)
|
49
50
|
end
|
50
|
-
|
51
|
-
|
51
|
+
|
52
|
+
results[term] = Math.log10(total_documents / count)
|
52
53
|
end
|
53
54
|
|
54
|
-
|
55
|
+
results
|
55
56
|
end
|
56
57
|
|
57
58
|
# TF = number_of_n_term_in_document / number_of_terms_in_document
|
58
59
|
# Calculates the number of times a term appears in the document
|
59
60
|
# It is then normalized (as some documents are longer than others)
|
60
61
|
def calculate_term_frequencies
|
61
|
-
|
62
|
-
|
63
|
-
|
62
|
+
results = []
|
63
|
+
|
64
|
+
@data.each do |document|
|
65
|
+
document_result = {}
|
66
|
+
|
67
|
+
document.each do |term|
|
68
|
+
if document_result.key?(term)
|
69
|
+
document_result[term] += 1.0
|
70
|
+
else
|
71
|
+
document_result[term] = 1.0
|
72
|
+
end
|
73
|
+
end
|
64
74
|
|
65
|
-
#
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
document[@n].each_pair do |key, value|
|
70
|
-
original_ngrams[index][@n][key] = (value.to_f / total_terms)
|
75
|
+
# Normalize the count
|
76
|
+
document_result.each_key do |term|
|
77
|
+
document_result[term] /= document.size
|
71
78
|
end
|
79
|
+
|
80
|
+
results << document_result
|
72
81
|
end
|
73
82
|
|
74
|
-
|
83
|
+
results
|
75
84
|
end
|
76
|
-
|
77
|
-
def n_gram
|
78
|
-
@n_gram ||= NGram.new(@data, :n => @n)
|
79
|
-
end
|
80
|
-
|
85
|
+
|
81
86
|
end
|
data/spec/tf_idf_spec.rb
CHANGED
@@ -3,7 +3,7 @@ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
|
3
3
|
describe "TfIdf" do
|
4
4
|
describe "Term Frequency" do
|
5
5
|
before do
|
6
|
-
@a = TfIdf.new(data
|
6
|
+
@a = TfIdf.new(data)
|
7
7
|
end
|
8
8
|
|
9
9
|
it "should return 0.2" do
|
@@ -13,16 +13,16 @@ describe "TfIdf" do
|
|
13
13
|
|
14
14
|
describe "Inverse Document Frequency" do
|
15
15
|
before do
|
16
|
-
@a = TfIdf.new(data
|
16
|
+
@a = TfIdf.new(data)
|
17
17
|
end
|
18
18
|
|
19
19
|
it "should return 0.3010" do
|
20
20
|
@a.idf['b'].to_s.should match(/0.30102999/)
|
21
21
|
end
|
22
22
|
end
|
23
|
-
|
23
|
+
|
24
24
|
before do
|
25
|
-
@a = TfIdf.new(data
|
25
|
+
@a = TfIdf.new(data)
|
26
26
|
end
|
27
27
|
|
28
28
|
it "should return 0.0602" do
|
@@ -32,6 +32,6 @@ describe "TfIdf" do
|
|
32
32
|
private
|
33
33
|
|
34
34
|
def data
|
35
|
-
[
|
35
|
+
[%w{a a a a a a a a b b}, %w{a a}]
|
36
36
|
end
|
37
37
|
end
|
data/tf_idf.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{tf_idf}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["reddavis"]
|
12
|
-
s.date = %q{
|
12
|
+
s.date = %q{2010-01-06}
|
13
13
|
s.description = %q{A TF-IDF in ruby - http://en.wikipedia.org/wiki/Tf–idf}
|
14
14
|
s.email = %q{reddavis@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -45,14 +45,11 @@ Gem::Specification.new do |s|
|
|
45
45
|
|
46
46
|
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
47
47
|
s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
|
48
|
-
s.add_runtime_dependency(%q<n_gram>, [">= 0.0.0"])
|
49
48
|
else
|
50
49
|
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
51
|
-
s.add_dependency(%q<n_gram>, [">= 0.0.0"])
|
52
50
|
end
|
53
51
|
else
|
54
52
|
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
55
|
-
s.add_dependency(%q<n_gram>, [">= 0.0.0"])
|
56
53
|
end
|
57
54
|
end
|
58
55
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tf_idf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- reddavis
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2010-01-06 00:00:00 +00:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -22,16 +22,6 @@ dependencies:
|
|
22
22
|
- !ruby/object:Gem::Version
|
23
23
|
version: 1.2.9
|
24
24
|
version:
|
25
|
-
- !ruby/object:Gem::Dependency
|
26
|
-
name: n_gram
|
27
|
-
type: :runtime
|
28
|
-
version_requirement:
|
29
|
-
version_requirements: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - ">="
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: 0.0.0
|
34
|
-
version:
|
35
25
|
description: "A TF-IDF in ruby - http://en.wikipedia.org/wiki/Tf\xE2\x80\x93idf"
|
36
26
|
email: reddavis@gmail.com
|
37
27
|
executables: []
|