tf_idf 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +2 -3
- data/Rakefile +0 -1
- data/VERSION +1 -1
- data/lib/tf_idf.rb +34 -29
- data/spec/tf_idf_spec.rb +5 -5
- data/tf_idf.gemspec +2 -5
- metadata +2 -12
data/README.rdoc
CHANGED
@@ -11,10 +11,9 @@ http://en.wikipedia.org/wiki/Tf–idf
|
|
11
11
|
require 'rubygems'
|
12
12
|
require 'tf_idf'
|
13
13
|
|
14
|
-
data = [
|
14
|
+
data = [%w{a a a a a a a a b b}, %w{a a}]
|
15
15
|
|
16
|
-
|
17
|
-
a = TfIdf.new(data, 1)
|
16
|
+
a = TfIdf.new(data)
|
18
17
|
|
19
18
|
# To find the term frequencies
|
20
19
|
a.tf
|
data/Rakefile
CHANGED
@@ -11,7 +11,6 @@ begin
|
|
11
11
|
gem.homepage = "http://github.com/reddavis/TF-IDF"
|
12
12
|
gem.authors = ["reddavis"]
|
13
13
|
gem.add_development_dependency "rspec", ">= 1.2.9"
|
14
|
-
gem.add_dependency('n_gram', ">= 0.0.0")
|
15
14
|
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
16
15
|
end
|
17
16
|
Jeweler::GemcutterTasks.new
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.2
|
data/lib/tf_idf.rb
CHANGED
@@ -1,11 +1,8 @@
|
|
1
|
-
require 'n_gram'
|
2
|
-
|
3
1
|
class TfIdf
|
4
2
|
|
5
3
|
# n the n-grams of the data http://en.wikipedia.org/wiki/N-gram
|
6
|
-
def initialize(data
|
4
|
+
def initialize(data)
|
7
5
|
@data = data
|
8
|
-
@n = n
|
9
6
|
end
|
10
7
|
|
11
8
|
def tf
|
@@ -28,54 +25,62 @@ class TfIdf
|
|
28
25
|
|
29
26
|
tf_idf
|
30
27
|
end
|
28
|
+
|
29
|
+
private
|
31
30
|
|
32
31
|
def total_documents
|
33
32
|
@data.size.to_f
|
34
33
|
end
|
35
34
|
|
36
|
-
|
35
|
+
# Returns all terms, once
|
36
|
+
def terms
|
37
|
+
@data.flatten.uniq
|
38
|
+
end
|
37
39
|
|
38
40
|
# IDF = total_documents / number_of_document_term_appears_in
|
39
41
|
# This calculates how important a term is.
|
40
42
|
def calculate_inverse_document_frequency
|
41
|
-
|
43
|
+
results = {}
|
42
44
|
|
43
|
-
|
44
|
-
|
45
|
-
# Calculate how many documents the term appears in
|
45
|
+
terms.each do |term|
|
46
46
|
count = 0.0
|
47
|
-
|
48
|
-
|
47
|
+
|
48
|
+
@data.each do |document|
|
49
|
+
count += 1 if document.include?(term)
|
49
50
|
end
|
50
|
-
|
51
|
-
|
51
|
+
|
52
|
+
results[term] = Math.log10(total_documents / count)
|
52
53
|
end
|
53
54
|
|
54
|
-
|
55
|
+
results
|
55
56
|
end
|
56
57
|
|
57
58
|
# TF = number_of_n_term_in_document / number_of_terms_in_document
|
58
59
|
# Calculates the number of times a term appears in the document
|
59
60
|
# It is then normalized (as some documents are longer than others)
|
60
61
|
def calculate_term_frequencies
|
61
|
-
|
62
|
-
|
63
|
-
|
62
|
+
results = []
|
63
|
+
|
64
|
+
@data.each do |document|
|
65
|
+
document_result = {}
|
66
|
+
|
67
|
+
document.each do |term|
|
68
|
+
if document_result.key?(term)
|
69
|
+
document_result[term] += 1.0
|
70
|
+
else
|
71
|
+
document_result[term] = 1.0
|
72
|
+
end
|
73
|
+
end
|
64
74
|
|
65
|
-
#
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
document[@n].each_pair do |key, value|
|
70
|
-
original_ngrams[index][@n][key] = (value.to_f / total_terms)
|
75
|
+
# Normalize the count
|
76
|
+
document_result.each_key do |term|
|
77
|
+
document_result[term] /= document.size
|
71
78
|
end
|
79
|
+
|
80
|
+
results << document_result
|
72
81
|
end
|
73
82
|
|
74
|
-
|
83
|
+
results
|
75
84
|
end
|
76
|
-
|
77
|
-
def n_gram
|
78
|
-
@n_gram ||= NGram.new(@data, :n => @n)
|
79
|
-
end
|
80
|
-
|
85
|
+
|
81
86
|
end
|
data/spec/tf_idf_spec.rb
CHANGED
@@ -3,7 +3,7 @@ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
|
3
3
|
describe "TfIdf" do
|
4
4
|
describe "Term Frequency" do
|
5
5
|
before do
|
6
|
-
@a = TfIdf.new(data
|
6
|
+
@a = TfIdf.new(data)
|
7
7
|
end
|
8
8
|
|
9
9
|
it "should return 0.2" do
|
@@ -13,16 +13,16 @@ describe "TfIdf" do
|
|
13
13
|
|
14
14
|
describe "Inverse Document Frequency" do
|
15
15
|
before do
|
16
|
-
@a = TfIdf.new(data
|
16
|
+
@a = TfIdf.new(data)
|
17
17
|
end
|
18
18
|
|
19
19
|
it "should return 0.3010" do
|
20
20
|
@a.idf['b'].to_s.should match(/0.30102999/)
|
21
21
|
end
|
22
22
|
end
|
23
|
-
|
23
|
+
|
24
24
|
before do
|
25
|
-
@a = TfIdf.new(data
|
25
|
+
@a = TfIdf.new(data)
|
26
26
|
end
|
27
27
|
|
28
28
|
it "should return 0.0602" do
|
@@ -32,6 +32,6 @@ describe "TfIdf" do
|
|
32
32
|
private
|
33
33
|
|
34
34
|
def data
|
35
|
-
[
|
35
|
+
[%w{a a a a a a a a b b}, %w{a a}]
|
36
36
|
end
|
37
37
|
end
|
data/tf_idf.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{tf_idf}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["reddavis"]
|
12
|
-
s.date = %q{
|
12
|
+
s.date = %q{2010-01-06}
|
13
13
|
s.description = %q{A TF-IDF in ruby - http://en.wikipedia.org/wiki/Tf–idf}
|
14
14
|
s.email = %q{reddavis@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -45,14 +45,11 @@ Gem::Specification.new do |s|
|
|
45
45
|
|
46
46
|
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
47
47
|
s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
|
48
|
-
s.add_runtime_dependency(%q<n_gram>, [">= 0.0.0"])
|
49
48
|
else
|
50
49
|
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
51
|
-
s.add_dependency(%q<n_gram>, [">= 0.0.0"])
|
52
50
|
end
|
53
51
|
else
|
54
52
|
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
55
|
-
s.add_dependency(%q<n_gram>, [">= 0.0.0"])
|
56
53
|
end
|
57
54
|
end
|
58
55
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tf_idf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- reddavis
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2010-01-06 00:00:00 +00:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -22,16 +22,6 @@ dependencies:
|
|
22
22
|
- !ruby/object:Gem::Version
|
23
23
|
version: 1.2.9
|
24
24
|
version:
|
25
|
-
- !ruby/object:Gem::Dependency
|
26
|
-
name: n_gram
|
27
|
-
type: :runtime
|
28
|
-
version_requirement:
|
29
|
-
version_requirements: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - ">="
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: 0.0.0
|
34
|
-
version:
|
35
25
|
description: "A TF-IDF in ruby - http://en.wikipedia.org/wiki/Tf\xE2\x80\x93idf"
|
36
26
|
email: reddavis@gmail.com
|
37
27
|
executables: []
|