tf_idf 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 reddavis
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,33 @@
1
+ = Tf-Idf
2
+
3
+ http://en.wikipedia.org/wiki/Tf–idf
4
+
5
+ == Install
6
+
7
+ gem sources -a http://gems.github.com
8
+ sudo gem install tf_idf
9
+
10
+ == How To Use
11
+ require 'rubygems'
12
+ require 'tf_idf'
13
+
14
+ data = ['a a a a a a a a b b', 'a a']
15
+
16
+ # 1 is the ngram setter => http://en.wikipedia.org/wiki/N-gram
17
+ a = TfIdf.new(data, 1)
18
+
19
+ # To find the term frequencies
20
+ a.tf
21
+ #=> [{'b' => 0.2, 'a' => etc...}, {'a' => 1}]
22
+
23
+ # To find the inverse document frequency
24
+ a.idf
25
+ #=> {'b' => 0.301... etc...}
26
+
27
+ # And to find the tf-idf
28
+ a.tf_idf
29
+ #=> [{'b' => 0.0602, 'a' => etc...}, {etc...}]
30
+
31
+ == Copyright
32
+
33
+ Copyright (c) 2009 Red Davis. See LICENSE for details.
@@ -0,0 +1,46 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "tf_idf"
8
+ gem.summary = %Q{A TF-IDF in ruby - http://en.wikipedia.org/wiki/Tf–idf}
9
+ gem.description = %Q{A TF-IDF in ruby - http://en.wikipedia.org/wiki/Tf–idf}
10
+ gem.email = "reddavis@gmail.com"
11
+ gem.homepage = "http://github.com/reddavis/TF-IDF"
12
+ gem.authors = ["reddavis"]
13
+ gem.add_development_dependency "rspec", ">= 1.2.9"
14
+ gem.add_dependency('n_gram', ">= 0.0.0")
15
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
16
+ end
17
+ Jeweler::GemcutterTasks.new
18
+ rescue LoadError
19
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
20
+ end
21
+
22
+ require 'spec/rake/spectask'
23
+ Spec::Rake::SpecTask.new(:spec) do |spec|
24
+ spec.libs << 'lib' << 'spec'
25
+ spec.spec_files = FileList['spec/**/*_spec.rb']
26
+ end
27
+
28
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
29
+ spec.libs << 'lib' << 'spec'
30
+ spec.pattern = 'spec/**/*_spec.rb'
31
+ spec.rcov = true
32
+ end
33
+
34
+ task :spec => :check_dependencies
35
+
36
+ task :default => :spec
37
+
38
+ require 'rake/rdoctask'
39
+ Rake::RDocTask.new do |rdoc|
40
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
41
+
42
+ rdoc.rdoc_dir = 'rdoc'
43
+ rdoc.title = "tf_idf #{version}"
44
+ rdoc.rdoc_files.include('README*')
45
+ rdoc.rdoc_files.include('lib/**/*.rb')
46
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
@@ -0,0 +1,80 @@
1
+ require 'n_gram'
2
+
3
+ class TfIdf
4
+
5
+ # n the n-grams of the data http://en.wikipedia.org/wiki/N-gram
6
+ def initialize(data, n=1)
7
+ @data = data
8
+ @n = n
9
+ end
10
+
11
+ def tf
12
+ @tf ||= calculate_term_frequencies
13
+ end
14
+
15
+ def idf
16
+ @idf ||= calculate_inverse_document_frequency
17
+ end
18
+
19
+ # This is basically calculated by multiplying tf by idf
20
+ def tf_idf
21
+ tf_idf = tf.clone
22
+
23
+ tf.each_with_index do |document, index|
24
+ document.each_pair do |term, tf_score|
25
+ tf_idf[index][term] = tf_score * idf[term]
26
+ end
27
+ end
28
+
29
+ tf_idf
30
+ end
31
+
32
+ def total_documents
33
+ @data.size.to_f
34
+ end
35
+
36
+ private
37
+
38
+ # IDF = total_documents / number_of_document_term_appears_in
39
+ # This calculates how important a term is.
40
+ def calculate_inverse_document_frequency
41
+ original_ngrams = n_gram.ngrams_of_all_data[@n].clone
42
+
43
+ original_ngrams.each_key do |term|
44
+
45
+ # Calculate how many documents the term appears in
46
+ count = 0.0
47
+ n_gram.ngrams_of_inputs.each do |document|
48
+ count += 1 if document[@n].key?(term)
49
+ end
50
+
51
+ original_ngrams[term] = Math.log10(total_documents / count)
52
+ end
53
+
54
+ original_ngrams
55
+ end
56
+
57
+ # TF = number_of_n_term_in_document / number_of_terms_in_document
58
+ # Calculates the number of times a term appears in the document
59
+ # It is then normalized (as some documents are longer than others)
60
+ def calculate_term_frequencies
61
+ original_ngrams = n_gram.ngrams_of_inputs.clone
62
+
63
+ original_ngrams.each_with_index do |document, index|
64
+
65
+ # Calculate the total number of terms
66
+ total_terms = 0.0
67
+ document[@n].each_value {|v| total_terms += v}
68
+
69
+ document[@n].each_pair do |key, value|
70
+ original_ngrams[index][@n][key] = (value.to_f / total_terms)
71
+ end
72
+ end
73
+
74
+ original_ngrams.map {|x| x.map {|y| y[@n] }}.flatten
75
+ end
76
+
77
+ def n_gram
78
+ @n_gram ||= NGram.new(@data, :n => @n)
79
+ end
80
+ end
@@ -0,0 +1 @@
1
+ --color
@@ -0,0 +1,10 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'rubygems'
4
+ require 'tf_idf'
5
+ require 'spec'
6
+ require 'spec/autorun'
7
+
8
+ Spec::Runner.configure do |config|
9
+
10
+ end
@@ -0,0 +1,37 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "TfIdf" do
4
+ describe "Term Frequency" do
5
+ before do
6
+ @a = TfIdf.new(data, 1)
7
+ end
8
+
9
+ it "should return 0.2" do
10
+ @a.tf[0]['b'].should == 0.2
11
+ end
12
+ end
13
+
14
+ describe "Inverse Document Frequency" do
15
+ before do
16
+ @a = TfIdf.new(data, 1)
17
+ end
18
+
19
+ it "should return 0.3010" do
20
+ @a.idf['b'].to_s.should match(/0.30102999/)
21
+ end
22
+ end
23
+
24
+ before do
25
+ @a = TfIdf.new(data, 1)
26
+ end
27
+
28
+ it "should return 0.0602" do
29
+ @a.tf_idf[0]['b'].to_s.should match(/0.0602/)
30
+ end
31
+
32
+ private
33
+
34
+ def data
35
+ ['a a a a a a a a b b', 'a a']
36
+ end
37
+ end
@@ -0,0 +1,58 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{tf_idf}
8
+ s.version = "0.0.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["reddavis"]
12
+ s.date = %q{2009-12-18}
13
+ s.description = %q{A TF-IDF in ruby - http://en.wikipedia.org/wiki/Tf–idf}
14
+ s.email = %q{reddavis@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "lib/tf_idf.rb",
27
+ "spec/spec.opts",
28
+ "spec/spec_helper.rb",
29
+ "spec/tf_idf_spec.rb",
30
+ "tf_idf.gemspec"
31
+ ]
32
+ s.homepage = %q{http://github.com/reddavis/TF-IDF}
33
+ s.rdoc_options = ["--charset=UTF-8"]
34
+ s.require_paths = ["lib"]
35
+ s.rubygems_version = %q{1.3.5}
36
+ s.summary = %q{A TF-IDF in ruby - http://en.wikipedia.org/wiki/Tf–idf}
37
+ s.test_files = [
38
+ "spec/spec_helper.rb",
39
+ "spec/tf_idf_spec.rb"
40
+ ]
41
+
42
+ if s.respond_to? :specification_version then
43
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
44
+ s.specification_version = 3
45
+
46
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
47
+ s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
48
+ s.add_runtime_dependency(%q<n_gram>, [">= 0.0.0"])
49
+ else
50
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
51
+ s.add_dependency(%q<n_gram>, [">= 0.0.0"])
52
+ end
53
+ else
54
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
55
+ s.add_dependency(%q<n_gram>, [">= 0.0.0"])
56
+ end
57
+ end
58
+
metadata ADDED
@@ -0,0 +1,86 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tf_idf
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - reddavis
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-12-18 00:00:00 +00:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rspec
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.2.9
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: n_gram
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.0.0
34
+ version:
35
+ description: "A TF-IDF in ruby - http://en.wikipedia.org/wiki/Tf\xE2\x80\x93idf"
36
+ email: reddavis@gmail.com
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files:
42
+ - LICENSE
43
+ - README.rdoc
44
+ files:
45
+ - .document
46
+ - .gitignore
47
+ - LICENSE
48
+ - README.rdoc
49
+ - Rakefile
50
+ - VERSION
51
+ - lib/tf_idf.rb
52
+ - spec/spec.opts
53
+ - spec/spec_helper.rb
54
+ - spec/tf_idf_spec.rb
55
+ - tf_idf.gemspec
56
+ has_rdoc: true
57
+ homepage: http://github.com/reddavis/TF-IDF
58
+ licenses: []
59
+
60
+ post_install_message:
61
+ rdoc_options:
62
+ - --charset=UTF-8
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: "0"
70
+ version:
71
+ required_rubygems_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: "0"
76
+ version:
77
+ requirements: []
78
+
79
+ rubyforge_project:
80
+ rubygems_version: 1.3.5
81
+ signing_key:
82
+ specification_version: 3
83
+ summary: "A TF-IDF in ruby - http://en.wikipedia.org/wiki/Tf\xE2\x80\x93idf"
84
+ test_files:
85
+ - spec/spec_helper.rb
86
+ - spec/tf_idf_spec.rb