feature_selection 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 reddavis
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,38 @@
1
+ = Feature Selection
2
+
3
+ Feature Selection is a library of feature selection algorithms.
4
+
5
+ http://en.wikipedia.org/wiki/Feature_selection
6
+
7
+ == Install
8
+
9
+ gem sources -a http://gems.github.com
10
+ sudo gem install feature_selection
11
+
12
+ == How To Use
13
+
14
+ There are currently 3 implemented feature collections: Chi-Squared, Mutual Information and Frequency Based.
15
+
16
+ Each return a hash that looks similar to:
17
+
18
+ {klass => {term => score, term => score}, klass => {term => score}}
19
+
20
+ Example:
21
+
22
+ data = {
23
+ :spam => [['this', 'is', 'some', 'information'], ['this', 'is', 'something', 'that', 'is', 'information']],
24
+ :ham => [['this', 'test', 'some', 'more', 'information'], ['there', 'are', 'some', 'things']],
25
+ }
26
+
27
+ a = FeatureSelection::ChiSquared.new(data)
28
+
29
+ # You can also use...
30
+ # FeatureSelection::MutualInformation
31
+ # FeatureSelection::FrequencyBased
32
+
33
+ a.rank_features
34
+ #=> {:spam => {term => score, term => score}, :ham => {term => score}}
35
+
36
+ == Copyright
37
+
38
+ Copyright (c) 2009 reddavis. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,45 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "feature_selection"
8
+ gem.summary = %Q{A library of feature selection algorithms}
9
+ gem.description = %Q{A library of feature selection algorithms}
10
+ gem.email = "reddavis@gmail.com"
11
+ gem.homepage = "http://github.com/reddavis/Feature-Selection"
12
+ gem.authors = ["reddavis"]
13
+ gem.add_development_dependency "rspec", ">= 1.2.9"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
19
+ end
20
+
21
+ require 'spec/rake/spectask'
22
+ Spec::Rake::SpecTask.new(:spec) do |spec|
23
+ spec.libs << 'lib' << 'spec'
24
+ spec.spec_files = FileList['spec/**/*_spec.rb']
25
+ end
26
+
27
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
28
+ spec.libs << 'lib' << 'spec'
29
+ spec.pattern = 'spec/**/*_spec.rb'
30
+ spec.rcov = true
31
+ end
32
+
33
+ task :spec => :check_dependencies
34
+
35
+ task :default => :spec
36
+
37
+ require 'rake/rdoctask'
38
+ Rake::RDocTask.new do |rdoc|
39
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
40
+
41
+ rdoc.rdoc_dir = 'rdoc'
42
+ rdoc.title = "feature_selection #{version}"
43
+ rdoc.rdoc_files.include('README*')
44
+ rdoc.rdoc_files.include('lib/**/*.rb')
45
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
@@ -0,0 +1,67 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{feature_selection}
8
+ s.version = "0.0.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["reddavis"]
12
+ s.date = %q{2010-01-05}
13
+ s.description = %q{A library of feature selection algorithms}
14
+ s.email = %q{reddavis@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "feature_selection.gemspec",
27
+ "lib/feature_selection.rb",
28
+ "lib/feature_selection/base.rb",
29
+ "lib/feature_selection/chi_squared.rb",
30
+ "lib/feature_selection/frequency_based.rb",
31
+ "lib/feature_selection/mutual_information.rb",
32
+ "spec/feature_selection/base_spec.rb",
33
+ "spec/feature_selection/chi_squared_spec.rb",
34
+ "spec/feature_selection/frequency_based_spec.rb",
35
+ "spec/feature_selection/mutual_information_spec.rb",
36
+ "spec/feature_selection_spec.rb",
37
+ "spec/spec.opts",
38
+ "spec/spec_helper.rb"
39
+ ]
40
+ s.homepage = %q{http://github.com/reddavis/Feature-Selection}
41
+ s.rdoc_options = ["--charset=UTF-8"]
42
+ s.require_paths = ["lib"]
43
+ s.rubygems_version = %q{1.3.5}
44
+ s.summary = %q{A library of feature selection algorithms}
45
+ s.test_files = [
46
+ "spec/feature_selection/base_spec.rb",
47
+ "spec/feature_selection/chi_squared_spec.rb",
48
+ "spec/feature_selection/frequency_based_spec.rb",
49
+ "spec/feature_selection/mutual_information_spec.rb",
50
+ "spec/feature_selection_spec.rb",
51
+ "spec/spec_helper.rb"
52
+ ]
53
+
54
+ if s.respond_to? :specification_version then
55
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
56
+ s.specification_version = 3
57
+
58
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
59
+ s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
60
+ else
61
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
62
+ end
63
+ else
64
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
65
+ end
66
+ end
67
+
@@ -0,0 +1,4 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/feature_selection/base')
2
+ require File.expand_path(File.dirname(__FILE__) + '/feature_selection/mutual_information')
3
+ require File.expand_path(File.dirname(__FILE__) + '/feature_selection/chi_squared')
4
+ require File.expand_path(File.dirname(__FILE__) + '/feature_selection/frequency_based')
@@ -0,0 +1,84 @@
1
+ module FeatureSelection
2
+ class Base
3
+
4
+ def initialize(data)
5
+ @data = data
6
+ end
7
+
8
+ def classes
9
+ @classes ||= find_all_classes
10
+ end
11
+
12
+ private
13
+
14
+ # Contains term and belongs to class
15
+ def n_1_1(term, klass)
16
+ count = 0.0
17
+
18
+ @data[klass].each do |document|
19
+ count += 1 if document.include?(term)
20
+ end
21
+
22
+ count
23
+ end
24
+
25
+ # Contains term but does not belong to class
26
+ def n_1_0(term, klass)
27
+ count = 0.0
28
+
29
+ @data.each_pair do |key, documents|
30
+ if key != klass
31
+ documents.each do |document|
32
+ count += 1 if document.include?(term)
33
+ end
34
+ end
35
+ end
36
+
37
+ count
38
+ end
39
+
40
+ # Does not contain term but belongs to class
41
+ def n_0_1(term, klass)
42
+ count = 0.0
43
+
44
+ @data[klass].each do |document|
45
+ count += 1 if !document.include?(term)
46
+ end
47
+
48
+ count
49
+ end
50
+
51
+ # Does not contain term and does not belong to class
52
+ def n_0_0(term, klass)
53
+ count = 0.0
54
+
55
+ @data.each_pair do |key, documents|
56
+ if key != klass
57
+ documents.each do |document|
58
+ count += 1 if !document.include?(term)
59
+ end
60
+ end #if key
61
+ end #@data.each_pair
62
+
63
+ count
64
+ end
65
+
66
+ # All of the counts added together
67
+ def count_documents
68
+ size = 0
69
+ @data.each_value do |documents|
70
+ size += documents.size
71
+ end
72
+
73
+ size
74
+ end
75
+
76
+ def find_all_classes
77
+ @data.map {|x| x[0]}
78
+ end
79
+
80
+ def terms
81
+ @data.map {|x| x[1]}.flatten
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,37 @@
1
+ # Chi-Squared takes two events (the class and the term) and calculates
2
+ # whether the two events are dependent. If they are, then the occurance
3
+ # of the term makes the occurance of the class more or less likely
4
+
5
+ module FeatureSelection
6
+ class ChiSquared < Base
7
+
8
+ def rank_features
9
+ # Returns:
10
+ #=> {:class => {'term' => score, 'term' => score}}
11
+ @results = {}
12
+
13
+ classes.each do |klass|
14
+ @results[klass] = {}
15
+
16
+ terms.each do |term|
17
+ answer = calculate_contribution(term, klass)
18
+ @results[klass][term] = answer
19
+ end #terms.each
20
+ end #classes.each
21
+ @results
22
+ end
23
+
24
+ private
25
+
26
+ def calculate_contribution(term, klass)
27
+ n_1_1 = n_1_1(term, klass)
28
+ n_1_0 = n_1_0(term, klass)
29
+ n_0_1 = n_0_1(term, klass)
30
+ n_0_0 = n_0_0(term, klass)
31
+
32
+ (n_1_1 + n_1_0 + n_0_1 + n_0_0) * ((n_1_1 * n_0_0 - n_1_0 * n_0_1) ** 2) /
33
+ (n_1_1 + n_0_1) * (n_1_1 + n_1_0) * (n_1_0 + n_0_0) * (n_0_1 + n_0_0)
34
+ end
35
+
36
+ end
37
+ end
@@ -0,0 +1,24 @@
1
+ module FeatureSelection
2
+ class FrequencyBased < Base
3
+
4
+ def rank_features
5
+ # Returns:
6
+ #=> {:class => {'term' => count, 'term' => count}}
7
+ @results = {}
8
+
9
+ classes.each do |klass|
10
+ @results[klass] = {}
11
+
12
+ terms.each do |term|
13
+ if @results[klass].key?(term)
14
+ @results[klass][term] += 1
15
+ else
16
+ @results[klass][term] = 1
17
+ end
18
+ end #terms.each
19
+ end #classes.each
20
+ @results
21
+ end
22
+
23
+ end
24
+ end
@@ -0,0 +1,79 @@
1
+ # Mutual Information measures how the much information
2
+ # presence/absence of a term contributes to making the correct
3
+ # classification
4
+
5
+ # Equation - for every class and term
6
+ #
7
+ # P(U=e(t), C=e(c)) * log2( P(U=e(t), C=e(c)) / P(U=e(t)) * P(C=e(c)) )
8
+
9
+ # We represent each function in the form - n(t,c)
10
+ # n(1,1) = count documents that have term and belong to specified class
11
+ # n(0,1) = count documents that do not have term and belongs to specified class
12
+ # n(1,0) = count documents that have term but do not belong to specified class
13
+ # n(0,0) = count documents that do not contain term and do not belong to specfied class
14
+ # n = n(1,1) + n(0,1) + n(1,0) + n(0,0)
15
+
16
+ module FeatureSelection
17
+ class MutualInformation < Base
18
+
19
+ def rank_features
20
+ # Returns:
21
+ #=> {:class => {'term' => score, 'term' => score}}
22
+ @results = {}
23
+
24
+ classes.each do |klass|
25
+ @results[klass] = {}
26
+
27
+ terms.each do |term|
28
+ answer = calculate_contribution(term, klass)
29
+ @results[klass][term] = answer
30
+ end #terms.each
31
+ end #classes.each
32
+ @results
33
+ end
34
+
35
+ private
36
+
37
+ def calculate_contribution(term, klass)
38
+ calculate_section(term, klass, 1, 1) +
39
+ calculate_section(term, klass, 1, 0) +
40
+ calculate_section(term, klass, 0, 1) +
41
+ calculate_section(term, klass, 0, 0)
42
+ end
43
+
44
+ def calculate_section(term, klass, t, c)
45
+ n = count_documents
46
+ n_1_0 = n_1_0(term, klass)
47
+ n_0_1 = n_0_1(term, klass)
48
+
49
+ begin
50
+ if t == 1 && c == 1
51
+ n_1_1 = n_1_1(term, klass)
52
+
53
+ n_1_1 / n *
54
+ Math.log( (n * n_1_1) / ((n_0_1 + n_1_1) * (n_1_1 + n_1_0)) )
55
+ elsif t == 1 && c == 0
56
+ n_1_1 = n_1_1(term, klass)
57
+ n_0_0 = n_0_0(term, klass)
58
+
59
+ n_1_0 / n *
60
+ Math.log( (n * n_1_0) / ((n_1_1 + n_0_1) * (n_0_1 + n_0_0)) )
61
+ elsif t == 0 && c == 1
62
+ n_0_0 = n_0_0(term, klass)
63
+ n_1_1 = n_1_1(term, klass)
64
+
65
+ n_0_1 / n *
66
+ Math.log( (n * n_0_1) / ((n_1_0 + n_0_0) * (n_1_1 + n_1_0)) )
67
+ elsif t == 0 && c == 0
68
+ n_0_0 = n_0_0(term, klass)
69
+
70
+ n_0_0 / n *
71
+ Math.log( (n * n_0_0) / ((n_1_0 + n_0_0) * (n_0_1 + n_0_0)) )
72
+ end
73
+ rescue ZeroDivisionError, Errno::EDOM
74
+ 0.0
75
+ end
76
+ end
77
+
78
+ end
79
+ end
@@ -0,0 +1,14 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ describe "Base" do
4
+
5
+ before do
6
+ @a = FeatureSelection::Base.new(data)
7
+ end
8
+
9
+ it "should include classes ham spam" do
10
+ @a.classes.should include(:spam)
11
+ @a.classes.should include(:ham)
12
+ end
13
+
14
+ end
@@ -0,0 +1,13 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ describe "Chi Squared" do
4
+
5
+ before do
6
+ @a = FeatureSelection::ChiSquared.new(data)
7
+ end
8
+
9
+ it "should return an hash" do
10
+ @a.rank_features.should be_a(Hash)
11
+ end
12
+
13
+ end
@@ -0,0 +1,19 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ describe "Frequency Based" do
4
+
5
+ before do
6
+ @a = FeatureSelection::FrequencyBased.new(data)
7
+ end
8
+
9
+ it "should return an hash" do
10
+ @a.rank_features.should be_a(Hash)
11
+ end
12
+
13
+ describe "Should count how many times is occurs in spam" do
14
+ it "should return 3" do
15
+ @a.rank_features[:spam]['is'].should == 3
16
+ end
17
+ end
18
+
19
+ end
@@ -0,0 +1,13 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ describe "Mutual Information" do
4
+
5
+ before do
6
+ @a = FeatureSelection::MutualInformation.new(data)
7
+ end
8
+
9
+ it "should return an hash" do
10
+ @a.rank_features.should be_a(Hash)
11
+ end
12
+
13
+ end
@@ -0,0 +1,5 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "FeatureSelection" do
4
+
5
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1 @@
1
+ --color
@@ -0,0 +1,16 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'feature_selection'
4
+ require 'spec'
5
+ require 'spec/autorun'
6
+
7
+ def data
8
+ {
9
+ :spam => [['this', 'is', 'some', 'information'], ['this', 'is', 'something', 'that', 'is', 'information']],
10
+ :ham => [['this', 'test', 'some', 'more', 'information'], ['there', 'are', 'some', 'things']],
11
+ }
12
+ end
13
+
14
+ Spec::Runner.configure do |config|
15
+
16
+ end
metadata ADDED
@@ -0,0 +1,88 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: feature_selection
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - reddavis
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-01-05 00:00:00 +00:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rspec
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.2.9
24
+ version:
25
+ description: A library of feature selection algorithms
26
+ email: reddavis@gmail.com
27
+ executables: []
28
+
29
+ extensions: []
30
+
31
+ extra_rdoc_files:
32
+ - LICENSE
33
+ - README.rdoc
34
+ files:
35
+ - .document
36
+ - .gitignore
37
+ - LICENSE
38
+ - README.rdoc
39
+ - Rakefile
40
+ - VERSION
41
+ - feature_selection.gemspec
42
+ - lib/feature_selection.rb
43
+ - lib/feature_selection/base.rb
44
+ - lib/feature_selection/chi_squared.rb
45
+ - lib/feature_selection/frequency_based.rb
46
+ - lib/feature_selection/mutual_information.rb
47
+ - spec/feature_selection/base_spec.rb
48
+ - spec/feature_selection/chi_squared_spec.rb
49
+ - spec/feature_selection/frequency_based_spec.rb
50
+ - spec/feature_selection/mutual_information_spec.rb
51
+ - spec/feature_selection_spec.rb
52
+ - spec/spec.opts
53
+ - spec/spec_helper.rb
54
+ has_rdoc: true
55
+ homepage: http://github.com/reddavis/Feature-Selection
56
+ licenses: []
57
+
58
+ post_install_message:
59
+ rdoc_options:
60
+ - --charset=UTF-8
61
+ require_paths:
62
+ - lib
63
+ required_ruby_version: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: "0"
68
+ version:
69
+ required_rubygems_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: "0"
74
+ version:
75
+ requirements: []
76
+
77
+ rubyforge_project:
78
+ rubygems_version: 1.3.5
79
+ signing_key:
80
+ specification_version: 3
81
+ summary: A library of feature selection algorithms
82
+ test_files:
83
+ - spec/feature_selection/base_spec.rb
84
+ - spec/feature_selection/chi_squared_spec.rb
85
+ - spec/feature_selection/frequency_based_spec.rb
86
+ - spec/feature_selection/mutual_information_spec.rb
87
+ - spec/feature_selection_spec.rb
88
+ - spec/spec_helper.rb