feature_selection 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 reddavis
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,38 @@
1
+ = Feature Selection
2
+
3
+ Feature Selection is a library of feature selection algorithms.
4
+
5
+ http://en.wikipedia.org/wiki/Feature_selection
6
+
7
+ == Install
8
+
9
+ gem sources -a http://gems.github.com
10
+ sudo gem install feature_selection
11
+
12
+ == How To Use
13
+
14
+ There are currently 3 implemented feature collections: Chi-Squared, Mutual Information and Frequency Based.
15
+
16
+ Each return a hash that looks similar to:
17
+
18
+ {klass => {term => score, term => score}, klass => {term => score}}
19
+
20
+ Example:
21
+
22
+ data = {
23
+ :spam => [['this', 'is', 'some', 'information'], ['this', 'is', 'something', 'that', 'is', 'information']],
24
+ :ham => [['this', 'test', 'some', 'more', 'information'], ['there', 'are', 'some', 'things']],
25
+ }
26
+
27
+ a = FeatureSelection::ChiSquared.new(data)
28
+
29
+ # You can also use...
30
+ # FeatureSelection::MutualInformation
31
+ # FeatureSelection::FrequencyBased
32
+
33
+ a.rank_features
34
+ #=> {:spam => {term => score, term => score}, :ham => {term => score}}
35
+
36
+ == Copyright
37
+
38
+ Copyright (c) 2009 reddavis. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,45 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "feature_selection"
8
+ gem.summary = %Q{A library of feature selection algorithms}
9
+ gem.description = %Q{A library of feature selection algorithms}
10
+ gem.email = "reddavis@gmail.com"
11
+ gem.homepage = "http://github.com/reddavis/Feature-Selection"
12
+ gem.authors = ["reddavis"]
13
+ gem.add_development_dependency "rspec", ">= 1.2.9"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
19
+ end
20
+
21
+ require 'spec/rake/spectask'
22
+ Spec::Rake::SpecTask.new(:spec) do |spec|
23
+ spec.libs << 'lib' << 'spec'
24
+ spec.spec_files = FileList['spec/**/*_spec.rb']
25
+ end
26
+
27
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
28
+ spec.libs << 'lib' << 'spec'
29
+ spec.pattern = 'spec/**/*_spec.rb'
30
+ spec.rcov = true
31
+ end
32
+
33
+ task :spec => :check_dependencies
34
+
35
+ task :default => :spec
36
+
37
+ require 'rake/rdoctask'
38
+ Rake::RDocTask.new do |rdoc|
39
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
40
+
41
+ rdoc.rdoc_dir = 'rdoc'
42
+ rdoc.title = "feature_selection #{version}"
43
+ rdoc.rdoc_files.include('README*')
44
+ rdoc.rdoc_files.include('lib/**/*.rb')
45
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
@@ -0,0 +1,67 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{feature_selection}
8
+ s.version = "0.0.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["reddavis"]
12
+ s.date = %q{2010-01-05}
13
+ s.description = %q{A library of feature selection algorithms}
14
+ s.email = %q{reddavis@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "feature_selection.gemspec",
27
+ "lib/feature_selection.rb",
28
+ "lib/feature_selection/base.rb",
29
+ "lib/feature_selection/chi_squared.rb",
30
+ "lib/feature_selection/frequency_based.rb",
31
+ "lib/feature_selection/mutual_information.rb",
32
+ "spec/feature_selection/base_spec.rb",
33
+ "spec/feature_selection/chi_squared_spec.rb",
34
+ "spec/feature_selection/frequency_based_spec.rb",
35
+ "spec/feature_selection/mutual_information_spec.rb",
36
+ "spec/feature_selection_spec.rb",
37
+ "spec/spec.opts",
38
+ "spec/spec_helper.rb"
39
+ ]
40
+ s.homepage = %q{http://github.com/reddavis/Feature-Selection}
41
+ s.rdoc_options = ["--charset=UTF-8"]
42
+ s.require_paths = ["lib"]
43
+ s.rubygems_version = %q{1.3.5}
44
+ s.summary = %q{A library of feature selection algorithms}
45
+ s.test_files = [
46
+ "spec/feature_selection/base_spec.rb",
47
+ "spec/feature_selection/chi_squared_spec.rb",
48
+ "spec/feature_selection/frequency_based_spec.rb",
49
+ "spec/feature_selection/mutual_information_spec.rb",
50
+ "spec/feature_selection_spec.rb",
51
+ "spec/spec_helper.rb"
52
+ ]
53
+
54
+ if s.respond_to? :specification_version then
55
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
56
+ s.specification_version = 3
57
+
58
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
59
+ s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
60
+ else
61
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
62
+ end
63
+ else
64
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
65
+ end
66
+ end
67
+
@@ -0,0 +1,4 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/feature_selection/base')
2
+ require File.expand_path(File.dirname(__FILE__) + '/feature_selection/mutual_information')
3
+ require File.expand_path(File.dirname(__FILE__) + '/feature_selection/chi_squared')
4
+ require File.expand_path(File.dirname(__FILE__) + '/feature_selection/frequency_based')
@@ -0,0 +1,84 @@
1
+ module FeatureSelection
2
+ class Base
3
+
4
+ def initialize(data)
5
+ @data = data
6
+ end
7
+
8
+ def classes
9
+ @classes ||= find_all_classes
10
+ end
11
+
12
+ private
13
+
14
+ # Contains term and belongs to class
15
+ def n_1_1(term, klass)
16
+ count = 0.0
17
+
18
+ @data[klass].each do |document|
19
+ count += 1 if document.include?(term)
20
+ end
21
+
22
+ count
23
+ end
24
+
25
+ # Contains term but does not belong to class
26
+ def n_1_0(term, klass)
27
+ count = 0.0
28
+
29
+ @data.each_pair do |key, documents|
30
+ if key != klass
31
+ documents.each do |document|
32
+ count += 1 if document.include?(term)
33
+ end
34
+ end
35
+ end
36
+
37
+ count
38
+ end
39
+
40
+ # Does not contain term but belongs to class
41
+ def n_0_1(term, klass)
42
+ count = 0.0
43
+
44
+ @data[klass].each do |document|
45
+ count += 1 if !document.include?(term)
46
+ end
47
+
48
+ count
49
+ end
50
+
51
+ # Does not contain term and does not belong to class
52
+ def n_0_0(term, klass)
53
+ count = 0.0
54
+
55
+ @data.each_pair do |key, documents|
56
+ if key != klass
57
+ documents.each do |document|
58
+ count += 1 if !document.include?(term)
59
+ end
60
+ end #if key
61
+ end #@data.each_pair
62
+
63
+ count
64
+ end
65
+
66
+ # All of the counts added together
67
+ def count_documents
68
+ size = 0
69
+ @data.each_value do |documents|
70
+ size += documents.size
71
+ end
72
+
73
+ size
74
+ end
75
+
76
+ def find_all_classes
77
+ @data.map {|x| x[0]}
78
+ end
79
+
80
+ def terms
81
+ @data.map {|x| x[1]}.flatten
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,37 @@
1
+ # Chi-Squared takes two events (the class and the term) and calculates
2
+ # whether the two events are dependent. If they are, then the occurance
3
+ # of the term makes the occurance of the class more or less likely
4
+
5
+ module FeatureSelection
6
+ class ChiSquared < Base
7
+
8
+ def rank_features
9
+ # Returns:
10
+ #=> {:class => {'term' => score, 'term' => score}}
11
+ @results = {}
12
+
13
+ classes.each do |klass|
14
+ @results[klass] = {}
15
+
16
+ terms.each do |term|
17
+ answer = calculate_contribution(term, klass)
18
+ @results[klass][term] = answer
19
+ end #terms.each
20
+ end #classes.each
21
+ @results
22
+ end
23
+
24
+ private
25
+
26
+ def calculate_contribution(term, klass)
27
+ n_1_1 = n_1_1(term, klass)
28
+ n_1_0 = n_1_0(term, klass)
29
+ n_0_1 = n_0_1(term, klass)
30
+ n_0_0 = n_0_0(term, klass)
31
+
32
+ (n_1_1 + n_1_0 + n_0_1 + n_0_0) * ((n_1_1 * n_0_0 - n_1_0 * n_0_1) ** 2) /
33
+ (n_1_1 + n_0_1) * (n_1_1 + n_1_0) * (n_1_0 + n_0_0) * (n_0_1 + n_0_0)
34
+ end
35
+
36
+ end
37
+ end
@@ -0,0 +1,24 @@
1
+ module FeatureSelection
2
+ class FrequencyBased < Base
3
+
4
+ def rank_features
5
+ # Returns:
6
+ #=> {:class => {'term' => count, 'term' => count}}
7
+ @results = {}
8
+
9
+ classes.each do |klass|
10
+ @results[klass] = {}
11
+
12
+ terms.each do |term|
13
+ if @results[klass].key?(term)
14
+ @results[klass][term] += 1
15
+ else
16
+ @results[klass][term] = 1
17
+ end
18
+ end #terms.each
19
+ end #classes.each
20
+ @results
21
+ end
22
+
23
+ end
24
+ end
@@ -0,0 +1,79 @@
1
+ # Mutual Information measures how the much information
2
+ # presence/absence of a term contributes to making the correct
3
+ # classification
4
+
5
+ # Equation - for every class and term
6
+ #
7
+ # P(U=e(t), C=e(c)) * log2( P(U=e(t), C=e(c)) / P(U=e(t)) * P(C=e(c)) )
8
+
9
+ # We represent each function in the form - n(t,c)
10
+ # n(1,1) = count documents that have term and belong to specified class
11
+ # n(0,1) = count documents that do not have term and belongs to specified class
12
+ # n(1,0) = count documents that have term but do not belong to specified class
13
+ # n(0,0) = count documents that do not contain term and do not belong to specfied class
14
+ # n = n(1,1) + n(0,1) + n(1,0) + n(0,0)
15
+
16
+ module FeatureSelection
17
+ class MutualInformation < Base
18
+
19
+ def rank_features
20
+ # Returns:
21
+ #=> {:class => {'term' => score, 'term' => score}}
22
+ @results = {}
23
+
24
+ classes.each do |klass|
25
+ @results[klass] = {}
26
+
27
+ terms.each do |term|
28
+ answer = calculate_contribution(term, klass)
29
+ @results[klass][term] = answer
30
+ end #terms.each
31
+ end #classes.each
32
+ @results
33
+ end
34
+
35
+ private
36
+
37
+ def calculate_contribution(term, klass)
38
+ calculate_section(term, klass, 1, 1) +
39
+ calculate_section(term, klass, 1, 0) +
40
+ calculate_section(term, klass, 0, 1) +
41
+ calculate_section(term, klass, 0, 0)
42
+ end
43
+
44
+ def calculate_section(term, klass, t, c)
45
+ n = count_documents
46
+ n_1_0 = n_1_0(term, klass)
47
+ n_0_1 = n_0_1(term, klass)
48
+
49
+ begin
50
+ if t == 1 && c == 1
51
+ n_1_1 = n_1_1(term, klass)
52
+
53
+ n_1_1 / n *
54
+ Math.log( (n * n_1_1) / ((n_0_1 + n_1_1) * (n_1_1 + n_1_0)) )
55
+ elsif t == 1 && c == 0
56
+ n_1_1 = n_1_1(term, klass)
57
+ n_0_0 = n_0_0(term, klass)
58
+
59
+ n_1_0 / n *
60
+ Math.log( (n * n_1_0) / ((n_1_1 + n_0_1) * (n_0_1 + n_0_0)) )
61
+ elsif t == 0 && c == 1
62
+ n_0_0 = n_0_0(term, klass)
63
+ n_1_1 = n_1_1(term, klass)
64
+
65
+ n_0_1 / n *
66
+ Math.log( (n * n_0_1) / ((n_1_0 + n_0_0) * (n_1_1 + n_1_0)) )
67
+ elsif t == 0 && c == 0
68
+ n_0_0 = n_0_0(term, klass)
69
+
70
+ n_0_0 / n *
71
+ Math.log( (n * n_0_0) / ((n_1_0 + n_0_0) * (n_0_1 + n_0_0)) )
72
+ end
73
+ rescue ZeroDivisionError, Errno::EDOM
74
+ 0.0
75
+ end
76
+ end
77
+
78
+ end
79
+ end
@@ -0,0 +1,14 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ describe "Base" do
4
+
5
+ before do
6
+ @a = FeatureSelection::Base.new(data)
7
+ end
8
+
9
+ it "should include classes ham spam" do
10
+ @a.classes.should include(:spam)
11
+ @a.classes.should include(:ham)
12
+ end
13
+
14
+ end
@@ -0,0 +1,13 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ describe "Chi Squared" do
4
+
5
+ before do
6
+ @a = FeatureSelection::ChiSquared.new(data)
7
+ end
8
+
9
+ it "should return an hash" do
10
+ @a.rank_features.should be_a(Hash)
11
+ end
12
+
13
+ end
@@ -0,0 +1,19 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ describe "Frequency Based" do
4
+
5
+ before do
6
+ @a = FeatureSelection::FrequencyBased.new(data)
7
+ end
8
+
9
+ it "should return an hash" do
10
+ @a.rank_features.should be_a(Hash)
11
+ end
12
+
13
+ describe "Should count how many times is occurs in spam" do
14
+ it "should return 3" do
15
+ @a.rank_features[:spam]['is'].should == 3
16
+ end
17
+ end
18
+
19
+ end
@@ -0,0 +1,13 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ describe "Mutual Information" do
4
+
5
+ before do
6
+ @a = FeatureSelection::MutualInformation.new(data)
7
+ end
8
+
9
+ it "should return an hash" do
10
+ @a.rank_features.should be_a(Hash)
11
+ end
12
+
13
+ end
@@ -0,0 +1,5 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "FeatureSelection" do
4
+
5
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1 @@
1
+ --color
@@ -0,0 +1,16 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'feature_selection'
4
+ require 'spec'
5
+ require 'spec/autorun'
6
+
7
+ def data
8
+ {
9
+ :spam => [['this', 'is', 'some', 'information'], ['this', 'is', 'something', 'that', 'is', 'information']],
10
+ :ham => [['this', 'test', 'some', 'more', 'information'], ['there', 'are', 'some', 'things']],
11
+ }
12
+ end
13
+
14
+ Spec::Runner.configure do |config|
15
+
16
+ end
metadata ADDED
@@ -0,0 +1,88 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: feature_selection
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - reddavis
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-01-05 00:00:00 +00:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rspec
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.2.9
24
+ version:
25
+ description: A library of feature selection algorithms
26
+ email: reddavis@gmail.com
27
+ executables: []
28
+
29
+ extensions: []
30
+
31
+ extra_rdoc_files:
32
+ - LICENSE
33
+ - README.rdoc
34
+ files:
35
+ - .document
36
+ - .gitignore
37
+ - LICENSE
38
+ - README.rdoc
39
+ - Rakefile
40
+ - VERSION
41
+ - feature_selection.gemspec
42
+ - lib/feature_selection.rb
43
+ - lib/feature_selection/base.rb
44
+ - lib/feature_selection/chi_squared.rb
45
+ - lib/feature_selection/frequency_based.rb
46
+ - lib/feature_selection/mutual_information.rb
47
+ - spec/feature_selection/base_spec.rb
48
+ - spec/feature_selection/chi_squared_spec.rb
49
+ - spec/feature_selection/frequency_based_spec.rb
50
+ - spec/feature_selection/mutual_information_spec.rb
51
+ - spec/feature_selection_spec.rb
52
+ - spec/spec.opts
53
+ - spec/spec_helper.rb
54
+ has_rdoc: true
55
+ homepage: http://github.com/reddavis/Feature-Selection
56
+ licenses: []
57
+
58
+ post_install_message:
59
+ rdoc_options:
60
+ - --charset=UTF-8
61
+ require_paths:
62
+ - lib
63
+ required_ruby_version: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: "0"
68
+ version:
69
+ required_rubygems_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: "0"
74
+ version:
75
+ requirements: []
76
+
77
+ rubyforge_project:
78
+ rubygems_version: 1.3.5
79
+ signing_key:
80
+ specification_version: 3
81
+ summary: A library of feature selection algorithms
82
+ test_files:
83
+ - spec/feature_selection/base_spec.rb
84
+ - spec/feature_selection/chi_squared_spec.rb
85
+ - spec/feature_selection/frequency_based_spec.rb
86
+ - spec/feature_selection/mutual_information_spec.rb
87
+ - spec/feature_selection_spec.rb
88
+ - spec/spec_helper.rb