feature_selection 0.0.0 → 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.0
1
+ 0.0.1
@@ -0,0 +1,17 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../lib/feature_selection')
2
+ require 'benchmark'
3
+
4
+ data = {:ham => [], :spam => []}
5
+
6
+ 1000.times do
7
+ a = rand(999).to_s
8
+
9
+ data[:ham] << [a] * 5
10
+ data[:spam] << [a] * 5
11
+ end
12
+
13
+ Benchmark.bm do |x|
14
+ x.report do
15
+ FeatureSelection::MutualInformation.new(data).rank_features
16
+ end
17
+ end
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{feature_selection}
8
- s.version = "0.0.0"
8
+ s.version = "0.0.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["reddavis"]
12
- s.date = %q{2010-01-05}
12
+ s.date = %q{2010-01-07}
13
13
  s.description = %q{A library of feature selection algorithms}
14
14
  s.email = %q{reddavis@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -23,6 +23,7 @@ Gem::Specification.new do |s|
23
23
  "README.rdoc",
24
24
  "Rakefile",
25
25
  "VERSION",
26
+ "benchmark/benchmark.rb",
26
27
  "feature_selection.gemspec",
27
28
  "lib/feature_selection.rb",
28
29
  "lib/feature_selection/base.rb",
@@ -13,54 +13,129 @@ module FeatureSelection
13
13
 
14
14
  # Contains term and belongs to class
15
15
  def n_1_1(term, klass)
16
- count = 0.0
16
+ @n_1_1 = pre_compute_n_1_1 unless @n_1_1
17
+
18
+ @n_1_1[klass][term]
19
+ end
20
+
21
+ def pre_compute_n_1_1
22
+ results = {}
17
23
 
18
- @data[klass].each do |document|
19
- count += 1 if document.include?(term)
24
+ classes.each do |q_klass|
25
+ results[q_klass] = {}
26
+
27
+ uniq_terms.each do |term|
28
+ count = 0.0
29
+
30
+ @data.each_pair do |klass, documents|
31
+ if klass == q_klass
32
+ documents.each do |document|
33
+ count += 1 if document.include?(term)
34
+ end
35
+ end
36
+ end
37
+
38
+ results[q_klass][term] = count
39
+ end
20
40
  end
21
-
22
- count
41
+
42
+ results
23
43
  end
24
44
 
25
45
  # Contains term but does not belong to class
26
46
  def n_1_0(term, klass)
27
- count = 0.0
47
+ @n_1_0 = pre_compute_n_1_0 unless @n_1_0
48
+
49
+ @n_1_0[klass][term]
50
+ end
51
+
52
+ # Pre-Computer n_1_0
53
+ def pre_compute_n_1_0
54
+ results = {}
28
55
 
29
- @data.each_pair do |key, documents|
30
- if key != klass
31
- documents.each do |document|
32
- count += 1 if document.include?(term)
56
+ classes.each do |q_klass|
57
+ results[q_klass] = {}
58
+
59
+ uniq_terms.each do |term|
60
+ count = 0.0
61
+
62
+ @data.each_pair do |klass, documents|
63
+ if klass != q_klass
64
+ documents.each do |document|
65
+ count += 1 if document.include?(term)
66
+ end
67
+ end
33
68
  end
69
+
70
+ results[q_klass][term] = count
34
71
  end
35
72
  end
36
-
37
- count
73
+
74
+ results
38
75
  end
39
76
 
40
77
  # Does not contain term but belongs to class
41
78
  def n_0_1(term, klass)
42
- count = 0.0
79
+ @n_0_1 = pre_compute_n_0_1 unless @n_0_1
80
+
81
+ @n_0_1[klass][term]
82
+ end
83
+
84
+ # Pre-Computer n_0_1
85
+ def pre_compute_n_0_1
86
+ results = {}
43
87
 
44
- @data[klass].each do |document|
45
- count += 1 if !document.include?(term)
88
+ classes.each do |q_klass|
89
+ results[q_klass] = {}
90
+
91
+ uniq_terms.each do |term|
92
+ count = 0.0
93
+
94
+ @data.each_pair do |klass, documents|
95
+ if klass == q_klass
96
+ documents.each do |document|
97
+ count += 1 if !document.include?(term)
98
+ end
99
+ end
100
+ end
101
+
102
+ results[q_klass][term] = count
103
+ end
46
104
  end
47
-
48
- count
105
+
106
+ results
49
107
  end
50
108
 
51
109
  # Does not contain term and does not belong to class
52
110
  def n_0_0(term, klass)
53
- count = 0.0
111
+ @n_0_0 = pre_compute_n_0_0 unless @n_0_0
112
+
113
+ @n_0_0[klass][term]
114
+ end
115
+
116
+ # Pre-Computes all n_0_0 queries
117
+ def pre_compute_n_0_0
118
+ results = {}
54
119
 
55
- @data.each_pair do |key, documents|
56
- if key != klass
57
- documents.each do |document|
58
- count += 1 if !document.include?(term)
120
+ classes.each do |q_klass|
121
+ results[q_klass] = {}
122
+
123
+ uniq_terms.each do |term|
124
+ count = 0.0
125
+
126
+ @data.each_pair do |klass, documents|
127
+ if klass != q_klass
128
+ documents.each do |document|
129
+ count += 1 if !document.include?(term)
130
+ end
131
+ end
59
132
  end
60
- end #if key
61
- end #@data.each_pair
62
-
63
- count
133
+
134
+ results[q_klass][term] = count
135
+ end
136
+ end
137
+
138
+ results
64
139
  end
65
140
 
66
141
  # All of the counts added together
@@ -76,9 +151,14 @@ module FeatureSelection
76
151
  def find_all_classes
77
152
  @data.map {|x| x[0]}
78
153
  end
154
+
155
+ def uniq_terms
156
+ @uniq_terms ||= @data.map {|x| x[1]}.flatten.uniq
157
+ end
79
158
 
80
159
  def terms
81
- @data.map {|x| x[1]}.flatten
160
+ @terms ||= @data.map {|x| x[1]}.flatten
82
161
  end
162
+
83
163
  end
84
164
  end
@@ -13,7 +13,7 @@ module FeatureSelection
13
13
  classes.each do |klass|
14
14
  @results[klass] = {}
15
15
 
16
- terms.each do |term|
16
+ uniq_terms.each do |term|
17
17
  answer = calculate_contribution(term, klass)
18
18
  @results[klass][term] = answer
19
19
  end #terms.each
@@ -24,7 +24,7 @@ module FeatureSelection
24
24
  classes.each do |klass|
25
25
  @results[klass] = {}
26
26
 
27
- terms.each do |term|
27
+ uniq_terms.each do |term|
28
28
  answer = calculate_contribution(term, klass)
29
29
  @results[klass][term] = answer
30
30
  end #terms.each
@@ -49,28 +49,44 @@ module FeatureSelection
49
49
  begin
50
50
  if t == 1 && c == 1
51
51
  n_1_1 = n_1_1(term, klass)
52
-
52
+
53
+ # return 0 if a == 0
54
+ a = (n * n_1_1) / ((n_0_1 + n_1_1) * (n_1_1 + n_1_0))
55
+ return 0.0 if a == 0
56
+
53
57
  n_1_1 / n *
54
- Math.log( (n * n_1_1) / ((n_0_1 + n_1_1) * (n_1_1 + n_1_0)) )
58
+ Math.log(a)
55
59
  elsif t == 1 && c == 0
56
60
  n_1_1 = n_1_1(term, klass)
57
61
  n_0_0 = n_0_0(term, klass)
58
-
62
+
63
+ # return 0 if a == 0
64
+ a = (n * n_1_0) / ((n_1_1 + n_0_1) * (n_0_1 + n_0_0))
65
+ return 0.0 if a == 0
66
+
59
67
  n_1_0 / n *
60
- Math.log( (n * n_1_0) / ((n_1_1 + n_0_1) * (n_0_1 + n_0_0)) )
68
+ Math.log(a)
61
69
  elsif t == 0 && c == 1
62
70
  n_0_0 = n_0_0(term, klass)
63
71
  n_1_1 = n_1_1(term, klass)
64
-
72
+
73
+ # return 0 if a == 0
74
+ a = (n * n_0_1) / ((n_1_0 + n_0_0) * (n_1_1 + n_1_0))
75
+ return 0.0 if a == 0
76
+
65
77
  n_0_1 / n *
66
- Math.log( (n * n_0_1) / ((n_1_0 + n_0_0) * (n_1_1 + n_1_0)) )
78
+ Math.log(a)
67
79
  elsif t == 0 && c == 0
68
80
  n_0_0 = n_0_0(term, klass)
81
+
82
+ # return 0 if a == 0
83
+ a = (n * n_0_0) / ((n_1_0 + n_0_0) * (n_0_1 + n_0_0))
84
+ return 0.0 if a == 0
69
85
 
70
86
  n_0_0 / n *
71
- Math.log( (n * n_0_0) / ((n_1_0 + n_0_0) * (n_0_1 + n_0_0)) )
87
+ Math.log(a)
72
88
  end
73
- rescue ZeroDivisionError, Errno::EDOM
89
+ rescue ZeroDivisionError, Errno::EDOM #1.9 Infinity
74
90
  0.0
75
91
  end
76
92
  end
@@ -9,5 +9,9 @@ describe "Chi Squared" do
9
9
  it "should return an hash" do
10
10
  @a.rank_features.should be_a(Hash)
11
11
  end
12
+
13
+ it "should give this a score of 48.0" do
14
+ @a.rank_features[:spam]['this'].should == 48.0
15
+ end
12
16
 
13
17
  end
@@ -9,5 +9,9 @@ describe "Mutual Information" do
9
9
  it "should return an hash" do
10
10
  @a.rank_features.should be_a(Hash)
11
11
  end
12
+
13
+ it "should give this a score of 0.4904..." do
14
+ @a.rank_features[:spam]['this'].to_s.should match(/0.4904/)
15
+ end
12
16
 
13
17
  end
data/spec/spec_helper.rb CHANGED
@@ -6,7 +6,7 @@ require 'spec/autorun'
6
6
 
7
7
  def data
8
8
  {
9
- :spam => [['this', 'is', 'some', 'information'], ['this', 'is', 'something', 'that', 'is', 'information']],
9
+ :spam => [['this', 'is', 'some', 'yer', 'information'], ['this', 'is', 'something', 'that', 'is', 'information']],
10
10
  :ham => [['this', 'test', 'some', 'more', 'information'], ['there', 'are', 'some', 'things']],
11
11
  }
12
12
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: feature_selection
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - reddavis
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-01-05 00:00:00 +00:00
12
+ date: 2010-01-07 00:00:00 +00:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -38,6 +38,7 @@ files:
38
38
  - README.rdoc
39
39
  - Rakefile
40
40
  - VERSION
41
+ - benchmark/benchmark.rb
41
42
  - feature_selection.gemspec
42
43
  - lib/feature_selection.rb
43
44
  - lib/feature_selection/base.rb