feature_selection 0.0.0 → 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.0
1
+ 0.0.1
@@ -0,0 +1,17 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../lib/feature_selection')
2
+ require 'benchmark'
3
+
4
+ data = {:ham => [], :spam => []}
5
+
6
+ 1000.times do
7
+ a = rand(999).to_s
8
+
9
+ data[:ham] << [a] * 5
10
+ data[:spam] << [a] * 5
11
+ end
12
+
13
+ Benchmark.bm do |x|
14
+ x.report do
15
+ FeatureSelection::MutualInformation.new(data).rank_features
16
+ end
17
+ end
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{feature_selection}
8
- s.version = "0.0.0"
8
+ s.version = "0.0.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["reddavis"]
12
- s.date = %q{2010-01-05}
12
+ s.date = %q{2010-01-07}
13
13
  s.description = %q{A library of feature selection algorithms}
14
14
  s.email = %q{reddavis@gmail.com}
15
15
  s.extra_rdoc_files = [
@@ -23,6 +23,7 @@ Gem::Specification.new do |s|
23
23
  "README.rdoc",
24
24
  "Rakefile",
25
25
  "VERSION",
26
+ "benchmark/benchmark.rb",
26
27
  "feature_selection.gemspec",
27
28
  "lib/feature_selection.rb",
28
29
  "lib/feature_selection/base.rb",
@@ -13,54 +13,129 @@ module FeatureSelection
13
13
 
14
14
  # Contains term and belongs to class
15
15
  def n_1_1(term, klass)
16
- count = 0.0
16
+ @n_1_1 = pre_compute_n_1_1 unless @n_1_1
17
+
18
+ @n_1_1[klass][term]
19
+ end
20
+
21
+ def pre_compute_n_1_1
22
+ results = {}
17
23
 
18
- @data[klass].each do |document|
19
- count += 1 if document.include?(term)
24
+ classes.each do |q_klass|
25
+ results[q_klass] = {}
26
+
27
+ uniq_terms.each do |term|
28
+ count = 0.0
29
+
30
+ @data.each_pair do |klass, documents|
31
+ if klass == q_klass
32
+ documents.each do |document|
33
+ count += 1 if document.include?(term)
34
+ end
35
+ end
36
+ end
37
+
38
+ results[q_klass][term] = count
39
+ end
20
40
  end
21
-
22
- count
41
+
42
+ results
23
43
  end
24
44
 
25
45
  # Contains term but does not belong to class
26
46
  def n_1_0(term, klass)
27
- count = 0.0
47
+ @n_1_0 = pre_compute_n_1_0 unless @n_1_0
48
+
49
+ @n_1_0[klass][term]
50
+ end
51
+
52
+ # Pre-Computer n_1_0
53
+ def pre_compute_n_1_0
54
+ results = {}
28
55
 
29
- @data.each_pair do |key, documents|
30
- if key != klass
31
- documents.each do |document|
32
- count += 1 if document.include?(term)
56
+ classes.each do |q_klass|
57
+ results[q_klass] = {}
58
+
59
+ uniq_terms.each do |term|
60
+ count = 0.0
61
+
62
+ @data.each_pair do |klass, documents|
63
+ if klass != q_klass
64
+ documents.each do |document|
65
+ count += 1 if document.include?(term)
66
+ end
67
+ end
33
68
  end
69
+
70
+ results[q_klass][term] = count
34
71
  end
35
72
  end
36
-
37
- count
73
+
74
+ results
38
75
  end
39
76
 
40
77
  # Does not contain term but belongs to class
41
78
  def n_0_1(term, klass)
42
- count = 0.0
79
+ @n_0_1 = pre_compute_n_0_1 unless @n_0_1
80
+
81
+ @n_0_1[klass][term]
82
+ end
83
+
84
+ # Pre-Computer n_0_1
85
+ def pre_compute_n_0_1
86
+ results = {}
43
87
 
44
- @data[klass].each do |document|
45
- count += 1 if !document.include?(term)
88
+ classes.each do |q_klass|
89
+ results[q_klass] = {}
90
+
91
+ uniq_terms.each do |term|
92
+ count = 0.0
93
+
94
+ @data.each_pair do |klass, documents|
95
+ if klass == q_klass
96
+ documents.each do |document|
97
+ count += 1 if !document.include?(term)
98
+ end
99
+ end
100
+ end
101
+
102
+ results[q_klass][term] = count
103
+ end
46
104
  end
47
-
48
- count
105
+
106
+ results
49
107
  end
50
108
 
51
109
  # Does not contain term and does not belong to class
52
110
  def n_0_0(term, klass)
53
- count = 0.0
111
+ @n_0_0 = pre_compute_n_0_0 unless @n_0_0
112
+
113
+ @n_0_0[klass][term]
114
+ end
115
+
116
+ # Pre-Computes all n_0_0 queries
117
+ def pre_compute_n_0_0
118
+ results = {}
54
119
 
55
- @data.each_pair do |key, documents|
56
- if key != klass
57
- documents.each do |document|
58
- count += 1 if !document.include?(term)
120
+ classes.each do |q_klass|
121
+ results[q_klass] = {}
122
+
123
+ uniq_terms.each do |term|
124
+ count = 0.0
125
+
126
+ @data.each_pair do |klass, documents|
127
+ if klass != q_klass
128
+ documents.each do |document|
129
+ count += 1 if !document.include?(term)
130
+ end
131
+ end
59
132
  end
60
- end #if key
61
- end #@data.each_pair
62
-
63
- count
133
+
134
+ results[q_klass][term] = count
135
+ end
136
+ end
137
+
138
+ results
64
139
  end
65
140
 
66
141
  # All of the counts added together
@@ -76,9 +151,14 @@ module FeatureSelection
76
151
  def find_all_classes
77
152
  @data.map {|x| x[0]}
78
153
  end
154
+
155
+ def uniq_terms
156
+ @uniq_terms ||= @data.map {|x| x[1]}.flatten.uniq
157
+ end
79
158
 
80
159
  def terms
81
- @data.map {|x| x[1]}.flatten
160
+ @terms ||= @data.map {|x| x[1]}.flatten
82
161
  end
162
+
83
163
  end
84
164
  end
@@ -13,7 +13,7 @@ module FeatureSelection
13
13
  classes.each do |klass|
14
14
  @results[klass] = {}
15
15
 
16
- terms.each do |term|
16
+ uniq_terms.each do |term|
17
17
  answer = calculate_contribution(term, klass)
18
18
  @results[klass][term] = answer
19
19
  end #terms.each
@@ -24,7 +24,7 @@ module FeatureSelection
24
24
  classes.each do |klass|
25
25
  @results[klass] = {}
26
26
 
27
- terms.each do |term|
27
+ uniq_terms.each do |term|
28
28
  answer = calculate_contribution(term, klass)
29
29
  @results[klass][term] = answer
30
30
  end #terms.each
@@ -49,28 +49,44 @@ module FeatureSelection
49
49
  begin
50
50
  if t == 1 && c == 1
51
51
  n_1_1 = n_1_1(term, klass)
52
-
52
+
53
+ # return 0 if a == 0
54
+ a = (n * n_1_1) / ((n_0_1 + n_1_1) * (n_1_1 + n_1_0))
55
+ return 0.0 if a == 0
56
+
53
57
  n_1_1 / n *
54
- Math.log( (n * n_1_1) / ((n_0_1 + n_1_1) * (n_1_1 + n_1_0)) )
58
+ Math.log(a)
55
59
  elsif t == 1 && c == 0
56
60
  n_1_1 = n_1_1(term, klass)
57
61
  n_0_0 = n_0_0(term, klass)
58
-
62
+
63
+ # return 0 if a == 0
64
+ a = (n * n_1_0) / ((n_1_1 + n_0_1) * (n_0_1 + n_0_0))
65
+ return 0.0 if a == 0
66
+
59
67
  n_1_0 / n *
60
- Math.log( (n * n_1_0) / ((n_1_1 + n_0_1) * (n_0_1 + n_0_0)) )
68
+ Math.log(a)
61
69
  elsif t == 0 && c == 1
62
70
  n_0_0 = n_0_0(term, klass)
63
71
  n_1_1 = n_1_1(term, klass)
64
-
72
+
73
+ # return 0 if a == 0
74
+ a = (n * n_0_1) / ((n_1_0 + n_0_0) * (n_1_1 + n_1_0))
75
+ return 0.0 if a == 0
76
+
65
77
  n_0_1 / n *
66
- Math.log( (n * n_0_1) / ((n_1_0 + n_0_0) * (n_1_1 + n_1_0)) )
78
+ Math.log(a)
67
79
  elsif t == 0 && c == 0
68
80
  n_0_0 = n_0_0(term, klass)
81
+
82
+ # return 0 if a == 0
83
+ a = (n * n_0_0) / ((n_1_0 + n_0_0) * (n_0_1 + n_0_0))
84
+ return 0.0 if a == 0
69
85
 
70
86
  n_0_0 / n *
71
- Math.log( (n * n_0_0) / ((n_1_0 + n_0_0) * (n_0_1 + n_0_0)) )
87
+ Math.log(a)
72
88
  end
73
- rescue ZeroDivisionError, Errno::EDOM
89
+ rescue ZeroDivisionError, Errno::EDOM #1.9 Infinity
74
90
  0.0
75
91
  end
76
92
  end
@@ -9,5 +9,9 @@ describe "Chi Squared" do
9
9
  it "should return an hash" do
10
10
  @a.rank_features.should be_a(Hash)
11
11
  end
12
+
13
+ it "should give this a score of 48.0" do
14
+ @a.rank_features[:spam]['this'].should == 48.0
15
+ end
12
16
 
13
17
  end
@@ -9,5 +9,9 @@ describe "Mutual Information" do
9
9
  it "should return an hash" do
10
10
  @a.rank_features.should be_a(Hash)
11
11
  end
12
+
13
+ it "should give this a score of 0.4904..." do
14
+ @a.rank_features[:spam]['this'].to_s.should match(/0.4904/)
15
+ end
12
16
 
13
17
  end
data/spec/spec_helper.rb CHANGED
@@ -6,7 +6,7 @@ require 'spec/autorun'
6
6
 
7
7
  def data
8
8
  {
9
- :spam => [['this', 'is', 'some', 'information'], ['this', 'is', 'something', 'that', 'is', 'information']],
9
+ :spam => [['this', 'is', 'some', 'yer', 'information'], ['this', 'is', 'something', 'that', 'is', 'information']],
10
10
  :ham => [['this', 'test', 'some', 'more', 'information'], ['there', 'are', 'some', 'things']],
11
11
  }
12
12
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: feature_selection
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - reddavis
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-01-05 00:00:00 +00:00
12
+ date: 2010-01-07 00:00:00 +00:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -38,6 +38,7 @@ files:
38
38
  - README.rdoc
39
39
  - Rakefile
40
40
  - VERSION
41
+ - benchmark/benchmark.rb
41
42
  - feature_selection.gemspec
42
43
  - lib/feature_selection.rb
43
44
  - lib/feature_selection/base.rb