feature_selection 0.0.0 → 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/benchmark/benchmark.rb +17 -0
- data/feature_selection.gemspec +3 -2
- data/lib/feature_selection/base.rb +107 -27
- data/lib/feature_selection/chi_squared.rb +1 -1
- data/lib/feature_selection/mutual_information.rb +25 -9
- data/spec/feature_selection/chi_squared_spec.rb +4 -0
- data/spec/feature_selection/mutual_information_spec.rb +4 -0
- data/spec/spec_helper.rb +1 -1
- metadata +3 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.1
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../lib/feature_selection')
|
2
|
+
require 'benchmark'
|
3
|
+
|
4
|
+
data = {:ham => [], :spam => []}
|
5
|
+
|
6
|
+
1000.times do
|
7
|
+
a = rand(999).to_s
|
8
|
+
|
9
|
+
data[:ham] << [a] * 5
|
10
|
+
data[:spam] << [a] * 5
|
11
|
+
end
|
12
|
+
|
13
|
+
Benchmark.bm do |x|
|
14
|
+
x.report do
|
15
|
+
FeatureSelection::MutualInformation.new(data).rank_features
|
16
|
+
end
|
17
|
+
end
|
data/feature_selection.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{feature_selection}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["reddavis"]
|
12
|
-
s.date = %q{2010-01-
|
12
|
+
s.date = %q{2010-01-07}
|
13
13
|
s.description = %q{A library of feature selection algorithms}
|
14
14
|
s.email = %q{reddavis@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -23,6 +23,7 @@ Gem::Specification.new do |s|
|
|
23
23
|
"README.rdoc",
|
24
24
|
"Rakefile",
|
25
25
|
"VERSION",
|
26
|
+
"benchmark/benchmark.rb",
|
26
27
|
"feature_selection.gemspec",
|
27
28
|
"lib/feature_selection.rb",
|
28
29
|
"lib/feature_selection/base.rb",
|
@@ -13,54 +13,129 @@ module FeatureSelection
|
|
13
13
|
|
14
14
|
# Contains term and belongs to class
|
15
15
|
def n_1_1(term, klass)
|
16
|
-
|
16
|
+
@n_1_1 = pre_compute_n_1_1 unless @n_1_1
|
17
|
+
|
18
|
+
@n_1_1[klass][term]
|
19
|
+
end
|
20
|
+
|
21
|
+
def pre_compute_n_1_1
|
22
|
+
results = {}
|
17
23
|
|
18
|
-
|
19
|
-
|
24
|
+
classes.each do |q_klass|
|
25
|
+
results[q_klass] = {}
|
26
|
+
|
27
|
+
uniq_terms.each do |term|
|
28
|
+
count = 0.0
|
29
|
+
|
30
|
+
@data.each_pair do |klass, documents|
|
31
|
+
if klass == q_klass
|
32
|
+
documents.each do |document|
|
33
|
+
count += 1 if document.include?(term)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
results[q_klass][term] = count
|
39
|
+
end
|
20
40
|
end
|
21
|
-
|
22
|
-
|
41
|
+
|
42
|
+
results
|
23
43
|
end
|
24
44
|
|
25
45
|
# Contains term but does not belong to class
|
26
46
|
def n_1_0(term, klass)
|
27
|
-
|
47
|
+
@n_1_0 = pre_compute_n_1_0 unless @n_1_0
|
48
|
+
|
49
|
+
@n_1_0[klass][term]
|
50
|
+
end
|
51
|
+
|
52
|
+
# Pre-Computer n_1_0
|
53
|
+
def pre_compute_n_1_0
|
54
|
+
results = {}
|
28
55
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
56
|
+
classes.each do |q_klass|
|
57
|
+
results[q_klass] = {}
|
58
|
+
|
59
|
+
uniq_terms.each do |term|
|
60
|
+
count = 0.0
|
61
|
+
|
62
|
+
@data.each_pair do |klass, documents|
|
63
|
+
if klass != q_klass
|
64
|
+
documents.each do |document|
|
65
|
+
count += 1 if document.include?(term)
|
66
|
+
end
|
67
|
+
end
|
33
68
|
end
|
69
|
+
|
70
|
+
results[q_klass][term] = count
|
34
71
|
end
|
35
72
|
end
|
36
|
-
|
37
|
-
|
73
|
+
|
74
|
+
results
|
38
75
|
end
|
39
76
|
|
40
77
|
# Does not contain term but belongs to class
|
41
78
|
def n_0_1(term, klass)
|
42
|
-
|
79
|
+
@n_0_1 = pre_compute_n_0_1 unless @n_0_1
|
80
|
+
|
81
|
+
@n_0_1[klass][term]
|
82
|
+
end
|
83
|
+
|
84
|
+
# Pre-Computer n_0_1
|
85
|
+
def pre_compute_n_0_1
|
86
|
+
results = {}
|
43
87
|
|
44
|
-
|
45
|
-
|
88
|
+
classes.each do |q_klass|
|
89
|
+
results[q_klass] = {}
|
90
|
+
|
91
|
+
uniq_terms.each do |term|
|
92
|
+
count = 0.0
|
93
|
+
|
94
|
+
@data.each_pair do |klass, documents|
|
95
|
+
if klass == q_klass
|
96
|
+
documents.each do |document|
|
97
|
+
count += 1 if !document.include?(term)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
results[q_klass][term] = count
|
103
|
+
end
|
46
104
|
end
|
47
|
-
|
48
|
-
|
105
|
+
|
106
|
+
results
|
49
107
|
end
|
50
108
|
|
51
109
|
# Does not contain term and does not belong to class
|
52
110
|
def n_0_0(term, klass)
|
53
|
-
|
111
|
+
@n_0_0 = pre_compute_n_0_0 unless @n_0_0
|
112
|
+
|
113
|
+
@n_0_0[klass][term]
|
114
|
+
end
|
115
|
+
|
116
|
+
# Pre-Computes all n_0_0 queries
|
117
|
+
def pre_compute_n_0_0
|
118
|
+
results = {}
|
54
119
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
120
|
+
classes.each do |q_klass|
|
121
|
+
results[q_klass] = {}
|
122
|
+
|
123
|
+
uniq_terms.each do |term|
|
124
|
+
count = 0.0
|
125
|
+
|
126
|
+
@data.each_pair do |klass, documents|
|
127
|
+
if klass != q_klass
|
128
|
+
documents.each do |document|
|
129
|
+
count += 1 if !document.include?(term)
|
130
|
+
end
|
131
|
+
end
|
59
132
|
end
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
133
|
+
|
134
|
+
results[q_klass][term] = count
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
results
|
64
139
|
end
|
65
140
|
|
66
141
|
# All of the counts added together
|
@@ -76,9 +151,14 @@ module FeatureSelection
|
|
76
151
|
def find_all_classes
|
77
152
|
@data.map {|x| x[0]}
|
78
153
|
end
|
154
|
+
|
155
|
+
def uniq_terms
|
156
|
+
@uniq_terms ||= @data.map {|x| x[1]}.flatten.uniq
|
157
|
+
end
|
79
158
|
|
80
159
|
def terms
|
81
|
-
@data.map {|x| x[1]}.flatten
|
160
|
+
@terms ||= @data.map {|x| x[1]}.flatten
|
82
161
|
end
|
162
|
+
|
83
163
|
end
|
84
164
|
end
|
@@ -24,7 +24,7 @@ module FeatureSelection
|
|
24
24
|
classes.each do |klass|
|
25
25
|
@results[klass] = {}
|
26
26
|
|
27
|
-
|
27
|
+
uniq_terms.each do |term|
|
28
28
|
answer = calculate_contribution(term, klass)
|
29
29
|
@results[klass][term] = answer
|
30
30
|
end #terms.each
|
@@ -49,28 +49,44 @@ module FeatureSelection
|
|
49
49
|
begin
|
50
50
|
if t == 1 && c == 1
|
51
51
|
n_1_1 = n_1_1(term, klass)
|
52
|
-
|
52
|
+
|
53
|
+
# return 0 if a == 0
|
54
|
+
a = (n * n_1_1) / ((n_0_1 + n_1_1) * (n_1_1 + n_1_0))
|
55
|
+
return 0.0 if a == 0
|
56
|
+
|
53
57
|
n_1_1 / n *
|
54
|
-
Math.log(
|
58
|
+
Math.log(a)
|
55
59
|
elsif t == 1 && c == 0
|
56
60
|
n_1_1 = n_1_1(term, klass)
|
57
61
|
n_0_0 = n_0_0(term, klass)
|
58
|
-
|
62
|
+
|
63
|
+
# return 0 if a == 0
|
64
|
+
a = (n * n_1_0) / ((n_1_1 + n_0_1) * (n_0_1 + n_0_0))
|
65
|
+
return 0.0 if a == 0
|
66
|
+
|
59
67
|
n_1_0 / n *
|
60
|
-
Math.log(
|
68
|
+
Math.log(a)
|
61
69
|
elsif t == 0 && c == 1
|
62
70
|
n_0_0 = n_0_0(term, klass)
|
63
71
|
n_1_1 = n_1_1(term, klass)
|
64
|
-
|
72
|
+
|
73
|
+
# return 0 if a == 0
|
74
|
+
a = (n * n_0_1) / ((n_1_0 + n_0_0) * (n_1_1 + n_1_0))
|
75
|
+
return 0.0 if a == 0
|
76
|
+
|
65
77
|
n_0_1 / n *
|
66
|
-
Math.log(
|
78
|
+
Math.log(a)
|
67
79
|
elsif t == 0 && c == 0
|
68
80
|
n_0_0 = n_0_0(term, klass)
|
81
|
+
|
82
|
+
# return 0 if a == 0
|
83
|
+
a = (n * n_0_0) / ((n_1_0 + n_0_0) * (n_0_1 + n_0_0))
|
84
|
+
return 0.0 if a == 0
|
69
85
|
|
70
86
|
n_0_0 / n *
|
71
|
-
Math.log(
|
87
|
+
Math.log(a)
|
72
88
|
end
|
73
|
-
rescue ZeroDivisionError, Errno::EDOM
|
89
|
+
rescue ZeroDivisionError, Errno::EDOM #1.9 Infinity
|
74
90
|
0.0
|
75
91
|
end
|
76
92
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -6,7 +6,7 @@ require 'spec/autorun'
|
|
6
6
|
|
7
7
|
def data
|
8
8
|
{
|
9
|
-
:spam => [['this', 'is', 'some', 'information'], ['this', 'is', 'something', 'that', 'is', 'information']],
|
9
|
+
:spam => [['this', 'is', 'some', 'yer', 'information'], ['this', 'is', 'something', 'that', 'is', 'information']],
|
10
10
|
:ham => [['this', 'test', 'some', 'more', 'information'], ['there', 'are', 'some', 'things']],
|
11
11
|
}
|
12
12
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: feature_selection
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- reddavis
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-01-
|
12
|
+
date: 2010-01-07 00:00:00 +00:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -38,6 +38,7 @@ files:
|
|
38
38
|
- README.rdoc
|
39
39
|
- Rakefile
|
40
40
|
- VERSION
|
41
|
+
- benchmark/benchmark.rb
|
41
42
|
- feature_selection.gemspec
|
42
43
|
- lib/feature_selection.rb
|
43
44
|
- lib/feature_selection/base.rb
|