otherinbox-classifier 1.3.1.1 → 1.3.1.2.20121218
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/classifier/bayes.rb +37 -7
- data/lib/classifier/extensions/word_hash.rb +2 -0
- metadata +37 -42
data/lib/classifier/bayes.rb
CHANGED
@@ -28,6 +28,7 @@ class Bayes
|
|
28
28
|
@categories[category][word] += count
|
29
29
|
@total_words += count
|
30
30
|
end
|
31
|
+
reset_correct_counts!
|
31
32
|
end
|
32
33
|
|
33
34
|
#
|
@@ -55,6 +56,7 @@ class Bayes
|
|
55
56
|
@total_words -= count
|
56
57
|
end
|
57
58
|
end
|
59
|
+
reset_correct_counts!
|
58
60
|
end
|
59
61
|
|
60
62
|
#
|
@@ -80,24 +82,28 @@ class Bayes
|
|
80
82
|
myclassify_with_word_hash(text.word_hash)
|
81
83
|
end
|
82
84
|
|
83
|
-
|
84
|
-
def myclassify_with_word_hash(word_hash)
|
85
|
+
def myclassify_with_word_hash(word_hash, debugging_info = nil)
|
85
86
|
member_term_count = @categories[:Member].size
|
86
87
|
nonmember_term_count = @categories[:"Not member"].size
|
88
|
+
|
87
89
|
term_count = member_term_count + nonmember_term_count
|
88
90
|
score = 0
|
89
91
|
word_hash.each do |word, count|
|
90
92
|
# count of words in each category
|
91
|
-
member_count = @categories[:Member][word].to_i + 1
|
92
|
-
nonmember_count = @categories[:"Not member"][word].to_i + 1
|
93
|
-
next if member_count
|
93
|
+
member_count = @categories[:Member][word].to_i + 0.1
|
94
|
+
nonmember_count = @categories[:"Not member"][word].to_i + 0.1
|
95
|
+
next if member_count == 0.1 && nonmember_count == 0.1
|
94
96
|
|
95
97
|
# find relative prob word is in class -- p(w|c)
|
96
|
-
word_member_p = (member_count) / (
|
97
|
-
word_nonmember_p = (nonmember_count) / (
|
98
|
+
word_member_p = (member_count) / (total_member_count_correct + term_count).to_f
|
99
|
+
word_nonmember_p = (nonmember_count) / (total_nonmember_count_correct + term_count).to_f
|
98
100
|
|
99
101
|
word_pr = Math.log(word_member_p / word_nonmember_p)
|
100
102
|
score += word_pr * count
|
103
|
+
if debugging_info
|
104
|
+
debugging_info[word] = word_pr * count
|
105
|
+
end
|
106
|
+
#print "#{word_pr * count}: #{word}\n"
|
101
107
|
end
|
102
108
|
if score > 0
|
103
109
|
return "Member", score
|
@@ -155,10 +161,34 @@ class Bayes
|
|
155
161
|
def add_category(category)
|
156
162
|
@categories[category.prepare_category_name] = Hash.new
|
157
163
|
end
|
164
|
+
|
165
|
+
def remove_low_frequency_words(threshold = 5)
|
166
|
+
@categories.each do |_, word_counts|
|
167
|
+
word_counts.to_a.each do |word, count|
|
168
|
+
if count < threshold
|
169
|
+
word_counts.delete(word)
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
reset_correct_counts!
|
174
|
+
end
|
158
175
|
|
159
176
|
alias append_category add_category
|
160
177
|
|
161
178
|
private
|
179
|
+
def reset_correct_counts!
|
180
|
+
@total_member_count_correct = nil
|
181
|
+
@total_nonmember_count_correct = nil
|
182
|
+
end
|
183
|
+
|
184
|
+
def total_member_count_correct
|
185
|
+
@total_member_count_correct ||= @categories[:Member].values.inject(0) {|sum, element| sum+element}
|
186
|
+
end
|
187
|
+
|
188
|
+
def total_nonmember_count_correct
|
189
|
+
@total_nonmember_count_correct ||= @categories[:"Not member"].values.inject(0) {|sum, element| sum+element}
|
190
|
+
end
|
191
|
+
|
162
192
|
def total_member_count
|
163
193
|
@total_member_count ||= @categories[:Member].values.inject(0) {|sum, element| sum+element}
|
164
194
|
end
|
metadata
CHANGED
@@ -1,85 +1,80 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: otherinbox-classifier
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.1.
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.3.1.2.20121218
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
|
-
authors:
|
7
|
+
authors:
|
7
8
|
- Lucas Carlson
|
8
9
|
autorequire: classifier
|
9
10
|
bindir: bin
|
10
|
-
cert_chain:
|
11
|
-
date: 2008-01-19 00:00:00
|
12
|
-
|
13
|
-
|
14
|
-
- !ruby/object:Gem::Dependency
|
11
|
+
cert_chain: []
|
12
|
+
date: 2008-01-19 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
15
|
name: stemmer
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.0.0
|
16
22
|
type: :runtime
|
17
|
-
|
18
|
-
version_requirements: !ruby/object:Gem::Requirement
|
19
|
-
|
20
|
-
|
21
|
-
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
22
29
|
version: 1.0.0
|
23
|
-
version:
|
24
30
|
description: A general classifier module to allow Bayesian and other types of classifications.
|
25
31
|
email: lucas@rufy.com
|
26
32
|
executables: []
|
27
|
-
|
28
33
|
extensions: []
|
29
|
-
|
30
34
|
extra_rdoc_files: []
|
31
|
-
|
32
|
-
files:
|
35
|
+
files:
|
33
36
|
- lib/classifier.rb
|
34
|
-
- lib/classifier
|
35
37
|
- lib/classifier/bayes.rb
|
36
38
|
- lib/classifier/lsi.rb
|
37
|
-
- lib/classifier/extensions
|
38
39
|
- lib/classifier/extensions/string.rb
|
39
40
|
- lib/classifier/extensions/vector.rb
|
40
41
|
- lib/classifier/extensions/vector_serialize.rb
|
41
42
|
- lib/classifier/extensions/word_hash.rb
|
42
|
-
- lib/classifier/lsi
|
43
43
|
- lib/classifier/lsi/content_node.rb
|
44
44
|
- lib/classifier/lsi/summary.rb
|
45
45
|
- lib/classifier/lsi/word_list.rb
|
46
46
|
- bin/bayes.rb
|
47
47
|
- bin/summarize.rb
|
48
|
-
- test/bayes
|
49
48
|
- test/bayes/bayesian_test.rb
|
50
49
|
- test/test_helper.rb
|
51
|
-
- test/extensions
|
52
50
|
- test/extensions/word_hash_test.rb
|
53
|
-
- test/lsi
|
54
51
|
- test/lsi/lsi_test.rb
|
55
52
|
- README
|
56
53
|
- Rakefile
|
57
54
|
- LICENSE
|
58
|
-
has_rdoc: true
|
59
55
|
homepage: http://classifier.rufy.com/
|
56
|
+
licenses: []
|
60
57
|
post_install_message:
|
61
58
|
rdoc_options: []
|
62
|
-
|
63
|
-
require_paths:
|
59
|
+
require_paths:
|
64
60
|
- lib
|
65
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
66
|
-
|
67
|
-
|
68
|
-
|
61
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
+
none: false
|
63
|
+
requirements:
|
64
|
+
- - ! '>'
|
65
|
+
- !ruby/object:Gem::Version
|
69
66
|
version: 0.0.0
|
70
|
-
|
71
|
-
|
72
|
-
requirements:
|
73
|
-
- -
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version:
|
76
|
-
|
77
|
-
requirements:
|
67
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
|
+
none: false
|
69
|
+
requirements:
|
70
|
+
- - ! '>='
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: '0'
|
73
|
+
requirements:
|
78
74
|
- A porter-stemmer module to split word stems.
|
79
75
|
rubyforge_project:
|
80
|
-
rubygems_version: 1.
|
76
|
+
rubygems_version: 1.8.24
|
81
77
|
signing_key:
|
82
78
|
specification_version: 1
|
83
79
|
summary: A general classifier module to allow Bayesian and other types of classifications.
|
84
80
|
test_files: []
|
85
|
-
|