otherinbox-classifier 1.3.1.1 → 1.3.1.2.20121218

Sign up to get free protection for your applications and to get access to all the features.
@@ -28,6 +28,7 @@ class Bayes
28
28
  @categories[category][word] += count
29
29
  @total_words += count
30
30
  end
31
+ reset_correct_counts!
31
32
  end
32
33
 
33
34
  #
@@ -55,6 +56,7 @@ class Bayes
55
56
  @total_words -= count
56
57
  end
57
58
  end
59
+ reset_correct_counts!
58
60
  end
59
61
 
60
62
  #
@@ -80,24 +82,28 @@ class Bayes
80
82
  myclassify_with_word_hash(text.word_hash)
81
83
  end
82
84
 
83
- # http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html
84
- def myclassify_with_word_hash(word_hash)
85
+ def myclassify_with_word_hash(word_hash, debugging_info = nil)
85
86
  member_term_count = @categories[:Member].size
86
87
  nonmember_term_count = @categories[:"Not member"].size
88
+
87
89
  term_count = member_term_count + nonmember_term_count
88
90
  score = 0
89
91
  word_hash.each do |word, count|
90
92
  # count of words in each category
91
- member_count = @categories[:Member][word].to_i + 1
92
- nonmember_count = @categories[:"Not member"][word].to_i + 1
93
- next if member_count.to_i == 1 && nonmember_count.to_i == 1
93
+ member_count = @categories[:Member][word].to_i + 0.1
94
+ nonmember_count = @categories[:"Not member"][word].to_i + 0.1
95
+ next if member_count == 0.1 && nonmember_count == 0.1
94
96
 
95
97
  # find relative prob word is in class -- p(w|c)
96
- word_member_p = (member_count) / (total_member_count + term_count).to_f
97
- word_nonmember_p = (nonmember_count) / (total_nonmember_count + term_count).to_f
98
+ word_member_p = (member_count) / (total_member_count_correct + term_count).to_f
99
+ word_nonmember_p = (nonmember_count) / (total_nonmember_count_correct + term_count).to_f
98
100
 
99
101
  word_pr = Math.log(word_member_p / word_nonmember_p)
100
102
  score += word_pr * count
103
+ if debugging_info
104
+ debugging_info[word] = word_pr * count
105
+ end
106
+ #print "#{word_pr * count}: #{word}\n"
101
107
  end
102
108
  if score > 0
103
109
  return "Member", score
@@ -155,10 +161,34 @@ class Bayes
155
161
  def add_category(category)
156
162
  @categories[category.prepare_category_name] = Hash.new
157
163
  end
164
+
165
+ def remove_low_frequency_words(threshold = 5)
166
+ @categories.each do |_, word_counts|
167
+ word_counts.to_a.each do |word, count|
168
+ if count < threshold
169
+ word_counts.delete(word)
170
+ end
171
+ end
172
+ end
173
+ reset_correct_counts!
174
+ end
158
175
 
159
176
  alias append_category add_category
160
177
 
161
178
  private
179
+ def reset_correct_counts!
180
+ @total_member_count_correct = nil
181
+ @total_nonmember_count_correct = nil
182
+ end
183
+
184
+ def total_member_count_correct
185
+ @total_member_count_correct ||= @categories[:Member].values.inject(0) {|sum, element| sum+element}
186
+ end
187
+
188
+ def total_nonmember_count_correct
189
+ @total_nonmember_count_correct ||= @categories[:"Not member"].values.inject(0) {|sum, element| sum+element}
190
+ end
191
+
162
192
  def total_member_count
163
193
  @total_member_count ||= @categories[:Member].values.inject(0) {|sum, element| sum+element}
164
194
  end
@@ -94,6 +94,7 @@ class String
94
94
  "dont",
95
95
  "ever",
96
96
  "first",
97
+ "for",
97
98
  "from",
98
99
  "have",
99
100
  "her",
@@ -150,5 +151,6 @@ class String
150
151
  "yes",
151
152
  "you",
152
153
  "youll",
154
+ "your"
153
155
  ].to_set
154
156
  end
metadata CHANGED
@@ -1,85 +1,80 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: otherinbox-classifier
3
- version: !ruby/object:Gem::Version
4
- version: 1.3.1.1
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.3.1.2.20121218
5
+ prerelease:
5
6
  platform: ruby
6
- authors:
7
+ authors:
7
8
  - Lucas Carlson
8
9
  autorequire: classifier
9
10
  bindir: bin
10
- cert_chain:
11
- date: 2008-01-19 00:00:00 -08:00
12
- default_executable:
13
- dependencies:
14
- - !ruby/object:Gem::Dependency
11
+ cert_chain: []
12
+ date: 2008-01-19 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
15
  name: stemmer
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 1.0.0
16
22
  type: :runtime
17
- version_requirement:
18
- version_requirements: !ruby/object:Gem::Requirement
19
- requirements:
20
- - - ">="
21
- - !ruby/object:Gem::Version
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
22
29
  version: 1.0.0
23
- version:
24
30
  description: A general classifier module to allow Bayesian and other types of classifications.
25
31
  email: lucas@rufy.com
26
32
  executables: []
27
-
28
33
  extensions: []
29
-
30
34
  extra_rdoc_files: []
31
-
32
- files:
35
+ files:
33
36
  - lib/classifier.rb
34
- - lib/classifier
35
37
  - lib/classifier/bayes.rb
36
38
  - lib/classifier/lsi.rb
37
- - lib/classifier/extensions
38
39
  - lib/classifier/extensions/string.rb
39
40
  - lib/classifier/extensions/vector.rb
40
41
  - lib/classifier/extensions/vector_serialize.rb
41
42
  - lib/classifier/extensions/word_hash.rb
42
- - lib/classifier/lsi
43
43
  - lib/classifier/lsi/content_node.rb
44
44
  - lib/classifier/lsi/summary.rb
45
45
  - lib/classifier/lsi/word_list.rb
46
46
  - bin/bayes.rb
47
47
  - bin/summarize.rb
48
- - test/bayes
49
48
  - test/bayes/bayesian_test.rb
50
49
  - test/test_helper.rb
51
- - test/extensions
52
50
  - test/extensions/word_hash_test.rb
53
- - test/lsi
54
51
  - test/lsi/lsi_test.rb
55
52
  - README
56
53
  - Rakefile
57
54
  - LICENSE
58
- has_rdoc: true
59
55
  homepage: http://classifier.rufy.com/
56
+ licenses: []
60
57
  post_install_message:
61
58
  rdoc_options: []
62
-
63
- require_paths:
59
+ require_paths:
64
60
  - lib
65
- required_ruby_version: !ruby/object:Gem::Requirement
66
- requirements:
67
- - - ">"
68
- - !ruby/object:Gem::Version
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ! '>'
65
+ - !ruby/object:Gem::Version
69
66
  version: 0.0.0
70
- version:
71
- required_rubygems_version: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- version: "0"
76
- version:
77
- requirements:
67
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
+ none: false
69
+ requirements:
70
+ - - ! '>='
71
+ - !ruby/object:Gem::Version
72
+ version: '0'
73
+ requirements:
78
74
  - A porter-stemmer module to split word stems.
79
75
  rubyforge_project:
80
- rubygems_version: 1.2.0
76
+ rubygems_version: 1.8.24
81
77
  signing_key:
82
78
  specification_version: 1
83
79
  summary: A general classifier module to allow Bayesian and other types of classifications.
84
80
  test_files: []
85
-