otherinbox-classifier 1.3.1.1 → 1.3.1.2.20121218

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,6 +28,7 @@ class Bayes
28
28
  @categories[category][word] += count
29
29
  @total_words += count
30
30
  end
31
+ reset_correct_counts!
31
32
  end
32
33
 
33
34
  #
@@ -55,6 +56,7 @@ class Bayes
55
56
  @total_words -= count
56
57
  end
57
58
  end
59
+ reset_correct_counts!
58
60
  end
59
61
 
60
62
  #
@@ -80,24 +82,28 @@ class Bayes
80
82
  myclassify_with_word_hash(text.word_hash)
81
83
  end
82
84
 
83
- # http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html
84
- def myclassify_with_word_hash(word_hash)
85
+ def myclassify_with_word_hash(word_hash, debugging_info = nil)
85
86
  member_term_count = @categories[:Member].size
86
87
  nonmember_term_count = @categories[:"Not member"].size
88
+
87
89
  term_count = member_term_count + nonmember_term_count
88
90
  score = 0
89
91
  word_hash.each do |word, count|
90
92
  # count of words in each category
91
- member_count = @categories[:Member][word].to_i + 1
92
- nonmember_count = @categories[:"Not member"][word].to_i + 1
93
- next if member_count.to_i == 1 && nonmember_count.to_i == 1
93
+ member_count = @categories[:Member][word].to_i + 0.1
94
+ nonmember_count = @categories[:"Not member"][word].to_i + 0.1
95
+ next if member_count == 0.1 && nonmember_count == 0.1
94
96
 
95
97
  # find relative prob word is in class -- p(w|c)
96
- word_member_p = (member_count) / (total_member_count + term_count).to_f
97
- word_nonmember_p = (nonmember_count) / (total_nonmember_count + term_count).to_f
98
+ word_member_p = (member_count) / (total_member_count_correct + term_count).to_f
99
+ word_nonmember_p = (nonmember_count) / (total_nonmember_count_correct + term_count).to_f
98
100
 
99
101
  word_pr = Math.log(word_member_p / word_nonmember_p)
100
102
  score += word_pr * count
103
+ if debugging_info
104
+ debugging_info[word] = word_pr * count
105
+ end
106
+ #print "#{word_pr * count}: #{word}\n"
101
107
  end
102
108
  if score > 0
103
109
  return "Member", score
@@ -155,10 +161,34 @@ class Bayes
155
161
  def add_category(category)
156
162
  @categories[category.prepare_category_name] = Hash.new
157
163
  end
164
+
165
+ def remove_low_frequency_words(threshold = 5)
166
+ @categories.each do |_, word_counts|
167
+ word_counts.to_a.each do |word, count|
168
+ if count < threshold
169
+ word_counts.delete(word)
170
+ end
171
+ end
172
+ end
173
+ reset_correct_counts!
174
+ end
158
175
 
159
176
  alias append_category add_category
160
177
 
161
178
  private
179
+ def reset_correct_counts!
180
+ @total_member_count_correct = nil
181
+ @total_nonmember_count_correct = nil
182
+ end
183
+
184
+ def total_member_count_correct
185
+ @total_member_count_correct ||= @categories[:Member].values.inject(0) {|sum, element| sum+element}
186
+ end
187
+
188
+ def total_nonmember_count_correct
189
+ @total_nonmember_count_correct ||= @categories[:"Not member"].values.inject(0) {|sum, element| sum+element}
190
+ end
191
+
162
192
  def total_member_count
163
193
  @total_member_count ||= @categories[:Member].values.inject(0) {|sum, element| sum+element}
164
194
  end
@@ -94,6 +94,7 @@ class String
94
94
  "dont",
95
95
  "ever",
96
96
  "first",
97
+ "for",
97
98
  "from",
98
99
  "have",
99
100
  "her",
@@ -150,5 +151,6 @@ class String
150
151
  "yes",
151
152
  "you",
152
153
  "youll",
154
+ "your"
153
155
  ].to_set
154
156
  end
metadata CHANGED
@@ -1,85 +1,80 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: otherinbox-classifier
3
- version: !ruby/object:Gem::Version
4
- version: 1.3.1.1
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.3.1.2.20121218
5
+ prerelease:
5
6
  platform: ruby
6
- authors:
7
+ authors:
7
8
  - Lucas Carlson
8
9
  autorequire: classifier
9
10
  bindir: bin
10
- cert_chain:
11
- date: 2008-01-19 00:00:00 -08:00
12
- default_executable:
13
- dependencies:
14
- - !ruby/object:Gem::Dependency
11
+ cert_chain: []
12
+ date: 2008-01-19 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
15
  name: stemmer
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 1.0.0
16
22
  type: :runtime
17
- version_requirement:
18
- version_requirements: !ruby/object:Gem::Requirement
19
- requirements:
20
- - - ">="
21
- - !ruby/object:Gem::Version
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
22
29
  version: 1.0.0
23
- version:
24
30
  description: A general classifier module to allow Bayesian and other types of classifications.
25
31
  email: lucas@rufy.com
26
32
  executables: []
27
-
28
33
  extensions: []
29
-
30
34
  extra_rdoc_files: []
31
-
32
- files:
35
+ files:
33
36
  - lib/classifier.rb
34
- - lib/classifier
35
37
  - lib/classifier/bayes.rb
36
38
  - lib/classifier/lsi.rb
37
- - lib/classifier/extensions
38
39
  - lib/classifier/extensions/string.rb
39
40
  - lib/classifier/extensions/vector.rb
40
41
  - lib/classifier/extensions/vector_serialize.rb
41
42
  - lib/classifier/extensions/word_hash.rb
42
- - lib/classifier/lsi
43
43
  - lib/classifier/lsi/content_node.rb
44
44
  - lib/classifier/lsi/summary.rb
45
45
  - lib/classifier/lsi/word_list.rb
46
46
  - bin/bayes.rb
47
47
  - bin/summarize.rb
48
- - test/bayes
49
48
  - test/bayes/bayesian_test.rb
50
49
  - test/test_helper.rb
51
- - test/extensions
52
50
  - test/extensions/word_hash_test.rb
53
- - test/lsi
54
51
  - test/lsi/lsi_test.rb
55
52
  - README
56
53
  - Rakefile
57
54
  - LICENSE
58
- has_rdoc: true
59
55
  homepage: http://classifier.rufy.com/
56
+ licenses: []
60
57
  post_install_message:
61
58
  rdoc_options: []
62
-
63
- require_paths:
59
+ require_paths:
64
60
  - lib
65
- required_ruby_version: !ruby/object:Gem::Requirement
66
- requirements:
67
- - - ">"
68
- - !ruby/object:Gem::Version
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ! '>'
65
+ - !ruby/object:Gem::Version
69
66
  version: 0.0.0
70
- version:
71
- required_rubygems_version: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- version: "0"
76
- version:
77
- requirements:
67
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
+ none: false
69
+ requirements:
70
+ - - ! '>='
71
+ - !ruby/object:Gem::Version
72
+ version: '0'
73
+ requirements:
78
74
  - A porter-stemmer module to split word stems.
79
75
  rubyforge_project:
80
- rubygems_version: 1.2.0
76
+ rubygems_version: 1.8.24
81
77
  signing_key:
82
78
  specification_version: 1
83
79
  summary: A general classifier module to allow Bayesian and other types of classifications.
84
80
  test_files: []
85
-