classifier 1.0

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,33 @@
1
+ == Welcome to Classifier
2
+
3
+ Classifier is a general module to allow Bayesian and other types of classifications.
4
+
5
+ == Usage
6
+ require 'classifier'
7
+ b = Classifier::Bayes.new 'Interesting', 'Uninteresting'
8
+ b.train_interesting "here are some good words. I hope you love them"
9
+ b.train_uninteresting "here are some bad words, I hate you"
10
+ b.classify "I hate bad words and you" # returns 'Uninsteresting'
11
+
12
+ require 'madeleine'
13
+ m = SnapshotMadeleine.new("bayes_data") {
14
+ Classifier::Bayes.new 'Interesting', 'Uninteresting'
15
+ }
16
+ m.system.train_interesting "here are some good words. I hope you love them"
17
+ m.system.train_uninteresting "here are some bad words, I hate you"
18
+ m.take_snapshot
19
+ m.system.classify "I love you" # returns 'Interesting'
20
+
21
+ Using Madeleine, your application can persist the learned data over time.
22
+
23
+ == Bayesian Classification
24
+
25
+ * http://www.process.com/precisemail/bayesian_filtering.htm
26
+ * http://en.wikipedia.org/wiki/Bayesian_filtering
27
+ * http://www.paulgraham.com/spam.html
28
+
29
+ == About
30
+
31
+ Author:: Lucas Carlson (mailto:lucas@rufy.com)
32
+ Copyright:: Copyright (c) 2005 Lucas Carlson
33
+ License:: GPL
@@ -0,0 +1,77 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'rake/testtask'
4
+ require 'rake/rdoctask'
5
+ require 'rake/gempackagetask'
6
+ require 'rake/contrib/rubyforgepublisher'
7
+
8
+ PKG_VERSION = "1.0.0"
9
+
10
+ PKG_FILES = FileList[
11
+ "lib/**/*", "bin/*", "test/**/*", "[A-Z]*", "Rakefile"
12
+ ]
13
+
14
+ desc "Default Task"
15
+ task :default => [ :test ]
16
+
17
+ # Run the unit tests
18
+ desc "Run all unit tests"
19
+ Rake::TestTask.new("test") { |t|
20
+ t.libs << "lib"
21
+ t.pattern = 'test/*/*_test.rb'
22
+ t.verbose = true
23
+ }
24
+
25
+ # Genereate the RDoc documentation
26
+ desc "Create documentation"
27
+ Rake::RDocTask.new("doc") { |rdoc|
28
+ rdoc.rdoc_dir = 'doc'
29
+ rdoc.title = "Classifier library"
30
+ rdoc.options << '--line-numbers --inline-source --accessor'
31
+ rdoc.rdoc_files.include('README')
32
+ rdoc.rdoc_files.include('lib/**/*.rb')
33
+ }
34
+
35
+ # Genereate the package
36
+ spec = Gem::Specification.new do |s|
37
+
38
+ #### Basic information.
39
+
40
+ s.name = 'classifier'
41
+ s.version = "1.0"
42
+ s.summary = <<-EOF
43
+ A general classifier module to allow Bayesian and other types of classifications.
44
+ EOF
45
+ s.description = <<-EOF
46
+ A general classifier module to allow Bayesian and other types of classifications.
47
+ EOF
48
+
49
+ #### Which files are to be included in this gem? Everything! (Except CVS directories.)
50
+
51
+ s.files = PKG_FILES
52
+
53
+ #### Load-time details: library and application (you will need one or both).
54
+
55
+ s.require_path = 'lib'
56
+ s.autorequire = 'classifier'
57
+
58
+ #### Documentation and testing.
59
+
60
+ s.has_rdoc = true
61
+
62
+ #### Author and project details.
63
+
64
+ s.author = "Lucas Carlson"
65
+ s.email = "lucas@rufy.com"
66
+ s.homepage = "http://rubyforge.org/projects/classifier/"
67
+ end
68
+
69
+ Rake::GemPackageTask.new(spec) do |pkg|
70
+ pkg.need_zip = false
71
+ pkg.need_tar = true
72
+ end
73
+
74
+ desc "Publish to RubyForge"
75
+ task :rubyforge do
76
+ Rake::RubyForgePublisher.new('classifier', 'cardmagic').upload
77
+ end
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'classifier'
4
+ require 'madeleine'
5
+
6
+ m = SnapshotMadeleine.new(File.expand_path("~/.bayes_data")) {
7
+ Classifier::Bayes.new 'Interesting', 'Uninteresting'
8
+ }
9
+
10
+ case ARGV[0]
11
+ when "add"
12
+ case ARGV[1].downcase
13
+ when "interesting"
14
+ m.system.train_interesting File.open(ARGV[2]).read
15
+ puts "#{ARGV[2]} has been classified as interesting"
16
+ when "uninteresting"
17
+ m.system.train_uninteresting File.open(ARGV[2]).read
18
+ puts "#{ARGV[2]} has been classified as uninteresting"
19
+ else
20
+ puts "Invalid category: choose between interesting and uninteresting"
21
+ exit(1)
22
+ end
23
+ when "classify"
24
+ puts m.system.classify(File.open(ARGV[1]).read)
25
+ else
26
+ puts "Invalid option: choose add [category] [file] or clasify [file]"
27
+ exit(1)
28
+ end
29
+
30
+ m.take_snapshot
@@ -0,0 +1,28 @@
1
+ #--
2
+ # Copyright (c) 2005 Lucas Carlson
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
24
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
25
+ # License:: LGPL
26
+
27
+ require 'classifier/string_extensions'
28
+ require 'classifier/bayes'
@@ -0,0 +1,53 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ class Bayes
8
+ def initialize(*categories)
9
+ @categories = Hash.new
10
+ categories.each { |category| @categories[category.capitalize.intern] = Hash.new }
11
+ @total_words = 0
12
+ end
13
+
14
+ def classify(text)
15
+ (classifications(text).sort { |a, b| b[1] <=> a[1] })[0][0]
16
+ end
17
+
18
+ def classifications(text)
19
+ score = Hash.new
20
+ @categories.each do |category, category_words|
21
+ score[category.to_s] = 0
22
+ total = category_words.values.inject(0) {|sum, element| sum+element}
23
+ text.word_hash.each do |word, count|
24
+ s = category_words.has_key?(word) ? category_words[word] : 0.1
25
+ score[category.to_s] += Math.log(s/total.to_f)
26
+ end
27
+ end
28
+ return score
29
+ end
30
+
31
+ def method_missing(name, *args)
32
+ category = name.to_s.gsub(/train_([\w]+)/, '\1').capitalize.intern
33
+ if @categories.has_key? category
34
+ args.each {|text| add_words category, text}
35
+ elsif name.to_s =~ /train_([\w]+)/
36
+ raise StandardError, "No such category: #{category}"
37
+ else
38
+ raise StandardError, "No such method: #{name}"
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def add_words(category, text)
45
+ text.word_hash.each do |word, count|
46
+ @categories[category][word] ||= 0
47
+ @categories[category][word] += count
48
+ @total_words += count
49
+ end
50
+ end
51
+ end
52
+
53
+ end
@@ -0,0 +1,11 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ require 'classifier/string_extensions/porter_stemmer'
6
+ require 'classifier/string_extensions/word_hash'
7
+
8
+ class String
9
+ include Classifier::Stemmable
10
+ include Classifier::WordHash
11
+ end
@@ -0,0 +1,188 @@
1
+ module Classifier
2
+
3
+ module Stemmable
4
+
5
+ STEP_2_LIST = {
6
+ 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
7
+ 'izer'=>'ize', 'bli'=>'ble',
8
+ 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
9
+ 'ization'=>'ize', 'ation'=>'ate',
10
+ 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
11
+ 'ousness'=>'ous', 'aliti'=>'al',
12
+ 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
13
+ }
14
+
15
+ STEP_3_LIST = {
16
+ 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
17
+ 'ical'=>'ic', 'ful'=>'', 'ness'=>''
18
+ }
19
+
20
+
21
+ SUFFIX_1_REGEXP = /(
22
+ ational |
23
+ tional |
24
+ enci |
25
+ anci |
26
+ izer |
27
+ bli |
28
+ alli |
29
+ entli |
30
+ eli |
31
+ ousli |
32
+ ization |
33
+ ation |
34
+ ator |
35
+ alism |
36
+ iveness |
37
+ fulness |
38
+ ousness |
39
+ aliti |
40
+ iviti |
41
+ biliti |
42
+ logi)$/x
43
+
44
+
45
+ SUFFIX_2_REGEXP = /(
46
+ al |
47
+ ance |
48
+ ence |
49
+ er |
50
+ ic |
51
+ able |
52
+ ible |
53
+ ant |
54
+ ement |
55
+ ment |
56
+ ent |
57
+ ou |
58
+ ism |
59
+ ate |
60
+ iti |
61
+ ous |
62
+ ive |
63
+ ize)$/x
64
+
65
+
66
+ C = "[^aeiou]" # consonant
67
+ V = "[aeiouy]" # vowel
68
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
69
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
70
+
71
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
72
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
73
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
74
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
75
+
76
+ #
77
+ # Porter stemmer in Ruby.
78
+ #
79
+ # This is the Porter stemming algorithm, ported to Ruby from the
80
+ # version coded up in Perl. It's easy to follow against the rules
81
+ # in the original paper in:
82
+ #
83
+ # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
84
+ # no. 3, pp 130-137,
85
+ #
86
+ # See also http://www.tartarus.org/~martin/PorterStemmer
87
+ #
88
+ # Send comments to raypereda@hotmail.com
89
+ #
90
+
91
+ def stem_porter
92
+
93
+ # make a copy of the given object and convert it to a string.
94
+ w = self.dup.to_str
95
+
96
+ return w if w.length < 3
97
+
98
+ # now map initial y to Y so that the patterns never treat it as vowel
99
+ w[0] = 'Y' if w[0] == ?y
100
+
101
+ # Step 1a
102
+ if w =~ /(ss|i)es$/
103
+ w = $` + $1
104
+ elsif w =~ /([^s])s$/
105
+ w = $` + $1
106
+ end
107
+
108
+ # Step 1b
109
+ if w =~ /eed$/
110
+ w.chop! if $` =~ MGR0
111
+ elsif w =~ /(ed|ing)$/
112
+ stem = $`
113
+ if stem =~ VOWEL_IN_STEM
114
+ w = stem
115
+ case w
116
+ when /(at|bl|iz)$/ then w << "e"
117
+ when /([^aeiouylsz])\1$/ then w.chop!
118
+ when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
119
+ end
120
+ end
121
+ end
122
+
123
+ if w =~ /y$/
124
+ stem = $`
125
+ w = stem + "i" if stem =~ VOWEL_IN_STEM
126
+ end
127
+
128
+ # Step 2
129
+ if w =~ SUFFIX_1_REGEXP
130
+ stem = $`
131
+ suffix = $1
132
+ # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
133
+ if stem =~ MGR0
134
+ w = stem + STEP_2_LIST[suffix]
135
+ end
136
+ end
137
+
138
+ # Step 3
139
+ if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
140
+ stem = $`
141
+ suffix = $1
142
+ if stem =~ MGR0
143
+ w = stem + STEP_3_LIST[suffix]
144
+ end
145
+ end
146
+
147
+ # Step 4
148
+ if w =~ SUFFIX_2_REGEXP
149
+ stem = $`
150
+ if stem =~ MGR1
151
+ w = stem
152
+ end
153
+ elsif w =~ /(s|t)(ion)$/
154
+ stem = $` + $1
155
+ if stem =~ MGR1
156
+ w = stem
157
+ end
158
+ end
159
+
160
+ # Step 5
161
+ if w =~ /e$/
162
+ stem = $`
163
+ if (stem =~ MGR1) ||
164
+ (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
165
+ w = stem
166
+ end
167
+ end
168
+
169
+ if w =~ /ll$/ && w =~ MGR1
170
+ w.chop!
171
+ end
172
+
173
+ # and turn initial Y back to y
174
+ w[0] = 'y' if w[0] == ?Y
175
+
176
+ w
177
+ end
178
+
179
+
180
+ #
181
+ # make the stem_porter the default stem method, just in case we
182
+ # feel like having multiple stemmers available later.
183
+ #
184
+ alias stem stem_porter
185
+
186
+ end
187
+
188
+ end
@@ -0,0 +1,24 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ module WordHash
8
+ def without_punctuation
9
+ tr( ',?.!;:\'"@#$%^&*()_=+[]{}\|<>/`~', " " )
10
+ end
11
+
12
+ def word_hash
13
+ d = Hash.new
14
+ corpus = without_punctuation
15
+ (corpus.split + gsub(/[\w]/,"").split).each do |word|
16
+ key = word.downcase.stem.intern
17
+ d[key] ||= 0
18
+ d[key] += 1
19
+ end
20
+ return d
21
+ end
22
+ end
23
+
24
+ end
@@ -0,0 +1,20 @@
1
+ require File.dirname(__FILE__) + '/../test_helper'
2
+ class BayesianTest < Test::Unit::TestCase
3
+ def setup
4
+ @classifier = Classifier::Bayes.new 'Interesting', 'Uninteresting'
5
+ end
6
+
7
+ def test_good_training
8
+ assert_nothing_raised { @classifier.train_interesting "love" }
9
+ end
10
+
11
+ def test_bad_training
12
+ assert_raise(StandardError) { @classifier.train_no_category "words" }
13
+ end
14
+
15
+ def test_classification
16
+ @classifier.train_interesting "here are some good words. I hope you love them"
17
+ @classifier.train_uninteresting "here are some bad words, I hate you"
18
+ assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
19
+ end
20
+ end
@@ -0,0 +1,8 @@
1
+ require File.dirname(__FILE__) + '/../test_helper'
2
+ class StringExtensionsTest < Test::Unit::TestCase
3
+ def test_word_hash
4
+ hash = {:some=>1, :good=>1, :hope=>1, :word=>1, :you=>1, :here=>1, :love=>1, :i=>1, :ar=>1, :them=>1, :"."=>1, :"!"=>1}
5
+
6
+ assert_equal hash, "here are some good words. I hope you love them!".word_hash
7
+ end
8
+ end
@@ -0,0 +1,4 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'test/unit'
4
+ require 'classifier'
metadata ADDED
@@ -0,0 +1,50 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.6
3
+ specification_version: 1
4
+ name: classifier
5
+ version: !ruby/object:Gem::Version
6
+ version: "1.0"
7
+ date: 2005-04-10
8
+ summary: A general classifier module to allow Bayesian and other types of classifications.
9
+ require_paths:
10
+ - lib
11
+ email: lucas@rufy.com
12
+ homepage: http://rubyforge.org/projects/classifier/
13
+ rubyforge_project:
14
+ description: A general classifier module to allow Bayesian and other types of classifications.
15
+ autorequire: classifier
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ -
22
+ - ">"
23
+ - !ruby/object:Gem::Version
24
+ version: 0.0.0
25
+ version:
26
+ platform: ruby
27
+ authors:
28
+ - Lucas Carlson
29
+ files:
30
+ - lib/classifier
31
+ - lib/classifier.rb
32
+ - lib/classifier/bayes.rb
33
+ - lib/classifier/string_extensions
34
+ - lib/classifier/string_extensions.rb
35
+ - lib/classifier/string_extensions/porter_stemmer.rb
36
+ - lib/classifier/string_extensions/word_hash.rb
37
+ - bin/bayes.rb
38
+ - test/bayes
39
+ - test/test_helper.rb
40
+ - test/bayes/bayesian_test.rb
41
+ - test/bayes/string_extensions_test.rb
42
+ - Rakefile
43
+ - README
44
+ test_files: []
45
+ rdoc_options: []
46
+ extra_rdoc_files: []
47
+ executables: []
48
+ extensions: []
49
+ requirements: []
50
+ dependencies: []