classifier 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,33 @@
1
+ == Welcome to Classifier
2
+
3
+ Classifier is a general module to allow Bayesian and other types of classifications.
4
+
5
+ == Usage
6
+ require 'classifier'
7
+ b = Classifier::Bayes.new 'Interesting', 'Uninteresting'
8
+ b.train_interesting "here are some good words. I hope you love them"
9
+ b.train_uninteresting "here are some bad words, I hate you"
10
+ b.classify "I hate bad words and you" # returns 'Uninsteresting'
11
+
12
+ require 'madeleine'
13
+ m = SnapshotMadeleine.new("bayes_data") {
14
+ Classifier::Bayes.new 'Interesting', 'Uninteresting'
15
+ }
16
+ m.system.train_interesting "here are some good words. I hope you love them"
17
+ m.system.train_uninteresting "here are some bad words, I hate you"
18
+ m.take_snapshot
19
+ m.system.classify "I love you" # returns 'Interesting'
20
+
21
+ Using Madeleine, your application can persist the learned data over time.
22
+
23
+ == Bayesian Classification
24
+
25
+ * http://www.process.com/precisemail/bayesian_filtering.htm
26
+ * http://en.wikipedia.org/wiki/Bayesian_filtering
27
+ * http://www.paulgraham.com/spam.html
28
+
29
+ == About
30
+
31
+ Author:: Lucas Carlson (mailto:lucas@rufy.com)
32
+ Copyright:: Copyright (c) 2005 Lucas Carlson
33
+ License:: GPL
@@ -0,0 +1,77 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'rake/testtask'
4
+ require 'rake/rdoctask'
5
+ require 'rake/gempackagetask'
6
+ require 'rake/contrib/rubyforgepublisher'
7
+
8
+ PKG_VERSION = "1.0.0"
9
+
10
+ PKG_FILES = FileList[
11
+ "lib/**/*", "bin/*", "test/**/*", "[A-Z]*", "Rakefile"
12
+ ]
13
+
14
+ desc "Default Task"
15
+ task :default => [ :test ]
16
+
17
+ # Run the unit tests
18
+ desc "Run all unit tests"
19
+ Rake::TestTask.new("test") { |t|
20
+ t.libs << "lib"
21
+ t.pattern = 'test/*/*_test.rb'
22
+ t.verbose = true
23
+ }
24
+
25
+ # Genereate the RDoc documentation
26
+ desc "Create documentation"
27
+ Rake::RDocTask.new("doc") { |rdoc|
28
+ rdoc.rdoc_dir = 'doc'
29
+ rdoc.title = "Classifier library"
30
+ rdoc.options << '--line-numbers --inline-source --accessor'
31
+ rdoc.rdoc_files.include('README')
32
+ rdoc.rdoc_files.include('lib/**/*.rb')
33
+ }
34
+
35
+ # Genereate the package
36
+ spec = Gem::Specification.new do |s|
37
+
38
+ #### Basic information.
39
+
40
+ s.name = 'classifier'
41
+ s.version = "1.0"
42
+ s.summary = <<-EOF
43
+ A general classifier module to allow Bayesian and other types of classifications.
44
+ EOF
45
+ s.description = <<-EOF
46
+ A general classifier module to allow Bayesian and other types of classifications.
47
+ EOF
48
+
49
+ #### Which files are to be included in this gem? Everything! (Except CVS directories.)
50
+
51
+ s.files = PKG_FILES
52
+
53
+ #### Load-time details: library and application (you will need one or both).
54
+
55
+ s.require_path = 'lib'
56
+ s.autorequire = 'classifier'
57
+
58
+ #### Documentation and testing.
59
+
60
+ s.has_rdoc = true
61
+
62
+ #### Author and project details.
63
+
64
+ s.author = "Lucas Carlson"
65
+ s.email = "lucas@rufy.com"
66
+ s.homepage = "http://rubyforge.org/projects/classifier/"
67
+ end
68
+
69
+ Rake::GemPackageTask.new(spec) do |pkg|
70
+ pkg.need_zip = false
71
+ pkg.need_tar = true
72
+ end
73
+
74
+ desc "Publish to RubyForge"
75
+ task :rubyforge do
76
+ Rake::RubyForgePublisher.new('classifier', 'cardmagic').upload
77
+ end
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'classifier'
4
+ require 'madeleine'
5
+
6
+ m = SnapshotMadeleine.new(File.expand_path("~/.bayes_data")) {
7
+ Classifier::Bayes.new 'Interesting', 'Uninteresting'
8
+ }
9
+
10
+ case ARGV[0]
11
+ when "add"
12
+ case ARGV[1].downcase
13
+ when "interesting"
14
+ m.system.train_interesting File.open(ARGV[2]).read
15
+ puts "#{ARGV[2]} has been classified as interesting"
16
+ when "uninteresting"
17
+ m.system.train_uninteresting File.open(ARGV[2]).read
18
+ puts "#{ARGV[2]} has been classified as uninteresting"
19
+ else
20
+ puts "Invalid category: choose between interesting and uninteresting"
21
+ exit(1)
22
+ end
23
+ when "classify"
24
+ puts m.system.classify(File.open(ARGV[1]).read)
25
+ else
26
+ puts "Invalid option: choose add [category] [file] or clasify [file]"
27
+ exit(1)
28
+ end
29
+
30
+ m.take_snapshot
@@ -0,0 +1,28 @@
1
+ #--
2
+ # Copyright (c) 2005 Lucas Carlson
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
24
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
25
+ # License:: LGPL
26
+
27
+ require 'classifier/string_extensions'
28
+ require 'classifier/bayes'
@@ -0,0 +1,53 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ class Bayes
8
+ def initialize(*categories)
9
+ @categories = Hash.new
10
+ categories.each { |category| @categories[category.capitalize.intern] = Hash.new }
11
+ @total_words = 0
12
+ end
13
+
14
+ def classify(text)
15
+ (classifications(text).sort { |a, b| b[1] <=> a[1] })[0][0]
16
+ end
17
+
18
+ def classifications(text)
19
+ score = Hash.new
20
+ @categories.each do |category, category_words|
21
+ score[category.to_s] = 0
22
+ total = category_words.values.inject(0) {|sum, element| sum+element}
23
+ text.word_hash.each do |word, count|
24
+ s = category_words.has_key?(word) ? category_words[word] : 0.1
25
+ score[category.to_s] += Math.log(s/total.to_f)
26
+ end
27
+ end
28
+ return score
29
+ end
30
+
31
+ def method_missing(name, *args)
32
+ category = name.to_s.gsub(/train_([\w]+)/, '\1').capitalize.intern
33
+ if @categories.has_key? category
34
+ args.each {|text| add_words category, text}
35
+ elsif name.to_s =~ /train_([\w]+)/
36
+ raise StandardError, "No such category: #{category}"
37
+ else
38
+ raise StandardError, "No such method: #{name}"
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def add_words(category, text)
45
+ text.word_hash.each do |word, count|
46
+ @categories[category][word] ||= 0
47
+ @categories[category][word] += count
48
+ @total_words += count
49
+ end
50
+ end
51
+ end
52
+
53
+ end
@@ -0,0 +1,11 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ require 'classifier/string_extensions/porter_stemmer'
6
+ require 'classifier/string_extensions/word_hash'
7
+
8
+ class String
9
+ include Classifier::Stemmable
10
+ include Classifier::WordHash
11
+ end
@@ -0,0 +1,188 @@
1
+ module Classifier
2
+
3
+ module Stemmable
4
+
5
+ STEP_2_LIST = {
6
+ 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
7
+ 'izer'=>'ize', 'bli'=>'ble',
8
+ 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
9
+ 'ization'=>'ize', 'ation'=>'ate',
10
+ 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
11
+ 'ousness'=>'ous', 'aliti'=>'al',
12
+ 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
13
+ }
14
+
15
+ STEP_3_LIST = {
16
+ 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
17
+ 'ical'=>'ic', 'ful'=>'', 'ness'=>''
18
+ }
19
+
20
+
21
+ SUFFIX_1_REGEXP = /(
22
+ ational |
23
+ tional |
24
+ enci |
25
+ anci |
26
+ izer |
27
+ bli |
28
+ alli |
29
+ entli |
30
+ eli |
31
+ ousli |
32
+ ization |
33
+ ation |
34
+ ator |
35
+ alism |
36
+ iveness |
37
+ fulness |
38
+ ousness |
39
+ aliti |
40
+ iviti |
41
+ biliti |
42
+ logi)$/x
43
+
44
+
45
+ SUFFIX_2_REGEXP = /(
46
+ al |
47
+ ance |
48
+ ence |
49
+ er |
50
+ ic |
51
+ able |
52
+ ible |
53
+ ant |
54
+ ement |
55
+ ment |
56
+ ent |
57
+ ou |
58
+ ism |
59
+ ate |
60
+ iti |
61
+ ous |
62
+ ive |
63
+ ize)$/x
64
+
65
+
66
+ C = "[^aeiou]" # consonant
67
+ V = "[aeiouy]" # vowel
68
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
69
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
70
+
71
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
72
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
73
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
74
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
75
+
76
+ #
77
+ # Porter stemmer in Ruby.
78
+ #
79
+ # This is the Porter stemming algorithm, ported to Ruby from the
80
+ # version coded up in Perl. It's easy to follow against the rules
81
+ # in the original paper in:
82
+ #
83
+ # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
84
+ # no. 3, pp 130-137,
85
+ #
86
+ # See also http://www.tartarus.org/~martin/PorterStemmer
87
+ #
88
+ # Send comments to raypereda@hotmail.com
89
+ #
90
+
91
+ def stem_porter
92
+
93
+ # make a copy of the given object and convert it to a string.
94
+ w = self.dup.to_str
95
+
96
+ return w if w.length < 3
97
+
98
+ # now map initial y to Y so that the patterns never treat it as vowel
99
+ w[0] = 'Y' if w[0] == ?y
100
+
101
+ # Step 1a
102
+ if w =~ /(ss|i)es$/
103
+ w = $` + $1
104
+ elsif w =~ /([^s])s$/
105
+ w = $` + $1
106
+ end
107
+
108
+ # Step 1b
109
+ if w =~ /eed$/
110
+ w.chop! if $` =~ MGR0
111
+ elsif w =~ /(ed|ing)$/
112
+ stem = $`
113
+ if stem =~ VOWEL_IN_STEM
114
+ w = stem
115
+ case w
116
+ when /(at|bl|iz)$/ then w << "e"
117
+ when /([^aeiouylsz])\1$/ then w.chop!
118
+ when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
119
+ end
120
+ end
121
+ end
122
+
123
+ if w =~ /y$/
124
+ stem = $`
125
+ w = stem + "i" if stem =~ VOWEL_IN_STEM
126
+ end
127
+
128
+ # Step 2
129
+ if w =~ SUFFIX_1_REGEXP
130
+ stem = $`
131
+ suffix = $1
132
+ # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
133
+ if stem =~ MGR0
134
+ w = stem + STEP_2_LIST[suffix]
135
+ end
136
+ end
137
+
138
+ # Step 3
139
+ if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
140
+ stem = $`
141
+ suffix = $1
142
+ if stem =~ MGR0
143
+ w = stem + STEP_3_LIST[suffix]
144
+ end
145
+ end
146
+
147
+ # Step 4
148
+ if w =~ SUFFIX_2_REGEXP
149
+ stem = $`
150
+ if stem =~ MGR1
151
+ w = stem
152
+ end
153
+ elsif w =~ /(s|t)(ion)$/
154
+ stem = $` + $1
155
+ if stem =~ MGR1
156
+ w = stem
157
+ end
158
+ end
159
+
160
+ # Step 5
161
+ if w =~ /e$/
162
+ stem = $`
163
+ if (stem =~ MGR1) ||
164
+ (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
165
+ w = stem
166
+ end
167
+ end
168
+
169
+ if w =~ /ll$/ && w =~ MGR1
170
+ w.chop!
171
+ end
172
+
173
+ # and turn initial Y back to y
174
+ w[0] = 'y' if w[0] == ?Y
175
+
176
+ w
177
+ end
178
+
179
+
180
+ #
181
+ # make the stem_porter the default stem method, just in case we
182
+ # feel like having multiple stemmers available later.
183
+ #
184
+ alias stem stem_porter
185
+
186
+ end
187
+
188
+ end
@@ -0,0 +1,24 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ module WordHash
8
+ def without_punctuation
9
+ tr( ',?.!;:\'"@#$%^&*()_=+[]{}\|<>/`~', " " )
10
+ end
11
+
12
+ def word_hash
13
+ d = Hash.new
14
+ corpus = without_punctuation
15
+ (corpus.split + gsub(/[\w]/,"").split).each do |word|
16
+ key = word.downcase.stem.intern
17
+ d[key] ||= 0
18
+ d[key] += 1
19
+ end
20
+ return d
21
+ end
22
+ end
23
+
24
+ end
@@ -0,0 +1,20 @@
1
+ require File.dirname(__FILE__) + '/../test_helper'
2
+ class BayesianTest < Test::Unit::TestCase
3
+ def setup
4
+ @classifier = Classifier::Bayes.new 'Interesting', 'Uninteresting'
5
+ end
6
+
7
+ def test_good_training
8
+ assert_nothing_raised { @classifier.train_interesting "love" }
9
+ end
10
+
11
+ def test_bad_training
12
+ assert_raise(StandardError) { @classifier.train_no_category "words" }
13
+ end
14
+
15
+ def test_classification
16
+ @classifier.train_interesting "here are some good words. I hope you love them"
17
+ @classifier.train_uninteresting "here are some bad words, I hate you"
18
+ assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
19
+ end
20
+ end
@@ -0,0 +1,8 @@
1
+ require File.dirname(__FILE__) + '/../test_helper'
2
+ class StringExtensionsTest < Test::Unit::TestCase
3
+ def test_word_hash
4
+ hash = {:some=>1, :good=>1, :hope=>1, :word=>1, :you=>1, :here=>1, :love=>1, :i=>1, :ar=>1, :them=>1, :"."=>1, :"!"=>1}
5
+
6
+ assert_equal hash, "here are some good words. I hope you love them!".word_hash
7
+ end
8
+ end
@@ -0,0 +1,4 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'test/unit'
4
+ require 'classifier'
metadata ADDED
@@ -0,0 +1,50 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.6
3
+ specification_version: 1
4
+ name: classifier
5
+ version: !ruby/object:Gem::Version
6
+ version: "1.0"
7
+ date: 2005-04-10
8
+ summary: A general classifier module to allow Bayesian and other types of classifications.
9
+ require_paths:
10
+ - lib
11
+ email: lucas@rufy.com
12
+ homepage: http://rubyforge.org/projects/classifier/
13
+ rubyforge_project:
14
+ description: A general classifier module to allow Bayesian and other types of classifications.
15
+ autorequire: classifier
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ -
22
+ - ">"
23
+ - !ruby/object:Gem::Version
24
+ version: 0.0.0
25
+ version:
26
+ platform: ruby
27
+ authors:
28
+ - Lucas Carlson
29
+ files:
30
+ - lib/classifier
31
+ - lib/classifier.rb
32
+ - lib/classifier/bayes.rb
33
+ - lib/classifier/string_extensions
34
+ - lib/classifier/string_extensions.rb
35
+ - lib/classifier/string_extensions/porter_stemmer.rb
36
+ - lib/classifier/string_extensions/word_hash.rb
37
+ - bin/bayes.rb
38
+ - test/bayes
39
+ - test/test_helper.rb
40
+ - test/bayes/bayesian_test.rb
41
+ - test/bayes/string_extensions_test.rb
42
+ - Rakefile
43
+ - README
44
+ test_files: []
45
+ rdoc_options: []
46
+ extra_rdoc_files: []
47
+ executables: []
48
+ extensions: []
49
+ requirements: []
50
+ dependencies: []