classifier-fork 1.3.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,97 @@
1
+ ## Welcome to Classifier
2
+
3
+ Classifier is a general module to allow Bayesian and other types of classifications.
4
+
5
+ ## Download
6
+
7
+ * https://github.com/cardmagic/classifier
8
+ * gem install classifier
9
+ * git clone https://github.com/cardmagic/classifier.git
10
+
11
+ ## Dependencies
12
+
13
+ If you install Classifier from source, you'll need to install Roman Shterenzon's fast-stemmer gem with RubyGems as follows:
14
+
15
+ gem install fast-stemmer
16
+
17
+ If you would like to speed up LSI classification by at least 10x, please install the following libraries:
18
+ GNU GSL:: http://www.gnu.org/software/gsl
19
+ rb-gsl:: http://rb-gsl.rubyforge.org
20
+
21
+ Notice that LSI will work without these libraries, but as soon as they are installed, Classifier will make use of them. No configuration changes are needed, we like to keep things ridiculously easy for you.
22
+
23
+ ## Bayes
24
+
25
+ A Bayesian classifier by Lucas Carlson. Bayesian Classifiers are accurate, fast, and have modest memory requirements.
26
+
27
+ ### Usage
28
+
29
+ require 'classifier'
30
+ b = Classifier::Bayes.new 'Interesting', 'Uninteresting'
31
+ b.train_interesting "here are some good words. I hope you love them"
32
+ b.train_uninteresting "here are some bad words, I hate you"
33
+ b.classify "I hate bad words and you" # returns 'Uninteresting'
34
+
35
+ require 'madeleine'
36
+ m = SnapshotMadeleine.new("bayes_data") {
37
+ Classifier::Bayes.new 'Interesting', 'Uninteresting'
38
+ }
39
+ m.system.train_interesting "here are some good words. I hope you love them"
40
+ m.system.train_uninteresting "here are some bad words, I hate you"
41
+ m.take_snapshot
42
+ m.system.classify "I love you" # returns 'Interesting'
43
+
44
+ Using Madeleine, your application can persist the learned data over time.
45
+
46
+ ### Bayesian Classification
47
+
48
+ * http://www.process.com/precisemail/bayesian_filtering.htm
49
+ * http://en.wikipedia.org/wiki/Bayesian_filtering
50
+ * http://www.paulgraham.com/spam.html
51
+
52
+ ## LSI
53
+
54
+ A Latent Semantic Indexer by David Fayram. Latent Semantic Indexing engines
55
+ are not as fast or as small as Bayesian classifiers, but are more flexible, providing
56
+ fast search and clustering detection as well as semantic analysis of the text that
57
+ theoretically simulates human learning.
58
+
59
+ ### Usage
60
+
61
+ require 'classifier'
62
+ lsi = Classifier::LSI.new
63
+ strings = [ ["This text deals with dogs. Dogs.", :dog],
64
+ ["This text involves dogs too. Dogs! ", :dog],
65
+ ["This text revolves around cats. Cats.", :cat],
66
+ ["This text also involves cats. Cats!", :cat],
67
+ ["This text involves birds. Birds.",:bird ]]
68
+ strings.each {|x| lsi.add_item x.first, x.last}
69
+
70
+ lsi.search("dog", 3)
71
+ # returns => ["This text deals with dogs. Dogs.", "This text involves dogs too. Dogs! ",
72
+ # "This text also involves cats. Cats!"]
73
+
74
+ lsi.find_related(strings[2], 2)
75
+ # returns => ["This text revolves around cats. Cats.", "This text also involves cats. Cats!"]
76
+
77
+ lsi.classify "This text is also about dogs!"
78
+ # returns => :dog
79
+
80
+ Please see the Classifier::LSI documentation for more information. It is possible to index, search and classify
81
+ with more than just simple strings.
82
+
83
+ ### Latent Semantic Indexing
84
+
85
+ * http://www.c2.com/cgi/wiki?LatentSemanticIndexing
86
+ * http://www.chadfowler.com/index.cgi/Computing/LatentSemanticIndexing.rdoc
87
+ * http://en.wikipedia.org/wiki/Latent_semantic_analysis
88
+
89
+ ## Authors
90
+
91
+ * Lucas Carlson (lucas@rufy.com)
92
+ * David Fayram II (dfayram@gmail.com)
93
+ * Cameron McBride (cameron.mcbride@gmail.com)
94
+ * Ivan Acosta-Rubio (ivan@softwarecriollo.com)
95
+
96
+ This library is released under the terms of the GNU LGPL. See LICENSE for more details.
97
+
@@ -0,0 +1,97 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'rake/testtask'
4
+ require 'rdoc/task'
5
+ require 'rubygems/package_task'
6
+ require 'rake/contrib/rubyforgepublisher'
7
+
8
+ PKG_VERSION = "1.3.4"
9
+
10
+ PKG_FILES = FileList[
11
+ "lib/**/*", "bin/*", "test/**/*", "[A-Z]*", "Rakefile", "Gemfile", "html/**/*"
12
+ ]
13
+
14
+ desc "Default Task"
15
+ task :default => [ :test ]
16
+
17
+ # Run the unit tests
18
+ desc "Run all unit tests"
19
+ Rake::TestTask.new("test") { |t|
20
+ t.libs << "lib"
21
+ t.pattern = 'test/*/*_test.rb'
22
+ t.verbose = true
23
+ }
24
+
25
+ # Make a console, useful when working on tests
26
+ desc "Generate a test console"
27
+ task :console do
28
+ verbose( false ) { sh "irb -I lib/ -r 'classifier'" }
29
+ end
30
+
31
+ # Genereate the RDoc documentation
32
+ desc "Create documentation"
33
+ Rake::RDocTask.new("doc") { |rdoc|
34
+ rdoc.title = "Ruby Classifier - Bayesian and LSI classification library"
35
+ rdoc.rdoc_dir = 'html'
36
+ rdoc.rdoc_files.include('README.markdown')
37
+ rdoc.rdoc_files.include('lib/**/*.rb')
38
+ }
39
+
40
+ # Genereate the package
41
+ spec = Gem::Specification.new do |s|
42
+
43
+ #### Basic information.
44
+
45
+ s.name = 'classifier'
46
+ s.version = PKG_VERSION
47
+ s.version = "#{s.version}-alpha-#{ENV['TRAVIS_BUILD_NUMBER']}" if ENV['TRAVIS']
48
+ s.summary = <<-EOF
49
+ A general classifier module to allow Bayesian and other types of classifications.
50
+ EOF
51
+ s.description = <<-EOF
52
+ A general classifier module to allow Bayesian and other types of classifications.
53
+ EOF
54
+
55
+ #### Which files are to be included in this gem? Everything! (Except CVS directories.)
56
+
57
+ s.files = PKG_FILES
58
+
59
+ #### Load-time details: library and application (you will need one or both).
60
+
61
+ s.require_path = 'lib'
62
+ s.autorequire = 'classifier'
63
+
64
+ #### Documentation and testing.
65
+
66
+ s.has_rdoc = true
67
+
68
+ #### Dependencies and requirements.
69
+
70
+ s.add_dependency('fast-stemmer', '>= 1.0.0')
71
+ s.requirements << "A porter-stemmer module to split word stems."
72
+
73
+ #### Author and project details.
74
+ s.author = "Lucas Carlson"
75
+ s.email = "lucas@rufy.com"
76
+ s.homepage = "http://classifier.rufy.com/"
77
+ end
78
+
79
+ Gem::PackageTask.new(spec) do |pkg|
80
+ pkg.need_zip = true
81
+ pkg.need_tar = true
82
+ end
83
+
84
+ desc "Report code statistics (KLOCs, etc) from the application"
85
+ task :stats do
86
+ require 'code_statistics'
87
+ CodeStatistics.new(
88
+ ["Library", "lib"],
89
+ ["Units", "test"]
90
+ ).to_s
91
+ end
92
+
93
+ desc "Publish new documentation"
94
+ task :publish do
95
+ `ssh rufy update-classifier-doc`
96
+ Rake::RubyForgePublisher.new('classifier', 'cardmagic').upload
97
+ end
@@ -0,0 +1,36 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ begin
4
+ require 'rubygems'
5
+ require 'classifier'
6
+ rescue
7
+ require 'classifier'
8
+ end
9
+
10
+ require 'madeleine'
11
+
12
+ m = SnapshotMadeleine.new(File.expand_path("~/.bayes_data")) {
13
+ Classifier::Bayes.new 'Interesting', 'Uninteresting'
14
+ }
15
+
16
+ case ARGV[0]
17
+ when "add"
18
+ case ARGV[1].downcase
19
+ when "interesting"
20
+ m.system.train_interesting File.open(ARGV[2]).read
21
+ puts "#{ARGV[2]} has been classified as interesting"
22
+ when "uninteresting"
23
+ m.system.train_uninteresting File.open(ARGV[2]).read
24
+ puts "#{ARGV[2]} has been classified as uninteresting"
25
+ else
26
+ puts "Invalid category: choose between interesting and uninteresting"
27
+ exit(1)
28
+ end
29
+ when "classify"
30
+ puts m.system.classify(File.open(ARGV[1]).read)
31
+ else
32
+ puts "Invalid option: choose add [category] [file] or clasify [file]"
33
+ exit(-1)
34
+ end
35
+
36
+ m.take_snapshot
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ begin
4
+ require 'rubygems'
5
+ require 'classifier'
6
+ rescue
7
+ require 'classifier'
8
+ end
9
+
10
+ require 'open-uri'
11
+
12
+ num = ARGV[1].to_i
13
+ num = num < 1 ? 10 : num
14
+
15
+ text = open(ARGV.first).read
16
+ puts text.gsub(/<[^>]+>/,"").gsub(/[\s]+/," ").summary(num)
@@ -0,0 +1,30 @@
1
+ #--
2
+ # Copyright (c) 2005 Lucas Carlson
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
24
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
25
+ # License:: LGPL
26
+
27
+ require 'rubygems'
28
+ require 'classifier/extensions/string'
29
+ require 'classifier/bayes'
30
+ require 'classifier/lsi'
@@ -0,0 +1,135 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ class Bayes
8
+ # The class can be created with one or more categories, each of which will be
9
+ # initialized and given a training method. E.g.,
10
+ # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
11
+ def initialize(*categories)
12
+ @categories = Hash.new
13
+ categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
14
+ @total_words = 0
15
+ @category_counts = Hash.new(0)
16
+ end
17
+
18
+ #
19
+ # Provides a general training method for all categories specified in Bayes#new
20
+ # For example:
21
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
22
+ # b.train :this, "This text"
23
+ # b.train "that", "That text"
24
+ # b.train "The other", "The other text"
25
+ def train(category, text)
26
+ category = category.prepare_category_name
27
+ @category_counts[category] += 1
28
+ text.word_hash.each do |word, count|
29
+ @categories[category][word] ||= 0
30
+ @categories[category][word] += count
31
+ @total_words += count
32
+ end
33
+ end
34
+
35
+ #
36
+ # Provides a untraining method for all categories specified in Bayes#new
37
+ # Be very careful with this method.
38
+ #
39
+ # For example:
40
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
41
+ # b.train :this, "This text"
42
+ # b.untrain :this, "This text"
43
+ def untrain(category, text)
44
+ category = category.prepare_category_name
45
+ @category_counts[category] -= 1
46
+ text.word_hash.each do |word, count|
47
+ if @total_words >= 0
48
+ orig = @categories[category][word]
49
+ @categories[category][word] ||= 0
50
+ @categories[category][word] -= count
51
+ if @categories[category][word] <= 0
52
+ @categories[category].delete(word)
53
+ count = orig
54
+ end
55
+ @total_words -= count
56
+ end
57
+ end
58
+ end
59
+
60
+ #
61
+ # Returns the scores in each category the provided +text+. E.g.,
62
+ # b.classifications "I hate bad words and you"
63
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
64
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
65
+ def classifications(text)
66
+ score = Hash.new
67
+ training_count = @category_counts.values.inject { |x,y| x+y }.to_f
68
+ @categories.each do |category, category_words|
69
+ score[category.to_s] = 0
70
+ total = category_words.values.inject(0) {|sum, element| sum+element}
71
+ text.word_hash.each do |word, count|
72
+ s = category_words.has_key?(word) ? category_words[word] : 0.1
73
+ score[category.to_s] += Math.log(s/total.to_f)
74
+ end
75
+ # now add prior probability for the category
76
+ s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
77
+ score[category.to_s] += Math.log(s / training_count)
78
+ end
79
+ return score
80
+ end
81
+
82
+ #
83
+ # Returns the classification of the provided +text+, which is one of the
84
+ # categories given in the initializer. E.g.,
85
+ # b.classify "I hate bad words and you"
86
+ # => 'Uninteresting'
87
+ def classify(text)
88
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
89
+ end
90
+
91
+ #
92
+ # Provides training and untraining methods for the categories specified in Bayes#new
93
+ # For example:
94
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
95
+ # b.train_this "This text"
96
+ # b.train_that "That text"
97
+ # b.untrain_that "That text"
98
+ # b.train_the_other "The other text"
99
+ def method_missing(name, *args)
100
+ category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
101
+ if @categories.has_key? category
102
+ args.each { |text| eval("#{$1}train(category, text)") }
103
+ elsif name.to_s =~ /(un)?train_([\w]+)/
104
+ raise StandardError, "No such category: #{category}"
105
+ else
106
+ super #raise StandardError, "No such method: #{name}"
107
+ end
108
+ end
109
+
110
+ #
111
+ # Provides a list of category names
112
+ # For example:
113
+ # b.categories
114
+ # => ['This', 'That', 'the_other']
115
+ def categories # :nodoc:
116
+ @categories.keys.collect {|c| c.to_s}
117
+ end
118
+
119
+ #
120
+ # Allows you to add categories to the classifier.
121
+ # For example:
122
+ # b.add_category "Not spam"
123
+ #
124
+ # WARNING: Adding categories to a trained classifier will
125
+ # result in an undertrained category that will tend to match
126
+ # more criteria than the trained selective categories. In short,
127
+ # try to initialize your categories at initialization.
128
+ def add_category(category)
129
+ @categories[category.prepare_category_name] = Hash.new
130
+ end
131
+
132
+ alias append_category add_category
133
+ end
134
+
135
+ end
@@ -0,0 +1,10 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ require 'fast_stemmer'
6
+ require 'classifier/extensions/word_hash'
7
+
8
+ class Object
9
+ def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
10
+ end
@@ -0,0 +1,112 @@
1
+ # Author:: Ernest Ellingson
2
+ # Copyright:: Copyright (c) 2005
3
+
4
+ # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
5
+
6
+ require 'matrix'
7
+ require 'mathn'
8
+
9
+ class Array
10
+ def sum(identity = 0, &block)
11
+ return identity unless size > 0
12
+
13
+ if block_given?
14
+ map(&block).sum
15
+ else
16
+ reduce(:+)
17
+ end
18
+ end
19
+ end
20
+
21
+ class Vector
22
+ def magnitude
23
+ sumsqs = 0.0
24
+ self.size.times do |i|
25
+ sumsqs += self[i] ** 2.0
26
+ end
27
+ Math.sqrt(sumsqs)
28
+ end
29
+ def normalize
30
+ nv = []
31
+ mag = self.magnitude
32
+ self.size.times do |i|
33
+
34
+ nv << (self[i] / mag)
35
+
36
+ end
37
+ Vector[*nv]
38
+ end
39
+ end
40
+
41
+ class Matrix
42
+ def Matrix.diag(s)
43
+ Matrix.diagonal(*s)
44
+ end
45
+
46
+ alias :trans :transpose
47
+
48
+ def SV_decomp(maxSweeps = 20)
49
+ if self.row_size >= self.column_size
50
+ q = self.trans * self
51
+ else
52
+ q = self * self.trans
53
+ end
54
+
55
+ qrot = q.dup
56
+ v = Matrix.identity(q.row_size)
57
+ azrot = nil
58
+ mzrot = nil
59
+ cnt = 0
60
+ s_old = nil
61
+ mu = nil
62
+
63
+ while true do
64
+ cnt += 1
65
+ for row in (0...qrot.row_size-1) do
66
+ for col in (1..qrot.row_size-1) do
67
+ next if row == col
68
+ h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
69
+ hcos = Math.cos(h)
70
+ hsin = Math.sin(h)
71
+ mzrot = Matrix.identity(qrot.row_size)
72
+ mzrot[row,row] = hcos
73
+ mzrot[row,col] = -hsin
74
+ mzrot[col,row] = hsin
75
+ mzrot[col,col] = hcos
76
+ qrot = mzrot.trans * qrot * mzrot
77
+ v = v * mzrot
78
+ end
79
+ end
80
+ s_old = qrot.dup if cnt == 1
81
+ sum_qrot = 0.0
82
+ if cnt > 1
83
+ qrot.row_size.times do |r|
84
+ sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
85
+ end
86
+ s_old = qrot.dup
87
+ end
88
+ break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
89
+ end # of do while true
90
+ s = []
91
+ qrot.row_size.times do |r|
92
+ s << Math.sqrt(qrot[r,r])
93
+ end
94
+ #puts "cnt = #{cnt}"
95
+ if self.row_size >= self.column_size
96
+ mu = self * v * Matrix.diagonal(*s).inverse
97
+ return [mu, v, s]
98
+ else
99
+ puts v.row_size
100
+ puts v.column_size
101
+ puts self.row_size
102
+ puts self.column_size
103
+ puts s.size
104
+
105
+ mu = (self.trans * v * Matrix.diagonal(*s).inverse)
106
+ return [mu, v, s]
107
+ end
108
+ end
109
+ def []=(i,j,val)
110
+ @rows[i][j] = val
111
+ end
112
+ end