secobarbital-classifier 1.3.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile ADDED
@@ -0,0 +1,96 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'rake/testtask'
4
+ require 'rake/rdoctask'
5
+ require 'rake/gempackagetask'
6
+ require 'rake/contrib/rubyforgepublisher'
7
+
8
+ PKG_VERSION = "1.3.1.1"
9
+
10
+ PKG_FILES = FileList[
11
+ "lib/**/*", "bin/*", "test/**/*", "[A-Z]*", "Rakefile", "html/**/*"
12
+ ]
13
+
14
+ desc "Default Task"
15
+ task :default => [ :test ]
16
+
17
+ # Run the unit tests
18
+ desc "Run all unit tests"
19
+ Rake::TestTask.new("test") { |t|
20
+ t.libs << "lib"
21
+ t.pattern = 'test/*/*_test.rb'
22
+ t.verbose = true
23
+ }
24
+
25
+ # Make a console, useful when working on tests
26
+ desc "Generate a test console"
27
+ task :console do
28
+ verbose( false ) { sh "irb -I lib/ -r 'classifier'" }
29
+ end
30
+
31
+ # Genereate the RDoc documentation
32
+ desc "Create documentation"
33
+ Rake::RDocTask.new("doc") { |rdoc|
34
+ rdoc.title = "Ruby Classifier - Bayesian and LSI classification library"
35
+ rdoc.rdoc_dir = 'html'
36
+ rdoc.rdoc_files.include('README')
37
+ rdoc.rdoc_files.include('lib/**/*.rb')
38
+ }
39
+
40
+ # Genereate the package
41
+ spec = Gem::Specification.new do |s|
42
+
43
+ #### Basic information.
44
+
45
+ s.name = 'secobarbital-classifier'
46
+ s.version = PKG_VERSION
47
+ s.summary = <<-EOF
48
+ A general classifier module to allow Bayesian and other types of classifications.
49
+ EOF
50
+ s.description = <<-EOF
51
+ A general classifier module to allow Bayesian and other types of classifications.
52
+ EOF
53
+
54
+ #### Which files are to be included in this gem? Everything! (Except CVS directories.)
55
+
56
+ s.files = PKG_FILES
57
+
58
+ #### Load-time details: library and application (you will need one or both).
59
+
60
+ s.require_path = 'lib'
61
+ s.autorequire = 'classifier'
62
+
63
+ #### Documentation and testing.
64
+
65
+ s.has_rdoc = true
66
+
67
+ #### Dependencies and requirements.
68
+
69
+ s.add_dependency('stemmer', '>= 1.0.0')
70
+ s.requirements << "A porter-stemmer module to split word stems."
71
+
72
+ #### Author and project details.
73
+ s.authors = ["Lucas Carlson", "Seggy Umboh"]
74
+ s.email = ["lucas@rufy.com", "seggy.umboh@gmail.com"]
75
+ s.homepage = "http://classifier.rufy.com/"
76
+ end
77
+
78
+ Rake::GemPackageTask.new(spec) do |pkg|
79
+ pkg.need_zip = true
80
+ pkg.need_tar = true
81
+ end
82
+
83
+ desc "Report code statistics (KLOCs, etc) from the application"
84
+ task :stats do
85
+ require 'code_statistics'
86
+ CodeStatistics.new(
87
+ ["Library", "lib"],
88
+ ["Units", "test"]
89
+ ).to_s
90
+ end
91
+
92
+ desc "Publish new documentation"
93
+ task :publish do
94
+ `ssh rufy update-classifier-doc`
95
+ Rake::RubyForgePublisher.new('classifier', 'cardmagic').upload
96
+ end
data/bin/bayes.rb ADDED
@@ -0,0 +1,36 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ begin
4
+ require 'rubygems'
5
+ require 'classifier'
6
+ rescue
7
+ require 'classifier'
8
+ end
9
+
10
+ require 'madeleine'
11
+
12
+ m = SnapshotMadeleine.new(File.expand_path("~/.bayes_data")) {
13
+ Classifier::Bayes.new 'Interesting', 'Uninteresting'
14
+ }
15
+
16
+ case ARGV[0]
17
+ when "add"
18
+ case ARGV[1].downcase
19
+ when "interesting"
20
+ m.system.train_interesting File.open(ARGV[2]).read
21
+ puts "#{ARGV[2]} has been classified as interesting"
22
+ when "uninteresting"
23
+ m.system.train_uninteresting File.open(ARGV[2]).read
24
+ puts "#{ARGV[2]} has been classified as uninteresting"
25
+ else
26
+ puts "Invalid category: choose between interesting and uninteresting"
27
+ exit(1)
28
+ end
29
+ when "classify"
30
+ puts m.system.classify(File.open(ARGV[1]).read)
31
+ else
32
+ puts "Invalid option: choose add [category] [file] or clasify [file]"
33
+ exit(-1)
34
+ end
35
+
36
+ m.take_snapshot
data/bin/summarize.rb ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ begin
4
+ require 'rubygems'
5
+ require 'classifier'
6
+ rescue
7
+ require 'classifier'
8
+ end
9
+
10
+ require 'open-uri'
11
+
12
+ num = ARGV[1].to_i
13
+ num = num < 1 ? 10 : num
14
+
15
+ text = open(ARGV.first).read
16
+ puts text.gsub(/<[^>]+>/,"").gsub(/[\s]+/," ").summary(num)
data/lib/classifier.rb ADDED
@@ -0,0 +1,30 @@
1
+ #--
2
+ # Copyright (c) 2005 Lucas Carlson
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
24
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
25
+ # License:: LGPL
26
+
27
+ require 'rubygems'
28
+ require 'classifier/extensions/string'
29
+ require 'classifier/bayes'
30
+ require 'classifier/lsi'
@@ -0,0 +1,128 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ class Bayes
8
+ # The class can be created with one or more categories, each of which will be
9
+ # initialized and given a training method. E.g.,
10
+ # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
11
+ def initialize(*categories)
12
+ @categories = Hash.new
13
+ categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
14
+ @total_words = 0
15
+ end
16
+
17
+ #
18
+ # Provides a general training method for all categories specified in Bayes#new
19
+ # For example:
20
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
21
+ # b.train :this, "This text"
22
+ # b.train "that", "That text"
23
+ # b.train "The other", "The other text"
24
+ def train(category, text)
25
+ category = category.prepare_category_name
26
+ text.word_hash.each do |word, count|
27
+ @categories[category][word] ||= 0
28
+ @categories[category][word] += count
29
+ @total_words += count
30
+ end
31
+ end
32
+
33
+ #
34
+ # Provides a untraining method for all categories specified in Bayes#new
35
+ # Be very careful with this method.
36
+ #
37
+ # For example:
38
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
39
+ # b.train :this, "This text"
40
+ # b.untrain :this, "This text"
41
+ def untrain(category, text)
42
+ category = category.prepare_category_name
43
+ text.word_hash.each do |word, count|
44
+ if @total_words >= 0
45
+ orig = @categories[category][word]
46
+ @categories[category][word] ||= 0
47
+ @categories[category][word] -= count
48
+ if @categories[category][word] <= 0
49
+ @categories[category].delete(word)
50
+ count = orig
51
+ end
52
+ @total_words -= count
53
+ end
54
+ end
55
+ end
56
+
57
+ #
58
+ # Returns the scores in each category the provided +text+. E.g.,
59
+ # b.classifications "I hate bad words and you"
60
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
61
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
62
+ def classifications(text)
63
+ score = Hash.new
64
+ @categories.each do |category, category_words|
65
+ score[category.to_s] = 0
66
+ total = category_words.values.inject(0) {|sum, element| sum+element}
67
+ text.word_hash.each do |word, count|
68
+ s = category_words.has_key?(word) ? category_words[word] : 0.1
69
+ score[category.to_s] += Math.log(s/total.to_f)
70
+ end
71
+ end
72
+ return score
73
+ end
74
+
75
+ #
76
+ # Returns the classification of the provided +text+, which is one of the
77
+ # categories given in the initializer. E.g.,
78
+ # b.classify "I hate bad words and you"
79
+ # => 'Uninteresting'
80
+ def classify(text)
81
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
82
+ end
83
+
84
+ #
85
+ # Provides training and untraining methods for the categories specified in Bayes#new
86
+ # For example:
87
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
88
+ # b.train_this "This text"
89
+ # b.train_that "That text"
90
+ # b.untrain_that "That text"
91
+ # b.train_the_other "The other text"
92
+ def method_missing(name, *args)
93
+ category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
94
+ if @categories.has_key? category
95
+ args.each { |text| eval("#{$1}train(category, text)") }
96
+ elsif name.to_s =~ /(un)?train_([\w]+)/
97
+ raise StandardError, "No such category: #{category}"
98
+ else
99
+ super #raise StandardError, "No such method: #{name}"
100
+ end
101
+ end
102
+
103
+ #
104
+ # Provides a list of category names
105
+ # For example:
106
+ # b.categories
107
+ # => ['This', 'That', 'the_other']
108
+ def categories # :nodoc:
109
+ @categories.keys.collect {|c| c.to_s}
110
+ end
111
+
112
+ #
113
+ # Allows you to add categories to the classifier.
114
+ # For example:
115
+ # b.add_category "Not spam"
116
+ #
117
+ # WARNING: Adding categories to a trained classifier will
118
+ # result in an undertrained category that will tend to match
119
+ # more criteria than the trained selective categories. In short,
120
+ # try to initialize your categories at initialization.
121
+ def add_category(category)
122
+ @categories[category.prepare_category_name] = Hash.new
123
+ end
124
+
125
+ alias append_category add_category
126
+ end
127
+
128
+ end
@@ -0,0 +1,16 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ begin
6
+ require 'stemmer'
7
+ rescue LoadError
8
+ puts "Please install stemmer from http://rubyforge.org/projects/stemmer or 'gem install stemmer'"
9
+ exit(-1)
10
+ end
11
+
12
+ require 'classifier/extensions/word_hash'
13
+
14
+ class Object
15
+ def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
16
+ end
@@ -0,0 +1,100 @@
1
+ # Author:: Ernest Ellingson
2
+ # Copyright:: Copyright (c) 2005
3
+
4
+ # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
5
+
6
+ require 'matrix'
7
+ require 'mathn'
8
+
9
+ class Vector
10
+ def magnitude
11
+ sumsqs = 0.0
12
+ self.size.times do |i|
13
+ sumsqs += self[i] ** 2.0
14
+ end
15
+ Math.sqrt(sumsqs)
16
+ end
17
+ def normalize
18
+ nv = []
19
+ mag = self.magnitude
20
+ self.size.times do |i|
21
+
22
+ nv << (self[i] / mag)
23
+
24
+ end
25
+ Vector[*nv]
26
+ end
27
+ end
28
+
29
+ class Matrix
30
+ def Matrix.diag(s)
31
+ Matrix.diagonal(*s)
32
+ end
33
+
34
+ alias :trans :transpose
35
+
36
+ def SV_decomp(maxSweeps = 20)
37
+ if self.row_size >= self.column_size
38
+ q = self.trans * self
39
+ else
40
+ q = self * self.trans
41
+ end
42
+
43
+ qrot = q.dup
44
+ v = Matrix.identity(q.row_size)
45
+ azrot = nil
46
+ mzrot = nil
47
+ cnt = 0
48
+ s_old = nil
49
+ mu = nil
50
+
51
+ while true do
52
+ cnt += 1
53
+ for row in (0...qrot.row_size-1) do
54
+ for col in (1..qrot.row_size-1) do
55
+ next if row == col
56
+ h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
57
+ hcos = Math.cos(h)
58
+ hsin = Math.sin(h)
59
+ mzrot = Matrix.identity(qrot.row_size)
60
+ mzrot[row,row] = hcos
61
+ mzrot[row,col] = -hsin
62
+ mzrot[col,row] = hsin
63
+ mzrot[col,col] = hcos
64
+ qrot = mzrot.trans * qrot * mzrot
65
+ v = v * mzrot
66
+ end
67
+ end
68
+ s_old = qrot.dup if cnt == 1
69
+ sum_qrot = 0.0
70
+ if cnt > 1
71
+ qrot.row_size.times do |r|
72
+ sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
73
+ end
74
+ s_old = qrot.dup
75
+ end
76
+ break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
77
+ end # of do while true
78
+ s = []
79
+ qrot.row_size.times do |r|
80
+ s << Math.sqrt(qrot[r,r])
81
+ end
82
+ #puts "cnt = #{cnt}"
83
+ if self.row_size >= self.column_size
84
+ mu = self * v * Matrix.diagonal(*s).inverse
85
+ return [mu, v, s]
86
+ else
87
+ puts v.row_size
88
+ puts v.column_size
89
+ puts self.row_size
90
+ puts self.column_size
91
+ puts s.size
92
+
93
+ mu = (self.trans * v * Matrix.diagonal(*s).inverse)
94
+ return [mu, v, s]
95
+ end
96
+ end
97
+ def []=(i,j,val)
98
+ @rows[i][j] = val
99
+ end
100
+ end
@@ -0,0 +1,20 @@
1
+ module GSL
2
+
3
+ class Vector
4
+ def _dump(v)
5
+ Marshal.dump( self.to_a )
6
+ end
7
+
8
+ def self._load(arr)
9
+ arry = Marshal.load(arr)
10
+ return GSL::Vector.alloc(arry)
11
+ end
12
+
13
+ end
14
+
15
+ class Matrix
16
+ class <<self
17
+ alias :diag :diagonal
18
+ end
19
+ end
20
+ end