classifier-reborn 2.0.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,36 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ begin
4
+ require 'rubygems'
5
+ require 'classifier'
6
+ rescue
7
+ require 'classifier'
8
+ end
9
+
10
+ require 'madeleine'
11
+
12
+ m = SnapshotMadeleine.new(File.expand_path("~/.bayes_data")) {
13
+ ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting'
14
+ }
15
+
16
+ case ARGV[0]
17
+ when "add"
18
+ case ARGV[1].downcase
19
+ when "interesting"
20
+ m.system.train_interesting File.open(ARGV[2]).read
21
+ puts "#{ARGV[2]} has been classified as interesting"
22
+ when "uninteresting"
23
+ m.system.train_uninteresting File.open(ARGV[2]).read
24
+ puts "#{ARGV[2]} has been classified as uninteresting"
25
+ else
26
+ puts "Invalid category: choose between interesting and uninteresting"
27
+ exit(1)
28
+ end
29
+ when "classify"
30
+ puts m.system.classify(File.open(ARGV[1]).read)
31
+ else
32
+ puts "Invalid option: choose add [category] [file] or clasify [file]"
33
+ exit(-1)
34
+ end
35
+
36
+ m.take_snapshot
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ begin
4
+ require 'rubygems'
5
+ require 'classifier'
6
+ rescue
7
+ require 'classifier'
8
+ end
9
+
10
+ require 'open-uri'
11
+
12
+ num = ARGV[1].to_i
13
+ num = num < 1 ? 10 : num
14
+
15
+ text = open(ARGV.first).read
16
+ puts text.gsub(/<[^>]+>/,"").gsub(/[\s]+/," ").summary(num)
@@ -0,0 +1,30 @@
1
+ #--
2
+ # Copyright (c) 2005 Lucas Carlson
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
24
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
25
+ # License:: LGPL
26
+
27
+ require 'rubygems'
28
+ require_relative 'classifier-reborn/extensions/string'
29
+ require_relative 'classifier-reborn/bayes'
30
+ require_relative 'classifier-reborn/lsi'
@@ -0,0 +1,126 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ module ClassifierReborn
6
+ class Bayes
7
+ # The class can be created with one or more categories, each of which will be
8
+ # initialized and given a training method. E.g.,
9
+ # b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
10
+ def initialize(*categories)
11
+ @categories = Hash.new
12
+ categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
13
+ @total_words = 0
14
+ @category_counts = Hash.new(0)
15
+ end
16
+
17
+ # Provides a general training method for all categories specified in Bayes#new
18
+ # For example:
19
+ # b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'
20
+ # b.train :this, "This text"
21
+ # b.train "that", "That text"
22
+ # b.train "The other", "The other text"
23
+ def train(category, text)
24
+ category = category.prepare_category_name
25
+ @category_counts[category] += 1
26
+ text.word_hash.each do |word, count|
27
+ @categories[category][word] ||= 0
28
+ @categories[category][word] += count
29
+ @total_words += count
30
+ end
31
+ end
32
+
33
+ # Provides a untraining method for all categories specified in Bayes#new
34
+ # Be very careful with this method.
35
+ #
36
+ # For example:
37
+ # b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'
38
+ # b.train :this, "This text"
39
+ # b.untrain :this, "This text"
40
+ def untrain(category, text)
41
+ category = category.prepare_category_name
42
+ @category_counts[category] -= 1
43
+ text.word_hash.each do |word, count|
44
+ if @total_words >= 0
45
+ orig = @categories[category][word]
46
+ @categories[category][word] ||= 0
47
+ @categories[category][word] -= count
48
+ if @categories[category][word] <= 0
49
+ @categories[category].delete(word)
50
+ count = orig
51
+ end
52
+ @total_words -= count
53
+ end
54
+ end
55
+ end
56
+
57
+ # Returns the scores in each category the provided +text+. E.g.,
58
+ # b.classifications "I hate bad words and you"
59
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
60
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
61
+ def classifications(text)
62
+ score = Hash.new
63
+ training_count = @category_counts.values.inject { |x,y| x+y }.to_f
64
+ @categories.each do |category, category_words|
65
+ score[category.to_s] = 0
66
+ total = category_words.values.inject(0) {|sum, element| sum+element}
67
+ text.word_hash.each do |word, count|
68
+ s = category_words.has_key?(word) ? category_words[word] : 0.1
69
+ score[category.to_s] += Math.log(s/total.to_f)
70
+ end
71
+ # now add prior probability for the category
72
+ s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
73
+ score[category.to_s] += Math.log(s / training_count)
74
+ end
75
+ return score
76
+ end
77
+
78
+ # Returns the classification of the provided +text+, which is one of the
79
+ # categories given in the initializer. E.g.,
80
+ # b.classify "I hate bad words and you"
81
+ # => 'Uninteresting'
82
+ def classify(text)
83
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
84
+ end
85
+
86
+ # Provides training and untraining methods for the categories specified in Bayes#new
87
+ # For example:
88
+ # b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'
89
+ # b.train_this "This text"
90
+ # b.train_that "That text"
91
+ # b.untrain_that "That text"
92
+ # b.train_the_other "The other text"
93
+ def method_missing(name, *args)
94
+ category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
95
+ if @categories.has_key? category
96
+ args.each { |text| eval("#{$1}train(category, text)") }
97
+ elsif name.to_s =~ /(un)?train_([\w]+)/
98
+ raise StandardError, "No such category: #{category}"
99
+ else
100
+ super #raise StandardError, "No such method: #{name}"
101
+ end
102
+ end
103
+
104
+ # Provides a list of category names
105
+ # For example:
106
+ # b.categories
107
+ # => ['This', 'That', 'the_other']
108
+ def categories # :nodoc:
109
+ @categories.keys.collect {|c| c.to_s}
110
+ end
111
+
112
+ # Allows you to add categories to the classifier.
113
+ # For example:
114
+ # b.add_category "Not spam"
115
+ #
116
+ # WARNING: Adding categories to a trained classifier will
117
+ # result in an undertrained category that will tend to match
118
+ # more criteria than the trained selective categories. In short,
119
+ # try to initialize your categories at initialization.
120
+ def add_category(category)
121
+ @categories[category.prepare_category_name] = Hash.new
122
+ end
123
+
124
+ alias append_category add_category
125
+ end
126
+ end
@@ -0,0 +1,10 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ require 'fast_stemmer'
6
+ require 'classifier-reborn/extensions/word_hash'
7
+
8
+ class Object
9
+ def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
10
+ end
@@ -0,0 +1,112 @@
1
+ # Author:: Ernest Ellingson
2
+ # Copyright:: Copyright (c) 2005
3
+
4
+ # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
5
+
6
+ require 'matrix'
7
+ require 'mathn'
8
+
9
+ class Array
10
+ def sum(identity = 0, &block)
11
+ return identity unless size > 0
12
+
13
+ if block_given?
14
+ map(&block).sum
15
+ else
16
+ reduce(:+)
17
+ end
18
+ end
19
+ end
20
+
21
+ class Vector
22
+ def magnitude
23
+ sumsqs = 0.0
24
+ self.size.times do |i|
25
+ sumsqs += self[i] ** 2.0
26
+ end
27
+ Math.sqrt(sumsqs)
28
+ end
29
+ def normalize
30
+ nv = []
31
+ mag = self.magnitude
32
+ self.size.times do |i|
33
+
34
+ nv << (self[i] / mag)
35
+
36
+ end
37
+ Vector[*nv]
38
+ end
39
+ end
40
+
41
+ class Matrix
42
+ def Matrix.diag(s)
43
+ Matrix.diagonal(*s)
44
+ end
45
+
46
+ alias :trans :transpose
47
+
48
+ def SV_decomp(maxSweeps = 20)
49
+ if self.row_size >= self.column_size
50
+ q = self.trans * self
51
+ else
52
+ q = self * self.trans
53
+ end
54
+
55
+ qrot = q.dup
56
+ v = Matrix.identity(q.row_size)
57
+ azrot = nil
58
+ mzrot = nil
59
+ cnt = 0
60
+ s_old = nil
61
+ mu = nil
62
+
63
+ while true do
64
+ cnt += 1
65
+ for row in (0...qrot.row_size-1) do
66
+ for col in (1..qrot.row_size-1) do
67
+ next if row == col
68
+ h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
69
+ hcos = Math.cos(h)
70
+ hsin = Math.sin(h)
71
+ mzrot = Matrix.identity(qrot.row_size)
72
+ mzrot[row,row] = hcos
73
+ mzrot[row,col] = -hsin
74
+ mzrot[col,row] = hsin
75
+ mzrot[col,col] = hcos
76
+ qrot = mzrot.trans * qrot * mzrot
77
+ v = v * mzrot
78
+ end
79
+ end
80
+ s_old = qrot.dup if cnt == 1
81
+ sum_qrot = 0.0
82
+ if cnt > 1
83
+ qrot.row_size.times do |r|
84
+ sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
85
+ end
86
+ s_old = qrot.dup
87
+ end
88
+ break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
89
+ end # of do while true
90
+ s = []
91
+ qrot.row_size.times do |r|
92
+ s << Math.sqrt(qrot[r,r])
93
+ end
94
+ #puts "cnt = #{cnt}"
95
+ if self.row_size >= self.column_size
96
+ mu = self * v * Matrix.diagonal(*s).inverse
97
+ return [mu, v, s]
98
+ else
99
+ puts v.row_size
100
+ puts v.column_size
101
+ puts self.row_size
102
+ puts self.column_size
103
+ puts s.size
104
+
105
+ mu = (self.trans * v * Matrix.diagonal(*s).inverse)
106
+ return [mu, v, s]
107
+ end
108
+ end
109
+ def []=(i,j,val)
110
+ @rows[i][j] = val
111
+ end
112
+ end
@@ -0,0 +1,20 @@
1
+ module GSL
2
+
3
+ class Vector
4
+ def _dump(v)
5
+ Marshal.dump( self.to_a )
6
+ end
7
+
8
+ def self._load(arr)
9
+ arry = Marshal.load(arr)
10
+ return GSL::Vector.alloc(arry)
11
+ end
12
+
13
+ end
14
+
15
+ class Matrix
16
+ class <<self
17
+ alias :diag :diagonal
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,136 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ require "set"
6
+
7
+ # These are extensions to the String class to provide convenience
8
+ # methods for the Classifier package.
9
+ class String
10
+
11
+ # Removes common punctuation symbols, returning a new string.
12
+ # E.g.,
13
+ # "Hello (greeting's), with {braces} < >...?".without_punctuation
14
+ # => "Hello greetings with braces "
15
+ def without_punctuation
16
+ tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
17
+ end
18
+
19
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
20
+ # interned, and indexes to its frequency in the document.
21
+ def word_hash
22
+ word_hash = clean_word_hash()
23
+ symbol_hash = word_hash_for_symbols(gsub(/[\w]/," ").split)
24
+ return word_hash.merge(symbol_hash)
25
+ end
26
+
27
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
28
+ def clean_word_hash
29
+ word_hash_for_words gsub(/[^\w\s]/,"").split
30
+ end
31
+
32
+ private
33
+
34
+ def word_hash_for_words(words)
35
+ d = Hash.new(0)
36
+ words.each do |word|
37
+ word.downcase!
38
+ if ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
39
+ d[word.stem.intern] += 1
40
+ end
41
+ end
42
+ return d
43
+ end
44
+
45
+
46
+ def word_hash_for_symbols(words)
47
+ d = Hash.new(0)
48
+ words.each do |word|
49
+ d[word.intern] += 1
50
+ end
51
+ return d
52
+ end
53
+
54
+ CORPUS_SKIP_WORDS = Set.new([
55
+ "a",
56
+ "again",
57
+ "all",
58
+ "along",
59
+ "are",
60
+ "also",
61
+ "an",
62
+ "and",
63
+ "as",
64
+ "at",
65
+ "but",
66
+ "by",
67
+ "came",
68
+ "can",
69
+ "cant",
70
+ "couldnt",
71
+ "did",
72
+ "didn",
73
+ "didnt",
74
+ "do",
75
+ "doesnt",
76
+ "dont",
77
+ "ever",
78
+ "first",
79
+ "from",
80
+ "have",
81
+ "her",
82
+ "here",
83
+ "him",
84
+ "how",
85
+ "i",
86
+ "if",
87
+ "in",
88
+ "into",
89
+ "is",
90
+ "isnt",
91
+ "it",
92
+ "itll",
93
+ "just",
94
+ "last",
95
+ "least",
96
+ "like",
97
+ "most",
98
+ "my",
99
+ "new",
100
+ "no",
101
+ "not",
102
+ "now",
103
+ "of",
104
+ "on",
105
+ "or",
106
+ "should",
107
+ "sinc",
108
+ "so",
109
+ "some",
110
+ "th",
111
+ "than",
112
+ "this",
113
+ "that",
114
+ "the",
115
+ "their",
116
+ "then",
117
+ "those",
118
+ "to",
119
+ "told",
120
+ "too",
121
+ "true",
122
+ "try",
123
+ "until",
124
+ "url",
125
+ "us",
126
+ "were",
127
+ "when",
128
+ "whether",
129
+ "while",
130
+ "with",
131
+ "within",
132
+ "yes",
133
+ "you",
134
+ "youll",
135
+ ])
136
+ end