classifier 1.1 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +2 -2
- data/Rakefile +2 -4
- data/doc/classes/Classifier.html +135 -0
- data/doc/classes/Classifier/Bayes.html +287 -0
- data/doc/classes/Classifier/Bayes.src/M000005.html +20 -0
- data/doc/classes/Classifier/Bayes.src/M000006.html +23 -0
- data/doc/classes/Classifier/Bayes.src/M000007.html +27 -0
- data/doc/classes/Classifier/Bayes.src/M000008.html +18 -0
- data/doc/classes/Classifier/Bayes.src/M000009.html +25 -0
- data/doc/classes/Classifier/Bayes.src/M000010.html +18 -0
- data/doc/classes/Classifier/Stemmable.html +243 -0
- data/doc/classes/Classifier/Stemmable.src/M000003.html +102 -0
- data/doc/classes/Classifier/WordHash.html +178 -0
- data/doc/classes/Classifier/WordHash.src/M000001.html +18 -0
- data/doc/classes/Classifier/WordHash.src/M000002.html +28 -0
- data/doc/classes/String.html +119 -0
- data/doc/created.rid +1 -0
- data/doc/files/README.html +156 -0
- data/doc/files/lib/classifier/bayes_rb.html +115 -0
- data/doc/files/lib/classifier/string_extensions/porter_stemmer_rb.html +112 -0
- data/doc/files/lib/classifier/string_extensions/word_hash_rb.html +115 -0
- data/doc/files/lib/classifier/string_extensions_rb.html +123 -0
- data/doc/files/lib/classifier_rb.html +123 -0
- data/doc/fr_class_index.html +31 -0
- data/doc/fr_file_index.html +32 -0
- data/doc/fr_method_index.html +37 -0
- data/doc/index.html +24 -0
- data/doc/rdoc-style.css +208 -0
- data/lib/classifier/bayes.rb +63 -12
- data/lib/classifier/string_extensions/porter_stemmer.rb +18 -15
- data/lib/classifier/string_extensions/word_hash.rb +96 -3
- data/test/bayes/bayesian_test.rb +13 -0
- data/test/string_extensions/word_hash_test.rb +7 -3
- metadata +36 -1
data/lib/classifier/bayes.rb
CHANGED
@@ -5,16 +5,36 @@
|
|
5
5
|
module Classifier
|
6
6
|
|
7
7
|
class Bayes
|
8
|
+
# The class can be created with one or more categories, each of which will be
|
9
|
+
# initialized and given a training method. E.g.,
|
10
|
+
# b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
|
8
11
|
def initialize(*categories)
|
9
12
|
@categories = Hash.new
|
10
|
-
categories.each { |category| @categories[category.capitalize.intern] = Hash.new }
|
13
|
+
categories.each { |category| @categories[category.to_s.gsub("_"," ").capitalize.intern] = Hash.new }
|
11
14
|
@total_words = 0
|
12
15
|
end
|
13
16
|
|
14
|
-
|
15
|
-
|
17
|
+
#
|
18
|
+
# Provides a general training method for all categories specified in Bayes#new
|
19
|
+
# For example:
|
20
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
21
|
+
# b.train :this, "This text"
|
22
|
+
# b.train "that", "That text"
|
23
|
+
# b.train "The other", "The other text"
|
24
|
+
def train(category, text)
|
25
|
+
category = category.to_s.gsub("_"," ").capitalize.intern
|
26
|
+
text.word_hash.each do |word, count|
|
27
|
+
@categories[category][word] ||= 0
|
28
|
+
@categories[category][word] += count
|
29
|
+
@total_words += count
|
30
|
+
end
|
16
31
|
end
|
17
32
|
|
33
|
+
#
|
34
|
+
# Returns the scores in each category the provided +text+. E.g.,
|
35
|
+
# b.classifications "I hate bad words and you"
|
36
|
+
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
37
|
+
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
18
38
|
def classifications(text)
|
19
39
|
score = Hash.new
|
20
40
|
@categories.each do |category, category_words|
|
@@ -28,6 +48,22 @@ class Bayes
|
|
28
48
|
return score
|
29
49
|
end
|
30
50
|
|
51
|
+
#
|
52
|
+
# Returns the classification of the provided +text+, which is one of the
|
53
|
+
# categories given in the initializer. E.g.,
|
54
|
+
# b.classify "I hate bad words and you"
|
55
|
+
# => 'Uninteresting'
|
56
|
+
def classify(text)
|
57
|
+
(classifications(text).sort_by { |a| -a[1] })[0][0]
|
58
|
+
end
|
59
|
+
|
60
|
+
#
|
61
|
+
# Provides training methods for the categories specified in Bayes#new
|
62
|
+
# For example:
|
63
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
64
|
+
# b.train_this "This text"
|
65
|
+
# b.train_that "That text"
|
66
|
+
# b.train_the_other "The other text"
|
31
67
|
def method_missing(name, *args)
|
32
68
|
category = name.to_s.gsub(/train_([\w]+)/, '\1').gsub("_"," ").capitalize.intern
|
33
69
|
if @categories.has_key? category
|
@@ -35,18 +71,33 @@ class Bayes
|
|
35
71
|
elsif name.to_s =~ /train_([\w]+)/
|
36
72
|
raise StandardError, "No such category: #{category}"
|
37
73
|
else
|
38
|
-
|
74
|
+
super #raise StandardError, "No such method: #{name}"
|
39
75
|
end
|
40
76
|
end
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
77
|
+
|
78
|
+
#
|
79
|
+
# Provides a list of category names
|
80
|
+
# For example:
|
81
|
+
# b.categories
|
82
|
+
# => ['This', 'That', 'the_other']
|
83
|
+
def categories # :nodoc:
|
84
|
+
@categories.keys.collect {|c| c.to_s}
|
49
85
|
end
|
86
|
+
|
87
|
+
#
|
88
|
+
# Allows you to add categories to the classifier.
|
89
|
+
# For example:
|
90
|
+
# b.add_category "Not spam"
|
91
|
+
#
|
92
|
+
# WARNING: Adding categories to a trained classifier will
|
93
|
+
# result in an undertrained category that will tend to match
|
94
|
+
# more criteria than the trained selective categories. In short,
|
95
|
+
# try to initialize your categories at initialization.
|
96
|
+
def add_category(category)
|
97
|
+
@categories[category.to_s.gsub("_"," ").capitalize.intern] = Hash.new
|
98
|
+
end
|
99
|
+
|
100
|
+
alias append_category add_category # :nodoc:
|
50
101
|
end
|
51
102
|
|
52
103
|
end
|
@@ -7,7 +7,21 @@
|
|
7
7
|
# Copyright 2005 Greg Fast <gdf@speakeasy.net>
|
8
8
|
|
9
9
|
module Classifier
|
10
|
-
|
10
|
+
|
11
|
+
#
|
12
|
+
# Porter stemmer in Ruby.
|
13
|
+
#
|
14
|
+
# This is the Porter stemming algorithm, ported to Ruby from the
|
15
|
+
# version coded up in Perl. It's easy to follow against the rules
|
16
|
+
# in the original paper in:
|
17
|
+
#
|
18
|
+
# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
19
|
+
# no. 3, pp 130-137,
|
20
|
+
#
|
21
|
+
# See also http://www.tartarus.org/~martin/PorterStemmer
|
22
|
+
#
|
23
|
+
# Send comments to raypereda@hotmail.com
|
24
|
+
#
|
11
25
|
module Stemmable
|
12
26
|
|
13
27
|
STEP_2_LIST = {
|
@@ -82,20 +96,9 @@ module Stemmable
|
|
82
96
|
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
|
83
97
|
|
84
98
|
#
|
85
|
-
#
|
86
|
-
#
|
87
|
-
#
|
88
|
-
# version coded up in Perl. It's easy to follow against the rules
|
89
|
-
# in the original paper in:
|
90
|
-
#
|
91
|
-
# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
92
|
-
# no. 3, pp 130-137,
|
93
|
-
#
|
94
|
-
# See also http://www.tartarus.org/~martin/PorterStemmer
|
95
|
-
#
|
96
|
-
# Send comments to raypereda@hotmail.com
|
97
|
-
#
|
98
|
-
|
99
|
+
# Stems the word contained in the current object. E.g.,
|
100
|
+
# "actually".stem_porter
|
101
|
+
# => "actual"
|
99
102
|
def stem_porter
|
100
103
|
|
101
104
|
# make a copy of the given object and convert it to a string.
|
@@ -4,23 +4,116 @@
|
|
4
4
|
|
5
5
|
module Classifier
|
6
6
|
|
7
|
+
# This module is mixed into String to provide convenience
|
8
|
+
# methods for the Classifier package.
|
7
9
|
module WordHash
|
10
|
+
|
11
|
+
# Removes common punctuation symbols, returning a new string. E.g.,
|
12
|
+
# "Hello (greeting's), with {braces} < >...?".without_punctuation
|
13
|
+
# => "Hello greetings with braces "
|
8
14
|
def without_punctuation
|
9
|
-
tr( '
|
15
|
+
tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
|
10
16
|
end
|
11
17
|
|
18
|
+
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
19
|
+
# interned, and indexes to its frequency in the document.
|
12
20
|
def word_hash
|
13
21
|
d = Hash.new
|
14
22
|
corpus = without_punctuation
|
15
23
|
(corpus.split + gsub(/[\w+]/,"").split).each do |word|
|
16
|
-
|
24
|
+
item = word.downcase
|
25
|
+
key = item.stem.intern
|
17
26
|
if !(word =~ /[\w+]/) || word.length > 2
|
18
27
|
d[key] ||= 0
|
19
28
|
d[key] += 1
|
20
|
-
end
|
29
|
+
end unless CORPUS_SKIP_WORDS[item]
|
21
30
|
end
|
22
31
|
return d
|
23
32
|
end
|
33
|
+
|
34
|
+
private
|
35
|
+
CORPUS_SKIP_WORDS = {
|
36
|
+
"a" => 1,
|
37
|
+
"again" => 1,
|
38
|
+
"all" => 1,
|
39
|
+
"along" => 1,
|
40
|
+
"are" => 1,
|
41
|
+
"also" => 1,
|
42
|
+
"an" => 1,
|
43
|
+
"and" => 1,
|
44
|
+
"as" => 1,
|
45
|
+
"at" => 1,
|
46
|
+
"but" => 1,
|
47
|
+
"by" => 1,
|
48
|
+
"came" => 1,
|
49
|
+
"can" => 1,
|
50
|
+
"cant" => 1,
|
51
|
+
"couldnt" => 1,
|
52
|
+
"did" => 1,
|
53
|
+
"didn" => 1,
|
54
|
+
"didnt" => 1,
|
55
|
+
"do" => 1,
|
56
|
+
"doesnt" => 1,
|
57
|
+
"dont" => 1,
|
58
|
+
"ever" => 1,
|
59
|
+
"first" => 1,
|
60
|
+
"from" => 1,
|
61
|
+
"have" => 1,
|
62
|
+
"her" => 1,
|
63
|
+
"here" => 1,
|
64
|
+
"him" => 1,
|
65
|
+
"how" => 1,
|
66
|
+
"i" => 1,
|
67
|
+
"if" => 1,
|
68
|
+
"in" => 1,
|
69
|
+
"into" => 1,
|
70
|
+
"is" => 1,
|
71
|
+
"isnt" => 1,
|
72
|
+
"it" => 1,
|
73
|
+
"itll" => 1,
|
74
|
+
"just" => 1,
|
75
|
+
"last" => 1,
|
76
|
+
"least" => 1,
|
77
|
+
"like" => 1,
|
78
|
+
"most" => 1,
|
79
|
+
"my" => 1,
|
80
|
+
"new" => 1,
|
81
|
+
"no" => 1,
|
82
|
+
"not" => 1,
|
83
|
+
"now" => 1,
|
84
|
+
"of" => 1,
|
85
|
+
"on" => 1,
|
86
|
+
"or" => 1,
|
87
|
+
"should" => 1,
|
88
|
+
"sinc" => 1,
|
89
|
+
"so" => 1,
|
90
|
+
"some" => 1,
|
91
|
+
"th" => 1,
|
92
|
+
"than" => 1,
|
93
|
+
"this" => 1,
|
94
|
+
"that" => 1,
|
95
|
+
"the" => 1,
|
96
|
+
"their" => 1,
|
97
|
+
"then" => 1,
|
98
|
+
"those" => 1,
|
99
|
+
"to" => 1,
|
100
|
+
"told" => 1,
|
101
|
+
"too" => 1,
|
102
|
+
"true" => 1,
|
103
|
+
"try" => 1,
|
104
|
+
"until" => 1,
|
105
|
+
"url" => 1,
|
106
|
+
"us" => 1,
|
107
|
+
"were" => 1,
|
108
|
+
"when" => 1,
|
109
|
+
"whether" => 1,
|
110
|
+
"while" => 1,
|
111
|
+
"with" => 1,
|
112
|
+
"within" => 1,
|
113
|
+
"yes" => 1,
|
114
|
+
"you" => 1,
|
115
|
+
"youll" => 1,
|
116
|
+
}
|
24
117
|
end
|
25
118
|
|
26
119
|
end
|
data/test/bayes/bayesian_test.rb
CHANGED
@@ -11,6 +11,19 @@ class BayesianTest < Test::Unit::TestCase
|
|
11
11
|
def test_bad_training
|
12
12
|
assert_raise(StandardError) { @classifier.train_no_category "words" }
|
13
13
|
end
|
14
|
+
|
15
|
+
def test_bad_method
|
16
|
+
assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_categories
|
20
|
+
assert_equal ['Interesting', 'Uninteresting'], @classifier.categories
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_add_category
|
24
|
+
@classifier.add_category 'Test'
|
25
|
+
assert_equal ['Interesting', 'Test', 'Uninteresting'], @classifier.categories
|
26
|
+
end
|
14
27
|
|
15
28
|
def test_classification
|
16
29
|
@classifier.train_interesting "here are some good words. I hope you love them"
|
@@ -1,8 +1,12 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/../test_helper'
|
2
2
|
class StringExtensionsTest < Test::Unit::TestCase
|
3
3
|
def test_word_hash
|
4
|
-
hash = {:
|
5
|
-
|
6
|
-
|
4
|
+
hash = {:good=>1, :"!"=>1, :hope=>1, :"'."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
|
5
|
+
assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
|
6
|
+
end
|
7
|
+
|
8
|
+
def test_without_punctuation
|
9
|
+
sample = "Hello thats welcome to no punctuation"
|
10
|
+
assert_equal sample, "Hello, that's welcome () to ! no *& punctuation".without_punctuation
|
7
11
|
end
|
8
12
|
end
|
metadata
CHANGED
@@ -3,7 +3,7 @@ rubygems_version: 0.8.6
|
|
3
3
|
specification_version: 1
|
4
4
|
name: classifier
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version:
|
6
|
+
version: 1.1.1
|
7
7
|
date: 2005-04-11
|
8
8
|
summary: A general classifier module to allow Bayesian and other types of classifications.
|
9
9
|
require_paths:
|
@@ -42,6 +42,41 @@ files:
|
|
42
42
|
- test/string_extensions/word_hash_test.rb
|
43
43
|
- Rakefile
|
44
44
|
- README
|
45
|
+
- doc/classes
|
46
|
+
- doc/created.rid
|
47
|
+
- doc/files
|
48
|
+
- doc/fr_class_index.html
|
49
|
+
- doc/fr_file_index.html
|
50
|
+
- doc/fr_method_index.html
|
51
|
+
- doc/index.html
|
52
|
+
- doc/rdoc-style.css
|
53
|
+
- doc/classes/Classifier
|
54
|
+
- doc/classes/Classifier.html
|
55
|
+
- doc/classes/String.html
|
56
|
+
- doc/classes/Classifier/Bayes.html
|
57
|
+
- doc/classes/Classifier/Bayes.src
|
58
|
+
- doc/classes/Classifier/Stemmable.html
|
59
|
+
- doc/classes/Classifier/Stemmable.src
|
60
|
+
- doc/classes/Classifier/WordHash.html
|
61
|
+
- doc/classes/Classifier/WordHash.src
|
62
|
+
- doc/classes/Classifier/Bayes.src/M000005.html
|
63
|
+
- doc/classes/Classifier/Bayes.src/M000006.html
|
64
|
+
- doc/classes/Classifier/Bayes.src/M000007.html
|
65
|
+
- doc/classes/Classifier/Bayes.src/M000008.html
|
66
|
+
- doc/classes/Classifier/Bayes.src/M000009.html
|
67
|
+
- doc/classes/Classifier/Bayes.src/M000010.html
|
68
|
+
- doc/classes/Classifier/Stemmable.src/M000003.html
|
69
|
+
- doc/classes/Classifier/WordHash.src/M000001.html
|
70
|
+
- doc/classes/Classifier/WordHash.src/M000002.html
|
71
|
+
- doc/files/lib
|
72
|
+
- doc/files/README.html
|
73
|
+
- doc/files/lib/classifier
|
74
|
+
- doc/files/lib/classifier_rb.html
|
75
|
+
- doc/files/lib/classifier/bayes_rb.html
|
76
|
+
- doc/files/lib/classifier/string_extensions
|
77
|
+
- doc/files/lib/classifier/string_extensions_rb.html
|
78
|
+
- doc/files/lib/classifier/string_extensions/porter_stemmer_rb.html
|
79
|
+
- doc/files/lib/classifier/string_extensions/word_hash_rb.html
|
45
80
|
test_files: []
|
46
81
|
rdoc_options: []
|
47
82
|
extra_rdoc_files: []
|