classifier 1.1 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +2 -2
- data/Rakefile +2 -4
- data/doc/classes/Classifier.html +135 -0
- data/doc/classes/Classifier/Bayes.html +287 -0
- data/doc/classes/Classifier/Bayes.src/M000005.html +20 -0
- data/doc/classes/Classifier/Bayes.src/M000006.html +23 -0
- data/doc/classes/Classifier/Bayes.src/M000007.html +27 -0
- data/doc/classes/Classifier/Bayes.src/M000008.html +18 -0
- data/doc/classes/Classifier/Bayes.src/M000009.html +25 -0
- data/doc/classes/Classifier/Bayes.src/M000010.html +18 -0
- data/doc/classes/Classifier/Stemmable.html +243 -0
- data/doc/classes/Classifier/Stemmable.src/M000003.html +102 -0
- data/doc/classes/Classifier/WordHash.html +178 -0
- data/doc/classes/Classifier/WordHash.src/M000001.html +18 -0
- data/doc/classes/Classifier/WordHash.src/M000002.html +28 -0
- data/doc/classes/String.html +119 -0
- data/doc/created.rid +1 -0
- data/doc/files/README.html +156 -0
- data/doc/files/lib/classifier/bayes_rb.html +115 -0
- data/doc/files/lib/classifier/string_extensions/porter_stemmer_rb.html +112 -0
- data/doc/files/lib/classifier/string_extensions/word_hash_rb.html +115 -0
- data/doc/files/lib/classifier/string_extensions_rb.html +123 -0
- data/doc/files/lib/classifier_rb.html +123 -0
- data/doc/fr_class_index.html +31 -0
- data/doc/fr_file_index.html +32 -0
- data/doc/fr_method_index.html +37 -0
- data/doc/index.html +24 -0
- data/doc/rdoc-style.css +208 -0
- data/lib/classifier/bayes.rb +63 -12
- data/lib/classifier/string_extensions/porter_stemmer.rb +18 -15
- data/lib/classifier/string_extensions/word_hash.rb +96 -3
- data/test/bayes/bayesian_test.rb +13 -0
- data/test/string_extensions/word_hash_test.rb +7 -3
- metadata +36 -1
data/lib/classifier/bayes.rb
CHANGED
@@ -5,16 +5,36 @@
|
|
5
5
|
module Classifier
|
6
6
|
|
7
7
|
class Bayes
|
8
|
+
# The class can be created with one or more categories, each of which will be
|
9
|
+
# initialized and given a training method. E.g.,
|
10
|
+
# b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
|
8
11
|
def initialize(*categories)
|
9
12
|
@categories = Hash.new
|
10
|
-
categories.each { |category| @categories[category.capitalize.intern] = Hash.new }
|
13
|
+
categories.each { |category| @categories[category.to_s.gsub("_"," ").capitalize.intern] = Hash.new }
|
11
14
|
@total_words = 0
|
12
15
|
end
|
13
16
|
|
14
|
-
|
15
|
-
|
17
|
+
#
|
18
|
+
# Provides a general training method for all categories specified in Bayes#new
|
19
|
+
# For example:
|
20
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
21
|
+
# b.train :this, "This text"
|
22
|
+
# b.train "that", "That text"
|
23
|
+
# b.train "The other", "The other text"
|
24
|
+
def train(category, text)
|
25
|
+
category = category.to_s.gsub("_"," ").capitalize.intern
|
26
|
+
text.word_hash.each do |word, count|
|
27
|
+
@categories[category][word] ||= 0
|
28
|
+
@categories[category][word] += count
|
29
|
+
@total_words += count
|
30
|
+
end
|
16
31
|
end
|
17
32
|
|
33
|
+
#
|
34
|
+
# Returns the scores in each category the provided +text+. E.g.,
|
35
|
+
# b.classifications "I hate bad words and you"
|
36
|
+
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
37
|
+
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
18
38
|
def classifications(text)
|
19
39
|
score = Hash.new
|
20
40
|
@categories.each do |category, category_words|
|
@@ -28,6 +48,22 @@ class Bayes
|
|
28
48
|
return score
|
29
49
|
end
|
30
50
|
|
51
|
+
#
|
52
|
+
# Returns the classification of the provided +text+, which is one of the
|
53
|
+
# categories given in the initializer. E.g.,
|
54
|
+
# b.classify "I hate bad words and you"
|
55
|
+
# => 'Uninteresting'
|
56
|
+
def classify(text)
|
57
|
+
(classifications(text).sort_by { |a| -a[1] })[0][0]
|
58
|
+
end
|
59
|
+
|
60
|
+
#
|
61
|
+
# Provides training methods for the categories specified in Bayes#new
|
62
|
+
# For example:
|
63
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
64
|
+
# b.train_this "This text"
|
65
|
+
# b.train_that "That text"
|
66
|
+
# b.train_the_other "The other text"
|
31
67
|
def method_missing(name, *args)
|
32
68
|
category = name.to_s.gsub(/train_([\w]+)/, '\1').gsub("_"," ").capitalize.intern
|
33
69
|
if @categories.has_key? category
|
@@ -35,18 +71,33 @@ class Bayes
|
|
35
71
|
elsif name.to_s =~ /train_([\w]+)/
|
36
72
|
raise StandardError, "No such category: #{category}"
|
37
73
|
else
|
38
|
-
|
74
|
+
super #raise StandardError, "No such method: #{name}"
|
39
75
|
end
|
40
76
|
end
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
77
|
+
|
78
|
+
#
|
79
|
+
# Provides a list of category names
|
80
|
+
# For example:
|
81
|
+
# b.categories
|
82
|
+
# => ['This', 'That', 'the_other']
|
83
|
+
def categories # :nodoc:
|
84
|
+
@categories.keys.collect {|c| c.to_s}
|
49
85
|
end
|
86
|
+
|
87
|
+
#
|
88
|
+
# Allows you to add categories to the classifier.
|
89
|
+
# For example:
|
90
|
+
# b.add_category "Not spam"
|
91
|
+
#
|
92
|
+
# WARNING: Adding categories to a trained classifier will
|
93
|
+
# result in an undertrained category that will tend to match
|
94
|
+
# more criteria than the trained selective categories. In short,
|
95
|
+
# try to initialize your categories at initialization.
|
96
|
+
def add_category(category)
|
97
|
+
@categories[category.to_s.gsub("_"," ").capitalize.intern] = Hash.new
|
98
|
+
end
|
99
|
+
|
100
|
+
alias append_category add_category # :nodoc:
|
50
101
|
end
|
51
102
|
|
52
103
|
end
|
@@ -7,7 +7,21 @@
|
|
7
7
|
# Copyright 2005 Greg Fast <gdf@speakeasy.net>
|
8
8
|
|
9
9
|
module Classifier
|
10
|
-
|
10
|
+
|
11
|
+
#
|
12
|
+
# Porter stemmer in Ruby.
|
13
|
+
#
|
14
|
+
# This is the Porter stemming algorithm, ported to Ruby from the
|
15
|
+
# version coded up in Perl. It's easy to follow against the rules
|
16
|
+
# in the original paper in:
|
17
|
+
#
|
18
|
+
# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
19
|
+
# no. 3, pp 130-137,
|
20
|
+
#
|
21
|
+
# See also http://www.tartarus.org/~martin/PorterStemmer
|
22
|
+
#
|
23
|
+
# Send comments to raypereda@hotmail.com
|
24
|
+
#
|
11
25
|
module Stemmable
|
12
26
|
|
13
27
|
STEP_2_LIST = {
|
@@ -82,20 +96,9 @@ module Stemmable
|
|
82
96
|
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
|
83
97
|
|
84
98
|
#
|
85
|
-
#
|
86
|
-
#
|
87
|
-
#
|
88
|
-
# version coded up in Perl. It's easy to follow against the rules
|
89
|
-
# in the original paper in:
|
90
|
-
#
|
91
|
-
# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
92
|
-
# no. 3, pp 130-137,
|
93
|
-
#
|
94
|
-
# See also http://www.tartarus.org/~martin/PorterStemmer
|
95
|
-
#
|
96
|
-
# Send comments to raypereda@hotmail.com
|
97
|
-
#
|
98
|
-
|
99
|
+
# Stems the word contained in the current object. E.g.,
|
100
|
+
# "actually".stem_porter
|
101
|
+
# => "actual"
|
99
102
|
def stem_porter
|
100
103
|
|
101
104
|
# make a copy of the given object and convert it to a string.
|
@@ -4,23 +4,116 @@
|
|
4
4
|
|
5
5
|
module Classifier
|
6
6
|
|
7
|
+
# This module is mixed into String to provide convenience
|
8
|
+
# methods for the Classifier package.
|
7
9
|
module WordHash
|
10
|
+
|
11
|
+
# Removes common punctuation symbols, returning a new string. E.g.,
|
12
|
+
# "Hello (greeting's), with {braces} < >...?".without_punctuation
|
13
|
+
# => "Hello greetings with braces "
|
8
14
|
def without_punctuation
|
9
|
-
tr( '
|
15
|
+
tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
|
10
16
|
end
|
11
17
|
|
18
|
+
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
19
|
+
# interned, and indexes to its frequency in the document.
|
12
20
|
def word_hash
|
13
21
|
d = Hash.new
|
14
22
|
corpus = without_punctuation
|
15
23
|
(corpus.split + gsub(/[\w+]/,"").split).each do |word|
|
16
|
-
|
24
|
+
item = word.downcase
|
25
|
+
key = item.stem.intern
|
17
26
|
if !(word =~ /[\w+]/) || word.length > 2
|
18
27
|
d[key] ||= 0
|
19
28
|
d[key] += 1
|
20
|
-
end
|
29
|
+
end unless CORPUS_SKIP_WORDS[item]
|
21
30
|
end
|
22
31
|
return d
|
23
32
|
end
|
33
|
+
|
34
|
+
private
|
35
|
+
CORPUS_SKIP_WORDS = {
|
36
|
+
"a" => 1,
|
37
|
+
"again" => 1,
|
38
|
+
"all" => 1,
|
39
|
+
"along" => 1,
|
40
|
+
"are" => 1,
|
41
|
+
"also" => 1,
|
42
|
+
"an" => 1,
|
43
|
+
"and" => 1,
|
44
|
+
"as" => 1,
|
45
|
+
"at" => 1,
|
46
|
+
"but" => 1,
|
47
|
+
"by" => 1,
|
48
|
+
"came" => 1,
|
49
|
+
"can" => 1,
|
50
|
+
"cant" => 1,
|
51
|
+
"couldnt" => 1,
|
52
|
+
"did" => 1,
|
53
|
+
"didn" => 1,
|
54
|
+
"didnt" => 1,
|
55
|
+
"do" => 1,
|
56
|
+
"doesnt" => 1,
|
57
|
+
"dont" => 1,
|
58
|
+
"ever" => 1,
|
59
|
+
"first" => 1,
|
60
|
+
"from" => 1,
|
61
|
+
"have" => 1,
|
62
|
+
"her" => 1,
|
63
|
+
"here" => 1,
|
64
|
+
"him" => 1,
|
65
|
+
"how" => 1,
|
66
|
+
"i" => 1,
|
67
|
+
"if" => 1,
|
68
|
+
"in" => 1,
|
69
|
+
"into" => 1,
|
70
|
+
"is" => 1,
|
71
|
+
"isnt" => 1,
|
72
|
+
"it" => 1,
|
73
|
+
"itll" => 1,
|
74
|
+
"just" => 1,
|
75
|
+
"last" => 1,
|
76
|
+
"least" => 1,
|
77
|
+
"like" => 1,
|
78
|
+
"most" => 1,
|
79
|
+
"my" => 1,
|
80
|
+
"new" => 1,
|
81
|
+
"no" => 1,
|
82
|
+
"not" => 1,
|
83
|
+
"now" => 1,
|
84
|
+
"of" => 1,
|
85
|
+
"on" => 1,
|
86
|
+
"or" => 1,
|
87
|
+
"should" => 1,
|
88
|
+
"sinc" => 1,
|
89
|
+
"so" => 1,
|
90
|
+
"some" => 1,
|
91
|
+
"th" => 1,
|
92
|
+
"than" => 1,
|
93
|
+
"this" => 1,
|
94
|
+
"that" => 1,
|
95
|
+
"the" => 1,
|
96
|
+
"their" => 1,
|
97
|
+
"then" => 1,
|
98
|
+
"those" => 1,
|
99
|
+
"to" => 1,
|
100
|
+
"told" => 1,
|
101
|
+
"too" => 1,
|
102
|
+
"true" => 1,
|
103
|
+
"try" => 1,
|
104
|
+
"until" => 1,
|
105
|
+
"url" => 1,
|
106
|
+
"us" => 1,
|
107
|
+
"were" => 1,
|
108
|
+
"when" => 1,
|
109
|
+
"whether" => 1,
|
110
|
+
"while" => 1,
|
111
|
+
"with" => 1,
|
112
|
+
"within" => 1,
|
113
|
+
"yes" => 1,
|
114
|
+
"you" => 1,
|
115
|
+
"youll" => 1,
|
116
|
+
}
|
24
117
|
end
|
25
118
|
|
26
119
|
end
|
data/test/bayes/bayesian_test.rb
CHANGED
@@ -11,6 +11,19 @@ class BayesianTest < Test::Unit::TestCase
|
|
11
11
|
def test_bad_training
|
12
12
|
assert_raise(StandardError) { @classifier.train_no_category "words" }
|
13
13
|
end
|
14
|
+
|
15
|
+
def test_bad_method
|
16
|
+
assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_categories
|
20
|
+
assert_equal ['Interesting', 'Uninteresting'], @classifier.categories
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_add_category
|
24
|
+
@classifier.add_category 'Test'
|
25
|
+
assert_equal ['Interesting', 'Test', 'Uninteresting'], @classifier.categories
|
26
|
+
end
|
14
27
|
|
15
28
|
def test_classification
|
16
29
|
@classifier.train_interesting "here are some good words. I hope you love them"
|
@@ -1,8 +1,12 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/../test_helper'
|
2
2
|
class StringExtensionsTest < Test::Unit::TestCase
|
3
3
|
def test_word_hash
|
4
|
-
hash = {:
|
5
|
-
|
6
|
-
|
4
|
+
hash = {:good=>1, :"!"=>1, :hope=>1, :"'."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
|
5
|
+
assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
|
6
|
+
end
|
7
|
+
|
8
|
+
def test_without_punctuation
|
9
|
+
sample = "Hello thats welcome to no punctuation"
|
10
|
+
assert_equal sample, "Hello, that's welcome () to ! no *& punctuation".without_punctuation
|
7
11
|
end
|
8
12
|
end
|
metadata
CHANGED
@@ -3,7 +3,7 @@ rubygems_version: 0.8.6
|
|
3
3
|
specification_version: 1
|
4
4
|
name: classifier
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version:
|
6
|
+
version: 1.1.1
|
7
7
|
date: 2005-04-11
|
8
8
|
summary: A general classifier module to allow Bayesian and other types of classifications.
|
9
9
|
require_paths:
|
@@ -42,6 +42,41 @@ files:
|
|
42
42
|
- test/string_extensions/word_hash_test.rb
|
43
43
|
- Rakefile
|
44
44
|
- README
|
45
|
+
- doc/classes
|
46
|
+
- doc/created.rid
|
47
|
+
- doc/files
|
48
|
+
- doc/fr_class_index.html
|
49
|
+
- doc/fr_file_index.html
|
50
|
+
- doc/fr_method_index.html
|
51
|
+
- doc/index.html
|
52
|
+
- doc/rdoc-style.css
|
53
|
+
- doc/classes/Classifier
|
54
|
+
- doc/classes/Classifier.html
|
55
|
+
- doc/classes/String.html
|
56
|
+
- doc/classes/Classifier/Bayes.html
|
57
|
+
- doc/classes/Classifier/Bayes.src
|
58
|
+
- doc/classes/Classifier/Stemmable.html
|
59
|
+
- doc/classes/Classifier/Stemmable.src
|
60
|
+
- doc/classes/Classifier/WordHash.html
|
61
|
+
- doc/classes/Classifier/WordHash.src
|
62
|
+
- doc/classes/Classifier/Bayes.src/M000005.html
|
63
|
+
- doc/classes/Classifier/Bayes.src/M000006.html
|
64
|
+
- doc/classes/Classifier/Bayes.src/M000007.html
|
65
|
+
- doc/classes/Classifier/Bayes.src/M000008.html
|
66
|
+
- doc/classes/Classifier/Bayes.src/M000009.html
|
67
|
+
- doc/classes/Classifier/Bayes.src/M000010.html
|
68
|
+
- doc/classes/Classifier/Stemmable.src/M000003.html
|
69
|
+
- doc/classes/Classifier/WordHash.src/M000001.html
|
70
|
+
- doc/classes/Classifier/WordHash.src/M000002.html
|
71
|
+
- doc/files/lib
|
72
|
+
- doc/files/README.html
|
73
|
+
- doc/files/lib/classifier
|
74
|
+
- doc/files/lib/classifier_rb.html
|
75
|
+
- doc/files/lib/classifier/bayes_rb.html
|
76
|
+
- doc/files/lib/classifier/string_extensions
|
77
|
+
- doc/files/lib/classifier/string_extensions_rb.html
|
78
|
+
- doc/files/lib/classifier/string_extensions/porter_stemmer_rb.html
|
79
|
+
- doc/files/lib/classifier/string_extensions/word_hash_rb.html
|
45
80
|
test_files: []
|
46
81
|
rdoc_options: []
|
47
82
|
extra_rdoc_files: []
|