classifier 1.1 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (34) hide show
  1. data/README +2 -2
  2. data/Rakefile +2 -4
  3. data/doc/classes/Classifier.html +135 -0
  4. data/doc/classes/Classifier/Bayes.html +287 -0
  5. data/doc/classes/Classifier/Bayes.src/M000005.html +20 -0
  6. data/doc/classes/Classifier/Bayes.src/M000006.html +23 -0
  7. data/doc/classes/Classifier/Bayes.src/M000007.html +27 -0
  8. data/doc/classes/Classifier/Bayes.src/M000008.html +18 -0
  9. data/doc/classes/Classifier/Bayes.src/M000009.html +25 -0
  10. data/doc/classes/Classifier/Bayes.src/M000010.html +18 -0
  11. data/doc/classes/Classifier/Stemmable.html +243 -0
  12. data/doc/classes/Classifier/Stemmable.src/M000003.html +102 -0
  13. data/doc/classes/Classifier/WordHash.html +178 -0
  14. data/doc/classes/Classifier/WordHash.src/M000001.html +18 -0
  15. data/doc/classes/Classifier/WordHash.src/M000002.html +28 -0
  16. data/doc/classes/String.html +119 -0
  17. data/doc/created.rid +1 -0
  18. data/doc/files/README.html +156 -0
  19. data/doc/files/lib/classifier/bayes_rb.html +115 -0
  20. data/doc/files/lib/classifier/string_extensions/porter_stemmer_rb.html +112 -0
  21. data/doc/files/lib/classifier/string_extensions/word_hash_rb.html +115 -0
  22. data/doc/files/lib/classifier/string_extensions_rb.html +123 -0
  23. data/doc/files/lib/classifier_rb.html +123 -0
  24. data/doc/fr_class_index.html +31 -0
  25. data/doc/fr_file_index.html +32 -0
  26. data/doc/fr_method_index.html +37 -0
  27. data/doc/index.html +24 -0
  28. data/doc/rdoc-style.css +208 -0
  29. data/lib/classifier/bayes.rb +63 -12
  30. data/lib/classifier/string_extensions/porter_stemmer.rb +18 -15
  31. data/lib/classifier/string_extensions/word_hash.rb +96 -3
  32. data/test/bayes/bayesian_test.rb +13 -0
  33. data/test/string_extensions/word_hash_test.rb +7 -3
  34. metadata +36 -1
@@ -5,16 +5,36 @@
5
5
  module Classifier
6
6
 
7
7
  class Bayes
8
+ # The class can be created with one or more categories, each of which will be
9
+ # initialized and given a training method. E.g.,
10
+ # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
8
11
  def initialize(*categories)
9
12
  @categories = Hash.new
10
- categories.each { |category| @categories[category.capitalize.intern] = Hash.new }
13
+ categories.each { |category| @categories[category.to_s.gsub("_"," ").capitalize.intern] = Hash.new }
11
14
  @total_words = 0
12
15
  end
13
16
 
14
- def classify(text)
15
- (classifications(text).sort_by { |a| -a[1] })[0][0]
17
+ #
18
+ # Provides a general training method for all categories specified in Bayes#new
19
+ # For example:
20
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
21
+ # b.train :this, "This text"
22
+ # b.train "that", "That text"
23
+ # b.train "The other", "The other text"
24
+ def train(category, text)
25
+ category = category.to_s.gsub("_"," ").capitalize.intern
26
+ text.word_hash.each do |word, count|
27
+ @categories[category][word] ||= 0
28
+ @categories[category][word] += count
29
+ @total_words += count
30
+ end
16
31
  end
17
32
 
33
+ #
34
+ # Returns the scores in each category the provided +text+. E.g.,
35
+ # b.classifications "I hate bad words and you"
36
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
37
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
18
38
  def classifications(text)
19
39
  score = Hash.new
20
40
  @categories.each do |category, category_words|
@@ -28,6 +48,22 @@ class Bayes
28
48
  return score
29
49
  end
30
50
 
51
+ #
52
+ # Returns the classification of the provided +text+, which is one of the
53
+ # categories given in the initializer. E.g.,
54
+ # b.classify "I hate bad words and you"
55
+ # => 'Uninteresting'
56
+ def classify(text)
57
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
58
+ end
59
+
60
+ #
61
+ # Provides training methods for the categories specified in Bayes#new
62
+ # For example:
63
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
64
+ # b.train_this "This text"
65
+ # b.train_that "That text"
66
+ # b.train_the_other "The other text"
31
67
  def method_missing(name, *args)
32
68
  category = name.to_s.gsub(/train_([\w]+)/, '\1').gsub("_"," ").capitalize.intern
33
69
  if @categories.has_key? category
@@ -35,18 +71,33 @@ class Bayes
35
71
  elsif name.to_s =~ /train_([\w]+)/
36
72
  raise StandardError, "No such category: #{category}"
37
73
  else
38
- raise StandardError, "No such method: #{name}"
74
+ super #raise StandardError, "No such method: #{name}"
39
75
  end
40
76
  end
41
-
42
- def train(category, text)
43
- category = category.to_s.gsub("_"," ").capitalize.intern
44
- text.word_hash.each do |word, count|
45
- @categories[category][word] ||= 0
46
- @categories[category][word] += count
47
- @total_words += count
48
- end
77
+
78
+ #
79
+ # Provides a list of category names
80
+ # For example:
81
+ # b.categories
82
+ # => ['This', 'That', 'the_other']
83
+ def categories # :nodoc:
84
+ @categories.keys.collect {|c| c.to_s}
49
85
  end
86
+
87
+ #
88
+ # Allows you to add categories to the classifier.
89
+ # For example:
90
+ # b.add_category "Not spam"
91
+ #
92
+ # WARNING: Adding categories to a trained classifier will
93
+ # result in an undertrained category that will tend to match
94
+ # more criteria than the trained selective categories. In short,
95
+ # try to initialize your categories at initialization.
96
+ def add_category(category)
97
+ @categories[category.to_s.gsub("_"," ").capitalize.intern] = Hash.new
98
+ end
99
+
100
+ alias append_category add_category # :nodoc:
50
101
  end
51
102
 
52
103
  end
@@ -7,7 +7,21 @@
7
7
  # Copyright 2005 Greg Fast <gdf@speakeasy.net>
8
8
 
9
9
  module Classifier
10
-
10
+
11
+ #
12
+ # Porter stemmer in Ruby.
13
+ #
14
+ # This is the Porter stemming algorithm, ported to Ruby from the
15
+ # version coded up in Perl. It's easy to follow against the rules
16
+ # in the original paper in:
17
+ #
18
+ # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
19
+ # no. 3, pp 130-137,
20
+ #
21
+ # See also http://www.tartarus.org/~martin/PorterStemmer
22
+ #
23
+ # Send comments to raypereda@hotmail.com
24
+ #
11
25
  module Stemmable
12
26
 
13
27
  STEP_2_LIST = {
@@ -82,20 +96,9 @@ module Stemmable
82
96
  VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
83
97
 
84
98
  #
85
- # Porter stemmer in Ruby.
86
- #
87
- # This is the Porter stemming algorithm, ported to Ruby from the
88
- # version coded up in Perl. It's easy to follow against the rules
89
- # in the original paper in:
90
- #
91
- # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
92
- # no. 3, pp 130-137,
93
- #
94
- # See also http://www.tartarus.org/~martin/PorterStemmer
95
- #
96
- # Send comments to raypereda@hotmail.com
97
- #
98
-
99
+ # Stems the word contained in the current object. E.g.,
100
+ # "actually".stem_porter
101
+ # => "actual"
99
102
  def stem_porter
100
103
 
101
104
  # make a copy of the given object and convert it to a string.
@@ -4,23 +4,116 @@
4
4
 
5
5
  module Classifier
6
6
 
7
+ # This module is mixed into String to provide convenience
8
+ # methods for the Classifier package.
7
9
  module WordHash
10
+
11
+ # Removes common punctuation symbols, returning a new string. E.g.,
12
+ # "Hello (greeting's), with {braces} < >...?".without_punctuation
13
+ # => "Hello greetings with braces "
8
14
  def without_punctuation
9
- tr( ',?.!;:\'"@#$%^&*()_=+[]{}\|<>/`~', " " )
15
+ tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
10
16
  end
11
17
 
18
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
19
+ # interned, and indexes to its frequency in the document.
12
20
  def word_hash
13
21
  d = Hash.new
14
22
  corpus = without_punctuation
15
23
  (corpus.split + gsub(/[\w+]/,"").split).each do |word|
16
- key = word.downcase.stem.intern
24
+ item = word.downcase
25
+ key = item.stem.intern
17
26
  if !(word =~ /[\w+]/) || word.length > 2
18
27
  d[key] ||= 0
19
28
  d[key] += 1
20
- end
29
+ end unless CORPUS_SKIP_WORDS[item]
21
30
  end
22
31
  return d
23
32
  end
33
+
34
+ private
35
+ CORPUS_SKIP_WORDS = {
36
+ "a" => 1,
37
+ "again" => 1,
38
+ "all" => 1,
39
+ "along" => 1,
40
+ "are" => 1,
41
+ "also" => 1,
42
+ "an" => 1,
43
+ "and" => 1,
44
+ "as" => 1,
45
+ "at" => 1,
46
+ "but" => 1,
47
+ "by" => 1,
48
+ "came" => 1,
49
+ "can" => 1,
50
+ "cant" => 1,
51
+ "couldnt" => 1,
52
+ "did" => 1,
53
+ "didn" => 1,
54
+ "didnt" => 1,
55
+ "do" => 1,
56
+ "doesnt" => 1,
57
+ "dont" => 1,
58
+ "ever" => 1,
59
+ "first" => 1,
60
+ "from" => 1,
61
+ "have" => 1,
62
+ "her" => 1,
63
+ "here" => 1,
64
+ "him" => 1,
65
+ "how" => 1,
66
+ "i" => 1,
67
+ "if" => 1,
68
+ "in" => 1,
69
+ "into" => 1,
70
+ "is" => 1,
71
+ "isnt" => 1,
72
+ "it" => 1,
73
+ "itll" => 1,
74
+ "just" => 1,
75
+ "last" => 1,
76
+ "least" => 1,
77
+ "like" => 1,
78
+ "most" => 1,
79
+ "my" => 1,
80
+ "new" => 1,
81
+ "no" => 1,
82
+ "not" => 1,
83
+ "now" => 1,
84
+ "of" => 1,
85
+ "on" => 1,
86
+ "or" => 1,
87
+ "should" => 1,
88
+ "sinc" => 1,
89
+ "so" => 1,
90
+ "some" => 1,
91
+ "th" => 1,
92
+ "than" => 1,
93
+ "this" => 1,
94
+ "that" => 1,
95
+ "the" => 1,
96
+ "their" => 1,
97
+ "then" => 1,
98
+ "those" => 1,
99
+ "to" => 1,
100
+ "told" => 1,
101
+ "too" => 1,
102
+ "true" => 1,
103
+ "try" => 1,
104
+ "until" => 1,
105
+ "url" => 1,
106
+ "us" => 1,
107
+ "were" => 1,
108
+ "when" => 1,
109
+ "whether" => 1,
110
+ "while" => 1,
111
+ "with" => 1,
112
+ "within" => 1,
113
+ "yes" => 1,
114
+ "you" => 1,
115
+ "youll" => 1,
116
+ }
24
117
  end
25
118
 
26
119
  end
@@ -11,6 +11,19 @@ class BayesianTest < Test::Unit::TestCase
11
11
  def test_bad_training
12
12
  assert_raise(StandardError) { @classifier.train_no_category "words" }
13
13
  end
14
+
15
+ def test_bad_method
16
+ assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
17
+ end
18
+
19
+ def test_categories
20
+ assert_equal ['Interesting', 'Uninteresting'], @classifier.categories
21
+ end
22
+
23
+ def test_add_category
24
+ @classifier.add_category 'Test'
25
+ assert_equal ['Interesting', 'Test', 'Uninteresting'], @classifier.categories
26
+ end
14
27
 
15
28
  def test_classification
16
29
  @classifier.train_interesting "here are some good words. I hope you love them"
@@ -1,8 +1,12 @@
1
1
  require File.dirname(__FILE__) + '/../test_helper'
2
2
  class StringExtensionsTest < Test::Unit::TestCase
3
3
  def test_word_hash
4
- hash = {:some=>1, :good=>1, :hope=>1, :word=>1, :you=>1, :here=>1, :love=>1, :ar=>1, :them=>1, :"."=>1, :"!"=>1}
5
-
6
- assert_equal hash, "here are some good words. I hope you love them!".word_hash
4
+ hash = {:good=>1, :"!"=>1, :hope=>1, :"'."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
5
+ assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
6
+ end
7
+
8
+ def test_without_punctuation
9
+ sample = "Hello thats welcome to no punctuation"
10
+ assert_equal sample, "Hello, that's welcome () to ! no *& punctuation".without_punctuation
7
11
  end
8
12
  end
metadata CHANGED
@@ -3,7 +3,7 @@ rubygems_version: 0.8.6
3
3
  specification_version: 1
4
4
  name: classifier
5
5
  version: !ruby/object:Gem::Version
6
- version: "1.1"
6
+ version: 1.1.1
7
7
  date: 2005-04-11
8
8
  summary: A general classifier module to allow Bayesian and other types of classifications.
9
9
  require_paths:
@@ -42,6 +42,41 @@ files:
42
42
  - test/string_extensions/word_hash_test.rb
43
43
  - Rakefile
44
44
  - README
45
+ - doc/classes
46
+ - doc/created.rid
47
+ - doc/files
48
+ - doc/fr_class_index.html
49
+ - doc/fr_file_index.html
50
+ - doc/fr_method_index.html
51
+ - doc/index.html
52
+ - doc/rdoc-style.css
53
+ - doc/classes/Classifier
54
+ - doc/classes/Classifier.html
55
+ - doc/classes/String.html
56
+ - doc/classes/Classifier/Bayes.html
57
+ - doc/classes/Classifier/Bayes.src
58
+ - doc/classes/Classifier/Stemmable.html
59
+ - doc/classes/Classifier/Stemmable.src
60
+ - doc/classes/Classifier/WordHash.html
61
+ - doc/classes/Classifier/WordHash.src
62
+ - doc/classes/Classifier/Bayes.src/M000005.html
63
+ - doc/classes/Classifier/Bayes.src/M000006.html
64
+ - doc/classes/Classifier/Bayes.src/M000007.html
65
+ - doc/classes/Classifier/Bayes.src/M000008.html
66
+ - doc/classes/Classifier/Bayes.src/M000009.html
67
+ - doc/classes/Classifier/Bayes.src/M000010.html
68
+ - doc/classes/Classifier/Stemmable.src/M000003.html
69
+ - doc/classes/Classifier/WordHash.src/M000001.html
70
+ - doc/classes/Classifier/WordHash.src/M000002.html
71
+ - doc/files/lib
72
+ - doc/files/README.html
73
+ - doc/files/lib/classifier
74
+ - doc/files/lib/classifier_rb.html
75
+ - doc/files/lib/classifier/bayes_rb.html
76
+ - doc/files/lib/classifier/string_extensions
77
+ - doc/files/lib/classifier/string_extensions_rb.html
78
+ - doc/files/lib/classifier/string_extensions/porter_stemmer_rb.html
79
+ - doc/files/lib/classifier/string_extensions/word_hash_rb.html
45
80
  test_files: []
46
81
  rdoc_options: []
47
82
  extra_rdoc_files: []