classifier 1.1 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. data/README +2 -2
  2. data/Rakefile +2 -4
  3. data/doc/classes/Classifier.html +135 -0
  4. data/doc/classes/Classifier/Bayes.html +287 -0
  5. data/doc/classes/Classifier/Bayes.src/M000005.html +20 -0
  6. data/doc/classes/Classifier/Bayes.src/M000006.html +23 -0
  7. data/doc/classes/Classifier/Bayes.src/M000007.html +27 -0
  8. data/doc/classes/Classifier/Bayes.src/M000008.html +18 -0
  9. data/doc/classes/Classifier/Bayes.src/M000009.html +25 -0
  10. data/doc/classes/Classifier/Bayes.src/M000010.html +18 -0
  11. data/doc/classes/Classifier/Stemmable.html +243 -0
  12. data/doc/classes/Classifier/Stemmable.src/M000003.html +102 -0
  13. data/doc/classes/Classifier/WordHash.html +178 -0
  14. data/doc/classes/Classifier/WordHash.src/M000001.html +18 -0
  15. data/doc/classes/Classifier/WordHash.src/M000002.html +28 -0
  16. data/doc/classes/String.html +119 -0
  17. data/doc/created.rid +1 -0
  18. data/doc/files/README.html +156 -0
  19. data/doc/files/lib/classifier/bayes_rb.html +115 -0
  20. data/doc/files/lib/classifier/string_extensions/porter_stemmer_rb.html +112 -0
  21. data/doc/files/lib/classifier/string_extensions/word_hash_rb.html +115 -0
  22. data/doc/files/lib/classifier/string_extensions_rb.html +123 -0
  23. data/doc/files/lib/classifier_rb.html +123 -0
  24. data/doc/fr_class_index.html +31 -0
  25. data/doc/fr_file_index.html +32 -0
  26. data/doc/fr_method_index.html +37 -0
  27. data/doc/index.html +24 -0
  28. data/doc/rdoc-style.css +208 -0
  29. data/lib/classifier/bayes.rb +63 -12
  30. data/lib/classifier/string_extensions/porter_stemmer.rb +18 -15
  31. data/lib/classifier/string_extensions/word_hash.rb +96 -3
  32. data/test/bayes/bayesian_test.rb +13 -0
  33. data/test/string_extensions/word_hash_test.rb +7 -3
  34. metadata +36 -1
@@ -5,16 +5,36 @@
5
5
  module Classifier
6
6
 
7
7
  class Bayes
8
+ # The class can be created with one or more categories, each of which will be
9
+ # initialized and given a training method. E.g.,
10
+ # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
8
11
  def initialize(*categories)
9
12
  @categories = Hash.new
10
- categories.each { |category| @categories[category.capitalize.intern] = Hash.new }
13
+ categories.each { |category| @categories[category.to_s.gsub("_"," ").capitalize.intern] = Hash.new }
11
14
  @total_words = 0
12
15
  end
13
16
 
14
- def classify(text)
15
- (classifications(text).sort_by { |a| -a[1] })[0][0]
17
+ #
18
+ # Provides a general training method for all categories specified in Bayes#new
19
+ # For example:
20
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
21
+ # b.train :this, "This text"
22
+ # b.train "that", "That text"
23
+ # b.train "The other", "The other text"
24
+ def train(category, text)
25
+ category = category.to_s.gsub("_"," ").capitalize.intern
26
+ text.word_hash.each do |word, count|
27
+ @categories[category][word] ||= 0
28
+ @categories[category][word] += count
29
+ @total_words += count
30
+ end
16
31
  end
17
32
 
33
+ #
34
+ # Returns the scores in each category the provided +text+. E.g.,
35
+ # b.classifications "I hate bad words and you"
36
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
37
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
18
38
  def classifications(text)
19
39
  score = Hash.new
20
40
  @categories.each do |category, category_words|
@@ -28,6 +48,22 @@ class Bayes
28
48
  return score
29
49
  end
30
50
 
51
+ #
52
+ # Returns the classification of the provided +text+, which is one of the
53
+ # categories given in the initializer. E.g.,
54
+ # b.classify "I hate bad words and you"
55
+ # => 'Uninteresting'
56
+ def classify(text)
57
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
58
+ end
59
+
60
+ #
61
+ # Provides training methods for the categories specified in Bayes#new
62
+ # For example:
63
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
64
+ # b.train_this "This text"
65
+ # b.train_that "That text"
66
+ # b.train_the_other "The other text"
31
67
  def method_missing(name, *args)
32
68
  category = name.to_s.gsub(/train_([\w]+)/, '\1').gsub("_"," ").capitalize.intern
33
69
  if @categories.has_key? category
@@ -35,18 +71,33 @@ class Bayes
35
71
  elsif name.to_s =~ /train_([\w]+)/
36
72
  raise StandardError, "No such category: #{category}"
37
73
  else
38
- raise StandardError, "No such method: #{name}"
74
+ super #raise StandardError, "No such method: #{name}"
39
75
  end
40
76
  end
41
-
42
- def train(category, text)
43
- category = category.to_s.gsub("_"," ").capitalize.intern
44
- text.word_hash.each do |word, count|
45
- @categories[category][word] ||= 0
46
- @categories[category][word] += count
47
- @total_words += count
48
- end
77
+
78
+ #
79
+ # Provides a list of category names
80
+ # For example:
81
+ # b.categories
82
+ # => ['This', 'That', 'the_other']
83
+ def categories # :nodoc:
84
+ @categories.keys.collect {|c| c.to_s}
49
85
  end
86
+
87
+ #
88
+ # Allows you to add categories to the classifier.
89
+ # For example:
90
+ # b.add_category "Not spam"
91
+ #
92
+ # WARNING: Adding categories to a trained classifier will
93
+ # result in an undertrained category that will tend to match
94
+ # more criteria than the trained selective categories. In short,
95
+ # try to initialize your categories at initialization.
96
+ def add_category(category)
97
+ @categories[category.to_s.gsub("_"," ").capitalize.intern] = Hash.new
98
+ end
99
+
100
+ alias append_category add_category # :nodoc:
50
101
  end
51
102
 
52
103
  end
@@ -7,7 +7,21 @@
7
7
  # Copyright 2005 Greg Fast <gdf@speakeasy.net>
8
8
 
9
9
  module Classifier
10
-
10
+
11
+ #
12
+ # Porter stemmer in Ruby.
13
+ #
14
+ # This is the Porter stemming algorithm, ported to Ruby from the
15
+ # version coded up in Perl. It's easy to follow against the rules
16
+ # in the original paper in:
17
+ #
18
+ # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
19
+ # no. 3, pp 130-137,
20
+ #
21
+ # See also http://www.tartarus.org/~martin/PorterStemmer
22
+ #
23
+ # Send comments to raypereda@hotmail.com
24
+ #
11
25
  module Stemmable
12
26
 
13
27
  STEP_2_LIST = {
@@ -82,20 +96,9 @@ module Stemmable
82
96
  VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
83
97
 
84
98
  #
85
- # Porter stemmer in Ruby.
86
- #
87
- # This is the Porter stemming algorithm, ported to Ruby from the
88
- # version coded up in Perl. It's easy to follow against the rules
89
- # in the original paper in:
90
- #
91
- # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
92
- # no. 3, pp 130-137,
93
- #
94
- # See also http://www.tartarus.org/~martin/PorterStemmer
95
- #
96
- # Send comments to raypereda@hotmail.com
97
- #
98
-
99
+ # Stems the word contained in the current object. E.g.,
100
+ # "actually".stem_porter
101
+ # => "actual"
99
102
  def stem_porter
100
103
 
101
104
  # make a copy of the given object and convert it to a string.
@@ -4,23 +4,116 @@
4
4
 
5
5
  module Classifier
6
6
 
7
+ # This module is mixed into String to provide convenience
8
+ # methods for the Classifier package.
7
9
  module WordHash
10
+
11
+ # Removes common punctuation symbols, returning a new string. E.g.,
12
+ # "Hello (greeting's), with {braces} < >...?".without_punctuation
13
+ # => "Hello greetings with braces "
8
14
  def without_punctuation
9
- tr( ',?.!;:\'"@#$%^&*()_=+[]{}\|<>/`~', " " )
15
+ tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
10
16
  end
11
17
 
18
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
19
+ # interned, and indexes to its frequency in the document.
12
20
  def word_hash
13
21
  d = Hash.new
14
22
  corpus = without_punctuation
15
23
  (corpus.split + gsub(/[\w+]/,"").split).each do |word|
16
- key = word.downcase.stem.intern
24
+ item = word.downcase
25
+ key = item.stem.intern
17
26
  if !(word =~ /[\w+]/) || word.length > 2
18
27
  d[key] ||= 0
19
28
  d[key] += 1
20
- end
29
+ end unless CORPUS_SKIP_WORDS[item]
21
30
  end
22
31
  return d
23
32
  end
33
+
34
+ private
35
+ CORPUS_SKIP_WORDS = {
36
+ "a" => 1,
37
+ "again" => 1,
38
+ "all" => 1,
39
+ "along" => 1,
40
+ "are" => 1,
41
+ "also" => 1,
42
+ "an" => 1,
43
+ "and" => 1,
44
+ "as" => 1,
45
+ "at" => 1,
46
+ "but" => 1,
47
+ "by" => 1,
48
+ "came" => 1,
49
+ "can" => 1,
50
+ "cant" => 1,
51
+ "couldnt" => 1,
52
+ "did" => 1,
53
+ "didn" => 1,
54
+ "didnt" => 1,
55
+ "do" => 1,
56
+ "doesnt" => 1,
57
+ "dont" => 1,
58
+ "ever" => 1,
59
+ "first" => 1,
60
+ "from" => 1,
61
+ "have" => 1,
62
+ "her" => 1,
63
+ "here" => 1,
64
+ "him" => 1,
65
+ "how" => 1,
66
+ "i" => 1,
67
+ "if" => 1,
68
+ "in" => 1,
69
+ "into" => 1,
70
+ "is" => 1,
71
+ "isnt" => 1,
72
+ "it" => 1,
73
+ "itll" => 1,
74
+ "just" => 1,
75
+ "last" => 1,
76
+ "least" => 1,
77
+ "like" => 1,
78
+ "most" => 1,
79
+ "my" => 1,
80
+ "new" => 1,
81
+ "no" => 1,
82
+ "not" => 1,
83
+ "now" => 1,
84
+ "of" => 1,
85
+ "on" => 1,
86
+ "or" => 1,
87
+ "should" => 1,
88
+ "sinc" => 1,
89
+ "so" => 1,
90
+ "some" => 1,
91
+ "th" => 1,
92
+ "than" => 1,
93
+ "this" => 1,
94
+ "that" => 1,
95
+ "the" => 1,
96
+ "their" => 1,
97
+ "then" => 1,
98
+ "those" => 1,
99
+ "to" => 1,
100
+ "told" => 1,
101
+ "too" => 1,
102
+ "true" => 1,
103
+ "try" => 1,
104
+ "until" => 1,
105
+ "url" => 1,
106
+ "us" => 1,
107
+ "were" => 1,
108
+ "when" => 1,
109
+ "whether" => 1,
110
+ "while" => 1,
111
+ "with" => 1,
112
+ "within" => 1,
113
+ "yes" => 1,
114
+ "you" => 1,
115
+ "youll" => 1,
116
+ }
24
117
  end
25
118
 
26
119
  end
@@ -11,6 +11,19 @@ class BayesianTest < Test::Unit::TestCase
11
11
  def test_bad_training
12
12
  assert_raise(StandardError) { @classifier.train_no_category "words" }
13
13
  end
14
+
15
+ def test_bad_method
16
+ assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
17
+ end
18
+
19
+ def test_categories
20
+ assert_equal ['Interesting', 'Uninteresting'], @classifier.categories
21
+ end
22
+
23
+ def test_add_category
24
+ @classifier.add_category 'Test'
25
+ assert_equal ['Interesting', 'Test', 'Uninteresting'], @classifier.categories
26
+ end
14
27
 
15
28
  def test_classification
16
29
  @classifier.train_interesting "here are some good words. I hope you love them"
@@ -1,8 +1,12 @@
1
1
  require File.dirname(__FILE__) + '/../test_helper'
2
2
  class StringExtensionsTest < Test::Unit::TestCase
3
3
  def test_word_hash
4
- hash = {:some=>1, :good=>1, :hope=>1, :word=>1, :you=>1, :here=>1, :love=>1, :ar=>1, :them=>1, :"."=>1, :"!"=>1}
5
-
6
- assert_equal hash, "here are some good words. I hope you love them!".word_hash
4
+ hash = {:good=>1, :"!"=>1, :hope=>1, :"'."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
5
+ assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
6
+ end
7
+
8
+ def test_without_punctuation
9
+ sample = "Hello thats welcome to no punctuation"
10
+ assert_equal sample, "Hello, that's welcome () to ! no *& punctuation".without_punctuation
7
11
  end
8
12
  end
metadata CHANGED
@@ -3,7 +3,7 @@ rubygems_version: 0.8.6
3
3
  specification_version: 1
4
4
  name: classifier
5
5
  version: !ruby/object:Gem::Version
6
- version: "1.1"
6
+ version: 1.1.1
7
7
  date: 2005-04-11
8
8
  summary: A general classifier module to allow Bayesian and other types of classifications.
9
9
  require_paths:
@@ -42,6 +42,41 @@ files:
42
42
  - test/string_extensions/word_hash_test.rb
43
43
  - Rakefile
44
44
  - README
45
+ - doc/classes
46
+ - doc/created.rid
47
+ - doc/files
48
+ - doc/fr_class_index.html
49
+ - doc/fr_file_index.html
50
+ - doc/fr_method_index.html
51
+ - doc/index.html
52
+ - doc/rdoc-style.css
53
+ - doc/classes/Classifier
54
+ - doc/classes/Classifier.html
55
+ - doc/classes/String.html
56
+ - doc/classes/Classifier/Bayes.html
57
+ - doc/classes/Classifier/Bayes.src
58
+ - doc/classes/Classifier/Stemmable.html
59
+ - doc/classes/Classifier/Stemmable.src
60
+ - doc/classes/Classifier/WordHash.html
61
+ - doc/classes/Classifier/WordHash.src
62
+ - doc/classes/Classifier/Bayes.src/M000005.html
63
+ - doc/classes/Classifier/Bayes.src/M000006.html
64
+ - doc/classes/Classifier/Bayes.src/M000007.html
65
+ - doc/classes/Classifier/Bayes.src/M000008.html
66
+ - doc/classes/Classifier/Bayes.src/M000009.html
67
+ - doc/classes/Classifier/Bayes.src/M000010.html
68
+ - doc/classes/Classifier/Stemmable.src/M000003.html
69
+ - doc/classes/Classifier/WordHash.src/M000001.html
70
+ - doc/classes/Classifier/WordHash.src/M000002.html
71
+ - doc/files/lib
72
+ - doc/files/README.html
73
+ - doc/files/lib/classifier
74
+ - doc/files/lib/classifier_rb.html
75
+ - doc/files/lib/classifier/bayes_rb.html
76
+ - doc/files/lib/classifier/string_extensions
77
+ - doc/files/lib/classifier/string_extensions_rb.html
78
+ - doc/files/lib/classifier/string_extensions/porter_stemmer_rb.html
79
+ - doc/files/lib/classifier/string_extensions/word_hash_rb.html
45
80
  test_files: []
46
81
  rdoc_options: []
47
82
  extra_rdoc_files: []