reclassifier 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -1,7 +1,4 @@
1
1
  require "bundler/gem_tasks"
2
- require 'rake/testtask'
2
+ require 'rspec/core/rake_task'
3
3
 
4
- Rake::TestTask.new do |t|
5
- t.libs << 'test'
6
- t.test_files = FileList['test/**/*_test.rb']
7
- end
4
+ RSpec::Core::RakeTask.new(:spec)
@@ -1,129 +1,137 @@
1
+ #
2
+ # Bayesian classifier for arbitrary text.
3
+ #
4
+ # Implementation is translated from
5
+ # Introduction to Information Retrieval by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze,
6
+ # Cambridge University Press. 2008, ISBN 0521865719.
7
+ #
1
8
  module Reclassifier
2
9
  class Bayes
3
- # The class can be created with one or more categories, each of which will be
4
- # initialized and given a training method. E.g.,
5
- # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
6
- def initialize(*categories)
7
- @categories = Hash.new
8
- categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
9
- @total_words = 0
10
- @category_counts = Hash.new(0)
10
+ # Can be created with zero or more classifications, each of which will be
11
+ # initialized and given a training method. The classifications are specified as
12
+ # symbols. E.g.,
13
+ # b = Reclassifier::Bayes.new :interesting, :uninteresting, :spam
14
+ def initialize(*classifications)
15
+ @classifications = {}
16
+ classifications.each {|classification| @classifications[classification] = {}}
17
+
18
+ @docs_in_classification_count = {}
11
19
  end
12
20
 
13
21
  #
14
- # Provides a general training method for all categories specified in Bayes#new
22
+ # Provides a general training method for all classifications specified in Bayes#new
15
23
  # For example:
16
- # b = Classifier::Bayes.new 'This', 'That', 'the_other'
24
+ # b = Reclassifier::Bayes.new :this, :that
17
25
  # b.train :this, "This text"
18
- # b.train "that", "That text"
19
- # b.train "The other", "The other text"
20
- def train(category, text)
21
- category = category.prepare_category_name
22
- @category_counts[category] += 1
26
+ # b.train :that, "That text"
27
+ def train(classification, text)
28
+ ensure_classification_exists(classification)
29
+
30
+ @docs_in_classification_count[classification] ||= 0
31
+ @docs_in_classification_count[classification] += 1
32
+
23
33
  text.word_hash.each do |word, count|
24
- @categories[category][word] ||= 0
25
- @categories[category][word] += count
26
- @total_words += count
34
+ @classifications[classification][word] ||= 0
35
+
36
+ @classifications[classification][word] += count
27
37
  end
28
38
  end
29
39
 
30
40
  #
31
- # Provides a untraining method for all categories specified in Bayes#new
41
+ # Untrain a (classification, text) pair.
32
42
  # Be very careful with this method.
33
43
  #
34
44
  # For example:
35
- # b = Classifier::Bayes.new 'This', 'That', 'the_other'
45
+ # b = Reclassifier::Bayes.new :this, :that, :the_other
36
46
  # b.train :this, "This text"
37
47
  # b.untrain :this, "This text"
38
- def untrain(category, text)
39
- category = category.prepare_category_name
40
- @category_counts[category] -= 1
48
+ def untrain(classification, text)
49
+ ensure_classification_exists(classification)
50
+
51
+ @docs_in_classification_count[classification] -= 1
52
+
41
53
  text.word_hash.each do |word, count|
42
- if @total_words >= 0
43
- orig = @categories[category][word]
44
- @categories[category][word] ||= 0
45
- @categories[category][word] -= count
46
- if @categories[category][word] <= 0
47
- @categories[category].delete(word)
48
- count = orig
49
- end
50
- @total_words -= count
51
- end
54
+ @classifications[classification][word] -= count if @classifications[classification].include?(word)
52
55
  end
53
56
  end
54
57
 
55
58
  #
56
- # Returns the scores in each category the provided +text+. E.g.,
59
+ # Returns the scores of the specified text for each classification. E.g.,
57
60
  # b.classifications "I hate bad words and you"
58
61
  # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
59
62
  # The largest of these scores (the one closest to 0) is the one picked out by #classify
60
- def classifications(text)
61
- score = Hash.new
62
- training_count = @category_counts.values.inject { |x,y| x+y }.to_f
63
- @categories.each do |category, category_words|
64
- score[category.to_s] = 0
65
- total = category_words.values.inject(0) {|sum, element| sum+element}
63
+ def calculate_scores(text)
64
+ scores = {}
65
+
66
+ @classifications.each do |classification, classification_word_counts|
67
+ # prior
68
+ scores[classification] = Math.log(@docs_in_classification_count[classification])
69
+ scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
70
+
71
+ # likelihood
66
72
  text.word_hash.each do |word, count|
67
- s = category_words.has_key?(word) ? category_words[word] : 0.1
68
- score[category.to_s] += Math.log(s/total.to_f)
73
+ if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
74
+ scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
75
+
76
+ scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
77
+ end
69
78
  end
70
- # now add prior probability for the category
71
- s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
72
- score[category.to_s] += Math.log(s / training_count)
73
79
  end
74
- return score
80
+
81
+ scores
75
82
  end
76
83
 
77
84
  #
78
- # Returns the classification of the provided +text+, which is one of the
79
- # categories given in the initializer. E.g.,
85
+ # Returns the classification of the specified text, which is one of the
86
+ # classifications given in the initializer. E.g.,
80
87
  # b.classify "I hate bad words and you"
81
- # => 'Uninteresting'
88
+ # => :uninteresting
82
89
  def classify(text)
83
- (classifications(text).sort_by { |a| -a[1] })[0][0]
90
+ calculate_scores(text).max_by {|classification| classification[1]}[0]
84
91
  end
85
92
 
86
93
  #
87
- # Provides training and untraining methods for the categories specified in Bayes#new
94
+ # Provides a list of classification names
88
95
  # For example:
89
- # b = Classifier::Bayes.new 'This', 'That', 'the_other'
90
- # b.train_this "This text"
91
- # b.train_that "That text"
92
- # b.untrain_that "That text"
93
- # b.train_the_other "The other text"
94
- def method_missing(name, *args)
95
- category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
96
- if @categories.has_key? category
97
- args.each { |text| eval("#{$1}train(category, text)") }
98
- elsif name.to_s =~ /(un)?train_([\w]+)/
99
- raise StandardError, "No such category: #{category}"
100
- else
101
- super #raise StandardError, "No such method: #{name}"
102
- end
96
+ # b.classifications
97
+ # => [:this, :that, :the_other]
98
+ def classifications
99
+ @classifications.keys
103
100
  end
104
101
 
105
102
  #
106
- # Provides a list of category names
103
+ # Adds the classification to the classifier.
104
+ # Has no effect if the classification already existed.
105
+ # Returns the classification.
107
106
  # For example:
108
- # b.categories
109
- # => ['This', 'That', 'the_other']
110
- def categories # :nodoc:
111
- @categories.keys.collect {|c| c.to_s}
107
+ # b.add_classification(:not_spam)
108
+ def add_classification(classification)
109
+ @classifications[classification] ||= {}
110
+
111
+ classification
112
112
  end
113
113
 
114
114
  #
115
- # Allows you to add categories to the classifier.
115
+ # Removes the classification from the classifier.
116
+ # Returns the classifier if the classification existed, else nil.
116
117
  # For example:
117
- # b.add_category "Not spam"
118
- #
119
- # WARNING: Adding categories to a trained classifier will
120
- # result in an undertrained category that will tend to match
121
- # more criteria than the trained selective categories. In short,
122
- # try to initialize your categories at initialization.
123
- def add_category(category)
124
- @categories[category.prepare_category_name] = Hash.new
118
+ # b.remove_classification(:not_spam)
119
+ def remove_classification(classification)
120
+ return_value = if @classifications.include?(classification)
121
+ classification
122
+ else
123
+ nil
124
+ end
125
+
126
+ @classifications.delete(classification)
127
+
128
+ return_value
125
129
  end
126
130
 
127
- alias append_category add_category
131
+ private
132
+
133
+ def ensure_classification_exists(classification)
134
+ raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
135
+ end
128
136
  end
129
137
  end
@@ -0,0 +1,2 @@
1
+ class Reclassifier::UnknownClassificationError < StandardError
2
+ end
@@ -1,3 +1,3 @@
1
1
  module Reclassifier
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
data/lib/reclassifier.rb CHANGED
@@ -7,13 +7,13 @@ require 'gsl'
7
7
  require 'reclassifier/version'
8
8
  require 'reclassifier/core_ext/array'
9
9
  require 'reclassifier/core_ext/matrix'
10
- require 'reclassifier/core_ext/object'
11
10
  require 'reclassifier/core_ext/string'
12
11
  require 'gsl/vector'
13
12
 
14
13
  module Reclassifier
15
- autoload :Bayes, 'reclassifier/bayes'
16
- autoload :LSI, 'reclassifier/lsi'
17
- autoload :ContentNode, 'reclassifier/content_node'
18
- autoload :WordList, 'reclassifier/word_list'
14
+ autoload :Bayes, 'reclassifier/bayes'
15
+ autoload :LSI, 'reclassifier/lsi'
16
+ autoload :ContentNode, 'reclassifier/content_node'
17
+ autoload :WordList, 'reclassifier/word_list'
18
+ autoload :UnknownClassificationError, 'reclassifier/unknown_classification_error'
19
19
  end
data/reclassifier.gemspec CHANGED
@@ -20,7 +20,7 @@ Gem::Specification.new do |spec|
20
20
 
21
21
  spec.add_development_dependency 'bundler', '~> 1.3'
22
22
  spec.add_development_dependency 'rake'
23
- spec.add_development_dependency 'test-unit'
23
+ spec.add_development_dependency 'rspec'
24
24
 
25
25
  spec.add_dependency 'fast-stemmer'
26
26
  spec.add_dependency 'gsl'
@@ -0,0 +1,97 @@
1
+ require 'spec_helper'
2
+
3
+ describe Reclassifier::Bayes do
4
+ describe "classifications" do
5
+ it "should return the classifications" do
6
+ subject = described_class.new(:interesting, :uninteresting)
7
+
8
+ subject.classifications.sort.should eq([:interesting, :uninteresting])
9
+ end
10
+ end
11
+
12
+ describe "train" do
13
+ it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
14
+ expect {subject.train(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
15
+ end
16
+
17
+ it "should train the classifier to the (classification, document) pair" do
18
+ subject = described_class.new(:in_china, :not_in_china)
19
+
20
+ subject.train(:in_china, 'Chinese Beijing Chinese')
21
+ subject.train(:in_china, 'Chinese Chinese Shanghai')
22
+ subject.train(:in_china, 'Chinese Macao')
23
+ subject.train(:not_in_china, 'Tokyo Japan Chinese')
24
+
25
+ subject.classify('Chinese Chinese Chinese Tokyo Japan').should eq(:in_china)
26
+ end
27
+ end
28
+
29
+ describe "untrain" do
30
+ it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
31
+ expect {subject.untrain(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
32
+ end
33
+
34
+ it "should untrain the classifier against the (classification, document) pair" do
35
+ subject = described_class.new(:in_china, :not_in_china)
36
+
37
+ subject.train(:in_china, 'Chinese Chinese')
38
+ subject.train(:not_in_china, 'Chinese Macao')
39
+
40
+ subject.classify('Chinese').should eq(:in_china)
41
+
42
+ subject.untrain(:in_china, 'Chinese Chinese')
43
+
44
+ subject.classify('Chinese').should eq(:not_in_china)
45
+ end
46
+ end
47
+
48
+ describe "calculate_scores" do
49
+ it "should return a score hash with the correct scores" do
50
+ subject = described_class.new(:in_china, :not_in_china)
51
+
52
+ subject.train(:in_china, 'Chinese Beijing Chinese')
53
+ subject.train(:in_china, 'Chinese Chinese Shanghai')
54
+ subject.train(:in_china, 'Chinese Macao')
55
+ subject.train(:not_in_china, 'Tokyo Japan Chinese')
56
+
57
+ scores = subject.calculate_scores('Chinese Chinese Chinese Tokyo Japan')
58
+
59
+ scores[:in_china].should eq(-8.107690312843907)
60
+ scores[:not_in_china].should eq(-8.906681345001262)
61
+ end
62
+ end
63
+
64
+ describe "add_classification" do
65
+ it "should add the classification to the set of classifications" do
66
+ subject.classifications.should be_empty
67
+
68
+ subject.add_classification(:niner)
69
+
70
+ subject.classifications.should eq([:niner])
71
+ end
72
+
73
+ it "should return the classification" do
74
+ subject.add_classification(:niner).should eq(:niner)
75
+ end
76
+ end
77
+
78
+ describe "remove_classification" do
79
+ it "should remove the classification from the set of classifications" do
80
+ subject.add_classification(:niner)
81
+
82
+ subject.remove_classification(:niner)
83
+
84
+ subject.classifications.should be_empty
85
+ end
86
+
87
+ it "should return the classification" do
88
+ subject.add_classification(:niner)
89
+
90
+ subject.remove_classification(:niner).should eq(:niner)
91
+ end
92
+
93
+ it "should return nil if the classification didn't exist" do
94
+ subject.remove_classification(:niner).should be(nil)
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,13 @@
1
+ require 'spec_helper'
2
+
3
+ describe Array do
4
+ describe "sum_with_identity" do
5
+ it "should sum the array" do
6
+ [1,2,3].sum_with_identity.should eq(6)
7
+ end
8
+
9
+ it "should return 0 when it encounters an empty array" do
10
+ [].sum_with_identity.should eq(0)
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,19 @@
1
+ require 'spec_helper'
2
+
3
+ describe String do
4
+ describe "word_hash" do
5
+ it "should hash text" do
6
+ hash = {:good => 1, :"!" => 1, :hope => 1, :"'" => 1, :"." => 1, :love => 1, :word => 1, :them => 1, :test => 1}
7
+
8
+ "here are some good words of test's. I hope you love them!".word_hash.should eq(hash)
9
+ end
10
+ end
11
+
12
+ describe "clean_word_hash" do
13
+ it "should clean and hash text" do
14
+ hash = {:good => 1, :word => 1, :hope => 1, :love => 1, :them => 1, :test => 1}
15
+
16
+ "here are some good words of test's. I hope you love them!".clean_word_hash.should eq(hash)
17
+ end
18
+ end
19
+ end
data/spec/lsi_spec.rb ADDED
@@ -0,0 +1,123 @@
1
+ require 'spec_helper'
2
+
3
+ describe Reclassifier::LSI do
4
+ before do
5
+ # we repeat principle words to help weight them.
6
+ # This test is rather delicate, since this system is mostly noise.
7
+ @str1 = "This text deals with dogs. Dogs."
8
+ @str2 = "This text involves dogs too. Dogs! "
9
+ @str3 = "This text revolves around cats. Cats."
10
+ @str4 = "This text also involves cats. Cats!"
11
+ @str5 = "This text involves birds. Birds."
12
+ end
13
+
14
+ it "should do basic indexing" do
15
+ [@str1, @str2, @str3, @str4, @str5].each { |x| subject << x }
16
+ subject.needs_rebuild?.should be(false)
17
+
18
+ # note that the closest match to str1 is str2, even though it is not
19
+ # the closest text match.
20
+ subject.find_related(@str1, 3).should eq([@str2, @str5, @str3])
21
+ end
22
+
23
+ it "should not auto rebuild when it's specified as false" do
24
+ subject = described_class.new(:auto_rebuild => false)
25
+
26
+ subject.add_item @str1, "Dog"
27
+ subject.add_item @str2, "Dog"
28
+
29
+ subject.needs_rebuild?.should be(true)
30
+
31
+ subject.build_index
32
+
33
+ subject.needs_rebuild?.should be(false)
34
+ end
35
+
36
+ it "should do basic classifying" do
37
+ subject.add_item(@str2, "Dog")
38
+ subject.add_item(@str3, "Cat")
39
+ subject.add_item(@str4, "Cat")
40
+ subject.add_item(@str5, "Bird")
41
+
42
+ subject.classify(@str1).should eq("Dog")
43
+ subject.classify(@str3).should eq("Cat")
44
+ subject.classify(@str5).should eq("Bird")
45
+ end
46
+
47
+ it "should perform better than Bayes" do
48
+ bayes = Reclassifier::Bayes.new :dog, :cat, :bird
49
+
50
+ [[@str1, "Dog"],
51
+ [@str2, "Dog"],
52
+ [@str3, "Cat"],
53
+ [@str4, "Cat"],
54
+ [@str5, "Bird"]].each do |str, classification|
55
+ subject.add_item(str, classification)
56
+
57
+ bayes.train(classification.downcase.to_sym, str)
58
+ end
59
+
60
+ # We're talking about dogs. Even though the text matches the corpus on
61
+ # cats better. Dogs have more semantic weight than cats. So bayes
62
+ # will fail here, but the LSI recognizes content.
63
+ tricky_case = "This text revolves around dogs."
64
+ subject.classify(tricky_case).should eq("Dog")
65
+ bayes.classify(tricky_case).should eq(:dog)
66
+ end
67
+
68
+ it "should recategorize as needed" do
69
+ subject.add_item(@str1, "Dog")
70
+ subject.add_item(@str2, "Dog")
71
+ subject.add_item(@str3, "Cat")
72
+ subject.add_item(@str4, "Cat")
73
+ subject.add_item(@str5, "Bird")
74
+
75
+ tricky_case = "This text revolves around dogs."
76
+ subject.classify(tricky_case).should eq("Dog")
77
+
78
+ # Recategorize as needed.
79
+ subject.categories_for(@str1).clear.push("Cow")
80
+ subject.categories_for(@str2).clear.push("Cow")
81
+
82
+ subject.needs_rebuild?.should be(false)
83
+ subject.classify(tricky_case).should eq("Cow")
84
+ end
85
+
86
+ it "should search correctly" do
87
+ [@str1, @str2, @str3, @str4, @str5].each { |x| subject << x }
88
+
89
+ # Searching by content and text, note that @str2 comes up first, because
90
+ # both "dog" and "involve" are present. But, the next match is @str1 instead
91
+ # of @str4, because "dog" carries more weight than involves.
92
+ subject.search("dog involves", 100).should eq([@str2, @str1, @str4, @str5, @str3])
93
+
94
+ # Keyword search shows how the space is mapped out in relation to
95
+ # dog when magnitude is remove. Note the relations. We move from dog
96
+ # through involve and then finally to other words.
97
+ subject.search("dog", 5).should eq([@str1, @str2, @str4, @str5, @str3])
98
+ end
99
+
100
+ it "should serialize correctly" do
101
+ [@str1, @str2, @str3, @str4, @str5].each { |x| subject << x }
102
+
103
+ subject_md = Marshal.dump(subject)
104
+ subject_m = Marshal.load(subject_md)
105
+
106
+ subject_m.search("cat", 3).should eq(subject.search("cat", 3))
107
+ subject_m.find_related(@str1, 3).should eq(subject.find_related(@str1, 3))
108
+ end
109
+
110
+ it "should keyword search correctly" do
111
+ subject.add_item(@str1, "Dog")
112
+ subject.add_item(@str2, "Dog")
113
+ subject.add_item(@str3, "Cat")
114
+ subject.add_item(@str4, "Cat")
115
+ subject.add_item(@str5, "Bird")
116
+
117
+ subject.highest_ranked_stems(@str1).should eq([:dog, :text, :deal])
118
+ end
119
+
120
+ it "should summarize correctly" do
121
+ [@str1, @str2, @str3, @str4, @str5].join.summary(2).should eq("This text involves dogs too [...] This text also involves cats")
122
+ end
123
+ end
@@ -0,0 +1,5 @@
1
+ require File.join(Dir.pwd, 'lib', 'reclassifier.rb')
2
+
3
+ RSpec.configure do |config|
4
+ config.color = true
5
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: reclassifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-18 00:00:00.000000000 Z
12
+ date: 2013-04-22 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -44,7 +44,7 @@ dependencies:
44
44
  - !ruby/object:Gem::Version
45
45
  version: '0'
46
46
  - !ruby/object:Gem::Dependency
47
- name: test-unit
47
+ name: rspec
48
48
  requirement: !ruby/object:Gem::Requirement
49
49
  none: false
50
50
  requirements:
@@ -109,18 +109,18 @@ files:
109
109
  - lib/reclassifier/content_node.rb
110
110
  - lib/reclassifier/core_ext/array.rb
111
111
  - lib/reclassifier/core_ext/matrix.rb
112
- - lib/reclassifier/core_ext/object.rb
113
112
  - lib/reclassifier/core_ext/string.rb
114
113
  - lib/reclassifier/core_ext/vector.rb
115
114
  - lib/reclassifier/lsi.rb
115
+ - lib/reclassifier/unknown_classification_error.rb
116
116
  - lib/reclassifier/version.rb
117
117
  - lib/reclassifier/word_list.rb
118
118
  - reclassifier.gemspec
119
- - test/bayes_test.rb
120
- - test/core_ext/array_test.rb
121
- - test/core_ext/string_test.rb
122
- - test/lsi_test.rb
123
- - test/test_helper.rb
119
+ - spec/bayes_spec.rb
120
+ - spec/core_ext/array_spec.rb
121
+ - spec/core_ext/string_spec.rb
122
+ - spec/lsi_spec.rb
123
+ - spec/spec_helper.rb
124
124
  homepage: https://github.com/saveup/reclassifier
125
125
  licenses:
126
126
  - LGPL
@@ -147,8 +147,8 @@ signing_key:
147
147
  specification_version: 3
148
148
  summary: Bayesian and Latent Semantic Indexing classification of text.
149
149
  test_files:
150
- - test/bayes_test.rb
151
- - test/core_ext/array_test.rb
152
- - test/core_ext/string_test.rb
153
- - test/lsi_test.rb
154
- - test/test_helper.rb
150
+ - spec/bayes_spec.rb
151
+ - spec/core_ext/array_spec.rb
152
+ - spec/core_ext/string_spec.rb
153
+ - spec/lsi_spec.rb
154
+ - spec/spec_helper.rb
@@ -1,3 +0,0 @@
1
- class Object
2
- def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
3
- end
data/test/bayes_test.rb DELETED
@@ -1,34 +0,0 @@
1
- require File.join(File.dirname(__FILE__), 'test_helper')
2
-
3
- class BayesTest < Test::Unit::TestCase
4
- def setup
5
- @classifier = Reclassifier::Bayes.new 'Interesting', 'Uninteresting'
6
- end
7
-
8
- def test_good_training
9
- assert_nothing_raised { @classifier.train_interesting "love" }
10
- end
11
-
12
- def test_bad_training
13
- assert_raise(StandardError) { @classifier.train_no_category "words" }
14
- end
15
-
16
- def test_bad_method
17
- assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
18
- end
19
-
20
- def test_categories
21
- assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
22
- end
23
-
24
- def test_add_category
25
- @classifier.add_category 'Test'
26
- assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
27
- end
28
-
29
- def test_classification
30
- @classifier.train_interesting "here are some good words. I hope you love them"
31
- @classifier.train_uninteresting "here are some bad words, I hate you"
32
- assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
33
- end
34
- end
@@ -1,15 +0,0 @@
1
- require File.join(File.dirname(__FILE__), '..', 'test_helper')
2
-
3
- class ArrayTest < Test::Unit::TestCase
4
- def test_monkey_path_array_sum
5
- assert_equal [1,2,3].sum_with_identity, 6
6
- end
7
-
8
- def test_summing_an_empty_array
9
- assert_equal [nil].sum_with_identity, 0
10
- end
11
-
12
- def test_summing_an_empty_array
13
- assert_equal Array[].sum_with_identity, 0
14
- end
15
- end
@@ -1,13 +0,0 @@
1
- require File.join(File.dirname(__FILE__), '..', 'test_helper')
2
-
3
- class StringTest < Test::Unit::TestCase
4
- def test_word_hash
5
- hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
6
- assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
7
- end
8
-
9
- def test_clean_word_hash
10
- hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
11
- assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash
12
- end
13
- end
data/test/lsi_test.rb DELETED
@@ -1,123 +0,0 @@
1
- require File.join(File.dirname(__FILE__), 'test_helper')
2
-
3
- class LSITest < Test::Unit::TestCase
4
- def setup
5
- # we repeat principle words to help weight them.
6
- # This test is rather delicate, since this system is mostly noise.
7
- @str1 = "This text deals with dogs. Dogs."
8
- @str2 = "This text involves dogs too. Dogs! "
9
- @str3 = "This text revolves around cats. Cats."
10
- @str4 = "This text also involves cats. Cats!"
11
- @str5 = "This text involves birds. Birds."
12
- end
13
-
14
- def test_basic_indexing
15
- lsi = Reclassifier::LSI.new
16
- [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
17
- assert ! lsi.needs_rebuild?
18
-
19
- # note that the closest match to str1 is str2, even though it is not
20
- # the closest text match.
21
- assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
22
- end
23
-
24
- def test_not_auto_rebuild
25
- lsi = Reclassifier::LSI.new :auto_rebuild => false
26
- lsi.add_item @str1, "Dog"
27
- lsi.add_item @str2, "Dog"
28
- assert lsi.needs_rebuild?
29
- lsi.build_index
30
- assert ! lsi.needs_rebuild?
31
- end
32
-
33
- def test_basic_categorizing
34
- lsi = Reclassifier::LSI.new
35
- lsi.add_item @str2, "Dog"
36
- lsi.add_item @str3, "Cat"
37
- lsi.add_item @str4, "Cat"
38
- lsi.add_item @str5, "Bird"
39
-
40
- assert_equal "Dog", lsi.classify( @str1 )
41
- assert_equal "Cat", lsi.classify( @str3 )
42
- assert_equal "Bird", lsi.classify( @str5 )
43
- end
44
-
45
- def test_external_classifying
46
- lsi = Reclassifier::LSI.new
47
- bayes = Reclassifier::Bayes.new 'Dog', 'Cat', 'Bird'
48
- lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
49
- lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
50
- lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
51
- lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
52
- lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
53
-
54
- # We're talking about dogs. Even though the text matches the corpus on
55
- # cats better. Dogs have more semantic weight than cats. So bayes
56
- # will fail here, but the LSI recognizes content.
57
- tricky_case = "This text revolves around dogs."
58
- assert_equal "Dog", lsi.classify( tricky_case )
59
- assert_not_equal "Dog", bayes.classify( tricky_case )
60
- end
61
-
62
- def test_recategorize_interface
63
- lsi = Reclassifier::LSI.new
64
- lsi.add_item @str1, "Dog"
65
- lsi.add_item @str2, "Dog"
66
- lsi.add_item @str3, "Cat"
67
- lsi.add_item @str4, "Cat"
68
- lsi.add_item @str5, "Bird"
69
-
70
- tricky_case = "This text revolves around dogs."
71
- assert_equal "Dog", lsi.classify( tricky_case )
72
-
73
- # Recategorize as needed.
74
- lsi.categories_for(@str1).clear.push "Cow"
75
- lsi.categories_for(@str2).clear.push "Cow"
76
-
77
- assert !lsi.needs_rebuild?
78
- assert_equal "Cow", lsi.classify( tricky_case )
79
- end
80
-
81
- def test_search
82
- lsi = Reclassifier::LSI.new
83
- [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
84
-
85
- # Searching by content and text, note that @str2 comes up first, because
86
- # both "dog" and "involve" are present. But, the next match is @str1 instead
87
- # of @str4, because "dog" carries more weight than involves.
88
- assert_equal( [@str2, @str1, @str4, @str5, @str3],
89
- lsi.search("dog involves", 100) )
90
-
91
- # Keyword search shows how the space is mapped out in relation to
92
- # dog when magnitude is remove. Note the relations. We move from dog
93
- # through involve and then finally to other words.
94
- assert_equal( [@str1, @str2, @str4, @str5, @str3],
95
- lsi.search("dog", 5) )
96
- end
97
-
98
- def test_serialize_safe
99
- lsi = Reclassifier::LSI.new
100
- [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
101
-
102
- lsi_md = Marshal.dump lsi
103
- lsi_m = Marshal.load lsi_md
104
-
105
- assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
106
- assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
107
- end
108
-
109
- def test_keyword_search
110
- lsi = Reclassifier::LSI.new
111
- lsi.add_item @str1, "Dog"
112
- lsi.add_item @str2, "Dog"
113
- lsi.add_item @str3, "Cat"
114
- lsi.add_item @str4, "Cat"
115
- lsi.add_item @str5, "Bird"
116
-
117
- assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
118
- end
119
-
120
- def test_summary
121
- assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
122
- end
123
- end
data/test/test_helper.rb DELETED
@@ -1,4 +0,0 @@
1
- $:.unshift(File.dirname(__FILE__) + '/../lib')
2
-
3
- require 'test/unit'
4
- require 'reclassifier'