reclassifier 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -1,7 +1,4 @@
1
1
  require "bundler/gem_tasks"
2
- require 'rake/testtask'
2
+ require 'rspec/core/rake_task'
3
3
 
4
- Rake::TestTask.new do |t|
5
- t.libs << 'test'
6
- t.test_files = FileList['test/**/*_test.rb']
7
- end
4
+ RSpec::Core::RakeTask.new(:spec)
@@ -1,129 +1,137 @@
1
+ #
2
+ # Bayesian classifier for arbitrary text.
3
+ #
4
+ # Implementation is translated from
5
+ # Introduction to Information Retrieval by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze,
6
+ # Cambridge University Press. 2008, ISBN 0521865719.
7
+ #
1
8
  module Reclassifier
2
9
  class Bayes
3
- # The class can be created with one or more categories, each of which will be
4
- # initialized and given a training method. E.g.,
5
- # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
6
- def initialize(*categories)
7
- @categories = Hash.new
8
- categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
9
- @total_words = 0
10
- @category_counts = Hash.new(0)
10
+ # Can be created with zero or more classifications, each of which will be
11
+ # initialized and given a training method. The classifications are specified as
12
+ # symbols. E.g.,
13
+ # b = Reclassifier::Bayes.new :interesting, :uninteresting, :spam
14
+ def initialize(*classifications)
15
+ @classifications = {}
16
+ classifications.each {|classification| @classifications[classification] = {}}
17
+
18
+ @docs_in_classification_count = {}
11
19
  end
12
20
 
13
21
  #
14
- # Provides a general training method for all categories specified in Bayes#new
22
+ # Provides a general training method for all classifications specified in Bayes#new
15
23
  # For example:
16
- # b = Classifier::Bayes.new 'This', 'That', 'the_other'
24
+ # b = Reclassifier::Bayes.new :this, :that
17
25
  # b.train :this, "This text"
18
- # b.train "that", "That text"
19
- # b.train "The other", "The other text"
20
- def train(category, text)
21
- category = category.prepare_category_name
22
- @category_counts[category] += 1
26
+ # b.train :that, "That text"
27
+ def train(classification, text)
28
+ ensure_classification_exists(classification)
29
+
30
+ @docs_in_classification_count[classification] ||= 0
31
+ @docs_in_classification_count[classification] += 1
32
+
23
33
  text.word_hash.each do |word, count|
24
- @categories[category][word] ||= 0
25
- @categories[category][word] += count
26
- @total_words += count
34
+ @classifications[classification][word] ||= 0
35
+
36
+ @classifications[classification][word] += count
27
37
  end
28
38
  end
29
39
 
30
40
  #
31
- # Provides a untraining method for all categories specified in Bayes#new
41
+ # Untrain a (classification, text) pair.
32
42
  # Be very careful with this method.
33
43
  #
34
44
  # For example:
35
- # b = Classifier::Bayes.new 'This', 'That', 'the_other'
45
+ # b = Reclassifier::Bayes.new :this, :that, :the_other
36
46
  # b.train :this, "This text"
37
47
  # b.untrain :this, "This text"
38
- def untrain(category, text)
39
- category = category.prepare_category_name
40
- @category_counts[category] -= 1
48
+ def untrain(classification, text)
49
+ ensure_classification_exists(classification)
50
+
51
+ @docs_in_classification_count[classification] -= 1
52
+
41
53
  text.word_hash.each do |word, count|
42
- if @total_words >= 0
43
- orig = @categories[category][word]
44
- @categories[category][word] ||= 0
45
- @categories[category][word] -= count
46
- if @categories[category][word] <= 0
47
- @categories[category].delete(word)
48
- count = orig
49
- end
50
- @total_words -= count
51
- end
54
+ @classifications[classification][word] -= count if @classifications[classification].include?(word)
52
55
  end
53
56
  end
54
57
 
55
58
  #
56
- # Returns the scores in each category the provided +text+. E.g.,
59
+ # Returns the scores of the specified text for each classification. E.g.,
57
60
  # b.classifications "I hate bad words and you"
58
61
  # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
59
62
  # The largest of these scores (the one closest to 0) is the one picked out by #classify
60
- def classifications(text)
61
- score = Hash.new
62
- training_count = @category_counts.values.inject { |x,y| x+y }.to_f
63
- @categories.each do |category, category_words|
64
- score[category.to_s] = 0
65
- total = category_words.values.inject(0) {|sum, element| sum+element}
63
+ def calculate_scores(text)
64
+ scores = {}
65
+
66
+ @classifications.each do |classification, classification_word_counts|
67
+ # prior
68
+ scores[classification] = Math.log(@docs_in_classification_count[classification])
69
+ scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
70
+
71
+ # likelihood
66
72
  text.word_hash.each do |word, count|
67
- s = category_words.has_key?(word) ? category_words[word] : 0.1
68
- score[category.to_s] += Math.log(s/total.to_f)
73
+ if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
74
+ scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
75
+
76
+ scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
77
+ end
69
78
  end
70
- # now add prior probability for the category
71
- s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
72
- score[category.to_s] += Math.log(s / training_count)
73
79
  end
74
- return score
80
+
81
+ scores
75
82
  end
76
83
 
77
84
  #
78
- # Returns the classification of the provided +text+, which is one of the
79
- # categories given in the initializer. E.g.,
85
+ # Returns the classification of the specified text, which is one of the
86
+ # classifications given in the initializer. E.g.,
80
87
  # b.classify "I hate bad words and you"
81
- # => 'Uninteresting'
88
+ # => :uninteresting
82
89
  def classify(text)
83
- (classifications(text).sort_by { |a| -a[1] })[0][0]
90
+ calculate_scores(text).max_by {|classification| classification[1]}[0]
84
91
  end
85
92
 
86
93
  #
87
- # Provides training and untraining methods for the categories specified in Bayes#new
94
+ # Provides a list of classification names
88
95
  # For example:
89
- # b = Classifier::Bayes.new 'This', 'That', 'the_other'
90
- # b.train_this "This text"
91
- # b.train_that "That text"
92
- # b.untrain_that "That text"
93
- # b.train_the_other "The other text"
94
- def method_missing(name, *args)
95
- category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
96
- if @categories.has_key? category
97
- args.each { |text| eval("#{$1}train(category, text)") }
98
- elsif name.to_s =~ /(un)?train_([\w]+)/
99
- raise StandardError, "No such category: #{category}"
100
- else
101
- super #raise StandardError, "No such method: #{name}"
102
- end
96
+ # b.classifications
97
+ # => [:this, :that, :the_other]
98
+ def classifications
99
+ @classifications.keys
103
100
  end
104
101
 
105
102
  #
106
- # Provides a list of category names
103
+ # Adds the classification to the classifier.
104
+ # Has no effect if the classification already existed.
105
+ # Returns the classification.
107
106
  # For example:
108
- # b.categories
109
- # => ['This', 'That', 'the_other']
110
- def categories # :nodoc:
111
- @categories.keys.collect {|c| c.to_s}
107
+ # b.add_classification(:not_spam)
108
+ def add_classification(classification)
109
+ @classifications[classification] ||= {}
110
+
111
+ classification
112
112
  end
113
113
 
114
114
  #
115
- # Allows you to add categories to the classifier.
115
+ # Removes the classification from the classifier.
116
+ # Returns the classifier if the classification existed, else nil.
116
117
  # For example:
117
- # b.add_category "Not spam"
118
- #
119
- # WARNING: Adding categories to a trained classifier will
120
- # result in an undertrained category that will tend to match
121
- # more criteria than the trained selective categories. In short,
122
- # try to initialize your categories at initialization.
123
- def add_category(category)
124
- @categories[category.prepare_category_name] = Hash.new
118
+ # b.remove_classification(:not_spam)
119
+ def remove_classification(classification)
120
+ return_value = if @classifications.include?(classification)
121
+ classification
122
+ else
123
+ nil
124
+ end
125
+
126
+ @classifications.delete(classification)
127
+
128
+ return_value
125
129
  end
126
130
 
127
- alias append_category add_category
131
+ private
132
+
133
+ def ensure_classification_exists(classification)
134
+ raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
135
+ end
128
136
  end
129
137
  end
@@ -0,0 +1,2 @@
1
+ class Reclassifier::UnknownClassificationError < StandardError
2
+ end
@@ -1,3 +1,3 @@
1
1
  module Reclassifier
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
data/lib/reclassifier.rb CHANGED
@@ -7,13 +7,13 @@ require 'gsl'
7
7
  require 'reclassifier/version'
8
8
  require 'reclassifier/core_ext/array'
9
9
  require 'reclassifier/core_ext/matrix'
10
- require 'reclassifier/core_ext/object'
11
10
  require 'reclassifier/core_ext/string'
12
11
  require 'gsl/vector'
13
12
 
14
13
  module Reclassifier
15
- autoload :Bayes, 'reclassifier/bayes'
16
- autoload :LSI, 'reclassifier/lsi'
17
- autoload :ContentNode, 'reclassifier/content_node'
18
- autoload :WordList, 'reclassifier/word_list'
14
+ autoload :Bayes, 'reclassifier/bayes'
15
+ autoload :LSI, 'reclassifier/lsi'
16
+ autoload :ContentNode, 'reclassifier/content_node'
17
+ autoload :WordList, 'reclassifier/word_list'
18
+ autoload :UnknownClassificationError, 'reclassifier/unknown_classification_error'
19
19
  end
data/reclassifier.gemspec CHANGED
@@ -20,7 +20,7 @@ Gem::Specification.new do |spec|
20
20
 
21
21
  spec.add_development_dependency 'bundler', '~> 1.3'
22
22
  spec.add_development_dependency 'rake'
23
- spec.add_development_dependency 'test-unit'
23
+ spec.add_development_dependency 'rspec'
24
24
 
25
25
  spec.add_dependency 'fast-stemmer'
26
26
  spec.add_dependency 'gsl'
@@ -0,0 +1,97 @@
1
+ require 'spec_helper'
2
+
3
+ describe Reclassifier::Bayes do
4
+ describe "classifications" do
5
+ it "should return the classifications" do
6
+ subject = described_class.new(:interesting, :uninteresting)
7
+
8
+ subject.classifications.sort.should eq([:interesting, :uninteresting])
9
+ end
10
+ end
11
+
12
+ describe "train" do
13
+ it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
14
+ expect {subject.train(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
15
+ end
16
+
17
+ it "should train the classifier to the (classification, document) pair" do
18
+ subject = described_class.new(:in_china, :not_in_china)
19
+
20
+ subject.train(:in_china, 'Chinese Beijing Chinese')
21
+ subject.train(:in_china, 'Chinese Chinese Shanghai')
22
+ subject.train(:in_china, 'Chinese Macao')
23
+ subject.train(:not_in_china, 'Tokyo Japan Chinese')
24
+
25
+ subject.classify('Chinese Chinese Chinese Tokyo Japan').should eq(:in_china)
26
+ end
27
+ end
28
+
29
+ describe "untrain" do
30
+ it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
31
+ expect {subject.untrain(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
32
+ end
33
+
34
+ it "should untrain the classifier against the (classification, document) pair" do
35
+ subject = described_class.new(:in_china, :not_in_china)
36
+
37
+ subject.train(:in_china, 'Chinese Chinese')
38
+ subject.train(:not_in_china, 'Chinese Macao')
39
+
40
+ subject.classify('Chinese').should eq(:in_china)
41
+
42
+ subject.untrain(:in_china, 'Chinese Chinese')
43
+
44
+ subject.classify('Chinese').should eq(:not_in_china)
45
+ end
46
+ end
47
+
48
+ describe "calculate_scores" do
49
+ it "should return a score hash with the correct scores" do
50
+ subject = described_class.new(:in_china, :not_in_china)
51
+
52
+ subject.train(:in_china, 'Chinese Beijing Chinese')
53
+ subject.train(:in_china, 'Chinese Chinese Shanghai')
54
+ subject.train(:in_china, 'Chinese Macao')
55
+ subject.train(:not_in_china, 'Tokyo Japan Chinese')
56
+
57
+ scores = subject.calculate_scores('Chinese Chinese Chinese Tokyo Japan')
58
+
59
+ scores[:in_china].should eq(-8.107690312843907)
60
+ scores[:not_in_china].should eq(-8.906681345001262)
61
+ end
62
+ end
63
+
64
+ describe "add_classification" do
65
+ it "should add the classification to the set of classifications" do
66
+ subject.classifications.should be_empty
67
+
68
+ subject.add_classification(:niner)
69
+
70
+ subject.classifications.should eq([:niner])
71
+ end
72
+
73
+ it "should return the classification" do
74
+ subject.add_classification(:niner).should eq(:niner)
75
+ end
76
+ end
77
+
78
+ describe "remove_classification" do
79
+ it "should remove the classification from the set of classifications" do
80
+ subject.add_classification(:niner)
81
+
82
+ subject.remove_classification(:niner)
83
+
84
+ subject.classifications.should be_empty
85
+ end
86
+
87
+ it "should return the classification" do
88
+ subject.add_classification(:niner)
89
+
90
+ subject.remove_classification(:niner).should eq(:niner)
91
+ end
92
+
93
+ it "should return nil if the classification didn't exist" do
94
+ subject.remove_classification(:niner).should be(nil)
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,13 @@
1
+ require 'spec_helper'
2
+
3
+ describe Array do
4
+ describe "sum_with_identity" do
5
+ it "should sum the array" do
6
+ [1,2,3].sum_with_identity.should eq(6)
7
+ end
8
+
9
+ it "should return 0 when it encounters an empty array" do
10
+ [].sum_with_identity.should eq(0)
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,19 @@
1
+ require 'spec_helper'
2
+
3
+ describe String do
4
+ describe "word_hash" do
5
+ it "should hash text" do
6
+ hash = {:good => 1, :"!" => 1, :hope => 1, :"'" => 1, :"." => 1, :love => 1, :word => 1, :them => 1, :test => 1}
7
+
8
+ "here are some good words of test's. I hope you love them!".word_hash.should eq(hash)
9
+ end
10
+ end
11
+
12
+ describe "clean_word_hash" do
13
+ it "should clean and hash text" do
14
+ hash = {:good => 1, :word => 1, :hope => 1, :love => 1, :them => 1, :test => 1}
15
+
16
+ "here are some good words of test's. I hope you love them!".clean_word_hash.should eq(hash)
17
+ end
18
+ end
19
+ end
data/spec/lsi_spec.rb ADDED
@@ -0,0 +1,123 @@
1
+ require 'spec_helper'
2
+
3
+ describe Reclassifier::LSI do
4
+ before do
5
+ # we repeat principle words to help weight them.
6
+ # This test is rather delicate, since this system is mostly noise.
7
+ @str1 = "This text deals with dogs. Dogs."
8
+ @str2 = "This text involves dogs too. Dogs! "
9
+ @str3 = "This text revolves around cats. Cats."
10
+ @str4 = "This text also involves cats. Cats!"
11
+ @str5 = "This text involves birds. Birds."
12
+ end
13
+
14
+ it "should do basic indexing" do
15
+ [@str1, @str2, @str3, @str4, @str5].each { |x| subject << x }
16
+ subject.needs_rebuild?.should be(false)
17
+
18
+ # note that the closest match to str1 is str2, even though it is not
19
+ # the closest text match.
20
+ subject.find_related(@str1, 3).should eq([@str2, @str5, @str3])
21
+ end
22
+
23
+ it "should not auto rebuild when it's specified as false" do
24
+ subject = described_class.new(:auto_rebuild => false)
25
+
26
+ subject.add_item @str1, "Dog"
27
+ subject.add_item @str2, "Dog"
28
+
29
+ subject.needs_rebuild?.should be(true)
30
+
31
+ subject.build_index
32
+
33
+ subject.needs_rebuild?.should be(false)
34
+ end
35
+
36
+ it "should do basic classifying" do
37
+ subject.add_item(@str2, "Dog")
38
+ subject.add_item(@str3, "Cat")
39
+ subject.add_item(@str4, "Cat")
40
+ subject.add_item(@str5, "Bird")
41
+
42
+ subject.classify(@str1).should eq("Dog")
43
+ subject.classify(@str3).should eq("Cat")
44
+ subject.classify(@str5).should eq("Bird")
45
+ end
46
+
47
+ it "should perform better than Bayes" do
48
+ bayes = Reclassifier::Bayes.new :dog, :cat, :bird
49
+
50
+ [[@str1, "Dog"],
51
+ [@str2, "Dog"],
52
+ [@str3, "Cat"],
53
+ [@str4, "Cat"],
54
+ [@str5, "Bird"]].each do |str, classification|
55
+ subject.add_item(str, classification)
56
+
57
+ bayes.train(classification.downcase.to_sym, str)
58
+ end
59
+
60
+ # We're talking about dogs. Even though the text matches the corpus on
61
+ # cats better. Dogs have more semantic weight than cats. So bayes
62
+ # will fail here, but the LSI recognizes content.
63
+ tricky_case = "This text revolves around dogs."
64
+ subject.classify(tricky_case).should eq("Dog")
65
+ bayes.classify(tricky_case).should eq(:dog)
66
+ end
67
+
68
+ it "should recategorize as needed" do
69
+ subject.add_item(@str1, "Dog")
70
+ subject.add_item(@str2, "Dog")
71
+ subject.add_item(@str3, "Cat")
72
+ subject.add_item(@str4, "Cat")
73
+ subject.add_item(@str5, "Bird")
74
+
75
+ tricky_case = "This text revolves around dogs."
76
+ subject.classify(tricky_case).should eq("Dog")
77
+
78
+ # Recategorize as needed.
79
+ subject.categories_for(@str1).clear.push("Cow")
80
+ subject.categories_for(@str2).clear.push("Cow")
81
+
82
+ subject.needs_rebuild?.should be(false)
83
+ subject.classify(tricky_case).should eq("Cow")
84
+ end
85
+
86
+ it "should search correctly" do
87
+ [@str1, @str2, @str3, @str4, @str5].each { |x| subject << x }
88
+
89
+ # Searching by content and text, note that @str2 comes up first, because
90
+ # both "dog" and "involve" are present. But, the next match is @str1 instead
91
+ # of @str4, because "dog" carries more weight than involves.
92
+ subject.search("dog involves", 100).should eq([@str2, @str1, @str4, @str5, @str3])
93
+
94
+ # Keyword search shows how the space is mapped out in relation to
95
+ # dog when magnitude is remove. Note the relations. We move from dog
96
+ # through involve and then finally to other words.
97
+ subject.search("dog", 5).should eq([@str1, @str2, @str4, @str5, @str3])
98
+ end
99
+
100
+ it "should serialize correctly" do
101
+ [@str1, @str2, @str3, @str4, @str5].each { |x| subject << x }
102
+
103
+ subject_md = Marshal.dump(subject)
104
+ subject_m = Marshal.load(subject_md)
105
+
106
+ subject_m.search("cat", 3).should eq(subject.search("cat", 3))
107
+ subject_m.find_related(@str1, 3).should eq(subject.find_related(@str1, 3))
108
+ end
109
+
110
+ it "should keyword search correctly" do
111
+ subject.add_item(@str1, "Dog")
112
+ subject.add_item(@str2, "Dog")
113
+ subject.add_item(@str3, "Cat")
114
+ subject.add_item(@str4, "Cat")
115
+ subject.add_item(@str5, "Bird")
116
+
117
+ subject.highest_ranked_stems(@str1).should eq([:dog, :text, :deal])
118
+ end
119
+
120
+ it "should summarize correctly" do
121
+ [@str1, @str2, @str3, @str4, @str5].join.summary(2).should eq("This text involves dogs too [...] This text also involves cats")
122
+ end
123
+ end
@@ -0,0 +1,5 @@
1
+ require File.join(Dir.pwd, 'lib', 'reclassifier.rb')
2
+
3
+ RSpec.configure do |config|
4
+ config.color = true
5
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: reclassifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-18 00:00:00.000000000 Z
12
+ date: 2013-04-22 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -44,7 +44,7 @@ dependencies:
44
44
  - !ruby/object:Gem::Version
45
45
  version: '0'
46
46
  - !ruby/object:Gem::Dependency
47
- name: test-unit
47
+ name: rspec
48
48
  requirement: !ruby/object:Gem::Requirement
49
49
  none: false
50
50
  requirements:
@@ -109,18 +109,18 @@ files:
109
109
  - lib/reclassifier/content_node.rb
110
110
  - lib/reclassifier/core_ext/array.rb
111
111
  - lib/reclassifier/core_ext/matrix.rb
112
- - lib/reclassifier/core_ext/object.rb
113
112
  - lib/reclassifier/core_ext/string.rb
114
113
  - lib/reclassifier/core_ext/vector.rb
115
114
  - lib/reclassifier/lsi.rb
115
+ - lib/reclassifier/unknown_classification_error.rb
116
116
  - lib/reclassifier/version.rb
117
117
  - lib/reclassifier/word_list.rb
118
118
  - reclassifier.gemspec
119
- - test/bayes_test.rb
120
- - test/core_ext/array_test.rb
121
- - test/core_ext/string_test.rb
122
- - test/lsi_test.rb
123
- - test/test_helper.rb
119
+ - spec/bayes_spec.rb
120
+ - spec/core_ext/array_spec.rb
121
+ - spec/core_ext/string_spec.rb
122
+ - spec/lsi_spec.rb
123
+ - spec/spec_helper.rb
124
124
  homepage: https://github.com/saveup/reclassifier
125
125
  licenses:
126
126
  - LGPL
@@ -147,8 +147,8 @@ signing_key:
147
147
  specification_version: 3
148
148
  summary: Bayesian and Latent Semantic Indexing classification of text.
149
149
  test_files:
150
- - test/bayes_test.rb
151
- - test/core_ext/array_test.rb
152
- - test/core_ext/string_test.rb
153
- - test/lsi_test.rb
154
- - test/test_helper.rb
150
+ - spec/bayes_spec.rb
151
+ - spec/core_ext/array_spec.rb
152
+ - spec/core_ext/string_spec.rb
153
+ - spec/lsi_spec.rb
154
+ - spec/spec_helper.rb
@@ -1,3 +0,0 @@
1
- class Object
2
- def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
3
- end
data/test/bayes_test.rb DELETED
@@ -1,34 +0,0 @@
1
- require File.join(File.dirname(__FILE__), 'test_helper')
2
-
3
- class BayesTest < Test::Unit::TestCase
4
- def setup
5
- @classifier = Reclassifier::Bayes.new 'Interesting', 'Uninteresting'
6
- end
7
-
8
- def test_good_training
9
- assert_nothing_raised { @classifier.train_interesting "love" }
10
- end
11
-
12
- def test_bad_training
13
- assert_raise(StandardError) { @classifier.train_no_category "words" }
14
- end
15
-
16
- def test_bad_method
17
- assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
18
- end
19
-
20
- def test_categories
21
- assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
22
- end
23
-
24
- def test_add_category
25
- @classifier.add_category 'Test'
26
- assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
27
- end
28
-
29
- def test_classification
30
- @classifier.train_interesting "here are some good words. I hope you love them"
31
- @classifier.train_uninteresting "here are some bad words, I hate you"
32
- assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
33
- end
34
- end
@@ -1,15 +0,0 @@
1
- require File.join(File.dirname(__FILE__), '..', 'test_helper')
2
-
3
- class ArrayTest < Test::Unit::TestCase
4
- def test_monkey_path_array_sum
5
- assert_equal [1,2,3].sum_with_identity, 6
6
- end
7
-
8
- def test_summing_an_empty_array
9
- assert_equal [nil].sum_with_identity, 0
10
- end
11
-
12
- def test_summing_an_empty_array
13
- assert_equal Array[].sum_with_identity, 0
14
- end
15
- end
@@ -1,13 +0,0 @@
1
- require File.join(File.dirname(__FILE__), '..', 'test_helper')
2
-
3
- class StringTest < Test::Unit::TestCase
4
- def test_word_hash
5
- hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
6
- assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
7
- end
8
-
9
- def test_clean_word_hash
10
- hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
11
- assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash
12
- end
13
- end
data/test/lsi_test.rb DELETED
@@ -1,123 +0,0 @@
1
- require File.join(File.dirname(__FILE__), 'test_helper')
2
-
3
- class LSITest < Test::Unit::TestCase
4
- def setup
5
- # we repeat principle words to help weight them.
6
- # This test is rather delicate, since this system is mostly noise.
7
- @str1 = "This text deals with dogs. Dogs."
8
- @str2 = "This text involves dogs too. Dogs! "
9
- @str3 = "This text revolves around cats. Cats."
10
- @str4 = "This text also involves cats. Cats!"
11
- @str5 = "This text involves birds. Birds."
12
- end
13
-
14
- def test_basic_indexing
15
- lsi = Reclassifier::LSI.new
16
- [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
17
- assert ! lsi.needs_rebuild?
18
-
19
- # note that the closest match to str1 is str2, even though it is not
20
- # the closest text match.
21
- assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
22
- end
23
-
24
- def test_not_auto_rebuild
25
- lsi = Reclassifier::LSI.new :auto_rebuild => false
26
- lsi.add_item @str1, "Dog"
27
- lsi.add_item @str2, "Dog"
28
- assert lsi.needs_rebuild?
29
- lsi.build_index
30
- assert ! lsi.needs_rebuild?
31
- end
32
-
33
- def test_basic_categorizing
34
- lsi = Reclassifier::LSI.new
35
- lsi.add_item @str2, "Dog"
36
- lsi.add_item @str3, "Cat"
37
- lsi.add_item @str4, "Cat"
38
- lsi.add_item @str5, "Bird"
39
-
40
- assert_equal "Dog", lsi.classify( @str1 )
41
- assert_equal "Cat", lsi.classify( @str3 )
42
- assert_equal "Bird", lsi.classify( @str5 )
43
- end
44
-
45
- def test_external_classifying
46
- lsi = Reclassifier::LSI.new
47
- bayes = Reclassifier::Bayes.new 'Dog', 'Cat', 'Bird'
48
- lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
49
- lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
50
- lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
51
- lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
52
- lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
53
-
54
- # We're talking about dogs. Even though the text matches the corpus on
55
- # cats better. Dogs have more semantic weight than cats. So bayes
56
- # will fail here, but the LSI recognizes content.
57
- tricky_case = "This text revolves around dogs."
58
- assert_equal "Dog", lsi.classify( tricky_case )
59
- assert_not_equal "Dog", bayes.classify( tricky_case )
60
- end
61
-
62
- def test_recategorize_interface
63
- lsi = Reclassifier::LSI.new
64
- lsi.add_item @str1, "Dog"
65
- lsi.add_item @str2, "Dog"
66
- lsi.add_item @str3, "Cat"
67
- lsi.add_item @str4, "Cat"
68
- lsi.add_item @str5, "Bird"
69
-
70
- tricky_case = "This text revolves around dogs."
71
- assert_equal "Dog", lsi.classify( tricky_case )
72
-
73
- # Recategorize as needed.
74
- lsi.categories_for(@str1).clear.push "Cow"
75
- lsi.categories_for(@str2).clear.push "Cow"
76
-
77
- assert !lsi.needs_rebuild?
78
- assert_equal "Cow", lsi.classify( tricky_case )
79
- end
80
-
81
- def test_search
82
- lsi = Reclassifier::LSI.new
83
- [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
84
-
85
- # Searching by content and text, note that @str2 comes up first, because
86
- # both "dog" and "involve" are present. But, the next match is @str1 instead
87
- # of @str4, because "dog" carries more weight than involves.
88
- assert_equal( [@str2, @str1, @str4, @str5, @str3],
89
- lsi.search("dog involves", 100) )
90
-
91
- # Keyword search shows how the space is mapped out in relation to
92
- # dog when magnitude is remove. Note the relations. We move from dog
93
- # through involve and then finally to other words.
94
- assert_equal( [@str1, @str2, @str4, @str5, @str3],
95
- lsi.search("dog", 5) )
96
- end
97
-
98
- def test_serialize_safe
99
- lsi = Reclassifier::LSI.new
100
- [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
101
-
102
- lsi_md = Marshal.dump lsi
103
- lsi_m = Marshal.load lsi_md
104
-
105
- assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
106
- assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
107
- end
108
-
109
- def test_keyword_search
110
- lsi = Reclassifier::LSI.new
111
- lsi.add_item @str1, "Dog"
112
- lsi.add_item @str2, "Dog"
113
- lsi.add_item @str3, "Cat"
114
- lsi.add_item @str4, "Cat"
115
- lsi.add_item @str5, "Bird"
116
-
117
- assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
118
- end
119
-
120
- def test_summary
121
- assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
122
- end
123
- end
data/test/test_helper.rb DELETED
@@ -1,4 +0,0 @@
1
- $:.unshift(File.dirname(__FILE__) + '/../lib')
2
-
3
- require 'test/unit'
4
- require 'reclassifier'