reclassifier 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +2 -5
- data/lib/reclassifier/bayes.rb +88 -80
- data/lib/reclassifier/unknown_classification_error.rb +2 -0
- data/lib/reclassifier/version.rb +1 -1
- data/lib/reclassifier.rb +5 -5
- data/reclassifier.gemspec +1 -1
- data/spec/bayes_spec.rb +97 -0
- data/spec/core_ext/array_spec.rb +13 -0
- data/spec/core_ext/string_spec.rb +19 -0
- data/spec/lsi_spec.rb +123 -0
- data/spec/spec_helper.rb +5 -0
- metadata +14 -14
- data/lib/reclassifier/core_ext/object.rb +0 -3
- data/test/bayes_test.rb +0 -34
- data/test/core_ext/array_test.rb +0 -15
- data/test/core_ext/string_test.rb +0 -13
- data/test/lsi_test.rb +0 -123
- data/test/test_helper.rb +0 -4
data/Rakefile
CHANGED
data/lib/reclassifier/bayes.rb
CHANGED
@@ -1,129 +1,137 @@
|
|
1
|
+
#
|
2
|
+
# Bayesian classifier for arbitrary text.
|
3
|
+
#
|
4
|
+
# Implementation is translated from
|
5
|
+
# Introduction to Information Retrieval by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze,
|
6
|
+
# Cambridge University Press. 2008, ISBN 0521865719.
|
7
|
+
#
|
1
8
|
module Reclassifier
|
2
9
|
class Bayes
|
3
|
-
#
|
4
|
-
# initialized and given a training method.
|
5
|
-
#
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
@
|
10
|
-
|
10
|
+
# Can be created with zero or more classifications, each of which will be
|
11
|
+
# initialized and given a training method. The classifications are specified as
|
12
|
+
# symbols. E.g.,
|
13
|
+
# b = Reclassifier::Bayes.new :interesting, :uninteresting, :spam
|
14
|
+
def initialize(*classifications)
|
15
|
+
@classifications = {}
|
16
|
+
classifications.each {|classification| @classifications[classification] = {}}
|
17
|
+
|
18
|
+
@docs_in_classification_count = {}
|
11
19
|
end
|
12
20
|
|
13
21
|
#
|
14
|
-
# Provides a general training method for all
|
22
|
+
# Provides a general training method for all classifications specified in Bayes#new
|
15
23
|
# For example:
|
16
|
-
# b =
|
24
|
+
# b = Reclassifier::Bayes.new :this, :that
|
17
25
|
# b.train :this, "This text"
|
18
|
-
# b.train
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
@
|
26
|
+
# b.train :that, "That text"
|
27
|
+
def train(classification, text)
|
28
|
+
ensure_classification_exists(classification)
|
29
|
+
|
30
|
+
@docs_in_classification_count[classification] ||= 0
|
31
|
+
@docs_in_classification_count[classification] += 1
|
32
|
+
|
23
33
|
text.word_hash.each do |word, count|
|
24
|
-
@
|
25
|
-
|
26
|
-
@
|
34
|
+
@classifications[classification][word] ||= 0
|
35
|
+
|
36
|
+
@classifications[classification][word] += count
|
27
37
|
end
|
28
38
|
end
|
29
39
|
|
30
40
|
#
|
31
|
-
#
|
41
|
+
# Untrain a (classification, text) pair.
|
32
42
|
# Be very careful with this method.
|
33
43
|
#
|
34
44
|
# For example:
|
35
|
-
# b =
|
45
|
+
# b = Reclassifier::Bayes.new :this, :that, :the_other
|
36
46
|
# b.train :this, "This text"
|
37
47
|
# b.untrain :this, "This text"
|
38
|
-
def untrain(
|
39
|
-
|
40
|
-
|
48
|
+
def untrain(classification, text)
|
49
|
+
ensure_classification_exists(classification)
|
50
|
+
|
51
|
+
@docs_in_classification_count[classification] -= 1
|
52
|
+
|
41
53
|
text.word_hash.each do |word, count|
|
42
|
-
if @
|
43
|
-
orig = @categories[category][word]
|
44
|
-
@categories[category][word] ||= 0
|
45
|
-
@categories[category][word] -= count
|
46
|
-
if @categories[category][word] <= 0
|
47
|
-
@categories[category].delete(word)
|
48
|
-
count = orig
|
49
|
-
end
|
50
|
-
@total_words -= count
|
51
|
-
end
|
54
|
+
@classifications[classification][word] -= count if @classifications[classification].include?(word)
|
52
55
|
end
|
53
56
|
end
|
54
57
|
|
55
58
|
#
|
56
|
-
# Returns the scores
|
59
|
+
# Returns the scores of the specified text for each classification. E.g.,
|
57
60
|
# b.classifications "I hate bad words and you"
|
58
61
|
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
59
62
|
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
60
|
-
def
|
61
|
-
|
62
|
-
|
63
|
-
@
|
64
|
-
|
65
|
-
|
63
|
+
def calculate_scores(text)
|
64
|
+
scores = {}
|
65
|
+
|
66
|
+
@classifications.each do |classification, classification_word_counts|
|
67
|
+
# prior
|
68
|
+
scores[classification] = Math.log(@docs_in_classification_count[classification])
|
69
|
+
scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
|
70
|
+
|
71
|
+
# likelihood
|
66
72
|
text.word_hash.each do |word, count|
|
67
|
-
|
68
|
-
|
73
|
+
if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
|
74
|
+
scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
|
75
|
+
|
76
|
+
scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
|
77
|
+
end
|
69
78
|
end
|
70
|
-
# now add prior probability for the category
|
71
|
-
s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
|
72
|
-
score[category.to_s] += Math.log(s / training_count)
|
73
79
|
end
|
74
|
-
|
80
|
+
|
81
|
+
scores
|
75
82
|
end
|
76
83
|
|
77
84
|
#
|
78
|
-
# Returns the classification of the
|
79
|
-
#
|
85
|
+
# Returns the classification of the specified text, which is one of the
|
86
|
+
# classifications given in the initializer. E.g.,
|
80
87
|
# b.classify "I hate bad words and you"
|
81
|
-
# =>
|
88
|
+
# => :uninteresting
|
82
89
|
def classify(text)
|
83
|
-
(
|
90
|
+
calculate_scores(text).max_by {|classification| classification[1]}[0]
|
84
91
|
end
|
85
92
|
|
86
93
|
#
|
87
|
-
# Provides
|
94
|
+
# Provides a list of classification names
|
88
95
|
# For example:
|
89
|
-
# b
|
90
|
-
#
|
91
|
-
|
92
|
-
|
93
|
-
# b.train_the_other "The other text"
|
94
|
-
def method_missing(name, *args)
|
95
|
-
category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
|
96
|
-
if @categories.has_key? category
|
97
|
-
args.each { |text| eval("#{$1}train(category, text)") }
|
98
|
-
elsif name.to_s =~ /(un)?train_([\w]+)/
|
99
|
-
raise StandardError, "No such category: #{category}"
|
100
|
-
else
|
101
|
-
super #raise StandardError, "No such method: #{name}"
|
102
|
-
end
|
96
|
+
# b.classifications
|
97
|
+
# => [:this, :that, :the_other]
|
98
|
+
def classifications
|
99
|
+
@classifications.keys
|
103
100
|
end
|
104
101
|
|
105
102
|
#
|
106
|
-
#
|
103
|
+
# Adds the classification to the classifier.
|
104
|
+
# Has no effect if the classification already existed.
|
105
|
+
# Returns the classification.
|
107
106
|
# For example:
|
108
|
-
# b.
|
109
|
-
|
110
|
-
|
111
|
-
|
107
|
+
# b.add_classification(:not_spam)
|
108
|
+
def add_classification(classification)
|
109
|
+
@classifications[classification] ||= {}
|
110
|
+
|
111
|
+
classification
|
112
112
|
end
|
113
113
|
|
114
114
|
#
|
115
|
-
#
|
115
|
+
# Removes the classification from the classifier.
|
116
|
+
# Returns the classifier if the classification existed, else nil.
|
116
117
|
# For example:
|
117
|
-
# b.
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
118
|
+
# b.remove_classification(:not_spam)
|
119
|
+
def remove_classification(classification)
|
120
|
+
return_value = if @classifications.include?(classification)
|
121
|
+
classification
|
122
|
+
else
|
123
|
+
nil
|
124
|
+
end
|
125
|
+
|
126
|
+
@classifications.delete(classification)
|
127
|
+
|
128
|
+
return_value
|
125
129
|
end
|
126
130
|
|
127
|
-
|
131
|
+
private
|
132
|
+
|
133
|
+
def ensure_classification_exists(classification)
|
134
|
+
raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
|
135
|
+
end
|
128
136
|
end
|
129
137
|
end
|
data/lib/reclassifier/version.rb
CHANGED
data/lib/reclassifier.rb
CHANGED
@@ -7,13 +7,13 @@ require 'gsl'
|
|
7
7
|
require 'reclassifier/version'
|
8
8
|
require 'reclassifier/core_ext/array'
|
9
9
|
require 'reclassifier/core_ext/matrix'
|
10
|
-
require 'reclassifier/core_ext/object'
|
11
10
|
require 'reclassifier/core_ext/string'
|
12
11
|
require 'gsl/vector'
|
13
12
|
|
14
13
|
module Reclassifier
|
15
|
-
autoload :Bayes,
|
16
|
-
autoload :LSI,
|
17
|
-
autoload :ContentNode,
|
18
|
-
autoload :WordList,
|
14
|
+
autoload :Bayes, 'reclassifier/bayes'
|
15
|
+
autoload :LSI, 'reclassifier/lsi'
|
16
|
+
autoload :ContentNode, 'reclassifier/content_node'
|
17
|
+
autoload :WordList, 'reclassifier/word_list'
|
18
|
+
autoload :UnknownClassificationError, 'reclassifier/unknown_classification_error'
|
19
19
|
end
|
data/reclassifier.gemspec
CHANGED
@@ -20,7 +20,7 @@ Gem::Specification.new do |spec|
|
|
20
20
|
|
21
21
|
spec.add_development_dependency 'bundler', '~> 1.3'
|
22
22
|
spec.add_development_dependency 'rake'
|
23
|
-
spec.add_development_dependency '
|
23
|
+
spec.add_development_dependency 'rspec'
|
24
24
|
|
25
25
|
spec.add_dependency 'fast-stemmer'
|
26
26
|
spec.add_dependency 'gsl'
|
data/spec/bayes_spec.rb
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Reclassifier::Bayes do
|
4
|
+
describe "classifications" do
|
5
|
+
it "should return the classifications" do
|
6
|
+
subject = described_class.new(:interesting, :uninteresting)
|
7
|
+
|
8
|
+
subject.classifications.sort.should eq([:interesting, :uninteresting])
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "train" do
|
13
|
+
it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
|
14
|
+
expect {subject.train(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should train the classifier to the (classification, document) pair" do
|
18
|
+
subject = described_class.new(:in_china, :not_in_china)
|
19
|
+
|
20
|
+
subject.train(:in_china, 'Chinese Beijing Chinese')
|
21
|
+
subject.train(:in_china, 'Chinese Chinese Shanghai')
|
22
|
+
subject.train(:in_china, 'Chinese Macao')
|
23
|
+
subject.train(:not_in_china, 'Tokyo Japan Chinese')
|
24
|
+
|
25
|
+
subject.classify('Chinese Chinese Chinese Tokyo Japan').should eq(:in_china)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe "untrain" do
|
30
|
+
it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
|
31
|
+
expect {subject.untrain(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should untrain the classifier against the (classification, document) pair" do
|
35
|
+
subject = described_class.new(:in_china, :not_in_china)
|
36
|
+
|
37
|
+
subject.train(:in_china, 'Chinese Chinese')
|
38
|
+
subject.train(:not_in_china, 'Chinese Macao')
|
39
|
+
|
40
|
+
subject.classify('Chinese').should eq(:in_china)
|
41
|
+
|
42
|
+
subject.untrain(:in_china, 'Chinese Chinese')
|
43
|
+
|
44
|
+
subject.classify('Chinese').should eq(:not_in_china)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe "calculate_scores" do
|
49
|
+
it "should return a score hash with the correct scores" do
|
50
|
+
subject = described_class.new(:in_china, :not_in_china)
|
51
|
+
|
52
|
+
subject.train(:in_china, 'Chinese Beijing Chinese')
|
53
|
+
subject.train(:in_china, 'Chinese Chinese Shanghai')
|
54
|
+
subject.train(:in_china, 'Chinese Macao')
|
55
|
+
subject.train(:not_in_china, 'Tokyo Japan Chinese')
|
56
|
+
|
57
|
+
scores = subject.calculate_scores('Chinese Chinese Chinese Tokyo Japan')
|
58
|
+
|
59
|
+
scores[:in_china].should eq(-8.107690312843907)
|
60
|
+
scores[:not_in_china].should eq(-8.906681345001262)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
describe "add_classification" do
|
65
|
+
it "should add the classification to the set of classifications" do
|
66
|
+
subject.classifications.should be_empty
|
67
|
+
|
68
|
+
subject.add_classification(:niner)
|
69
|
+
|
70
|
+
subject.classifications.should eq([:niner])
|
71
|
+
end
|
72
|
+
|
73
|
+
it "should return the classification" do
|
74
|
+
subject.add_classification(:niner).should eq(:niner)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
describe "remove_classification" do
|
79
|
+
it "should remove the classification from the set of classifications" do
|
80
|
+
subject.add_classification(:niner)
|
81
|
+
|
82
|
+
subject.remove_classification(:niner)
|
83
|
+
|
84
|
+
subject.classifications.should be_empty
|
85
|
+
end
|
86
|
+
|
87
|
+
it "should return the classification" do
|
88
|
+
subject.add_classification(:niner)
|
89
|
+
|
90
|
+
subject.remove_classification(:niner).should eq(:niner)
|
91
|
+
end
|
92
|
+
|
93
|
+
it "should return nil if the classification didn't exist" do
|
94
|
+
subject.remove_classification(:niner).should be(nil)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Array do
|
4
|
+
describe "sum_with_identity" do
|
5
|
+
it "should sum the array" do
|
6
|
+
[1,2,3].sum_with_identity.should eq(6)
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should return 0 when it encounters an empty array" do
|
10
|
+
[].sum_with_identity.should eq(0)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe String do
|
4
|
+
describe "word_hash" do
|
5
|
+
it "should hash text" do
|
6
|
+
hash = {:good => 1, :"!" => 1, :hope => 1, :"'" => 1, :"." => 1, :love => 1, :word => 1, :them => 1, :test => 1}
|
7
|
+
|
8
|
+
"here are some good words of test's. I hope you love them!".word_hash.should eq(hash)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "clean_word_hash" do
|
13
|
+
it "should clean and hash text" do
|
14
|
+
hash = {:good => 1, :word => 1, :hope => 1, :love => 1, :them => 1, :test => 1}
|
15
|
+
|
16
|
+
"here are some good words of test's. I hope you love them!".clean_word_hash.should eq(hash)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/spec/lsi_spec.rb
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Reclassifier::LSI do
|
4
|
+
before do
|
5
|
+
# we repeat principle words to help weight them.
|
6
|
+
# This test is rather delicate, since this system is mostly noise.
|
7
|
+
@str1 = "This text deals with dogs. Dogs."
|
8
|
+
@str2 = "This text involves dogs too. Dogs! "
|
9
|
+
@str3 = "This text revolves around cats. Cats."
|
10
|
+
@str4 = "This text also involves cats. Cats!"
|
11
|
+
@str5 = "This text involves birds. Birds."
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should do basic indexing" do
|
15
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| subject << x }
|
16
|
+
subject.needs_rebuild?.should be(false)
|
17
|
+
|
18
|
+
# note that the closest match to str1 is str2, even though it is not
|
19
|
+
# the closest text match.
|
20
|
+
subject.find_related(@str1, 3).should eq([@str2, @str5, @str3])
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should not auto rebuild when it's specified as false" do
|
24
|
+
subject = described_class.new(:auto_rebuild => false)
|
25
|
+
|
26
|
+
subject.add_item @str1, "Dog"
|
27
|
+
subject.add_item @str2, "Dog"
|
28
|
+
|
29
|
+
subject.needs_rebuild?.should be(true)
|
30
|
+
|
31
|
+
subject.build_index
|
32
|
+
|
33
|
+
subject.needs_rebuild?.should be(false)
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should do basic classifying" do
|
37
|
+
subject.add_item(@str2, "Dog")
|
38
|
+
subject.add_item(@str3, "Cat")
|
39
|
+
subject.add_item(@str4, "Cat")
|
40
|
+
subject.add_item(@str5, "Bird")
|
41
|
+
|
42
|
+
subject.classify(@str1).should eq("Dog")
|
43
|
+
subject.classify(@str3).should eq("Cat")
|
44
|
+
subject.classify(@str5).should eq("Bird")
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should perform better than Bayes" do
|
48
|
+
bayes = Reclassifier::Bayes.new :dog, :cat, :bird
|
49
|
+
|
50
|
+
[[@str1, "Dog"],
|
51
|
+
[@str2, "Dog"],
|
52
|
+
[@str3, "Cat"],
|
53
|
+
[@str4, "Cat"],
|
54
|
+
[@str5, "Bird"]].each do |str, classification|
|
55
|
+
subject.add_item(str, classification)
|
56
|
+
|
57
|
+
bayes.train(classification.downcase.to_sym, str)
|
58
|
+
end
|
59
|
+
|
60
|
+
# We're talking about dogs. Even though the text matches the corpus on
|
61
|
+
# cats better. Dogs have more semantic weight than cats. So bayes
|
62
|
+
# will fail here, but the LSI recognizes content.
|
63
|
+
tricky_case = "This text revolves around dogs."
|
64
|
+
subject.classify(tricky_case).should eq("Dog")
|
65
|
+
bayes.classify(tricky_case).should eq(:dog)
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should recategorize as needed" do
|
69
|
+
subject.add_item(@str1, "Dog")
|
70
|
+
subject.add_item(@str2, "Dog")
|
71
|
+
subject.add_item(@str3, "Cat")
|
72
|
+
subject.add_item(@str4, "Cat")
|
73
|
+
subject.add_item(@str5, "Bird")
|
74
|
+
|
75
|
+
tricky_case = "This text revolves around dogs."
|
76
|
+
subject.classify(tricky_case).should eq("Dog")
|
77
|
+
|
78
|
+
# Recategorize as needed.
|
79
|
+
subject.categories_for(@str1).clear.push("Cow")
|
80
|
+
subject.categories_for(@str2).clear.push("Cow")
|
81
|
+
|
82
|
+
subject.needs_rebuild?.should be(false)
|
83
|
+
subject.classify(tricky_case).should eq("Cow")
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should search correctly" do
|
87
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| subject << x }
|
88
|
+
|
89
|
+
# Searching by content and text, note that @str2 comes up first, because
|
90
|
+
# both "dog" and "involve" are present. But, the next match is @str1 instead
|
91
|
+
# of @str4, because "dog" carries more weight than involves.
|
92
|
+
subject.search("dog involves", 100).should eq([@str2, @str1, @str4, @str5, @str3])
|
93
|
+
|
94
|
+
# Keyword search shows how the space is mapped out in relation to
|
95
|
+
# dog when magnitude is remove. Note the relations. We move from dog
|
96
|
+
# through involve and then finally to other words.
|
97
|
+
subject.search("dog", 5).should eq([@str1, @str2, @str4, @str5, @str3])
|
98
|
+
end
|
99
|
+
|
100
|
+
it "should serialize correctly" do
|
101
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| subject << x }
|
102
|
+
|
103
|
+
subject_md = Marshal.dump(subject)
|
104
|
+
subject_m = Marshal.load(subject_md)
|
105
|
+
|
106
|
+
subject_m.search("cat", 3).should eq(subject.search("cat", 3))
|
107
|
+
subject_m.find_related(@str1, 3).should eq(subject.find_related(@str1, 3))
|
108
|
+
end
|
109
|
+
|
110
|
+
it "should keyword search correctly" do
|
111
|
+
subject.add_item(@str1, "Dog")
|
112
|
+
subject.add_item(@str2, "Dog")
|
113
|
+
subject.add_item(@str3, "Cat")
|
114
|
+
subject.add_item(@str4, "Cat")
|
115
|
+
subject.add_item(@str5, "Bird")
|
116
|
+
|
117
|
+
subject.highest_ranked_stems(@str1).should eq([:dog, :text, :deal])
|
118
|
+
end
|
119
|
+
|
120
|
+
it "should summarize correctly" do
|
121
|
+
[@str1, @str2, @str3, @str4, @str5].join.summary(2).should eq("This text involves dogs too [...] This text also involves cats")
|
122
|
+
end
|
123
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: reclassifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-04-
|
12
|
+
date: 2013-04-22 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -44,7 +44,7 @@ dependencies:
|
|
44
44
|
- !ruby/object:Gem::Version
|
45
45
|
version: '0'
|
46
46
|
- !ruby/object:Gem::Dependency
|
47
|
-
name:
|
47
|
+
name: rspec
|
48
48
|
requirement: !ruby/object:Gem::Requirement
|
49
49
|
none: false
|
50
50
|
requirements:
|
@@ -109,18 +109,18 @@ files:
|
|
109
109
|
- lib/reclassifier/content_node.rb
|
110
110
|
- lib/reclassifier/core_ext/array.rb
|
111
111
|
- lib/reclassifier/core_ext/matrix.rb
|
112
|
-
- lib/reclassifier/core_ext/object.rb
|
113
112
|
- lib/reclassifier/core_ext/string.rb
|
114
113
|
- lib/reclassifier/core_ext/vector.rb
|
115
114
|
- lib/reclassifier/lsi.rb
|
115
|
+
- lib/reclassifier/unknown_classification_error.rb
|
116
116
|
- lib/reclassifier/version.rb
|
117
117
|
- lib/reclassifier/word_list.rb
|
118
118
|
- reclassifier.gemspec
|
119
|
-
-
|
120
|
-
-
|
121
|
-
-
|
122
|
-
-
|
123
|
-
-
|
119
|
+
- spec/bayes_spec.rb
|
120
|
+
- spec/core_ext/array_spec.rb
|
121
|
+
- spec/core_ext/string_spec.rb
|
122
|
+
- spec/lsi_spec.rb
|
123
|
+
- spec/spec_helper.rb
|
124
124
|
homepage: https://github.com/saveup/reclassifier
|
125
125
|
licenses:
|
126
126
|
- LGPL
|
@@ -147,8 +147,8 @@ signing_key:
|
|
147
147
|
specification_version: 3
|
148
148
|
summary: Bayesian and Latent Semantic Indexing classification of text.
|
149
149
|
test_files:
|
150
|
-
-
|
151
|
-
-
|
152
|
-
-
|
153
|
-
-
|
154
|
-
-
|
150
|
+
- spec/bayes_spec.rb
|
151
|
+
- spec/core_ext/array_spec.rb
|
152
|
+
- spec/core_ext/string_spec.rb
|
153
|
+
- spec/lsi_spec.rb
|
154
|
+
- spec/spec_helper.rb
|
data/test/bayes_test.rb
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), 'test_helper')
|
2
|
-
|
3
|
-
class BayesTest < Test::Unit::TestCase
|
4
|
-
def setup
|
5
|
-
@classifier = Reclassifier::Bayes.new 'Interesting', 'Uninteresting'
|
6
|
-
end
|
7
|
-
|
8
|
-
def test_good_training
|
9
|
-
assert_nothing_raised { @classifier.train_interesting "love" }
|
10
|
-
end
|
11
|
-
|
12
|
-
def test_bad_training
|
13
|
-
assert_raise(StandardError) { @classifier.train_no_category "words" }
|
14
|
-
end
|
15
|
-
|
16
|
-
def test_bad_method
|
17
|
-
assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
|
18
|
-
end
|
19
|
-
|
20
|
-
def test_categories
|
21
|
-
assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
|
22
|
-
end
|
23
|
-
|
24
|
-
def test_add_category
|
25
|
-
@classifier.add_category 'Test'
|
26
|
-
assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
|
27
|
-
end
|
28
|
-
|
29
|
-
def test_classification
|
30
|
-
@classifier.train_interesting "here are some good words. I hope you love them"
|
31
|
-
@classifier.train_uninteresting "here are some bad words, I hate you"
|
32
|
-
assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
|
33
|
-
end
|
34
|
-
end
|
data/test/core_ext/array_test.rb
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', 'test_helper')
|
2
|
-
|
3
|
-
class ArrayTest < Test::Unit::TestCase
|
4
|
-
def test_monkey_path_array_sum
|
5
|
-
assert_equal [1,2,3].sum_with_identity, 6
|
6
|
-
end
|
7
|
-
|
8
|
-
def test_summing_an_empty_array
|
9
|
-
assert_equal [nil].sum_with_identity, 0
|
10
|
-
end
|
11
|
-
|
12
|
-
def test_summing_an_empty_array
|
13
|
-
assert_equal Array[].sum_with_identity, 0
|
14
|
-
end
|
15
|
-
end
|
@@ -1,13 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', 'test_helper')
|
2
|
-
|
3
|
-
class StringTest < Test::Unit::TestCase
|
4
|
-
def test_word_hash
|
5
|
-
hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
|
6
|
-
assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
|
7
|
-
end
|
8
|
-
|
9
|
-
def test_clean_word_hash
|
10
|
-
hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
|
11
|
-
assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash
|
12
|
-
end
|
13
|
-
end
|
data/test/lsi_test.rb
DELETED
@@ -1,123 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), 'test_helper')
|
2
|
-
|
3
|
-
class LSITest < Test::Unit::TestCase
|
4
|
-
def setup
|
5
|
-
# we repeat principle words to help weight them.
|
6
|
-
# This test is rather delicate, since this system is mostly noise.
|
7
|
-
@str1 = "This text deals with dogs. Dogs."
|
8
|
-
@str2 = "This text involves dogs too. Dogs! "
|
9
|
-
@str3 = "This text revolves around cats. Cats."
|
10
|
-
@str4 = "This text also involves cats. Cats!"
|
11
|
-
@str5 = "This text involves birds. Birds."
|
12
|
-
end
|
13
|
-
|
14
|
-
def test_basic_indexing
|
15
|
-
lsi = Reclassifier::LSI.new
|
16
|
-
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
17
|
-
assert ! lsi.needs_rebuild?
|
18
|
-
|
19
|
-
# note that the closest match to str1 is str2, even though it is not
|
20
|
-
# the closest text match.
|
21
|
-
assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
|
22
|
-
end
|
23
|
-
|
24
|
-
def test_not_auto_rebuild
|
25
|
-
lsi = Reclassifier::LSI.new :auto_rebuild => false
|
26
|
-
lsi.add_item @str1, "Dog"
|
27
|
-
lsi.add_item @str2, "Dog"
|
28
|
-
assert lsi.needs_rebuild?
|
29
|
-
lsi.build_index
|
30
|
-
assert ! lsi.needs_rebuild?
|
31
|
-
end
|
32
|
-
|
33
|
-
def test_basic_categorizing
|
34
|
-
lsi = Reclassifier::LSI.new
|
35
|
-
lsi.add_item @str2, "Dog"
|
36
|
-
lsi.add_item @str3, "Cat"
|
37
|
-
lsi.add_item @str4, "Cat"
|
38
|
-
lsi.add_item @str5, "Bird"
|
39
|
-
|
40
|
-
assert_equal "Dog", lsi.classify( @str1 )
|
41
|
-
assert_equal "Cat", lsi.classify( @str3 )
|
42
|
-
assert_equal "Bird", lsi.classify( @str5 )
|
43
|
-
end
|
44
|
-
|
45
|
-
def test_external_classifying
|
46
|
-
lsi = Reclassifier::LSI.new
|
47
|
-
bayes = Reclassifier::Bayes.new 'Dog', 'Cat', 'Bird'
|
48
|
-
lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
|
49
|
-
lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
|
50
|
-
lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
|
51
|
-
lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
|
52
|
-
lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
|
53
|
-
|
54
|
-
# We're talking about dogs. Even though the text matches the corpus on
|
55
|
-
# cats better. Dogs have more semantic weight than cats. So bayes
|
56
|
-
# will fail here, but the LSI recognizes content.
|
57
|
-
tricky_case = "This text revolves around dogs."
|
58
|
-
assert_equal "Dog", lsi.classify( tricky_case )
|
59
|
-
assert_not_equal "Dog", bayes.classify( tricky_case )
|
60
|
-
end
|
61
|
-
|
62
|
-
def test_recategorize_interface
|
63
|
-
lsi = Reclassifier::LSI.new
|
64
|
-
lsi.add_item @str1, "Dog"
|
65
|
-
lsi.add_item @str2, "Dog"
|
66
|
-
lsi.add_item @str3, "Cat"
|
67
|
-
lsi.add_item @str4, "Cat"
|
68
|
-
lsi.add_item @str5, "Bird"
|
69
|
-
|
70
|
-
tricky_case = "This text revolves around dogs."
|
71
|
-
assert_equal "Dog", lsi.classify( tricky_case )
|
72
|
-
|
73
|
-
# Recategorize as needed.
|
74
|
-
lsi.categories_for(@str1).clear.push "Cow"
|
75
|
-
lsi.categories_for(@str2).clear.push "Cow"
|
76
|
-
|
77
|
-
assert !lsi.needs_rebuild?
|
78
|
-
assert_equal "Cow", lsi.classify( tricky_case )
|
79
|
-
end
|
80
|
-
|
81
|
-
def test_search
|
82
|
-
lsi = Reclassifier::LSI.new
|
83
|
-
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
84
|
-
|
85
|
-
# Searching by content and text, note that @str2 comes up first, because
|
86
|
-
# both "dog" and "involve" are present. But, the next match is @str1 instead
|
87
|
-
# of @str4, because "dog" carries more weight than involves.
|
88
|
-
assert_equal( [@str2, @str1, @str4, @str5, @str3],
|
89
|
-
lsi.search("dog involves", 100) )
|
90
|
-
|
91
|
-
# Keyword search shows how the space is mapped out in relation to
|
92
|
-
# dog when magnitude is remove. Note the relations. We move from dog
|
93
|
-
# through involve and then finally to other words.
|
94
|
-
assert_equal( [@str1, @str2, @str4, @str5, @str3],
|
95
|
-
lsi.search("dog", 5) )
|
96
|
-
end
|
97
|
-
|
98
|
-
def test_serialize_safe
|
99
|
-
lsi = Reclassifier::LSI.new
|
100
|
-
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
101
|
-
|
102
|
-
lsi_md = Marshal.dump lsi
|
103
|
-
lsi_m = Marshal.load lsi_md
|
104
|
-
|
105
|
-
assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
|
106
|
-
assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
|
107
|
-
end
|
108
|
-
|
109
|
-
def test_keyword_search
|
110
|
-
lsi = Reclassifier::LSI.new
|
111
|
-
lsi.add_item @str1, "Dog"
|
112
|
-
lsi.add_item @str2, "Dog"
|
113
|
-
lsi.add_item @str3, "Cat"
|
114
|
-
lsi.add_item @str4, "Cat"
|
115
|
-
lsi.add_item @str5, "Bird"
|
116
|
-
|
117
|
-
assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
|
118
|
-
end
|
119
|
-
|
120
|
-
def test_summary
|
121
|
-
assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
|
122
|
-
end
|
123
|
-
end
|
data/test/test_helper.rb
DELETED