reclassifier 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +2 -5
- data/lib/reclassifier/bayes.rb +88 -80
- data/lib/reclassifier/unknown_classification_error.rb +2 -0
- data/lib/reclassifier/version.rb +1 -1
- data/lib/reclassifier.rb +5 -5
- data/reclassifier.gemspec +1 -1
- data/spec/bayes_spec.rb +97 -0
- data/spec/core_ext/array_spec.rb +13 -0
- data/spec/core_ext/string_spec.rb +19 -0
- data/spec/lsi_spec.rb +123 -0
- data/spec/spec_helper.rb +5 -0
- metadata +14 -14
- data/lib/reclassifier/core_ext/object.rb +0 -3
- data/test/bayes_test.rb +0 -34
- data/test/core_ext/array_test.rb +0 -15
- data/test/core_ext/string_test.rb +0 -13
- data/test/lsi_test.rb +0 -123
- data/test/test_helper.rb +0 -4
data/Rakefile
CHANGED
data/lib/reclassifier/bayes.rb
CHANGED
@@ -1,129 +1,137 @@
|
|
1
|
+
#
|
2
|
+
# Bayesian classifier for arbitrary text.
|
3
|
+
#
|
4
|
+
# Implementation is translated from
|
5
|
+
# Introduction to Information Retrieval by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze,
|
6
|
+
# Cambridge University Press. 2008, ISBN 0521865719.
|
7
|
+
#
|
1
8
|
module Reclassifier
|
2
9
|
class Bayes
|
3
|
-
#
|
4
|
-
# initialized and given a training method.
|
5
|
-
#
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
@
|
10
|
-
|
10
|
+
# Can be created with zero or more classifications, each of which will be
|
11
|
+
# initialized and given a training method. The classifications are specified as
|
12
|
+
# symbols. E.g.,
|
13
|
+
# b = Reclassifier::Bayes.new :interesting, :uninteresting, :spam
|
14
|
+
def initialize(*classifications)
|
15
|
+
@classifications = {}
|
16
|
+
classifications.each {|classification| @classifications[classification] = {}}
|
17
|
+
|
18
|
+
@docs_in_classification_count = {}
|
11
19
|
end
|
12
20
|
|
13
21
|
#
|
14
|
-
# Provides a general training method for all
|
22
|
+
# Provides a general training method for all classifications specified in Bayes#new
|
15
23
|
# For example:
|
16
|
-
# b =
|
24
|
+
# b = Reclassifier::Bayes.new :this, :that
|
17
25
|
# b.train :this, "This text"
|
18
|
-
# b.train
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
@
|
26
|
+
# b.train :that, "That text"
|
27
|
+
def train(classification, text)
|
28
|
+
ensure_classification_exists(classification)
|
29
|
+
|
30
|
+
@docs_in_classification_count[classification] ||= 0
|
31
|
+
@docs_in_classification_count[classification] += 1
|
32
|
+
|
23
33
|
text.word_hash.each do |word, count|
|
24
|
-
@
|
25
|
-
|
26
|
-
@
|
34
|
+
@classifications[classification][word] ||= 0
|
35
|
+
|
36
|
+
@classifications[classification][word] += count
|
27
37
|
end
|
28
38
|
end
|
29
39
|
|
30
40
|
#
|
31
|
-
#
|
41
|
+
# Untrain a (classification, text) pair.
|
32
42
|
# Be very careful with this method.
|
33
43
|
#
|
34
44
|
# For example:
|
35
|
-
# b =
|
45
|
+
# b = Reclassifier::Bayes.new :this, :that, :the_other
|
36
46
|
# b.train :this, "This text"
|
37
47
|
# b.untrain :this, "This text"
|
38
|
-
def untrain(
|
39
|
-
|
40
|
-
|
48
|
+
def untrain(classification, text)
|
49
|
+
ensure_classification_exists(classification)
|
50
|
+
|
51
|
+
@docs_in_classification_count[classification] -= 1
|
52
|
+
|
41
53
|
text.word_hash.each do |word, count|
|
42
|
-
if @
|
43
|
-
orig = @categories[category][word]
|
44
|
-
@categories[category][word] ||= 0
|
45
|
-
@categories[category][word] -= count
|
46
|
-
if @categories[category][word] <= 0
|
47
|
-
@categories[category].delete(word)
|
48
|
-
count = orig
|
49
|
-
end
|
50
|
-
@total_words -= count
|
51
|
-
end
|
54
|
+
@classifications[classification][word] -= count if @classifications[classification].include?(word)
|
52
55
|
end
|
53
56
|
end
|
54
57
|
|
55
58
|
#
|
56
|
-
# Returns the scores
|
59
|
+
# Returns the scores of the specified text for each classification. E.g.,
|
57
60
|
# b.classifications "I hate bad words and you"
|
58
61
|
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
59
62
|
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
60
|
-
def
|
61
|
-
|
62
|
-
|
63
|
-
@
|
64
|
-
|
65
|
-
|
63
|
+
def calculate_scores(text)
|
64
|
+
scores = {}
|
65
|
+
|
66
|
+
@classifications.each do |classification, classification_word_counts|
|
67
|
+
# prior
|
68
|
+
scores[classification] = Math.log(@docs_in_classification_count[classification])
|
69
|
+
scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
|
70
|
+
|
71
|
+
# likelihood
|
66
72
|
text.word_hash.each do |word, count|
|
67
|
-
|
68
|
-
|
73
|
+
if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
|
74
|
+
scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
|
75
|
+
|
76
|
+
scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
|
77
|
+
end
|
69
78
|
end
|
70
|
-
# now add prior probability for the category
|
71
|
-
s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
|
72
|
-
score[category.to_s] += Math.log(s / training_count)
|
73
79
|
end
|
74
|
-
|
80
|
+
|
81
|
+
scores
|
75
82
|
end
|
76
83
|
|
77
84
|
#
|
78
|
-
# Returns the classification of the
|
79
|
-
#
|
85
|
+
# Returns the classification of the specified text, which is one of the
|
86
|
+
# classifications given in the initializer. E.g.,
|
80
87
|
# b.classify "I hate bad words and you"
|
81
|
-
# =>
|
88
|
+
# => :uninteresting
|
82
89
|
def classify(text)
|
83
|
-
(
|
90
|
+
calculate_scores(text).max_by {|classification| classification[1]}[0]
|
84
91
|
end
|
85
92
|
|
86
93
|
#
|
87
|
-
# Provides
|
94
|
+
# Provides a list of classification names
|
88
95
|
# For example:
|
89
|
-
# b
|
90
|
-
#
|
91
|
-
|
92
|
-
|
93
|
-
# b.train_the_other "The other text"
|
94
|
-
def method_missing(name, *args)
|
95
|
-
category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
|
96
|
-
if @categories.has_key? category
|
97
|
-
args.each { |text| eval("#{$1}train(category, text)") }
|
98
|
-
elsif name.to_s =~ /(un)?train_([\w]+)/
|
99
|
-
raise StandardError, "No such category: #{category}"
|
100
|
-
else
|
101
|
-
super #raise StandardError, "No such method: #{name}"
|
102
|
-
end
|
96
|
+
# b.classifications
|
97
|
+
# => [:this, :that, :the_other]
|
98
|
+
def classifications
|
99
|
+
@classifications.keys
|
103
100
|
end
|
104
101
|
|
105
102
|
#
|
106
|
-
#
|
103
|
+
# Adds the classification to the classifier.
|
104
|
+
# Has no effect if the classification already existed.
|
105
|
+
# Returns the classification.
|
107
106
|
# For example:
|
108
|
-
# b.
|
109
|
-
|
110
|
-
|
111
|
-
|
107
|
+
# b.add_classification(:not_spam)
|
108
|
+
def add_classification(classification)
|
109
|
+
@classifications[classification] ||= {}
|
110
|
+
|
111
|
+
classification
|
112
112
|
end
|
113
113
|
|
114
114
|
#
|
115
|
-
#
|
115
|
+
# Removes the classification from the classifier.
|
116
|
+
# Returns the classifier if the classification existed, else nil.
|
116
117
|
# For example:
|
117
|
-
# b.
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
118
|
+
# b.remove_classification(:not_spam)
|
119
|
+
def remove_classification(classification)
|
120
|
+
return_value = if @classifications.include?(classification)
|
121
|
+
classification
|
122
|
+
else
|
123
|
+
nil
|
124
|
+
end
|
125
|
+
|
126
|
+
@classifications.delete(classification)
|
127
|
+
|
128
|
+
return_value
|
125
129
|
end
|
126
130
|
|
127
|
-
|
131
|
+
private
|
132
|
+
|
133
|
+
def ensure_classification_exists(classification)
|
134
|
+
raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
|
135
|
+
end
|
128
136
|
end
|
129
137
|
end
|
data/lib/reclassifier/version.rb
CHANGED
data/lib/reclassifier.rb
CHANGED
@@ -7,13 +7,13 @@ require 'gsl'
|
|
7
7
|
require 'reclassifier/version'
|
8
8
|
require 'reclassifier/core_ext/array'
|
9
9
|
require 'reclassifier/core_ext/matrix'
|
10
|
-
require 'reclassifier/core_ext/object'
|
11
10
|
require 'reclassifier/core_ext/string'
|
12
11
|
require 'gsl/vector'
|
13
12
|
|
14
13
|
module Reclassifier
|
15
|
-
autoload :Bayes,
|
16
|
-
autoload :LSI,
|
17
|
-
autoload :ContentNode,
|
18
|
-
autoload :WordList,
|
14
|
+
autoload :Bayes, 'reclassifier/bayes'
|
15
|
+
autoload :LSI, 'reclassifier/lsi'
|
16
|
+
autoload :ContentNode, 'reclassifier/content_node'
|
17
|
+
autoload :WordList, 'reclassifier/word_list'
|
18
|
+
autoload :UnknownClassificationError, 'reclassifier/unknown_classification_error'
|
19
19
|
end
|
data/reclassifier.gemspec
CHANGED
@@ -20,7 +20,7 @@ Gem::Specification.new do |spec|
|
|
20
20
|
|
21
21
|
spec.add_development_dependency 'bundler', '~> 1.3'
|
22
22
|
spec.add_development_dependency 'rake'
|
23
|
-
spec.add_development_dependency '
|
23
|
+
spec.add_development_dependency 'rspec'
|
24
24
|
|
25
25
|
spec.add_dependency 'fast-stemmer'
|
26
26
|
spec.add_dependency 'gsl'
|
data/spec/bayes_spec.rb
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Reclassifier::Bayes do
|
4
|
+
describe "classifications" do
|
5
|
+
it "should return the classifications" do
|
6
|
+
subject = described_class.new(:interesting, :uninteresting)
|
7
|
+
|
8
|
+
subject.classifications.sort.should eq([:interesting, :uninteresting])
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "train" do
|
13
|
+
it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
|
14
|
+
expect {subject.train(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should train the classifier to the (classification, document) pair" do
|
18
|
+
subject = described_class.new(:in_china, :not_in_china)
|
19
|
+
|
20
|
+
subject.train(:in_china, 'Chinese Beijing Chinese')
|
21
|
+
subject.train(:in_china, 'Chinese Chinese Shanghai')
|
22
|
+
subject.train(:in_china, 'Chinese Macao')
|
23
|
+
subject.train(:not_in_china, 'Tokyo Japan Chinese')
|
24
|
+
|
25
|
+
subject.classify('Chinese Chinese Chinese Tokyo Japan').should eq(:in_china)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe "untrain" do
|
30
|
+
it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
|
31
|
+
expect {subject.untrain(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should untrain the classifier against the (classification, document) pair" do
|
35
|
+
subject = described_class.new(:in_china, :not_in_china)
|
36
|
+
|
37
|
+
subject.train(:in_china, 'Chinese Chinese')
|
38
|
+
subject.train(:not_in_china, 'Chinese Macao')
|
39
|
+
|
40
|
+
subject.classify('Chinese').should eq(:in_china)
|
41
|
+
|
42
|
+
subject.untrain(:in_china, 'Chinese Chinese')
|
43
|
+
|
44
|
+
subject.classify('Chinese').should eq(:not_in_china)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe "calculate_scores" do
|
49
|
+
it "should return a score hash with the correct scores" do
|
50
|
+
subject = described_class.new(:in_china, :not_in_china)
|
51
|
+
|
52
|
+
subject.train(:in_china, 'Chinese Beijing Chinese')
|
53
|
+
subject.train(:in_china, 'Chinese Chinese Shanghai')
|
54
|
+
subject.train(:in_china, 'Chinese Macao')
|
55
|
+
subject.train(:not_in_china, 'Tokyo Japan Chinese')
|
56
|
+
|
57
|
+
scores = subject.calculate_scores('Chinese Chinese Chinese Tokyo Japan')
|
58
|
+
|
59
|
+
scores[:in_china].should eq(-8.107690312843907)
|
60
|
+
scores[:not_in_china].should eq(-8.906681345001262)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
describe "add_classification" do
|
65
|
+
it "should add the classification to the set of classifications" do
|
66
|
+
subject.classifications.should be_empty
|
67
|
+
|
68
|
+
subject.add_classification(:niner)
|
69
|
+
|
70
|
+
subject.classifications.should eq([:niner])
|
71
|
+
end
|
72
|
+
|
73
|
+
it "should return the classification" do
|
74
|
+
subject.add_classification(:niner).should eq(:niner)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
describe "remove_classification" do
|
79
|
+
it "should remove the classification from the set of classifications" do
|
80
|
+
subject.add_classification(:niner)
|
81
|
+
|
82
|
+
subject.remove_classification(:niner)
|
83
|
+
|
84
|
+
subject.classifications.should be_empty
|
85
|
+
end
|
86
|
+
|
87
|
+
it "should return the classification" do
|
88
|
+
subject.add_classification(:niner)
|
89
|
+
|
90
|
+
subject.remove_classification(:niner).should eq(:niner)
|
91
|
+
end
|
92
|
+
|
93
|
+
it "should return nil if the classification didn't exist" do
|
94
|
+
subject.remove_classification(:niner).should be(nil)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Array do
|
4
|
+
describe "sum_with_identity" do
|
5
|
+
it "should sum the array" do
|
6
|
+
[1,2,3].sum_with_identity.should eq(6)
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should return 0 when it encounters an empty array" do
|
10
|
+
[].sum_with_identity.should eq(0)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe String do
|
4
|
+
describe "word_hash" do
|
5
|
+
it "should hash text" do
|
6
|
+
hash = {:good => 1, :"!" => 1, :hope => 1, :"'" => 1, :"." => 1, :love => 1, :word => 1, :them => 1, :test => 1}
|
7
|
+
|
8
|
+
"here are some good words of test's. I hope you love them!".word_hash.should eq(hash)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "clean_word_hash" do
|
13
|
+
it "should clean and hash text" do
|
14
|
+
hash = {:good => 1, :word => 1, :hope => 1, :love => 1, :them => 1, :test => 1}
|
15
|
+
|
16
|
+
"here are some good words of test's. I hope you love them!".clean_word_hash.should eq(hash)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/spec/lsi_spec.rb
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Reclassifier::LSI do
|
4
|
+
before do
|
5
|
+
# we repeat principle words to help weight them.
|
6
|
+
# This test is rather delicate, since this system is mostly noise.
|
7
|
+
@str1 = "This text deals with dogs. Dogs."
|
8
|
+
@str2 = "This text involves dogs too. Dogs! "
|
9
|
+
@str3 = "This text revolves around cats. Cats."
|
10
|
+
@str4 = "This text also involves cats. Cats!"
|
11
|
+
@str5 = "This text involves birds. Birds."
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should do basic indexing" do
|
15
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| subject << x }
|
16
|
+
subject.needs_rebuild?.should be(false)
|
17
|
+
|
18
|
+
# note that the closest match to str1 is str2, even though it is not
|
19
|
+
# the closest text match.
|
20
|
+
subject.find_related(@str1, 3).should eq([@str2, @str5, @str3])
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should not auto rebuild when it's specified as false" do
|
24
|
+
subject = described_class.new(:auto_rebuild => false)
|
25
|
+
|
26
|
+
subject.add_item @str1, "Dog"
|
27
|
+
subject.add_item @str2, "Dog"
|
28
|
+
|
29
|
+
subject.needs_rebuild?.should be(true)
|
30
|
+
|
31
|
+
subject.build_index
|
32
|
+
|
33
|
+
subject.needs_rebuild?.should be(false)
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should do basic classifying" do
|
37
|
+
subject.add_item(@str2, "Dog")
|
38
|
+
subject.add_item(@str3, "Cat")
|
39
|
+
subject.add_item(@str4, "Cat")
|
40
|
+
subject.add_item(@str5, "Bird")
|
41
|
+
|
42
|
+
subject.classify(@str1).should eq("Dog")
|
43
|
+
subject.classify(@str3).should eq("Cat")
|
44
|
+
subject.classify(@str5).should eq("Bird")
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should perform better than Bayes" do
|
48
|
+
bayes = Reclassifier::Bayes.new :dog, :cat, :bird
|
49
|
+
|
50
|
+
[[@str1, "Dog"],
|
51
|
+
[@str2, "Dog"],
|
52
|
+
[@str3, "Cat"],
|
53
|
+
[@str4, "Cat"],
|
54
|
+
[@str5, "Bird"]].each do |str, classification|
|
55
|
+
subject.add_item(str, classification)
|
56
|
+
|
57
|
+
bayes.train(classification.downcase.to_sym, str)
|
58
|
+
end
|
59
|
+
|
60
|
+
# We're talking about dogs. Even though the text matches the corpus on
|
61
|
+
# cats better. Dogs have more semantic weight than cats. So bayes
|
62
|
+
# will fail here, but the LSI recognizes content.
|
63
|
+
tricky_case = "This text revolves around dogs."
|
64
|
+
subject.classify(tricky_case).should eq("Dog")
|
65
|
+
bayes.classify(tricky_case).should eq(:dog)
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should recategorize as needed" do
|
69
|
+
subject.add_item(@str1, "Dog")
|
70
|
+
subject.add_item(@str2, "Dog")
|
71
|
+
subject.add_item(@str3, "Cat")
|
72
|
+
subject.add_item(@str4, "Cat")
|
73
|
+
subject.add_item(@str5, "Bird")
|
74
|
+
|
75
|
+
tricky_case = "This text revolves around dogs."
|
76
|
+
subject.classify(tricky_case).should eq("Dog")
|
77
|
+
|
78
|
+
# Recategorize as needed.
|
79
|
+
subject.categories_for(@str1).clear.push("Cow")
|
80
|
+
subject.categories_for(@str2).clear.push("Cow")
|
81
|
+
|
82
|
+
subject.needs_rebuild?.should be(false)
|
83
|
+
subject.classify(tricky_case).should eq("Cow")
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should search correctly" do
|
87
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| subject << x }
|
88
|
+
|
89
|
+
# Searching by content and text, note that @str2 comes up first, because
|
90
|
+
# both "dog" and "involve" are present. But, the next match is @str1 instead
|
91
|
+
# of @str4, because "dog" carries more weight than involves.
|
92
|
+
subject.search("dog involves", 100).should eq([@str2, @str1, @str4, @str5, @str3])
|
93
|
+
|
94
|
+
# Keyword search shows how the space is mapped out in relation to
|
95
|
+
# dog when magnitude is remove. Note the relations. We move from dog
|
96
|
+
# through involve and then finally to other words.
|
97
|
+
subject.search("dog", 5).should eq([@str1, @str2, @str4, @str5, @str3])
|
98
|
+
end
|
99
|
+
|
100
|
+
it "should serialize correctly" do
|
101
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| subject << x }
|
102
|
+
|
103
|
+
subject_md = Marshal.dump(subject)
|
104
|
+
subject_m = Marshal.load(subject_md)
|
105
|
+
|
106
|
+
subject_m.search("cat", 3).should eq(subject.search("cat", 3))
|
107
|
+
subject_m.find_related(@str1, 3).should eq(subject.find_related(@str1, 3))
|
108
|
+
end
|
109
|
+
|
110
|
+
it "should keyword search correctly" do
|
111
|
+
subject.add_item(@str1, "Dog")
|
112
|
+
subject.add_item(@str2, "Dog")
|
113
|
+
subject.add_item(@str3, "Cat")
|
114
|
+
subject.add_item(@str4, "Cat")
|
115
|
+
subject.add_item(@str5, "Bird")
|
116
|
+
|
117
|
+
subject.highest_ranked_stems(@str1).should eq([:dog, :text, :deal])
|
118
|
+
end
|
119
|
+
|
120
|
+
it "should summarize correctly" do
|
121
|
+
[@str1, @str2, @str3, @str4, @str5].join.summary(2).should eq("This text involves dogs too [...] This text also involves cats")
|
122
|
+
end
|
123
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: reclassifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-04-
|
12
|
+
date: 2013-04-22 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -44,7 +44,7 @@ dependencies:
|
|
44
44
|
- !ruby/object:Gem::Version
|
45
45
|
version: '0'
|
46
46
|
- !ruby/object:Gem::Dependency
|
47
|
-
name:
|
47
|
+
name: rspec
|
48
48
|
requirement: !ruby/object:Gem::Requirement
|
49
49
|
none: false
|
50
50
|
requirements:
|
@@ -109,18 +109,18 @@ files:
|
|
109
109
|
- lib/reclassifier/content_node.rb
|
110
110
|
- lib/reclassifier/core_ext/array.rb
|
111
111
|
- lib/reclassifier/core_ext/matrix.rb
|
112
|
-
- lib/reclassifier/core_ext/object.rb
|
113
112
|
- lib/reclassifier/core_ext/string.rb
|
114
113
|
- lib/reclassifier/core_ext/vector.rb
|
115
114
|
- lib/reclassifier/lsi.rb
|
115
|
+
- lib/reclassifier/unknown_classification_error.rb
|
116
116
|
- lib/reclassifier/version.rb
|
117
117
|
- lib/reclassifier/word_list.rb
|
118
118
|
- reclassifier.gemspec
|
119
|
-
-
|
120
|
-
-
|
121
|
-
-
|
122
|
-
-
|
123
|
-
-
|
119
|
+
- spec/bayes_spec.rb
|
120
|
+
- spec/core_ext/array_spec.rb
|
121
|
+
- spec/core_ext/string_spec.rb
|
122
|
+
- spec/lsi_spec.rb
|
123
|
+
- spec/spec_helper.rb
|
124
124
|
homepage: https://github.com/saveup/reclassifier
|
125
125
|
licenses:
|
126
126
|
- LGPL
|
@@ -147,8 +147,8 @@ signing_key:
|
|
147
147
|
specification_version: 3
|
148
148
|
summary: Bayesian and Latent Semantic Indexing classification of text.
|
149
149
|
test_files:
|
150
|
-
-
|
151
|
-
-
|
152
|
-
-
|
153
|
-
-
|
154
|
-
-
|
150
|
+
- spec/bayes_spec.rb
|
151
|
+
- spec/core_ext/array_spec.rb
|
152
|
+
- spec/core_ext/string_spec.rb
|
153
|
+
- spec/lsi_spec.rb
|
154
|
+
- spec/spec_helper.rb
|
data/test/bayes_test.rb
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), 'test_helper')
|
2
|
-
|
3
|
-
class BayesTest < Test::Unit::TestCase
|
4
|
-
def setup
|
5
|
-
@classifier = Reclassifier::Bayes.new 'Interesting', 'Uninteresting'
|
6
|
-
end
|
7
|
-
|
8
|
-
def test_good_training
|
9
|
-
assert_nothing_raised { @classifier.train_interesting "love" }
|
10
|
-
end
|
11
|
-
|
12
|
-
def test_bad_training
|
13
|
-
assert_raise(StandardError) { @classifier.train_no_category "words" }
|
14
|
-
end
|
15
|
-
|
16
|
-
def test_bad_method
|
17
|
-
assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
|
18
|
-
end
|
19
|
-
|
20
|
-
def test_categories
|
21
|
-
assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
|
22
|
-
end
|
23
|
-
|
24
|
-
def test_add_category
|
25
|
-
@classifier.add_category 'Test'
|
26
|
-
assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
|
27
|
-
end
|
28
|
-
|
29
|
-
def test_classification
|
30
|
-
@classifier.train_interesting "here are some good words. I hope you love them"
|
31
|
-
@classifier.train_uninteresting "here are some bad words, I hate you"
|
32
|
-
assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
|
33
|
-
end
|
34
|
-
end
|
data/test/core_ext/array_test.rb
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', 'test_helper')
|
2
|
-
|
3
|
-
class ArrayTest < Test::Unit::TestCase
|
4
|
-
def test_monkey_path_array_sum
|
5
|
-
assert_equal [1,2,3].sum_with_identity, 6
|
6
|
-
end
|
7
|
-
|
8
|
-
def test_summing_an_empty_array
|
9
|
-
assert_equal [nil].sum_with_identity, 0
|
10
|
-
end
|
11
|
-
|
12
|
-
def test_summing_an_empty_array
|
13
|
-
assert_equal Array[].sum_with_identity, 0
|
14
|
-
end
|
15
|
-
end
|
@@ -1,13 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', 'test_helper')
|
2
|
-
|
3
|
-
class StringTest < Test::Unit::TestCase
|
4
|
-
def test_word_hash
|
5
|
-
hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
|
6
|
-
assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
|
7
|
-
end
|
8
|
-
|
9
|
-
def test_clean_word_hash
|
10
|
-
hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
|
11
|
-
assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash
|
12
|
-
end
|
13
|
-
end
|
data/test/lsi_test.rb
DELETED
@@ -1,123 +0,0 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), 'test_helper')
|
2
|
-
|
3
|
-
class LSITest < Test::Unit::TestCase
|
4
|
-
def setup
|
5
|
-
# we repeat principle words to help weight them.
|
6
|
-
# This test is rather delicate, since this system is mostly noise.
|
7
|
-
@str1 = "This text deals with dogs. Dogs."
|
8
|
-
@str2 = "This text involves dogs too. Dogs! "
|
9
|
-
@str3 = "This text revolves around cats. Cats."
|
10
|
-
@str4 = "This text also involves cats. Cats!"
|
11
|
-
@str5 = "This text involves birds. Birds."
|
12
|
-
end
|
13
|
-
|
14
|
-
def test_basic_indexing
|
15
|
-
lsi = Reclassifier::LSI.new
|
16
|
-
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
17
|
-
assert ! lsi.needs_rebuild?
|
18
|
-
|
19
|
-
# note that the closest match to str1 is str2, even though it is not
|
20
|
-
# the closest text match.
|
21
|
-
assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
|
22
|
-
end
|
23
|
-
|
24
|
-
def test_not_auto_rebuild
|
25
|
-
lsi = Reclassifier::LSI.new :auto_rebuild => false
|
26
|
-
lsi.add_item @str1, "Dog"
|
27
|
-
lsi.add_item @str2, "Dog"
|
28
|
-
assert lsi.needs_rebuild?
|
29
|
-
lsi.build_index
|
30
|
-
assert ! lsi.needs_rebuild?
|
31
|
-
end
|
32
|
-
|
33
|
-
def test_basic_categorizing
|
34
|
-
lsi = Reclassifier::LSI.new
|
35
|
-
lsi.add_item @str2, "Dog"
|
36
|
-
lsi.add_item @str3, "Cat"
|
37
|
-
lsi.add_item @str4, "Cat"
|
38
|
-
lsi.add_item @str5, "Bird"
|
39
|
-
|
40
|
-
assert_equal "Dog", lsi.classify( @str1 )
|
41
|
-
assert_equal "Cat", lsi.classify( @str3 )
|
42
|
-
assert_equal "Bird", lsi.classify( @str5 )
|
43
|
-
end
|
44
|
-
|
45
|
-
def test_external_classifying
|
46
|
-
lsi = Reclassifier::LSI.new
|
47
|
-
bayes = Reclassifier::Bayes.new 'Dog', 'Cat', 'Bird'
|
48
|
-
lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
|
49
|
-
lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
|
50
|
-
lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
|
51
|
-
lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
|
52
|
-
lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
|
53
|
-
|
54
|
-
# We're talking about dogs. Even though the text matches the corpus on
|
55
|
-
# cats better. Dogs have more semantic weight than cats. So bayes
|
56
|
-
# will fail here, but the LSI recognizes content.
|
57
|
-
tricky_case = "This text revolves around dogs."
|
58
|
-
assert_equal "Dog", lsi.classify( tricky_case )
|
59
|
-
assert_not_equal "Dog", bayes.classify( tricky_case )
|
60
|
-
end
|
61
|
-
|
62
|
-
def test_recategorize_interface
|
63
|
-
lsi = Reclassifier::LSI.new
|
64
|
-
lsi.add_item @str1, "Dog"
|
65
|
-
lsi.add_item @str2, "Dog"
|
66
|
-
lsi.add_item @str3, "Cat"
|
67
|
-
lsi.add_item @str4, "Cat"
|
68
|
-
lsi.add_item @str5, "Bird"
|
69
|
-
|
70
|
-
tricky_case = "This text revolves around dogs."
|
71
|
-
assert_equal "Dog", lsi.classify( tricky_case )
|
72
|
-
|
73
|
-
# Recategorize as needed.
|
74
|
-
lsi.categories_for(@str1).clear.push "Cow"
|
75
|
-
lsi.categories_for(@str2).clear.push "Cow"
|
76
|
-
|
77
|
-
assert !lsi.needs_rebuild?
|
78
|
-
assert_equal "Cow", lsi.classify( tricky_case )
|
79
|
-
end
|
80
|
-
|
81
|
-
def test_search
|
82
|
-
lsi = Reclassifier::LSI.new
|
83
|
-
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
84
|
-
|
85
|
-
# Searching by content and text, note that @str2 comes up first, because
|
86
|
-
# both "dog" and "involve" are present. But, the next match is @str1 instead
|
87
|
-
# of @str4, because "dog" carries more weight than involves.
|
88
|
-
assert_equal( [@str2, @str1, @str4, @str5, @str3],
|
89
|
-
lsi.search("dog involves", 100) )
|
90
|
-
|
91
|
-
# Keyword search shows how the space is mapped out in relation to
|
92
|
-
# dog when magnitude is remove. Note the relations. We move from dog
|
93
|
-
# through involve and then finally to other words.
|
94
|
-
assert_equal( [@str1, @str2, @str4, @str5, @str3],
|
95
|
-
lsi.search("dog", 5) )
|
96
|
-
end
|
97
|
-
|
98
|
-
def test_serialize_safe
|
99
|
-
lsi = Reclassifier::LSI.new
|
100
|
-
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
101
|
-
|
102
|
-
lsi_md = Marshal.dump lsi
|
103
|
-
lsi_m = Marshal.load lsi_md
|
104
|
-
|
105
|
-
assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
|
106
|
-
assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
|
107
|
-
end
|
108
|
-
|
109
|
-
def test_keyword_search
|
110
|
-
lsi = Reclassifier::LSI.new
|
111
|
-
lsi.add_item @str1, "Dog"
|
112
|
-
lsi.add_item @str2, "Dog"
|
113
|
-
lsi.add_item @str3, "Cat"
|
114
|
-
lsi.add_item @str4, "Cat"
|
115
|
-
lsi.add_item @str5, "Bird"
|
116
|
-
|
117
|
-
assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
|
118
|
-
end
|
119
|
-
|
120
|
-
def test_summary
|
121
|
-
assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
|
122
|
-
end
|
123
|
-
end
|
data/test/test_helper.rb
DELETED