classifier 1.3.4 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/LICENSE +2 -2
- data/lib/classifier/bayes.rb +132 -124
- data/lib/classifier/extensions/string.rb +1 -1
- data/lib/classifier/extensions/vector.rb +72 -78
- data/lib/classifier/extensions/vector_serialize.rb +8 -10
- data/lib/classifier/extensions/word_hash.rb +114 -120
- data/lib/classifier/lsi/content_node.rb +39 -37
- data/lib/classifier/lsi/summary.rb +24 -24
- data/lib/classifier/lsi/word_list.rb +7 -8
- data/lib/classifier/lsi.rb +174 -151
- data/lib/classifier.rb +2 -1
- data/test/test_helper.rb +3 -2
- metadata +60 -27
- data/Gemfile +0 -5
- data/Gemfile.lock +0 -26
- data/README.markdown +0 -97
- data/Rakefile +0 -84
- data/test/bayes/bayesian_test.rb +0 -33
- data/test/extensions/word_hash_test.rb +0 -35
- data/test/lsi/lsi_test.rb +0 -123
data/README.markdown
DELETED
@@ -1,97 +0,0 @@
|
|
1
|
-
## Welcome to Classifier
|
2
|
-
|
3
|
-
Classifier is a general module to allow Bayesian and other types of classifications.
|
4
|
-
|
5
|
-
## Download
|
6
|
-
|
7
|
-
* https://github.com/cardmagic/classifier
|
8
|
-
* gem install classifier
|
9
|
-
* git clone https://github.com/cardmagic/classifier.git
|
10
|
-
|
11
|
-
## Dependencies
|
12
|
-
|
13
|
-
If you install Classifier from source, you'll need to install Roman Shterenzon's fast-stemmer gem with RubyGems as follows:
|
14
|
-
|
15
|
-
gem install fast-stemmer
|
16
|
-
|
17
|
-
If you would like to speed up LSI classification by at least 10x, please install the following libraries:
|
18
|
-
GNU GSL:: http://www.gnu.org/software/gsl
|
19
|
-
rb-gsl:: http://rb-gsl.rubyforge.org
|
20
|
-
|
21
|
-
Notice that LSI will work without these libraries, but as soon as they are installed, Classifier will make use of them. No configuration changes are needed, we like to keep things ridiculously easy for you.
|
22
|
-
|
23
|
-
## Bayes
|
24
|
-
|
25
|
-
A Bayesian classifier by Lucas Carlson. Bayesian Classifiers are accurate, fast, and have modest memory requirements.
|
26
|
-
|
27
|
-
### Usage
|
28
|
-
|
29
|
-
require 'classifier'
|
30
|
-
b = Classifier::Bayes.new 'Interesting', 'Uninteresting'
|
31
|
-
b.train_interesting "here are some good words. I hope you love them"
|
32
|
-
b.train_uninteresting "here are some bad words, I hate you"
|
33
|
-
b.classify "I hate bad words and you" # returns 'Uninteresting'
|
34
|
-
|
35
|
-
require 'madeleine'
|
36
|
-
m = SnapshotMadeleine.new("bayes_data") {
|
37
|
-
Classifier::Bayes.new 'Interesting', 'Uninteresting'
|
38
|
-
}
|
39
|
-
m.system.train_interesting "here are some good words. I hope you love them"
|
40
|
-
m.system.train_uninteresting "here are some bad words, I hate you"
|
41
|
-
m.take_snapshot
|
42
|
-
m.system.classify "I love you" # returns 'Interesting'
|
43
|
-
|
44
|
-
Using Madeleine, your application can persist the learned data over time.
|
45
|
-
|
46
|
-
### Bayesian Classification
|
47
|
-
|
48
|
-
* http://www.process.com/precisemail/bayesian_filtering.htm
|
49
|
-
* http://en.wikipedia.org/wiki/Bayesian_filtering
|
50
|
-
* http://www.paulgraham.com/spam.html
|
51
|
-
|
52
|
-
## LSI
|
53
|
-
|
54
|
-
A Latent Semantic Indexer by David Fayram. Latent Semantic Indexing engines
|
55
|
-
are not as fast or as small as Bayesian classifiers, but are more flexible, providing
|
56
|
-
fast search and clustering detection as well as semantic analysis of the text that
|
57
|
-
theoretically simulates human learning.
|
58
|
-
|
59
|
-
### Usage
|
60
|
-
|
61
|
-
require 'classifier'
|
62
|
-
lsi = Classifier::LSI.new
|
63
|
-
strings = [ ["This text deals with dogs. Dogs.", :dog],
|
64
|
-
["This text involves dogs too. Dogs! ", :dog],
|
65
|
-
["This text revolves around cats. Cats.", :cat],
|
66
|
-
["This text also involves cats. Cats!", :cat],
|
67
|
-
["This text involves birds. Birds.",:bird ]]
|
68
|
-
strings.each {|x| lsi.add_item x.first, x.last}
|
69
|
-
|
70
|
-
lsi.search("dog", 3)
|
71
|
-
# returns => ["This text deals with dogs. Dogs.", "This text involves dogs too. Dogs! ",
|
72
|
-
# "This text also involves cats. Cats!"]
|
73
|
-
|
74
|
-
lsi.find_related(strings[2], 2)
|
75
|
-
# returns => ["This text revolves around cats. Cats.", "This text also involves cats. Cats!"]
|
76
|
-
|
77
|
-
lsi.classify "This text is also about dogs!"
|
78
|
-
# returns => :dog
|
79
|
-
|
80
|
-
Please see the Classifier::LSI documentation for more information. It is possible to index, search and classify
|
81
|
-
with more than just simple strings.
|
82
|
-
|
83
|
-
### Latent Semantic Indexing
|
84
|
-
|
85
|
-
* http://www.c2.com/cgi/wiki?LatentSemanticIndexing
|
86
|
-
* http://www.chadfowler.com/index.cgi/Computing/LatentSemanticIndexing.rdoc
|
87
|
-
* http://en.wikipedia.org/wiki/Latent_semantic_analysis
|
88
|
-
|
89
|
-
## Authors
|
90
|
-
|
91
|
-
* Lucas Carlson (lucas@rufy.com)
|
92
|
-
* David Fayram II (dfayram@gmail.com)
|
93
|
-
* Cameron McBride (cameron.mcbride@gmail.com)
|
94
|
-
* Ivan Acosta-Rubio (ivan@softwarecriollo.com)
|
95
|
-
|
96
|
-
This library is released under the terms of the GNU LGPL. See LICENSE for more details.
|
97
|
-
|
data/Rakefile
DELETED
@@ -1,84 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'rake'
|
3
|
-
require 'rake/testtask'
|
4
|
-
require 'rdoc/task'
|
5
|
-
require 'rake/contrib/rubyforgepublisher'
|
6
|
-
|
7
|
-
desc "Default Task"
|
8
|
-
task :default => [ :test ]
|
9
|
-
|
10
|
-
# Run the unit tests
|
11
|
-
desc "Run all unit tests"
|
12
|
-
Rake::TestTask.new("test") { |t|
|
13
|
-
t.libs << "lib"
|
14
|
-
t.pattern = 'test/*/*_test.rb'
|
15
|
-
t.verbose = true
|
16
|
-
}
|
17
|
-
|
18
|
-
# Make a console, useful when working on tests
|
19
|
-
desc "Generate a test console"
|
20
|
-
task :console do
|
21
|
-
verbose( false ) { sh "irb -I lib/ -r 'classifier'" }
|
22
|
-
end
|
23
|
-
|
24
|
-
# Genereate the RDoc documentation
|
25
|
-
desc "Create documentation"
|
26
|
-
Rake::RDocTask.new("doc") { |rdoc|
|
27
|
-
rdoc.title = "Ruby Classifier - Bayesian and LSI classification library"
|
28
|
-
rdoc.rdoc_dir = 'html'
|
29
|
-
rdoc.rdoc_files.include('README')
|
30
|
-
rdoc.rdoc_files.include('lib/**/*.rb')
|
31
|
-
}
|
32
|
-
|
33
|
-
# Genereate the package
|
34
|
-
spec = Gem::Specification.new do |s|
|
35
|
-
|
36
|
-
#### Basic information.
|
37
|
-
|
38
|
-
s.name = 'classifier'
|
39
|
-
s.version = PKG_VERSION
|
40
|
-
s.summary = <<-EOF
|
41
|
-
A general classifier module to allow Bayesian and other types of classifications.
|
42
|
-
EOF
|
43
|
-
s.description = <<-EOF
|
44
|
-
A general classifier module to allow Bayesian and other types of classifications.
|
45
|
-
EOF
|
46
|
-
|
47
|
-
#### Which files are to be included in this gem? Everything! (Except CVS directories.)
|
48
|
-
|
49
|
-
s.files = PKG_FILES
|
50
|
-
|
51
|
-
#### Load-time details: library and application (you will need one or both).
|
52
|
-
|
53
|
-
s.require_path = 'lib'
|
54
|
-
s.autorequire = 'classifier'
|
55
|
-
|
56
|
-
#### Documentation and testing.
|
57
|
-
|
58
|
-
s.has_rdoc = true
|
59
|
-
|
60
|
-
#### Dependencies and requirements.
|
61
|
-
|
62
|
-
s.add_dependency('fast-stemmer', '>= 1.0.0')
|
63
|
-
s.requirements << "A porter-stemmer module to split word stems."
|
64
|
-
|
65
|
-
#### Author and project details.
|
66
|
-
s.author = "Lucas Carlson"
|
67
|
-
s.email = "lucas@rufy.com"
|
68
|
-
s.homepage = "http://classifier.rufy.com/"
|
69
|
-
end
|
70
|
-
|
71
|
-
desc "Report code statistics (KLOCs, etc) from the application"
|
72
|
-
task :stats do
|
73
|
-
require 'code_statistics'
|
74
|
-
CodeStatistics.new(
|
75
|
-
["Library", "lib"],
|
76
|
-
["Units", "test"]
|
77
|
-
).to_s
|
78
|
-
end
|
79
|
-
|
80
|
-
desc "Publish new documentation"
|
81
|
-
task :publish do
|
82
|
-
`ssh rufy update-classifier-doc`
|
83
|
-
Rake::RubyForgePublisher.new('classifier', 'cardmagic').upload
|
84
|
-
end
|
data/test/bayes/bayesian_test.rb
DELETED
@@ -1,33 +0,0 @@
|
|
1
|
-
require File.dirname(__FILE__) + '/../test_helper'
|
2
|
-
class BayesianTest < Test::Unit::TestCase
|
3
|
-
def setup
|
4
|
-
@classifier = Classifier::Bayes.new 'Interesting', 'Uninteresting'
|
5
|
-
end
|
6
|
-
|
7
|
-
def test_good_training
|
8
|
-
assert_nothing_raised { @classifier.train_interesting "love" }
|
9
|
-
end
|
10
|
-
|
11
|
-
def test_bad_training
|
12
|
-
assert_raise(StandardError) { @classifier.train_no_category "words" }
|
13
|
-
end
|
14
|
-
|
15
|
-
def test_bad_method
|
16
|
-
assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
|
17
|
-
end
|
18
|
-
|
19
|
-
def test_categories
|
20
|
-
assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
|
21
|
-
end
|
22
|
-
|
23
|
-
def test_add_category
|
24
|
-
@classifier.add_category 'Test'
|
25
|
-
assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
|
26
|
-
end
|
27
|
-
|
28
|
-
def test_classification
|
29
|
-
@classifier.train_interesting "here are some good words. I hope you love them"
|
30
|
-
@classifier.train_uninteresting "here are some bad words, I hate you"
|
31
|
-
assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
|
32
|
-
end
|
33
|
-
end
|
@@ -1,35 +0,0 @@
|
|
1
|
-
require File.dirname(__FILE__) + '/../test_helper'
|
2
|
-
class StringExtensionsTest < Test::Unit::TestCase
|
3
|
-
def test_word_hash
|
4
|
-
hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
|
5
|
-
assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
|
6
|
-
end
|
7
|
-
|
8
|
-
|
9
|
-
def test_clean_word_hash
|
10
|
-
hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
|
11
|
-
assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash
|
12
|
-
end
|
13
|
-
|
14
|
-
end
|
15
|
-
|
16
|
-
|
17
|
-
class ArrayExtensionsTest < Test::Unit::TestCase
|
18
|
-
|
19
|
-
def test_plays_nicely_with_any_array
|
20
|
-
assert_equal [Array].sum, Array
|
21
|
-
end
|
22
|
-
|
23
|
-
def test_monkey_path_array_sum
|
24
|
-
assert_equal [1,2,3].sum, 6
|
25
|
-
end
|
26
|
-
|
27
|
-
def test_summing_an_empty_array
|
28
|
-
assert_equal [nil].sum, 0
|
29
|
-
end
|
30
|
-
|
31
|
-
def test_summing_an_empty_array
|
32
|
-
assert_equal Array[].sum, 0
|
33
|
-
end
|
34
|
-
|
35
|
-
end
|
data/test/lsi/lsi_test.rb
DELETED
@@ -1,123 +0,0 @@
|
|
1
|
-
require File.dirname(__FILE__) + '/../test_helper'
|
2
|
-
class LSITest < Test::Unit::TestCase
|
3
|
-
def setup
|
4
|
-
# we repeat principle words to help weight them.
|
5
|
-
# This test is rather delicate, since this system is mostly noise.
|
6
|
-
@str1 = "This text deals with dogs. Dogs."
|
7
|
-
@str2 = "This text involves dogs too. Dogs! "
|
8
|
-
@str3 = "This text revolves around cats. Cats."
|
9
|
-
@str4 = "This text also involves cats. Cats!"
|
10
|
-
@str5 = "This text involves birds. Birds."
|
11
|
-
end
|
12
|
-
|
13
|
-
def test_basic_indexing
|
14
|
-
lsi = Classifier::LSI.new
|
15
|
-
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
16
|
-
assert ! lsi.needs_rebuild?
|
17
|
-
|
18
|
-
# note that the closest match to str1 is str2, even though it is not
|
19
|
-
# the closest text match.
|
20
|
-
assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
|
21
|
-
end
|
22
|
-
|
23
|
-
def test_not_auto_rebuild
|
24
|
-
lsi = Classifier::LSI.new :auto_rebuild => false
|
25
|
-
lsi.add_item @str1, "Dog"
|
26
|
-
lsi.add_item @str2, "Dog"
|
27
|
-
assert lsi.needs_rebuild?
|
28
|
-
lsi.build_index
|
29
|
-
assert ! lsi.needs_rebuild?
|
30
|
-
end
|
31
|
-
|
32
|
-
def test_basic_categorizing
|
33
|
-
lsi = Classifier::LSI.new
|
34
|
-
lsi.add_item @str2, "Dog"
|
35
|
-
lsi.add_item @str3, "Cat"
|
36
|
-
lsi.add_item @str4, "Cat"
|
37
|
-
lsi.add_item @str5, "Bird"
|
38
|
-
|
39
|
-
assert_equal "Dog", lsi.classify( @str1 )
|
40
|
-
assert_equal "Cat", lsi.classify( @str3 )
|
41
|
-
assert_equal "Bird", lsi.classify( @str5 )
|
42
|
-
end
|
43
|
-
|
44
|
-
def test_external_classifying
|
45
|
-
lsi = Classifier::LSI.new
|
46
|
-
bayes = Classifier::Bayes.new 'Dog', 'Cat', 'Bird'
|
47
|
-
lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
|
48
|
-
lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
|
49
|
-
lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
|
50
|
-
lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
|
51
|
-
lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
|
52
|
-
|
53
|
-
# We're talking about dogs. Even though the text matches the corpus on
|
54
|
-
# cats better. Dogs have more semantic weight than cats. So bayes
|
55
|
-
# will fail here, but the LSI recognizes content.
|
56
|
-
tricky_case = "This text revolves around dogs."
|
57
|
-
assert_equal "Dog", lsi.classify( tricky_case )
|
58
|
-
assert_not_equal "Dog", bayes.classify( tricky_case )
|
59
|
-
end
|
60
|
-
|
61
|
-
def test_recategorize_interface
|
62
|
-
lsi = Classifier::LSI.new
|
63
|
-
lsi.add_item @str1, "Dog"
|
64
|
-
lsi.add_item @str2, "Dog"
|
65
|
-
lsi.add_item @str3, "Cat"
|
66
|
-
lsi.add_item @str4, "Cat"
|
67
|
-
lsi.add_item @str5, "Bird"
|
68
|
-
|
69
|
-
tricky_case = "This text revolves around dogs."
|
70
|
-
assert_equal "Dog", lsi.classify( tricky_case )
|
71
|
-
|
72
|
-
# Recategorize as needed.
|
73
|
-
lsi.categories_for(@str1).clear.push "Cow"
|
74
|
-
lsi.categories_for(@str2).clear.push "Cow"
|
75
|
-
|
76
|
-
assert !lsi.needs_rebuild?
|
77
|
-
assert_equal "Cow", lsi.classify( tricky_case )
|
78
|
-
end
|
79
|
-
|
80
|
-
def test_search
|
81
|
-
lsi = Classifier::LSI.new
|
82
|
-
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
83
|
-
|
84
|
-
# Searching by content and text, note that @str2 comes up first, because
|
85
|
-
# both "dog" and "involve" are present. But, the next match is @str1 instead
|
86
|
-
# of @str4, because "dog" carries more weight than involves.
|
87
|
-
assert_equal( [@str2, @str1, @str4, @str5, @str3],
|
88
|
-
lsi.search("dog involves", 100) )
|
89
|
-
|
90
|
-
# Keyword search shows how the space is mapped out in relation to
|
91
|
-
# dog when magnitude is remove. Note the relations. We move from dog
|
92
|
-
# through involve and then finally to other words.
|
93
|
-
assert_equal( [@str1, @str2, @str4, @str5, @str3],
|
94
|
-
lsi.search("dog", 5) )
|
95
|
-
end
|
96
|
-
|
97
|
-
def test_serialize_safe
|
98
|
-
lsi = Classifier::LSI.new
|
99
|
-
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
100
|
-
|
101
|
-
lsi_md = Marshal.dump lsi
|
102
|
-
lsi_m = Marshal.load lsi_md
|
103
|
-
|
104
|
-
assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
|
105
|
-
assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
|
106
|
-
end
|
107
|
-
|
108
|
-
def test_keyword_search
|
109
|
-
lsi = Classifier::LSI.new
|
110
|
-
lsi.add_item @str1, "Dog"
|
111
|
-
lsi.add_item @str2, "Dog"
|
112
|
-
lsi.add_item @str3, "Cat"
|
113
|
-
lsi.add_item @str4, "Cat"
|
114
|
-
lsi.add_item @str5, "Bird"
|
115
|
-
|
116
|
-
assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
|
117
|
-
end
|
118
|
-
|
119
|
-
def test_summary
|
120
|
-
assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
|
121
|
-
end
|
122
|
-
|
123
|
-
end
|