reclassifier 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,123 @@
1
+ require File.join(File.dirname(__FILE__), 'test_helper')
2
+
3
+ class LSITest < Test::Unit::TestCase
4
+ def setup
5
+ # we repeat principle words to help weight them.
6
+ # This test is rather delicate, since this system is mostly noise.
7
+ @str1 = "This text deals with dogs. Dogs."
8
+ @str2 = "This text involves dogs too. Dogs! "
9
+ @str3 = "This text revolves around cats. Cats."
10
+ @str4 = "This text also involves cats. Cats!"
11
+ @str5 = "This text involves birds. Birds."
12
+ end
13
+
14
+ def test_basic_indexing
15
+ lsi = Reclassifier::LSI.new
16
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
17
+ assert ! lsi.needs_rebuild?
18
+
19
+ # note that the closest match to str1 is str2, even though it is not
20
+ # the closest text match.
21
+ assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
22
+ end
23
+
24
+ def test_not_auto_rebuild
25
+ lsi = Reclassifier::LSI.new :auto_rebuild => false
26
+ lsi.add_item @str1, "Dog"
27
+ lsi.add_item @str2, "Dog"
28
+ assert lsi.needs_rebuild?
29
+ lsi.build_index
30
+ assert ! lsi.needs_rebuild?
31
+ end
32
+
33
+ def test_basic_categorizing
34
+ lsi = Reclassifier::LSI.new
35
+ lsi.add_item @str2, "Dog"
36
+ lsi.add_item @str3, "Cat"
37
+ lsi.add_item @str4, "Cat"
38
+ lsi.add_item @str5, "Bird"
39
+
40
+ assert_equal "Dog", lsi.classify( @str1 )
41
+ assert_equal "Cat", lsi.classify( @str3 )
42
+ assert_equal "Bird", lsi.classify( @str5 )
43
+ end
44
+
45
+ def test_external_classifying
46
+ lsi = Reclassifier::LSI.new
47
+ bayes = Reclassifier::Bayes.new 'Dog', 'Cat', 'Bird'
48
+ lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
49
+ lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
50
+ lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
51
+ lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
52
+ lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
53
+
54
+ # We're talking about dogs. Even though the text matches the corpus on
55
+ # cats better. Dogs have more semantic weight than cats. So bayes
56
+ # will fail here, but the LSI recognizes content.
57
+ tricky_case = "This text revolves around dogs."
58
+ assert_equal "Dog", lsi.classify( tricky_case )
59
+ assert_not_equal "Dog", bayes.classify( tricky_case )
60
+ end
61
+
62
+ def test_recategorize_interface
63
+ lsi = Reclassifier::LSI.new
64
+ lsi.add_item @str1, "Dog"
65
+ lsi.add_item @str2, "Dog"
66
+ lsi.add_item @str3, "Cat"
67
+ lsi.add_item @str4, "Cat"
68
+ lsi.add_item @str5, "Bird"
69
+
70
+ tricky_case = "This text revolves around dogs."
71
+ assert_equal "Dog", lsi.classify( tricky_case )
72
+
73
+ # Recategorize as needed.
74
+ lsi.categories_for(@str1).clear.push "Cow"
75
+ lsi.categories_for(@str2).clear.push "Cow"
76
+
77
+ assert !lsi.needs_rebuild?
78
+ assert_equal "Cow", lsi.classify( tricky_case )
79
+ end
80
+
81
+ def test_search
82
+ lsi = Reclassifier::LSI.new
83
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
84
+
85
+ # Searching by content and text, note that @str2 comes up first, because
86
+ # both "dog" and "involve" are present. But, the next match is @str1 instead
87
+ # of @str4, because "dog" carries more weight than involves.
88
+ assert_equal( [@str2, @str1, @str4, @str5, @str3],
89
+ lsi.search("dog involves", 100) )
90
+
91
+ # Keyword search shows how the space is mapped out in relation to
92
+ # dog when magnitude is remove. Note the relations. We move from dog
93
+ # through involve and then finally to other words.
94
+ assert_equal( [@str1, @str2, @str4, @str5, @str3],
95
+ lsi.search("dog", 5) )
96
+ end
97
+
98
+ def test_serialize_safe
99
+ lsi = Reclassifier::LSI.new
100
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
101
+
102
+ lsi_md = Marshal.dump lsi
103
+ lsi_m = Marshal.load lsi_md
104
+
105
+ assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
106
+ assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
107
+ end
108
+
109
+ def test_keyword_search
110
+ lsi = Reclassifier::LSI.new
111
+ lsi.add_item @str1, "Dog"
112
+ lsi.add_item @str2, "Dog"
113
+ lsi.add_item @str3, "Cat"
114
+ lsi.add_item @str4, "Cat"
115
+ lsi.add_item @str5, "Bird"
116
+
117
+ assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
118
+ end
119
+
120
+ def test_summary
121
+ assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
122
+ end
123
+ end
@@ -0,0 +1,4 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'test/unit'
4
+ require 'reclassifier'
metadata ADDED
@@ -0,0 +1,154 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: reclassifier
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Ryan Oblak
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-04-18 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '1.3'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '1.3'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: test-unit
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: fast-stemmer
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: gsl
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ description: Bayesian and Latent Semantic Indexing classification of text.
95
+ email:
96
+ - rroblak@gmail.com
97
+ executables: []
98
+ extensions: []
99
+ extra_rdoc_files: []
100
+ files:
101
+ - .gitignore
102
+ - Gemfile
103
+ - LICENSE.txt
104
+ - README.md
105
+ - Rakefile
106
+ - lib/gsl/vector.rb
107
+ - lib/reclassifier.rb
108
+ - lib/reclassifier/bayes.rb
109
+ - lib/reclassifier/content_node.rb
110
+ - lib/reclassifier/core_ext/array.rb
111
+ - lib/reclassifier/core_ext/matrix.rb
112
+ - lib/reclassifier/core_ext/object.rb
113
+ - lib/reclassifier/core_ext/string.rb
114
+ - lib/reclassifier/core_ext/vector.rb
115
+ - lib/reclassifier/lsi.rb
116
+ - lib/reclassifier/version.rb
117
+ - lib/reclassifier/word_list.rb
118
+ - reclassifier.gemspec
119
+ - test/bayes_test.rb
120
+ - test/core_ext/array_test.rb
121
+ - test/core_ext/string_test.rb
122
+ - test/lsi_test.rb
123
+ - test/test_helper.rb
124
+ homepage: https://github.com/saveup/reclassifier
125
+ licenses:
126
+ - LGPL
127
+ post_install_message:
128
+ rdoc_options: []
129
+ require_paths:
130
+ - lib
131
+ required_ruby_version: !ruby/object:Gem::Requirement
132
+ none: false
133
+ requirements:
134
+ - - ! '>='
135
+ - !ruby/object:Gem::Version
136
+ version: '0'
137
+ required_rubygems_version: !ruby/object:Gem::Requirement
138
+ none: false
139
+ requirements:
140
+ - - ! '>='
141
+ - !ruby/object:Gem::Version
142
+ version: '0'
143
+ requirements: []
144
+ rubyforge_project:
145
+ rubygems_version: 1.8.25
146
+ signing_key:
147
+ specification_version: 3
148
+ summary: Bayesian and Latent Semantic Indexing classification of text.
149
+ test_files:
150
+ - test/bayes_test.rb
151
+ - test/core_ext/array_test.rb
152
+ - test/core_ext/string_test.rb
153
+ - test/lsi_test.rb
154
+ - test/test_helper.rb