reclassifier 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,123 @@
1
+ require File.join(File.dirname(__FILE__), 'test_helper')
2
+
3
+ class LSITest < Test::Unit::TestCase
4
+ def setup
5
+ # we repeat principle words to help weight them.
6
+ # This test is rather delicate, since this system is mostly noise.
7
+ @str1 = "This text deals with dogs. Dogs."
8
+ @str2 = "This text involves dogs too. Dogs! "
9
+ @str3 = "This text revolves around cats. Cats."
10
+ @str4 = "This text also involves cats. Cats!"
11
+ @str5 = "This text involves birds. Birds."
12
+ end
13
+
14
+ def test_basic_indexing
15
+ lsi = Reclassifier::LSI.new
16
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
17
+ assert ! lsi.needs_rebuild?
18
+
19
+ # note that the closest match to str1 is str2, even though it is not
20
+ # the closest text match.
21
+ assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
22
+ end
23
+
24
+ def test_not_auto_rebuild
25
+ lsi = Reclassifier::LSI.new :auto_rebuild => false
26
+ lsi.add_item @str1, "Dog"
27
+ lsi.add_item @str2, "Dog"
28
+ assert lsi.needs_rebuild?
29
+ lsi.build_index
30
+ assert ! lsi.needs_rebuild?
31
+ end
32
+
33
+ def test_basic_categorizing
34
+ lsi = Reclassifier::LSI.new
35
+ lsi.add_item @str2, "Dog"
36
+ lsi.add_item @str3, "Cat"
37
+ lsi.add_item @str4, "Cat"
38
+ lsi.add_item @str5, "Bird"
39
+
40
+ assert_equal "Dog", lsi.classify( @str1 )
41
+ assert_equal "Cat", lsi.classify( @str3 )
42
+ assert_equal "Bird", lsi.classify( @str5 )
43
+ end
44
+
45
+ def test_external_classifying
46
+ lsi = Reclassifier::LSI.new
47
+ bayes = Reclassifier::Bayes.new 'Dog', 'Cat', 'Bird'
48
+ lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
49
+ lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
50
+ lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
51
+ lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
52
+ lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
53
+
54
+ # We're talking about dogs. Even though the text matches the corpus on
55
+ # cats better. Dogs have more semantic weight than cats. So bayes
56
+ # will fail here, but the LSI recognizes content.
57
+ tricky_case = "This text revolves around dogs."
58
+ assert_equal "Dog", lsi.classify( tricky_case )
59
+ assert_not_equal "Dog", bayes.classify( tricky_case )
60
+ end
61
+
62
+ def test_recategorize_interface
63
+ lsi = Reclassifier::LSI.new
64
+ lsi.add_item @str1, "Dog"
65
+ lsi.add_item @str2, "Dog"
66
+ lsi.add_item @str3, "Cat"
67
+ lsi.add_item @str4, "Cat"
68
+ lsi.add_item @str5, "Bird"
69
+
70
+ tricky_case = "This text revolves around dogs."
71
+ assert_equal "Dog", lsi.classify( tricky_case )
72
+
73
+ # Recategorize as needed.
74
+ lsi.categories_for(@str1).clear.push "Cow"
75
+ lsi.categories_for(@str2).clear.push "Cow"
76
+
77
+ assert !lsi.needs_rebuild?
78
+ assert_equal "Cow", lsi.classify( tricky_case )
79
+ end
80
+
81
+ def test_search
82
+ lsi = Reclassifier::LSI.new
83
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
84
+
85
+ # Searching by content and text, note that @str2 comes up first, because
86
+ # both "dog" and "involve" are present. But, the next match is @str1 instead
87
+ # of @str4, because "dog" carries more weight than involves.
88
+ assert_equal( [@str2, @str1, @str4, @str5, @str3],
89
+ lsi.search("dog involves", 100) )
90
+
91
+ # Keyword search shows how the space is mapped out in relation to
92
+ # dog when magnitude is remove. Note the relations. We move from dog
93
+ # through involve and then finally to other words.
94
+ assert_equal( [@str1, @str2, @str4, @str5, @str3],
95
+ lsi.search("dog", 5) )
96
+ end
97
+
98
+ def test_serialize_safe
99
+ lsi = Reclassifier::LSI.new
100
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
101
+
102
+ lsi_md = Marshal.dump lsi
103
+ lsi_m = Marshal.load lsi_md
104
+
105
+ assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
106
+ assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
107
+ end
108
+
109
+ def test_keyword_search
110
+ lsi = Reclassifier::LSI.new
111
+ lsi.add_item @str1, "Dog"
112
+ lsi.add_item @str2, "Dog"
113
+ lsi.add_item @str3, "Cat"
114
+ lsi.add_item @str4, "Cat"
115
+ lsi.add_item @str5, "Bird"
116
+
117
+ assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
118
+ end
119
+
120
+ def test_summary
121
+ assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
122
+ end
123
+ end
@@ -0,0 +1,4 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'test/unit'
4
+ require 'reclassifier'
metadata ADDED
@@ -0,0 +1,154 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: reclassifier
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Ryan Oblak
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-04-18 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '1.3'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '1.3'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: test-unit
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: fast-stemmer
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: gsl
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ description: Bayesian and Latent Semantic Indexing classification of text.
95
+ email:
96
+ - rroblak@gmail.com
97
+ executables: []
98
+ extensions: []
99
+ extra_rdoc_files: []
100
+ files:
101
+ - .gitignore
102
+ - Gemfile
103
+ - LICENSE.txt
104
+ - README.md
105
+ - Rakefile
106
+ - lib/gsl/vector.rb
107
+ - lib/reclassifier.rb
108
+ - lib/reclassifier/bayes.rb
109
+ - lib/reclassifier/content_node.rb
110
+ - lib/reclassifier/core_ext/array.rb
111
+ - lib/reclassifier/core_ext/matrix.rb
112
+ - lib/reclassifier/core_ext/object.rb
113
+ - lib/reclassifier/core_ext/string.rb
114
+ - lib/reclassifier/core_ext/vector.rb
115
+ - lib/reclassifier/lsi.rb
116
+ - lib/reclassifier/version.rb
117
+ - lib/reclassifier/word_list.rb
118
+ - reclassifier.gemspec
119
+ - test/bayes_test.rb
120
+ - test/core_ext/array_test.rb
121
+ - test/core_ext/string_test.rb
122
+ - test/lsi_test.rb
123
+ - test/test_helper.rb
124
+ homepage: https://github.com/saveup/reclassifier
125
+ licenses:
126
+ - LGPL
127
+ post_install_message:
128
+ rdoc_options: []
129
+ require_paths:
130
+ - lib
131
+ required_ruby_version: !ruby/object:Gem::Requirement
132
+ none: false
133
+ requirements:
134
+ - - ! '>='
135
+ - !ruby/object:Gem::Version
136
+ version: '0'
137
+ required_rubygems_version: !ruby/object:Gem::Requirement
138
+ none: false
139
+ requirements:
140
+ - - ! '>='
141
+ - !ruby/object:Gem::Version
142
+ version: '0'
143
+ requirements: []
144
+ rubyforge_project:
145
+ rubygems_version: 1.8.25
146
+ signing_key:
147
+ specification_version: 3
148
+ summary: Bayesian and Latent Semantic Indexing classification of text.
149
+ test_files:
150
+ - test/bayes_test.rb
151
+ - test/core_ext/array_test.rb
152
+ - test/core_ext/string_test.rb
153
+ - test/lsi_test.rb
154
+ - test/test_helper.rb