reclassifier 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,20 @@
1
+ class Vector
2
+ def magnitude
3
+ sumsqs = 0.0
4
+ self.size.times do |i|
5
+ sumsqs += self[i] ** 2.0
6
+ end
7
+ Math.sqrt(sumsqs)
8
+ end
9
+
10
+ def normalize
11
+ nv = []
12
+ mag = self.magnitude
13
+ self.size.times do |i|
14
+
15
+ nv << (self[i] / mag)
16
+
17
+ end
18
+ Vector[*nv]
19
+ end
20
+ end
@@ -0,0 +1,300 @@
1
+ $GSL = true
2
+
3
+ module Reclassifier
4
+
5
+ # This class implements a Latent Semantic Indexer, which can search, classify and cluster
6
+ # data based on underlying semantic relations. For more information on the algorithms used,
7
+ # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
8
+ class LSI
9
+
10
+ attr_reader :word_list
11
+ attr_accessor :auto_rebuild
12
+
13
+ # Create a fresh index.
14
+ # If you want to call #build_index manually, use
15
+ # Classifier::LSI.new :auto_rebuild => false
16
+ #
17
+ def initialize(options = {})
18
+ @auto_rebuild = true unless options[:auto_rebuild] == false
19
+ @word_list, @items = WordList.new, {}
20
+ @version, @built_at_version = 0, -1
21
+ end
22
+
23
+ # Returns true if the index needs to be rebuilt. The index needs
24
+ # to be built after all informaton is added, but before you start
25
+ # using it for search, classification and cluster detection.
26
+ def needs_rebuild?
27
+ (@items.keys.size > 1) && (@version != @built_at_version)
28
+ end
29
+
30
+ # Adds an item to the index. item is assumed to be a string, but
31
+ # any item may be indexed so long as it responds to #to_s or if
32
+ # you provide an optional block explaining how the indexer can
33
+ # fetch fresh string data. This optional block is passed the item,
34
+ # so the item may only be a reference to a URL or file name.
35
+ #
36
+ # For example:
37
+ # lsi = Classifier::LSI.new
38
+ # lsi.add_item "This is just plain text"
39
+ # lsi.add_item "/home/me/filename.txt" { |x| File.read x }
40
+ # ar = ActiveRecordObject.find( :all )
41
+ # lsi.add_item ar, *ar.categories { |x| ar.content }
42
+ #
43
+ def add_item( item, *categories, &block )
44
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
45
+ @items[item] = ContentNode.new(clean_word_hash, *categories)
46
+ @version += 1
47
+ build_index if @auto_rebuild
48
+ end
49
+
50
+ # A less flexible shorthand for add_item that assumes
51
+ # you are passing in a string with no categorries. item
52
+ # will be duck typed via to_s .
53
+ #
54
+ def <<( item )
55
+ add_item item
56
+ end
57
+
58
+ # Returns the categories for a given indexed items. You are free to add and remove
59
+ # items from this as you see fit. It does not invalide an index to change its categories.
60
+ def categories_for(item)
61
+ return [] unless @items[item]
62
+ return @items[item].categories
63
+ end
64
+
65
+ # Removes an item from the database, if it is indexed.
66
+ #
67
+ def remove_item( item )
68
+ if @items.has_key? item
69
+ @items.delete item
70
+ @version += 1
71
+ end
72
+ end
73
+
74
+ # Returns an array of items that are indexed.
75
+ def items
76
+ @items.keys
77
+ end
78
+
79
+ # Returns the categories for a given indexed items. You are free to add and remove
80
+ # items from this as you see fit. It does not invalide an index to change its categories.
81
+ def categories_for(item)
82
+ return [] unless @items[item]
83
+ return @items[item].categories
84
+ end
85
+
86
+ # This function rebuilds the index if needs_rebuild? returns true.
87
+ # For very large document spaces, this indexing operation may take some
88
+ # time to complete, so it may be wise to place the operation in another
89
+ # thread.
90
+ #
91
+ # As a rule, indexing will be fairly swift on modern machines until
92
+ # you have well over 500 documents indexed, or have an incredibly diverse
93
+ # vocabulary for your documents.
94
+ #
95
+ # The optional parameter "cutoff" is a tuning parameter. When the index is
96
+ # built, a certain number of s-values are discarded from the system. The
97
+ # cutoff parameter tells the indexer how many of these values to keep.
98
+ # A value of 1 for cutoff means that no semantic analysis will take place,
99
+ # turning the LSI class into a simple vector search engine.
100
+ def build_index( cutoff=0.75 )
101
+ return unless needs_rebuild?
102
+ make_word_list
103
+
104
+ doc_list = @items.values
105
+ tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
106
+
107
+ if $GSL
108
+ tdm = GSL::Matrix.alloc(*tda).trans
109
+ ntdm = build_reduced_matrix(tdm, cutoff)
110
+
111
+ ntdm.size[1].times do |col|
112
+ vec = GSL::Vector.alloc( ntdm.column(col) ).row
113
+ doc_list[col].lsi_vector = vec
114
+ doc_list[col].lsi_norm = vec.normalize
115
+ end
116
+ else
117
+ tdm = Matrix.rows(tda).trans
118
+ ntdm = build_reduced_matrix(tdm, cutoff)
119
+
120
+ ntdm.row_size.times do |col|
121
+ doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
122
+ doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
123
+ end
124
+ end
125
+
126
+ @built_at_version = @version
127
+ end
128
+
129
+ # This method returns max_chunks entries, ordered by their average semantic rating.
130
+ # Essentially, the average distance of each entry from all other entries is calculated,
131
+ # the highest are returned.
132
+ #
133
+ # This can be used to build a summary service, or to provide more information about
134
+ # your dataset's general content. For example, if you were to use categorize on the
135
+ # results of this data, you could gather information on what your dataset is generally
136
+ # about.
137
+ def highest_relative_content( max_chunks=10 )
138
+ return [] if needs_rebuild?
139
+
140
+ avg_density = Hash.new
141
+ @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
142
+
143
+ avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
144
+ end
145
+
146
+ # This function is the primitive that find_related and classify
147
+ # build upon. It returns an array of 2-element arrays. The first element
148
+ # of this array is a document, and the second is its "score", defining
149
+ # how "close" it is to other indexed items.
150
+ #
151
+ # These values are somewhat arbitrary, having to do with the vector space
152
+ # created by your content, so the magnitude is interpretable but not always
153
+ # meaningful between indexes.
154
+ #
155
+ # The parameter doc is the content to compare. If that content is not
156
+ # indexed, you can pass an optional block to define how to create the
157
+ # text data. See add_item for examples of how this works.
158
+ def proximity_array_for_content( doc, &block )
159
+ return [] if needs_rebuild?
160
+
161
+ content_node = node_for_content( doc, &block )
162
+ result =
163
+ @items.keys.collect do |item|
164
+ if $GSL
165
+ val = content_node.search_vector * @items[item].search_vector.col
166
+ else
167
+ val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
168
+ end
169
+ [item, val]
170
+ end
171
+ result.sort_by { |x| x[1] }.reverse
172
+ end
173
+
174
+ # Similar to proximity_array_for_content, this function takes similar
175
+ # arguments and returns a similar array. However, it uses the normalized
176
+ # calculated vectors instead of their full versions. This is useful when
177
+ # you're trying to perform operations on content that is much smaller than
178
+ # the text you're working with. search uses this primitive.
179
+ def proximity_norms_for_content( doc, &block )
180
+ return [] if needs_rebuild?
181
+
182
+ content_node = node_for_content( doc, &block )
183
+ result =
184
+ @items.keys.collect do |item|
185
+ if $GSL
186
+ val = content_node.search_norm * @items[item].search_norm.col
187
+ else
188
+ val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
189
+ end
190
+ [item, val]
191
+ end
192
+ result.sort_by { |x| x[1] }.reverse
193
+ end
194
+
195
+ # This function allows for text-based search of your index. Unlike other functions
196
+ # like find_related and classify, search only takes short strings. It will also ignore
197
+ # factors like repeated words. It is best for short, google-like search terms.
198
+ # A search will first priortize lexical relationships, then semantic ones.
199
+ #
200
+ # While this may seem backwards compared to the other functions that LSI supports,
201
+ # it is actually the same algorithm, just applied on a smaller document.
202
+ def search( string, max_nearest=3 )
203
+ return [] if needs_rebuild?
204
+ carry = proximity_norms_for_content( string )
205
+ result = carry.collect { |x| x[0] }
206
+ return result[0..max_nearest-1]
207
+ end
208
+
209
+ # This function takes content and finds other documents
210
+ # that are semantically "close", returning an array of documents sorted
211
+ # from most to least relavant.
212
+ # max_nearest specifies the number of documents to return. A value of
213
+ # 0 means that it returns all the indexed documents, sorted by relavence.
214
+ #
215
+ # This is particularly useful for identifing clusters in your document space.
216
+ # For example you may want to identify several "What's Related" items for weblog
217
+ # articles, or find paragraphs that relate to each other in an essay.
218
+ def find_related( doc, max_nearest=3, &block )
219
+ carry =
220
+ proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
221
+ result = carry.collect { |x| x[0] }
222
+ return result[0..max_nearest-1]
223
+ end
224
+
225
+ # This function uses a voting system to categorize documents, based on
226
+ # the categories of other documents. It uses the same logic as the
227
+ # find_related function to find related documents, then returns the
228
+ # most obvious category from this list.
229
+ #
230
+ # cutoff signifies the number of documents to consider when clasifying
231
+ # text. A cutoff of 1 means that every document in the index votes on
232
+ # what category the document is in. This may not always make sense.
233
+ #
234
+ def classify( doc, cutoff=0.30, &block )
235
+ icutoff = (@items.size * cutoff).round
236
+ carry = proximity_array_for_content( doc, &block )
237
+ carry = carry[0..icutoff-1]
238
+ votes = {}
239
+ carry.each do |pair|
240
+ categories = @items[pair[0]].categories
241
+ categories.each do |category|
242
+ votes[category] ||= 0.0
243
+ votes[category] += pair[1]
244
+ end
245
+ end
246
+
247
+ ranking = votes.keys.sort_by { |x| votes[x] }
248
+ return ranking[-1]
249
+ end
250
+
251
+ # Prototype, only works on indexed documents.
252
+ # I have no clue if this is going to work, but in theory
253
+ # it's supposed to.
254
+ def highest_ranked_stems( doc, count=3 )
255
+ raise "Requested stem ranking on non-indexed content!" unless @items[doc]
256
+ arr = node_for_content(doc).lsi_vector.to_a
257
+ top_n = arr.sort.reverse[0..count-1]
258
+ return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
259
+ end
260
+
261
+ private
262
+ def build_reduced_matrix( matrix, cutoff=0.75 )
263
+ # TODO: Check that M>=N on these dimensions! Transpose helps assure this
264
+ u, v, s = matrix.SV_decomp
265
+
266
+ # TODO: Better than 75% term, please. :\
267
+ s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
268
+ s.size.times do |ord|
269
+ s[ord] = 0.0 if s[ord] < s_cutoff
270
+ end
271
+ # Reconstruct the term document matrix, only with reduced rank
272
+ u * ($GSL ? GSL::Matrix : ::Matrix).diag( s ) * v.trans
273
+ end
274
+
275
+ def node_for_content(item, &block)
276
+ if @items[item]
277
+ return @items[item]
278
+ else
279
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
280
+
281
+ cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
282
+
283
+ unless needs_rebuild?
284
+ cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
285
+ end
286
+ end
287
+
288
+ return cn
289
+ end
290
+
291
+ def make_word_list
292
+ @word_list = WordList.new
293
+ @items.each_value do |node|
294
+ node.word_hash.each_key { |key| @word_list.add_word key }
295
+ end
296
+ end
297
+
298
+ end
299
+ end
300
+
@@ -0,0 +1,3 @@
1
+ module Reclassifier
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,32 @@
1
+ module Reclassifier
2
+ # This class keeps a word => index mapping. It is used to map stemmed words
3
+ # to dimensions of a vector.
4
+
5
+ class WordList
6
+ def initialize
7
+ @location_table = Hash.new
8
+ end
9
+
10
+ # Adds a word (if it is new) and assigns it a unique dimension.
11
+ def add_word(word)
12
+ term = word
13
+ @location_table[term] = @location_table.size unless @location_table[term]
14
+ end
15
+
16
+ # Returns the dimension of the word or nil if the word is not in the space.
17
+ def [](lookup)
18
+ term = lookup
19
+ @location_table[term]
20
+ end
21
+
22
+ def word_for_index(ind)
23
+ @location_table.invert[ind]
24
+ end
25
+
26
+ # Returns the number of words mapped.
27
+ def size
28
+ @location_table.size
29
+ end
30
+
31
+ end
32
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'reclassifier/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "reclassifier"
8
+ spec.version = Reclassifier::VERSION
9
+ spec.authors = ["Ryan Oblak"]
10
+ spec.email = ["rroblak@gmail.com"]
11
+ spec.description = %q{Bayesian and Latent Semantic Indexing classification of text.}
12
+ spec.summary = %q{Bayesian and Latent Semantic Indexing classification of text.}
13
+ spec.homepage = "https://github.com/saveup/reclassifier"
14
+ spec.license = "LGPL"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency 'bundler', '~> 1.3'
22
+ spec.add_development_dependency 'rake'
23
+ spec.add_development_dependency 'test-unit'
24
+
25
+ spec.add_dependency 'fast-stemmer'
26
+ spec.add_dependency 'gsl'
27
+ end
@@ -0,0 +1,34 @@
1
+ require File.join(File.dirname(__FILE__), 'test_helper')
2
+
3
+ class BayesTest < Test::Unit::TestCase
4
+ def setup
5
+ @classifier = Reclassifier::Bayes.new 'Interesting', 'Uninteresting'
6
+ end
7
+
8
+ def test_good_training
9
+ assert_nothing_raised { @classifier.train_interesting "love" }
10
+ end
11
+
12
+ def test_bad_training
13
+ assert_raise(StandardError) { @classifier.train_no_category "words" }
14
+ end
15
+
16
+ def test_bad_method
17
+ assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
18
+ end
19
+
20
+ def test_categories
21
+ assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
22
+ end
23
+
24
+ def test_add_category
25
+ @classifier.add_category 'Test'
26
+ assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
27
+ end
28
+
29
+ def test_classification
30
+ @classifier.train_interesting "here are some good words. I hope you love them"
31
+ @classifier.train_uninteresting "here are some bad words, I hate you"
32
+ assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
33
+ end
34
+ end
@@ -0,0 +1,15 @@
1
+ require File.join(File.dirname(__FILE__), '..', 'test_helper')
2
+
3
+ class ArrayTest < Test::Unit::TestCase
4
+ def test_monkey_path_array_sum
5
+ assert_equal [1,2,3].sum_with_identity, 6
6
+ end
7
+
8
+ def test_summing_an_empty_array
9
+ assert_equal [nil].sum_with_identity, 0
10
+ end
11
+
12
+ def test_summing_an_empty_array
13
+ assert_equal Array[].sum_with_identity, 0
14
+ end
15
+ end
@@ -0,0 +1,13 @@
1
+ require File.join(File.dirname(__FILE__), '..', 'test_helper')
2
+
3
+ class StringTest < Test::Unit::TestCase
4
+ def test_word_hash
5
+ hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
6
+ assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
7
+ end
8
+
9
+ def test_clean_word_hash
10
+ hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
11
+ assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash
12
+ end
13
+ end