reclassifier 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,20 @@
1
+ class Vector
2
+ def magnitude
3
+ sumsqs = 0.0
4
+ self.size.times do |i|
5
+ sumsqs += self[i] ** 2.0
6
+ end
7
+ Math.sqrt(sumsqs)
8
+ end
9
+
10
+ def normalize
11
+ nv = []
12
+ mag = self.magnitude
13
+ self.size.times do |i|
14
+
15
+ nv << (self[i] / mag)
16
+
17
+ end
18
+ Vector[*nv]
19
+ end
20
+ end
@@ -0,0 +1,300 @@
1
+ $GSL = true
2
+
3
+ module Reclassifier
4
+
5
+ # This class implements a Latent Semantic Indexer, which can search, classify and cluster
6
+ # data based on underlying semantic relations. For more information on the algorithms used,
7
+ # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
8
+ class LSI
9
+
10
+ attr_reader :word_list
11
+ attr_accessor :auto_rebuild
12
+
13
+ # Create a fresh index.
14
+ # If you want to call #build_index manually, use
15
+ # Classifier::LSI.new :auto_rebuild => false
16
+ #
17
+ def initialize(options = {})
18
+ @auto_rebuild = true unless options[:auto_rebuild] == false
19
+ @word_list, @items = WordList.new, {}
20
+ @version, @built_at_version = 0, -1
21
+ end
22
+
23
+ # Returns true if the index needs to be rebuilt. The index needs
24
+ # to be built after all informaton is added, but before you start
25
+ # using it for search, classification and cluster detection.
26
+ def needs_rebuild?
27
+ (@items.keys.size > 1) && (@version != @built_at_version)
28
+ end
29
+
30
+ # Adds an item to the index. item is assumed to be a string, but
31
+ # any item may be indexed so long as it responds to #to_s or if
32
+ # you provide an optional block explaining how the indexer can
33
+ # fetch fresh string data. This optional block is passed the item,
34
+ # so the item may only be a reference to a URL or file name.
35
+ #
36
+ # For example:
37
+ # lsi = Classifier::LSI.new
38
+ # lsi.add_item "This is just plain text"
39
+ # lsi.add_item "/home/me/filename.txt" { |x| File.read x }
40
+ # ar = ActiveRecordObject.find( :all )
41
+ # lsi.add_item ar, *ar.categories { |x| ar.content }
42
+ #
43
+ def add_item( item, *categories, &block )
44
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
45
+ @items[item] = ContentNode.new(clean_word_hash, *categories)
46
+ @version += 1
47
+ build_index if @auto_rebuild
48
+ end
49
+
50
+ # A less flexible shorthand for add_item that assumes
51
+ # you are passing in a string with no categorries. item
52
+ # will be duck typed via to_s .
53
+ #
54
+ def <<( item )
55
+ add_item item
56
+ end
57
+
58
+ # Returns the categories for a given indexed items. You are free to add and remove
59
+ # items from this as you see fit. It does not invalide an index to change its categories.
60
+ def categories_for(item)
61
+ return [] unless @items[item]
62
+ return @items[item].categories
63
+ end
64
+
65
+ # Removes an item from the database, if it is indexed.
66
+ #
67
+ def remove_item( item )
68
+ if @items.has_key? item
69
+ @items.delete item
70
+ @version += 1
71
+ end
72
+ end
73
+
74
+ # Returns an array of items that are indexed.
75
+ def items
76
+ @items.keys
77
+ end
78
+
79
+ # Returns the categories for a given indexed items. You are free to add and remove
80
+ # items from this as you see fit. It does not invalide an index to change its categories.
81
+ def categories_for(item)
82
+ return [] unless @items[item]
83
+ return @items[item].categories
84
+ end
85
+
86
+ # This function rebuilds the index if needs_rebuild? returns true.
87
+ # For very large document spaces, this indexing operation may take some
88
+ # time to complete, so it may be wise to place the operation in another
89
+ # thread.
90
+ #
91
+ # As a rule, indexing will be fairly swift on modern machines until
92
+ # you have well over 500 documents indexed, or have an incredibly diverse
93
+ # vocabulary for your documents.
94
+ #
95
+ # The optional parameter "cutoff" is a tuning parameter. When the index is
96
+ # built, a certain number of s-values are discarded from the system. The
97
+ # cutoff parameter tells the indexer how many of these values to keep.
98
+ # A value of 1 for cutoff means that no semantic analysis will take place,
99
+ # turning the LSI class into a simple vector search engine.
100
+ def build_index( cutoff=0.75 )
101
+ return unless needs_rebuild?
102
+ make_word_list
103
+
104
+ doc_list = @items.values
105
+ tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
106
+
107
+ if $GSL
108
+ tdm = GSL::Matrix.alloc(*tda).trans
109
+ ntdm = build_reduced_matrix(tdm, cutoff)
110
+
111
+ ntdm.size[1].times do |col|
112
+ vec = GSL::Vector.alloc( ntdm.column(col) ).row
113
+ doc_list[col].lsi_vector = vec
114
+ doc_list[col].lsi_norm = vec.normalize
115
+ end
116
+ else
117
+ tdm = Matrix.rows(tda).trans
118
+ ntdm = build_reduced_matrix(tdm, cutoff)
119
+
120
+ ntdm.row_size.times do |col|
121
+ doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
122
+ doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
123
+ end
124
+ end
125
+
126
+ @built_at_version = @version
127
+ end
128
+
129
+ # This method returns max_chunks entries, ordered by their average semantic rating.
130
+ # Essentially, the average distance of each entry from all other entries is calculated,
131
+ # the highest are returned.
132
+ #
133
+ # This can be used to build a summary service, or to provide more information about
134
+ # your dataset's general content. For example, if you were to use categorize on the
135
+ # results of this data, you could gather information on what your dataset is generally
136
+ # about.
137
+ def highest_relative_content( max_chunks=10 )
138
+ return [] if needs_rebuild?
139
+
140
+ avg_density = Hash.new
141
+ @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
142
+
143
+ avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
144
+ end
145
+
146
+ # This function is the primitive that find_related and classify
147
+ # build upon. It returns an array of 2-element arrays. The first element
148
+ # of this array is a document, and the second is its "score", defining
149
+ # how "close" it is to other indexed items.
150
+ #
151
+ # These values are somewhat arbitrary, having to do with the vector space
152
+ # created by your content, so the magnitude is interpretable but not always
153
+ # meaningful between indexes.
154
+ #
155
+ # The parameter doc is the content to compare. If that content is not
156
+ # indexed, you can pass an optional block to define how to create the
157
+ # text data. See add_item for examples of how this works.
158
+ def proximity_array_for_content( doc, &block )
159
+ return [] if needs_rebuild?
160
+
161
+ content_node = node_for_content( doc, &block )
162
+ result =
163
+ @items.keys.collect do |item|
164
+ if $GSL
165
+ val = content_node.search_vector * @items[item].search_vector.col
166
+ else
167
+ val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
168
+ end
169
+ [item, val]
170
+ end
171
+ result.sort_by { |x| x[1] }.reverse
172
+ end
173
+
174
+ # Similar to proximity_array_for_content, this function takes similar
175
+ # arguments and returns a similar array. However, it uses the normalized
176
+ # calculated vectors instead of their full versions. This is useful when
177
+ # you're trying to perform operations on content that is much smaller than
178
+ # the text you're working with. search uses this primitive.
179
+ def proximity_norms_for_content( doc, &block )
180
+ return [] if needs_rebuild?
181
+
182
+ content_node = node_for_content( doc, &block )
183
+ result =
184
+ @items.keys.collect do |item|
185
+ if $GSL
186
+ val = content_node.search_norm * @items[item].search_norm.col
187
+ else
188
+ val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
189
+ end
190
+ [item, val]
191
+ end
192
+ result.sort_by { |x| x[1] }.reverse
193
+ end
194
+
195
+ # This function allows for text-based search of your index. Unlike other functions
196
+ # like find_related and classify, search only takes short strings. It will also ignore
197
+ # factors like repeated words. It is best for short, google-like search terms.
198
+ # A search will first priortize lexical relationships, then semantic ones.
199
+ #
200
+ # While this may seem backwards compared to the other functions that LSI supports,
201
+ # it is actually the same algorithm, just applied on a smaller document.
202
+ def search( string, max_nearest=3 )
203
+ return [] if needs_rebuild?
204
+ carry = proximity_norms_for_content( string )
205
+ result = carry.collect { |x| x[0] }
206
+ return result[0..max_nearest-1]
207
+ end
208
+
209
+ # This function takes content and finds other documents
210
+ # that are semantically "close", returning an array of documents sorted
211
+ # from most to least relavant.
212
+ # max_nearest specifies the number of documents to return. A value of
213
+ # 0 means that it returns all the indexed documents, sorted by relavence.
214
+ #
215
+ # This is particularly useful for identifing clusters in your document space.
216
+ # For example you may want to identify several "What's Related" items for weblog
217
+ # articles, or find paragraphs that relate to each other in an essay.
218
+ def find_related( doc, max_nearest=3, &block )
219
+ carry =
220
+ proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
221
+ result = carry.collect { |x| x[0] }
222
+ return result[0..max_nearest-1]
223
+ end
224
+
225
+ # This function uses a voting system to categorize documents, based on
226
+ # the categories of other documents. It uses the same logic as the
227
+ # find_related function to find related documents, then returns the
228
+ # most obvious category from this list.
229
+ #
230
+ # cutoff signifies the number of documents to consider when clasifying
231
+ # text. A cutoff of 1 means that every document in the index votes on
232
+ # what category the document is in. This may not always make sense.
233
+ #
234
+ def classify( doc, cutoff=0.30, &block )
235
+ icutoff = (@items.size * cutoff).round
236
+ carry = proximity_array_for_content( doc, &block )
237
+ carry = carry[0..icutoff-1]
238
+ votes = {}
239
+ carry.each do |pair|
240
+ categories = @items[pair[0]].categories
241
+ categories.each do |category|
242
+ votes[category] ||= 0.0
243
+ votes[category] += pair[1]
244
+ end
245
+ end
246
+
247
+ ranking = votes.keys.sort_by { |x| votes[x] }
248
+ return ranking[-1]
249
+ end
250
+
251
+ # Prototype, only works on indexed documents.
252
+ # I have no clue if this is going to work, but in theory
253
+ # it's supposed to.
254
+ def highest_ranked_stems( doc, count=3 )
255
+ raise "Requested stem ranking on non-indexed content!" unless @items[doc]
256
+ arr = node_for_content(doc).lsi_vector.to_a
257
+ top_n = arr.sort.reverse[0..count-1]
258
+ return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
259
+ end
260
+
261
+ private
262
+ def build_reduced_matrix( matrix, cutoff=0.75 )
263
+ # TODO: Check that M>=N on these dimensions! Transpose helps assure this
264
+ u, v, s = matrix.SV_decomp
265
+
266
+ # TODO: Better than 75% term, please. :\
267
+ s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
268
+ s.size.times do |ord|
269
+ s[ord] = 0.0 if s[ord] < s_cutoff
270
+ end
271
+ # Reconstruct the term document matrix, only with reduced rank
272
+ u * ($GSL ? GSL::Matrix : ::Matrix).diag( s ) * v.trans
273
+ end
274
+
275
+ def node_for_content(item, &block)
276
+ if @items[item]
277
+ return @items[item]
278
+ else
279
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
280
+
281
+ cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
282
+
283
+ unless needs_rebuild?
284
+ cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
285
+ end
286
+ end
287
+
288
+ return cn
289
+ end
290
+
291
+ def make_word_list
292
+ @word_list = WordList.new
293
+ @items.each_value do |node|
294
+ node.word_hash.each_key { |key| @word_list.add_word key }
295
+ end
296
+ end
297
+
298
+ end
299
+ end
300
+
@@ -0,0 +1,3 @@
1
+ module Reclassifier
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,32 @@
1
+ module Reclassifier
2
+ # This class keeps a word => index mapping. It is used to map stemmed words
3
+ # to dimensions of a vector.
4
+
5
+ class WordList
6
+ def initialize
7
+ @location_table = Hash.new
8
+ end
9
+
10
+ # Adds a word (if it is new) and assigns it a unique dimension.
11
+ def add_word(word)
12
+ term = word
13
+ @location_table[term] = @location_table.size unless @location_table[term]
14
+ end
15
+
16
+ # Returns the dimension of the word or nil if the word is not in the space.
17
+ def [](lookup)
18
+ term = lookup
19
+ @location_table[term]
20
+ end
21
+
22
+ def word_for_index(ind)
23
+ @location_table.invert[ind]
24
+ end
25
+
26
+ # Returns the number of words mapped.
27
+ def size
28
+ @location_table.size
29
+ end
30
+
31
+ end
32
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'reclassifier/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "reclassifier"
8
+ spec.version = Reclassifier::VERSION
9
+ spec.authors = ["Ryan Oblak"]
10
+ spec.email = ["rroblak@gmail.com"]
11
+ spec.description = %q{Bayesian and Latent Semantic Indexing classification of text.}
12
+ spec.summary = %q{Bayesian and Latent Semantic Indexing classification of text.}
13
+ spec.homepage = "https://github.com/saveup/reclassifier"
14
+ spec.license = "LGPL"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency 'bundler', '~> 1.3'
22
+ spec.add_development_dependency 'rake'
23
+ spec.add_development_dependency 'test-unit'
24
+
25
+ spec.add_dependency 'fast-stemmer'
26
+ spec.add_dependency 'gsl'
27
+ end
@@ -0,0 +1,34 @@
1
+ require File.join(File.dirname(__FILE__), 'test_helper')
2
+
3
+ class BayesTest < Test::Unit::TestCase
4
+ def setup
5
+ @classifier = Reclassifier::Bayes.new 'Interesting', 'Uninteresting'
6
+ end
7
+
8
+ def test_good_training
9
+ assert_nothing_raised { @classifier.train_interesting "love" }
10
+ end
11
+
12
+ def test_bad_training
13
+ assert_raise(StandardError) { @classifier.train_no_category "words" }
14
+ end
15
+
16
+ def test_bad_method
17
+ assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
18
+ end
19
+
20
+ def test_categories
21
+ assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
22
+ end
23
+
24
+ def test_add_category
25
+ @classifier.add_category 'Test'
26
+ assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
27
+ end
28
+
29
+ def test_classification
30
+ @classifier.train_interesting "here are some good words. I hope you love them"
31
+ @classifier.train_uninteresting "here are some bad words, I hate you"
32
+ assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
33
+ end
34
+ end
@@ -0,0 +1,15 @@
1
+ require File.join(File.dirname(__FILE__), '..', 'test_helper')
2
+
3
+ class ArrayTest < Test::Unit::TestCase
4
+ def test_monkey_path_array_sum
5
+ assert_equal [1,2,3].sum_with_identity, 6
6
+ end
7
+
8
+ def test_summing_an_empty_array
9
+ assert_equal [nil].sum_with_identity, 0
10
+ end
11
+
12
+ def test_summing_an_empty_array
13
+ assert_equal Array[].sum_with_identity, 0
14
+ end
15
+ end
@@ -0,0 +1,13 @@
1
+ require File.join(File.dirname(__FILE__), '..', 'test_helper')
2
+
3
+ class StringTest < Test::Unit::TestCase
4
+ def test_word_hash
5
+ hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
6
+ assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
7
+ end
8
+
9
+ def test_clean_word_hash
10
+ hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
11
+ assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash
12
+ end
13
+ end