classifier-reborn 2.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,317 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ begin
6
+ raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
7
+
8
+ require 'gsl' # requires http://rb-gsl.rubyforge.org/
9
+ require 'classifier-reborn/extensions/vector_serialize'
10
+ $GSL = true
11
+
12
+ rescue LoadError
13
+ require 'classifier-reborn/extensions/vector'
14
+ end
15
+
16
+ require 'classifier-reborn/lsi/word_list'
17
+ require 'classifier-reborn/lsi/content_node'
18
+ require 'classifier-reborn/lsi/summary'
19
+
20
+ module ClassifierReborn
21
+
22
+ # This class implements a Latent Semantic Indexer, which can search, classify and cluster
23
+ # data based on underlying semantic relations. For more information on the algorithms used,
24
+ # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
25
+ class LSI
26
+
27
+ attr_reader :word_list
28
+ attr_accessor :auto_rebuild
29
+
30
+ # Create a fresh index.
31
+ # If you want to call #build_index manually, use
32
+ # ClassifierReborn::LSI.new :auto_rebuild => false
33
+ #
34
+ def initialize(options = {})
35
+ @auto_rebuild = true unless options[:auto_rebuild] == false
36
+ @word_list, @items = WordList.new, {}
37
+ @version, @built_at_version = 0, -1
38
+ end
39
+
40
+ # Returns true if the index needs to be rebuilt. The index needs
41
+ # to be built after all informaton is added, but before you start
42
+ # using it for search, classification and cluster detection.
43
+ def needs_rebuild?
44
+ (@items.keys.size > 1) && (@version != @built_at_version)
45
+ end
46
+
47
+ # Adds an item to the index. item is assumed to be a string, but
48
+ # any item may be indexed so long as it responds to #to_s or if
49
+ # you provide an optional block explaining how the indexer can
50
+ # fetch fresh string data. This optional block is passed the item,
51
+ # so the item may only be a reference to a URL or file name.
52
+ #
53
+ # For example:
54
+ # lsi = ClassifierReborn::LSI.new
55
+ # lsi.add_item "This is just plain text"
56
+ # lsi.add_item "/home/me/filename.txt" { |x| File.read x }
57
+ # ar = ActiveRecordObject.find( :all )
58
+ # lsi.add_item ar, *ar.categories { |x| ar.content }
59
+ #
60
+ def add_item( item, *categories, &block )
61
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
62
+ @items[item] = ContentNode.new(clean_word_hash, *categories)
63
+ @version += 1
64
+ build_index if @auto_rebuild
65
+ end
66
+
67
+ # A less flexible shorthand for add_item that assumes
68
+ # you are passing in a string with no categorries. item
69
+ # will be duck typed via to_s .
70
+ #
71
+ def <<( item )
72
+ add_item item
73
+ end
74
+
75
+ # Returns the categories for a given indexed items. You are free to add and remove
76
+ # items from this as you see fit. It does not invalide an index to change its categories.
77
+ def categories_for(item)
78
+ return [] unless @items[item]
79
+ return @items[item].categories
80
+ end
81
+
82
+ # Removes an item from the database, if it is indexed.
83
+ #
84
+ def remove_item( item )
85
+ if @items.keys.contain? item
86
+ @items.remove item
87
+ @version += 1
88
+ end
89
+ end
90
+
91
+ # Returns an array of items that are indexed.
92
+ def items
93
+ @items.keys
94
+ end
95
+
96
+ # Returns the categories for a given indexed items. You are free to add and remove
97
+ # items from this as you see fit. It does not invalide an index to change its categories.
98
+ def categories_for(item)
99
+ return [] unless @items[item]
100
+ return @items[item].categories
101
+ end
102
+
103
+ # This function rebuilds the index if needs_rebuild? returns true.
104
+ # For very large document spaces, this indexing operation may take some
105
+ # time to complete, so it may be wise to place the operation in another
106
+ # thread.
107
+ #
108
+ # As a rule, indexing will be fairly swift on modern machines until
109
+ # you have well over 500 documents indexed, or have an incredibly diverse
110
+ # vocabulary for your documents.
111
+ #
112
+ # The optional parameter "cutoff" is a tuning parameter. When the index is
113
+ # built, a certain number of s-values are discarded from the system. The
114
+ # cutoff parameter tells the indexer how many of these values to keep.
115
+ # A value of 1 for cutoff means that no semantic analysis will take place,
116
+ # turning the LSI class into a simple vector search engine.
117
+ def build_index( cutoff=0.75 )
118
+ return unless needs_rebuild?
119
+ make_word_list
120
+
121
+ doc_list = @items.values
122
+ tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
123
+
124
+ if $GSL
125
+ tdm = GSL::Matrix.alloc(*tda).trans
126
+ ntdm = build_reduced_matrix(tdm, cutoff)
127
+
128
+ ntdm.size[1].times do |col|
129
+ vec = GSL::Vector.alloc( ntdm.column(col) ).row
130
+ doc_list[col].lsi_vector = vec
131
+ doc_list[col].lsi_norm = vec.normalize
132
+ end
133
+ else
134
+ tdm = Matrix.rows(tda).trans
135
+ ntdm = build_reduced_matrix(tdm, cutoff)
136
+
137
+ ntdm.row_size.times do |col|
138
+ doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
139
+ doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
140
+ end
141
+ end
142
+
143
+ @built_at_version = @version
144
+ end
145
+
146
+ # This method returns max_chunks entries, ordered by their average semantic rating.
147
+ # Essentially, the average distance of each entry from all other entries is calculated,
148
+ # the highest are returned.
149
+ #
150
+ # This can be used to build a summary service, or to provide more information about
151
+ # your dataset's general content. For example, if you were to use categorize on the
152
+ # results of this data, you could gather information on what your dataset is generally
153
+ # about.
154
+ def highest_relative_content( max_chunks=10 )
155
+ return [] if needs_rebuild?
156
+
157
+ avg_density = Hash.new
158
+ @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
159
+
160
+ avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
161
+ end
162
+
163
+ # This function is the primitive that find_related and classify
164
+ # build upon. It returns an array of 2-element arrays. The first element
165
+ # of this array is a document, and the second is its "score", defining
166
+ # how "close" it is to other indexed items.
167
+ #
168
+ # These values are somewhat arbitrary, having to do with the vector space
169
+ # created by your content, so the magnitude is interpretable but not always
170
+ # meaningful between indexes.
171
+ #
172
+ # The parameter doc is the content to compare. If that content is not
173
+ # indexed, you can pass an optional block to define how to create the
174
+ # text data. See add_item for examples of how this works.
175
+ def proximity_array_for_content( doc, &block )
176
+ return [] if needs_rebuild?
177
+
178
+ content_node = node_for_content( doc, &block )
179
+ result =
180
+ @items.keys.collect do |item|
181
+ if $GSL
182
+ val = content_node.search_vector * @items[item].search_vector.col
183
+ else
184
+ val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
185
+ end
186
+ [item, val]
187
+ end
188
+ result.sort_by { |x| x[1] }.reverse
189
+ end
190
+
191
+ # Similar to proximity_array_for_content, this function takes similar
192
+ # arguments and returns a similar array. However, it uses the normalized
193
+ # calculated vectors instead of their full versions. This is useful when
194
+ # you're trying to perform operations on content that is much smaller than
195
+ # the text you're working with. search uses this primitive.
196
+ def proximity_norms_for_content( doc, &block )
197
+ return [] if needs_rebuild?
198
+
199
+ content_node = node_for_content( doc, &block )
200
+ result =
201
+ @items.keys.collect do |item|
202
+ if $GSL
203
+ val = content_node.search_norm * @items[item].search_norm.col
204
+ else
205
+ val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
206
+ end
207
+ [item, val]
208
+ end
209
+ result.sort_by { |x| x[1] }.reverse
210
+ end
211
+
212
+ # This function allows for text-based search of your index. Unlike other functions
213
+ # like find_related and classify, search only takes short strings. It will also ignore
214
+ # factors like repeated words. It is best for short, google-like search terms.
215
+ # A search will first priortize lexical relationships, then semantic ones.
216
+ #
217
+ # While this may seem backwards compared to the other functions that LSI supports,
218
+ # it is actually the same algorithm, just applied on a smaller document.
219
+ def search( string, max_nearest=3 )
220
+ return [] if needs_rebuild?
221
+ carry = proximity_norms_for_content( string )
222
+ result = carry.collect { |x| x[0] }
223
+ return result[0..max_nearest-1]
224
+ end
225
+
226
+ # This function takes content and finds other documents
227
+ # that are semantically "close", returning an array of documents sorted
228
+ # from most to least relavant.
229
+ # max_nearest specifies the number of documents to return. A value of
230
+ # 0 means that it returns all the indexed documents, sorted by relavence.
231
+ #
232
+ # This is particularly useful for identifing clusters in your document space.
233
+ # For example you may want to identify several "What's Related" items for weblog
234
+ # articles, or find paragraphs that relate to each other in an essay.
235
+ def find_related( doc, max_nearest=3, &block )
236
+ carry =
237
+ proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
238
+ result = carry.collect { |x| x[0] }
239
+ return result[0..max_nearest-1]
240
+ end
241
+
242
+ # This function uses a voting system to categorize documents, based on
243
+ # the categories of other documents. It uses the same logic as the
244
+ # find_related function to find related documents, then returns the
245
+ # most obvious category from this list.
246
+ #
247
+ # cutoff signifies the number of documents to consider when clasifying
248
+ # text. A cutoff of 1 means that every document in the index votes on
249
+ # what category the document is in. This may not always make sense.
250
+ #
251
+ def classify( doc, cutoff=0.30, &block )
252
+ icutoff = (@items.size * cutoff).round
253
+ carry = proximity_array_for_content( doc, &block )
254
+ carry = carry[0..icutoff-1]
255
+ votes = {}
256
+ carry.each do |pair|
257
+ categories = @items[pair[0]].categories
258
+ categories.each do |category|
259
+ votes[category] ||= 0.0
260
+ votes[category] += pair[1]
261
+ end
262
+ end
263
+
264
+ ranking = votes.keys.sort_by { |x| votes[x] }
265
+ return ranking[-1]
266
+ end
267
+
268
+ # Prototype, only works on indexed documents.
269
+ # I have no clue if this is going to work, but in theory
270
+ # it's supposed to.
271
+ def highest_ranked_stems( doc, count=3 )
272
+ raise "Requested stem ranking on non-indexed content!" unless @items[doc]
273
+ arr = node_for_content(doc).lsi_vector.to_a
274
+ top_n = arr.sort.reverse[0..count-1]
275
+ return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
276
+ end
277
+
278
+ private
279
+ def build_reduced_matrix( matrix, cutoff=0.75 )
280
+ # TODO: Check that M>=N on these dimensions! Transpose helps assure this
281
+ u, v, s = matrix.SV_decomp
282
+
283
+ # TODO: Better than 75% term, please. :\
284
+ s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
285
+ s.size.times do |ord|
286
+ s[ord] = 0.0 if s[ord] < s_cutoff
287
+ end
288
+ # Reconstruct the term document matrix, only with reduced rank
289
+ u * ($GSL ? GSL::Matrix : ::Matrix).diag( s ) * v.trans
290
+ end
291
+
292
+ def node_for_content(item, &block)
293
+ if @items[item]
294
+ return @items[item]
295
+ else
296
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
297
+
298
+ cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
299
+
300
+ unless needs_rebuild?
301
+ cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
302
+ end
303
+ end
304
+
305
+ return cn
306
+ end
307
+
308
+ def make_word_list
309
+ @word_list = WordList.new
310
+ @items.each_value do |node|
311
+ node.word_hash.each_key { |key| @word_list.add_word key }
312
+ end
313
+ end
314
+
315
+ end
316
+ end
317
+
@@ -0,0 +1,72 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module ClassifierReborn
6
+
7
+ # This is an internal data structure class for the LSI node. Save for
8
+ # raw_vector_with, it should be fairly straightforward to understand.
9
+ # You should never have to use it directly.
10
+ class ContentNode
11
+ attr_accessor :raw_vector, :raw_norm,
12
+ :lsi_vector, :lsi_norm,
13
+ :categories
14
+
15
+ attr_reader :word_hash
16
+ # If text_proc is not specified, the source will be duck-typed
17
+ # via source.to_s
18
+ def initialize( word_hash, *categories )
19
+ @categories = categories || []
20
+ @word_hash = word_hash
21
+ end
22
+
23
+ # Use this to fetch the appropriate search vector.
24
+ def search_vector
25
+ @lsi_vector || @raw_vector
26
+ end
27
+
28
+ # Use this to fetch the appropriate search vector in normalized form.
29
+ def search_norm
30
+ @lsi_norm || @raw_norm
31
+ end
32
+
33
+ # Creates the raw vector out of word_hash using word_list as the
34
+ # key for mapping the vector space.
35
+ def raw_vector_with( word_list )
36
+ if $GSL
37
+ vec = GSL::Vector.alloc(word_list.size)
38
+ else
39
+ vec = Array.new(word_list.size, 0)
40
+ end
41
+
42
+ @word_hash.each_key do |word|
43
+ vec[word_list[word]] = @word_hash[word] if word_list[word]
44
+ end
45
+
46
+ # Perform the scaling transform
47
+ total_words = vec.sum
48
+
49
+ # Perform first-order association transform if this vector has more
50
+ # than one word in it.
51
+ if total_words > 1.0
52
+ weighted_total = 0.0
53
+ vec.each do |term|
54
+ if ( term > 0 )
55
+ weighted_total += (( term / total_words ) * Math.log( term / total_words ))
56
+ end
57
+ end
58
+ vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
59
+ end
60
+
61
+ if $GSL
62
+ @raw_norm = vec.normalize
63
+ @raw_vector = vec
64
+ else
65
+ @raw_norm = Vector[*vec].normalize
66
+ @raw_vector = Vector[*vec]
67
+ end
68
+ end
69
+
70
+ end
71
+
72
+ end
@@ -0,0 +1,31 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ class String
6
+ def summary( count=10, separator=" [...] " )
7
+ perform_lsi split_sentences, count, separator
8
+ end
9
+
10
+ def paragraph_summary( count=1, separator=" [...] " )
11
+ perform_lsi split_paragraphs, count, separator
12
+ end
13
+
14
+ def split_sentences
15
+ split /(\.|\!|\?)/ # TODO: make this less primitive
16
+ end
17
+
18
+ def split_paragraphs
19
+ split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
20
+ end
21
+
22
+ private
23
+
24
+ def perform_lsi(chunks, count, separator)
25
+ lsi = ClassifierReborn::LSI.new :auto_rebuild => false
26
+ chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27
+ lsi.build_index
28
+ summaries = lsi.highest_relative_content count
29
+ return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
30
+ end
31
+ end
@@ -0,0 +1,36 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module ClassifierReborn
6
+ # This class keeps a word => index mapping. It is used to map stemmed words
7
+ # to dimensions of a vector.
8
+
9
+ class WordList
10
+ def initialize
11
+ @location_table = Hash.new
12
+ end
13
+
14
+ # Adds a word (if it is new) and assigns it a unique dimension.
15
+ def add_word(word)
16
+ term = word
17
+ @location_table[term] = @location_table.size unless @location_table[term]
18
+ end
19
+
20
+ # Returns the dimension of the word or nil if the word is not in the space.
21
+ def [](lookup)
22
+ term = lookup
23
+ @location_table[term]
24
+ end
25
+
26
+ def word_for_index(ind)
27
+ @location_table.invert[ind]
28
+ end
29
+
30
+ # Returns the number of words mapped.
31
+ def size
32
+ @location_table.size
33
+ end
34
+
35
+ end
36
+ end