classifier-reborn 2.0.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,317 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ begin
6
+ raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
7
+
8
+ require 'gsl' # requires http://rb-gsl.rubyforge.org/
9
+ require 'classifier-reborn/extensions/vector_serialize'
10
+ $GSL = true
11
+
12
+ rescue LoadError
13
+ require 'classifier-reborn/extensions/vector'
14
+ end
15
+
16
+ require 'classifier-reborn/lsi/word_list'
17
+ require 'classifier-reborn/lsi/content_node'
18
+ require 'classifier-reborn/lsi/summary'
19
+
20
+ module ClassifierReborn
21
+
22
+ # This class implements a Latent Semantic Indexer, which can search, classify and cluster
23
+ # data based on underlying semantic relations. For more information on the algorithms used,
24
+ # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
25
+ class LSI
26
+
27
+ attr_reader :word_list
28
+ attr_accessor :auto_rebuild
29
+
30
+ # Create a fresh index.
31
+ # If you want to call #build_index manually, use
32
+ # ClassifierReborn::LSI.new :auto_rebuild => false
33
+ #
34
+ def initialize(options = {})
35
+ @auto_rebuild = true unless options[:auto_rebuild] == false
36
+ @word_list, @items = WordList.new, {}
37
+ @version, @built_at_version = 0, -1
38
+ end
39
+
40
+ # Returns true if the index needs to be rebuilt. The index needs
41
+ # to be built after all informaton is added, but before you start
42
+ # using it for search, classification and cluster detection.
43
+ def needs_rebuild?
44
+ (@items.keys.size > 1) && (@version != @built_at_version)
45
+ end
46
+
47
+ # Adds an item to the index. item is assumed to be a string, but
48
+ # any item may be indexed so long as it responds to #to_s or if
49
+ # you provide an optional block explaining how the indexer can
50
+ # fetch fresh string data. This optional block is passed the item,
51
+ # so the item may only be a reference to a URL or file name.
52
+ #
53
+ # For example:
54
+ # lsi = ClassifierReborn::LSI.new
55
+ # lsi.add_item "This is just plain text"
56
+ # lsi.add_item "/home/me/filename.txt" { |x| File.read x }
57
+ # ar = ActiveRecordObject.find( :all )
58
+ # lsi.add_item ar, *ar.categories { |x| ar.content }
59
+ #
60
+ def add_item( item, *categories, &block )
61
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
62
+ @items[item] = ContentNode.new(clean_word_hash, *categories)
63
+ @version += 1
64
+ build_index if @auto_rebuild
65
+ end
66
+
67
+ # A less flexible shorthand for add_item that assumes
68
+ # you are passing in a string with no categorries. item
69
+ # will be duck typed via to_s .
70
+ #
71
+ def <<( item )
72
+ add_item item
73
+ end
74
+
75
+ # Returns the categories for a given indexed items. You are free to add and remove
76
+ # items from this as you see fit. It does not invalide an index to change its categories.
77
+ def categories_for(item)
78
+ return [] unless @items[item]
79
+ return @items[item].categories
80
+ end
81
+
82
+ # Removes an item from the database, if it is indexed.
83
+ #
84
+ def remove_item( item )
85
+ if @items.keys.contain? item
86
+ @items.remove item
87
+ @version += 1
88
+ end
89
+ end
90
+
91
+ # Returns an array of items that are indexed.
92
+ def items
93
+ @items.keys
94
+ end
95
+
96
+ # Returns the categories for a given indexed items. You are free to add and remove
97
+ # items from this as you see fit. It does not invalide an index to change its categories.
98
+ def categories_for(item)
99
+ return [] unless @items[item]
100
+ return @items[item].categories
101
+ end
102
+
103
+ # This function rebuilds the index if needs_rebuild? returns true.
104
+ # For very large document spaces, this indexing operation may take some
105
+ # time to complete, so it may be wise to place the operation in another
106
+ # thread.
107
+ #
108
+ # As a rule, indexing will be fairly swift on modern machines until
109
+ # you have well over 500 documents indexed, or have an incredibly diverse
110
+ # vocabulary for your documents.
111
+ #
112
+ # The optional parameter "cutoff" is a tuning parameter. When the index is
113
+ # built, a certain number of s-values are discarded from the system. The
114
+ # cutoff parameter tells the indexer how many of these values to keep.
115
+ # A value of 1 for cutoff means that no semantic analysis will take place,
116
+ # turning the LSI class into a simple vector search engine.
117
+ def build_index( cutoff=0.75 )
118
+ return unless needs_rebuild?
119
+ make_word_list
120
+
121
+ doc_list = @items.values
122
+ tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
123
+
124
+ if $GSL
125
+ tdm = GSL::Matrix.alloc(*tda).trans
126
+ ntdm = build_reduced_matrix(tdm, cutoff)
127
+
128
+ ntdm.size[1].times do |col|
129
+ vec = GSL::Vector.alloc( ntdm.column(col) ).row
130
+ doc_list[col].lsi_vector = vec
131
+ doc_list[col].lsi_norm = vec.normalize
132
+ end
133
+ else
134
+ tdm = Matrix.rows(tda).trans
135
+ ntdm = build_reduced_matrix(tdm, cutoff)
136
+
137
+ ntdm.row_size.times do |col|
138
+ doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
139
+ doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
140
+ end
141
+ end
142
+
143
+ @built_at_version = @version
144
+ end
145
+
146
+ # This method returns max_chunks entries, ordered by their average semantic rating.
147
+ # Essentially, the average distance of each entry from all other entries is calculated,
148
+ # the highest are returned.
149
+ #
150
+ # This can be used to build a summary service, or to provide more information about
151
+ # your dataset's general content. For example, if you were to use categorize on the
152
+ # results of this data, you could gather information on what your dataset is generally
153
+ # about.
154
+ def highest_relative_content( max_chunks=10 )
155
+ return [] if needs_rebuild?
156
+
157
+ avg_density = Hash.new
158
+ @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
159
+
160
+ avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
161
+ end
162
+
163
+ # This function is the primitive that find_related and classify
164
+ # build upon. It returns an array of 2-element arrays. The first element
165
+ # of this array is a document, and the second is its "score", defining
166
+ # how "close" it is to other indexed items.
167
+ #
168
+ # These values are somewhat arbitrary, having to do with the vector space
169
+ # created by your content, so the magnitude is interpretable but not always
170
+ # meaningful between indexes.
171
+ #
172
+ # The parameter doc is the content to compare. If that content is not
173
+ # indexed, you can pass an optional block to define how to create the
174
+ # text data. See add_item for examples of how this works.
175
+ def proximity_array_for_content( doc, &block )
176
+ return [] if needs_rebuild?
177
+
178
+ content_node = node_for_content( doc, &block )
179
+ result =
180
+ @items.keys.collect do |item|
181
+ if $GSL
182
+ val = content_node.search_vector * @items[item].search_vector.col
183
+ else
184
+ val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
185
+ end
186
+ [item, val]
187
+ end
188
+ result.sort_by { |x| x[1] }.reverse
189
+ end
190
+
191
+ # Similar to proximity_array_for_content, this function takes similar
192
+ # arguments and returns a similar array. However, it uses the normalized
193
+ # calculated vectors instead of their full versions. This is useful when
194
+ # you're trying to perform operations on content that is much smaller than
195
+ # the text you're working with. search uses this primitive.
196
+ def proximity_norms_for_content( doc, &block )
197
+ return [] if needs_rebuild?
198
+
199
+ content_node = node_for_content( doc, &block )
200
+ result =
201
+ @items.keys.collect do |item|
202
+ if $GSL
203
+ val = content_node.search_norm * @items[item].search_norm.col
204
+ else
205
+ val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
206
+ end
207
+ [item, val]
208
+ end
209
+ result.sort_by { |x| x[1] }.reverse
210
+ end
211
+
212
+ # This function allows for text-based search of your index. Unlike other functions
213
+ # like find_related and classify, search only takes short strings. It will also ignore
214
+ # factors like repeated words. It is best for short, google-like search terms.
215
+ # A search will first priortize lexical relationships, then semantic ones.
216
+ #
217
+ # While this may seem backwards compared to the other functions that LSI supports,
218
+ # it is actually the same algorithm, just applied on a smaller document.
219
+ def search( string, max_nearest=3 )
220
+ return [] if needs_rebuild?
221
+ carry = proximity_norms_for_content( string )
222
+ result = carry.collect { |x| x[0] }
223
+ return result[0..max_nearest-1]
224
+ end
225
+
226
+ # This function takes content and finds other documents
227
+ # that are semantically "close", returning an array of documents sorted
228
+ # from most to least relavant.
229
+ # max_nearest specifies the number of documents to return. A value of
230
+ # 0 means that it returns all the indexed documents, sorted by relavence.
231
+ #
232
+ # This is particularly useful for identifing clusters in your document space.
233
+ # For example you may want to identify several "What's Related" items for weblog
234
+ # articles, or find paragraphs that relate to each other in an essay.
235
+ def find_related( doc, max_nearest=3, &block )
236
+ carry =
237
+ proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
238
+ result = carry.collect { |x| x[0] }
239
+ return result[0..max_nearest-1]
240
+ end
241
+
242
+ # This function uses a voting system to categorize documents, based on
243
+ # the categories of other documents. It uses the same logic as the
244
+ # find_related function to find related documents, then returns the
245
+ # most obvious category from this list.
246
+ #
247
+ # cutoff signifies the number of documents to consider when clasifying
248
+ # text. A cutoff of 1 means that every document in the index votes on
249
+ # what category the document is in. This may not always make sense.
250
+ #
251
+ def classify( doc, cutoff=0.30, &block )
252
+ icutoff = (@items.size * cutoff).round
253
+ carry = proximity_array_for_content( doc, &block )
254
+ carry = carry[0..icutoff-1]
255
+ votes = {}
256
+ carry.each do |pair|
257
+ categories = @items[pair[0]].categories
258
+ categories.each do |category|
259
+ votes[category] ||= 0.0
260
+ votes[category] += pair[1]
261
+ end
262
+ end
263
+
264
+ ranking = votes.keys.sort_by { |x| votes[x] }
265
+ return ranking[-1]
266
+ end
267
+
268
+ # Prototype, only works on indexed documents.
269
+ # I have no clue if this is going to work, but in theory
270
+ # it's supposed to.
271
+ def highest_ranked_stems( doc, count=3 )
272
+ raise "Requested stem ranking on non-indexed content!" unless @items[doc]
273
+ arr = node_for_content(doc).lsi_vector.to_a
274
+ top_n = arr.sort.reverse[0..count-1]
275
+ return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
276
+ end
277
+
278
+ private
279
+ def build_reduced_matrix( matrix, cutoff=0.75 )
280
+ # TODO: Check that M>=N on these dimensions! Transpose helps assure this
281
+ u, v, s = matrix.SV_decomp
282
+
283
+ # TODO: Better than 75% term, please. :\
284
+ s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
285
+ s.size.times do |ord|
286
+ s[ord] = 0.0 if s[ord] < s_cutoff
287
+ end
288
+ # Reconstruct the term document matrix, only with reduced rank
289
+ u * ($GSL ? GSL::Matrix : ::Matrix).diag( s ) * v.trans
290
+ end
291
+
292
+ def node_for_content(item, &block)
293
+ if @items[item]
294
+ return @items[item]
295
+ else
296
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
297
+
298
+ cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
299
+
300
+ unless needs_rebuild?
301
+ cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
302
+ end
303
+ end
304
+
305
+ return cn
306
+ end
307
+
308
+ def make_word_list
309
+ @word_list = WordList.new
310
+ @items.each_value do |node|
311
+ node.word_hash.each_key { |key| @word_list.add_word key }
312
+ end
313
+ end
314
+
315
+ end
316
+ end
317
+
@@ -0,0 +1,72 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module ClassifierReborn
6
+
7
+ # This is an internal data structure class for the LSI node. Save for
8
+ # raw_vector_with, it should be fairly straightforward to understand.
9
+ # You should never have to use it directly.
10
+ class ContentNode
11
+ attr_accessor :raw_vector, :raw_norm,
12
+ :lsi_vector, :lsi_norm,
13
+ :categories
14
+
15
+ attr_reader :word_hash
16
+ # If text_proc is not specified, the source will be duck-typed
17
+ # via source.to_s
18
+ def initialize( word_hash, *categories )
19
+ @categories = categories || []
20
+ @word_hash = word_hash
21
+ end
22
+
23
+ # Use this to fetch the appropriate search vector.
24
+ def search_vector
25
+ @lsi_vector || @raw_vector
26
+ end
27
+
28
+ # Use this to fetch the appropriate search vector in normalized form.
29
+ def search_norm
30
+ @lsi_norm || @raw_norm
31
+ end
32
+
33
+ # Creates the raw vector out of word_hash using word_list as the
34
+ # key for mapping the vector space.
35
+ def raw_vector_with( word_list )
36
+ if $GSL
37
+ vec = GSL::Vector.alloc(word_list.size)
38
+ else
39
+ vec = Array.new(word_list.size, 0)
40
+ end
41
+
42
+ @word_hash.each_key do |word|
43
+ vec[word_list[word]] = @word_hash[word] if word_list[word]
44
+ end
45
+
46
+ # Perform the scaling transform
47
+ total_words = vec.sum
48
+
49
+ # Perform first-order association transform if this vector has more
50
+ # than one word in it.
51
+ if total_words > 1.0
52
+ weighted_total = 0.0
53
+ vec.each do |term|
54
+ if ( term > 0 )
55
+ weighted_total += (( term / total_words ) * Math.log( term / total_words ))
56
+ end
57
+ end
58
+ vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
59
+ end
60
+
61
+ if $GSL
62
+ @raw_norm = vec.normalize
63
+ @raw_vector = vec
64
+ else
65
+ @raw_norm = Vector[*vec].normalize
66
+ @raw_vector = Vector[*vec]
67
+ end
68
+ end
69
+
70
+ end
71
+
72
+ end
@@ -0,0 +1,31 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ class String
6
+ def summary( count=10, separator=" [...] " )
7
+ perform_lsi split_sentences, count, separator
8
+ end
9
+
10
+ def paragraph_summary( count=1, separator=" [...] " )
11
+ perform_lsi split_paragraphs, count, separator
12
+ end
13
+
14
+ def split_sentences
15
+ split /(\.|\!|\?)/ # TODO: make this less primitive
16
+ end
17
+
18
+ def split_paragraphs
19
+ split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
20
+ end
21
+
22
+ private
23
+
24
+ def perform_lsi(chunks, count, separator)
25
+ lsi = ClassifierReborn::LSI.new :auto_rebuild => false
26
+ chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27
+ lsi.build_index
28
+ summaries = lsi.highest_relative_content count
29
+ return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
30
+ end
31
+ end
@@ -0,0 +1,36 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module ClassifierReborn
6
+ # This class keeps a word => index mapping. It is used to map stemmed words
7
+ # to dimensions of a vector.
8
+
9
+ class WordList
10
+ def initialize
11
+ @location_table = Hash.new
12
+ end
13
+
14
+ # Adds a word (if it is new) and assigns it a unique dimension.
15
+ def add_word(word)
16
+ term = word
17
+ @location_table[term] = @location_table.size unless @location_table[term]
18
+ end
19
+
20
+ # Returns the dimension of the word or nil if the word is not in the space.
21
+ def [](lookup)
22
+ term = lookup
23
+ @location_table[term]
24
+ end
25
+
26
+ def word_for_index(ind)
27
+ @location_table.invert[ind]
28
+ end
29
+
30
+ # Returns the number of words mapped.
31
+ def size
32
+ @location_table.size
33
+ end
34
+
35
+ end
36
+ end