classifier-fork 1.3.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,20 @@
1
+ module GSL
2
+
3
+ class Vector
4
+ def _dump(v)
5
+ Marshal.dump( self.to_a )
6
+ end
7
+
8
+ def self._load(arr)
9
+ arry = Marshal.load(arr)
10
+ return GSL::Vector.alloc(arry)
11
+ end
12
+
13
+ end
14
+
15
+ class Matrix
16
+ class <<self
17
+ alias :diag :diagonal
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,136 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ require "set"
6
+
7
+ # These are extensions to the String class to provide convenience
8
+ # methods for the Classifier package.
9
+ class String
10
+
11
+ # Removes common punctuation symbols, returning a new string.
12
+ # E.g.,
13
+ # "Hello (greeting's), with {braces} < >...?".without_punctuation
14
+ # => "Hello greetings with braces "
15
+ def without_punctuation
16
+ tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
17
+ end
18
+
19
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
20
+ # interned, and indexes to its frequency in the document.
21
+ def word_hash
22
+ word_hash = clean_word_hash()
23
+ symbol_hash = word_hash_for_symbols(gsub(/[\w]/," ").split)
24
+ return word_hash.merge(symbol_hash)
25
+ end
26
+
27
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
28
+ def clean_word_hash
29
+ word_hash_for_words gsub(/[^\w\s]/,"").split
30
+ end
31
+
32
+ private
33
+
34
+ def word_hash_for_words(words)
35
+ d = Hash.new(0)
36
+ words.each do |word|
37
+ word.downcase!
38
+ if ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
39
+ d[word.stem.intern] += 1
40
+ end
41
+ end
42
+ return d
43
+ end
44
+
45
+
46
+ def word_hash_for_symbols(words)
47
+ d = Hash.new(0)
48
+ words.each do |word|
49
+ d[word.intern] += 1
50
+ end
51
+ return d
52
+ end
53
+
54
+ CORPUS_SKIP_WORDS = Set.new([
55
+ "a",
56
+ "again",
57
+ "all",
58
+ "along",
59
+ "are",
60
+ "also",
61
+ "an",
62
+ "and",
63
+ "as",
64
+ "at",
65
+ "but",
66
+ "by",
67
+ "came",
68
+ "can",
69
+ "cant",
70
+ "couldnt",
71
+ "did",
72
+ "didn",
73
+ "didnt",
74
+ "do",
75
+ "doesnt",
76
+ "dont",
77
+ "ever",
78
+ "first",
79
+ "from",
80
+ "have",
81
+ "her",
82
+ "here",
83
+ "him",
84
+ "how",
85
+ "i",
86
+ "if",
87
+ "in",
88
+ "into",
89
+ "is",
90
+ "isnt",
91
+ "it",
92
+ "itll",
93
+ "just",
94
+ "last",
95
+ "least",
96
+ "like",
97
+ "most",
98
+ "my",
99
+ "new",
100
+ "no",
101
+ "not",
102
+ "now",
103
+ "of",
104
+ "on",
105
+ "or",
106
+ "should",
107
+ "sinc",
108
+ "so",
109
+ "some",
110
+ "th",
111
+ "than",
112
+ "this",
113
+ "that",
114
+ "the",
115
+ "their",
116
+ "then",
117
+ "those",
118
+ "to",
119
+ "told",
120
+ "too",
121
+ "true",
122
+ "try",
123
+ "until",
124
+ "url",
125
+ "us",
126
+ "were",
127
+ "when",
128
+ "whether",
129
+ "while",
130
+ "with",
131
+ "within",
132
+ "yes",
133
+ "you",
134
+ "youll",
135
+ ])
136
+ end
@@ -0,0 +1,318 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ begin
6
+ raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
7
+
8
+ require 'gsl' # requires http://rb-gsl.rubyforge.org/
9
+ require 'classifier/extensions/vector_serialize'
10
+ $GSL = true
11
+
12
+ rescue LoadError
13
+ warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
14
+ require 'classifier/extensions/vector'
15
+ end
16
+
17
+ require 'classifier/lsi/word_list'
18
+ require 'classifier/lsi/content_node'
19
+ require 'classifier/lsi/summary'
20
+
21
+ module Classifier
22
+
23
+ # This class implements a Latent Semantic Indexer, which can search, classify and cluster
24
+ # data based on underlying semantic relations. For more information on the algorithms used,
25
+ # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
26
+ class LSI
27
+
28
+ attr_reader :word_list
29
+ attr_accessor :auto_rebuild
30
+
31
+ # Create a fresh index.
32
+ # If you want to call #build_index manually, use
33
+ # Classifier::LSI.new :auto_rebuild => false
34
+ #
35
+ def initialize(options = {})
36
+ @auto_rebuild = true unless options[:auto_rebuild] == false
37
+ @word_list, @items = WordList.new, {}
38
+ @version, @built_at_version = 0, -1
39
+ end
40
+
41
+ # Returns true if the index needs to be rebuilt. The index needs
42
+ # to be built after all informaton is added, but before you start
43
+ # using it for search, classification and cluster detection.
44
+ def needs_rebuild?
45
+ (@items.keys.size > 1) && (@version != @built_at_version)
46
+ end
47
+
48
+ # Adds an item to the index. item is assumed to be a string, but
49
+ # any item may be indexed so long as it responds to #to_s or if
50
+ # you provide an optional block explaining how the indexer can
51
+ # fetch fresh string data. This optional block is passed the item,
52
+ # so the item may only be a reference to a URL or file name.
53
+ #
54
+ # For example:
55
+ # lsi = Classifier::LSI.new
56
+ # lsi.add_item "This is just plain text"
57
+ # lsi.add_item "/home/me/filename.txt" { |x| File.read x }
58
+ # ar = ActiveRecordObject.find( :all )
59
+ # lsi.add_item ar, *ar.categories { |x| ar.content }
60
+ #
61
+ def add_item( item, *categories, &block )
62
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
63
+ @items[item] = ContentNode.new(clean_word_hash, *categories)
64
+ @version += 1
65
+ build_index if @auto_rebuild
66
+ end
67
+
68
+ # A less flexible shorthand for add_item that assumes
69
+ # you are passing in a string with no categorries. item
70
+ # will be duck typed via to_s .
71
+ #
72
+ def <<( item )
73
+ add_item item
74
+ end
75
+
76
+ # Returns the categories for a given indexed items. You are free to add and remove
77
+ # items from this as you see fit. It does not invalide an index to change its categories.
78
+ def categories_for(item)
79
+ return [] unless @items[item]
80
+ return @items[item].categories
81
+ end
82
+
83
+ # Removes an item from the database, if it is indexed.
84
+ #
85
+ def remove_item( item )
86
+ if @items.keys.contain? item
87
+ @items.remove item
88
+ @version += 1
89
+ end
90
+ end
91
+
92
+ # Returns an array of items that are indexed.
93
+ def items
94
+ @items.keys
95
+ end
96
+
97
+ # Returns the categories for a given indexed items. You are free to add and remove
98
+ # items from this as you see fit. It does not invalide an index to change its categories.
99
+ def categories_for(item)
100
+ return [] unless @items[item]
101
+ return @items[item].categories
102
+ end
103
+
104
+ # This function rebuilds the index if needs_rebuild? returns true.
105
+ # For very large document spaces, this indexing operation may take some
106
+ # time to complete, so it may be wise to place the operation in another
107
+ # thread.
108
+ #
109
+ # As a rule, indexing will be fairly swift on modern machines until
110
+ # you have well over 500 documents indexed, or have an incredibly diverse
111
+ # vocabulary for your documents.
112
+ #
113
+ # The optional parameter "cutoff" is a tuning parameter. When the index is
114
+ # built, a certain number of s-values are discarded from the system. The
115
+ # cutoff parameter tells the indexer how many of these values to keep.
116
+ # A value of 1 for cutoff means that no semantic analysis will take place,
117
+ # turning the LSI class into a simple vector search engine.
118
+ def build_index( cutoff=0.75 )
119
+ return unless needs_rebuild?
120
+ make_word_list
121
+
122
+ doc_list = @items.values
123
+ tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
124
+
125
+ if $GSL
126
+ tdm = GSL::Matrix.alloc(*tda).trans
127
+ ntdm = build_reduced_matrix(tdm, cutoff)
128
+
129
+ ntdm.size[1].times do |col|
130
+ vec = GSL::Vector.alloc( ntdm.column(col) ).row
131
+ doc_list[col].lsi_vector = vec
132
+ doc_list[col].lsi_norm = vec.normalize
133
+ end
134
+ else
135
+ tdm = Matrix.rows(tda).trans
136
+ ntdm = build_reduced_matrix(tdm, cutoff)
137
+
138
+ ntdm.row_size.times do |col|
139
+ doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
140
+ doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
141
+ end
142
+ end
143
+
144
+ @built_at_version = @version
145
+ end
146
+
147
+ # This method returns max_chunks entries, ordered by their average semantic rating.
148
+ # Essentially, the average distance of each entry from all other entries is calculated,
149
+ # the highest are returned.
150
+ #
151
+ # This can be used to build a summary service, or to provide more information about
152
+ # your dataset's general content. For example, if you were to use categorize on the
153
+ # results of this data, you could gather information on what your dataset is generally
154
+ # about.
155
+ def highest_relative_content( max_chunks=10 )
156
+ return [] if needs_rebuild?
157
+
158
+ avg_density = Hash.new
159
+ @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
160
+
161
+ avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
162
+ end
163
+
164
+ # This function is the primitive that find_related and classify
165
+ # build upon. It returns an array of 2-element arrays. The first element
166
+ # of this array is a document, and the second is its "score", defining
167
+ # how "close" it is to other indexed items.
168
+ #
169
+ # These values are somewhat arbitrary, having to do with the vector space
170
+ # created by your content, so the magnitude is interpretable but not always
171
+ # meaningful between indexes.
172
+ #
173
+ # The parameter doc is the content to compare. If that content is not
174
+ # indexed, you can pass an optional block to define how to create the
175
+ # text data. See add_item for examples of how this works.
176
+ def proximity_array_for_content( doc, &block )
177
+ return [] if needs_rebuild?
178
+
179
+ content_node = node_for_content( doc, &block )
180
+ result =
181
+ @items.keys.collect do |item|
182
+ if $GSL
183
+ val = content_node.search_vector * @items[item].search_vector.col
184
+ else
185
+ val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
186
+ end
187
+ [item, val]
188
+ end
189
+ result.sort_by { |x| x[1] }.reverse
190
+ end
191
+
192
+ # Similar to proximity_array_for_content, this function takes similar
193
+ # arguments and returns a similar array. However, it uses the normalized
194
+ # calculated vectors instead of their full versions. This is useful when
195
+ # you're trying to perform operations on content that is much smaller than
196
+ # the text you're working with. search uses this primitive.
197
+ def proximity_norms_for_content( doc, &block )
198
+ return [] if needs_rebuild?
199
+
200
+ content_node = node_for_content( doc, &block )
201
+ result =
202
+ @items.keys.collect do |item|
203
+ if $GSL
204
+ val = content_node.search_norm * @items[item].search_norm.col
205
+ else
206
+ val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
207
+ end
208
+ [item, val]
209
+ end
210
+ result.sort_by { |x| x[1] }.reverse
211
+ end
212
+
213
+ # This function allows for text-based search of your index. Unlike other functions
214
+ # like find_related and classify, search only takes short strings. It will also ignore
215
+ # factors like repeated words. It is best for short, google-like search terms.
216
+ # A search will first priortize lexical relationships, then semantic ones.
217
+ #
218
+ # While this may seem backwards compared to the other functions that LSI supports,
219
+ # it is actually the same algorithm, just applied on a smaller document.
220
+ def search( string, max_nearest=3 )
221
+ return [] if needs_rebuild?
222
+ carry = proximity_norms_for_content( string )
223
+ result = carry.collect { |x| x[0] }
224
+ return result[0..max_nearest-1]
225
+ end
226
+
227
+ # This function takes content and finds other documents
228
+ # that are semantically "close", returning an array of documents sorted
229
+ # from most to least relavant.
230
+ # max_nearest specifies the number of documents to return. A value of
231
+ # 0 means that it returns all the indexed documents, sorted by relavence.
232
+ #
233
+ # This is particularly useful for identifing clusters in your document space.
234
+ # For example you may want to identify several "What's Related" items for weblog
235
+ # articles, or find paragraphs that relate to each other in an essay.
236
+ def find_related( doc, max_nearest=3, &block )
237
+ carry =
238
+ proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
239
+ result = carry.collect { |x| x[0] }
240
+ return result[0..max_nearest-1]
241
+ end
242
+
243
+ # This function uses a voting system to categorize documents, based on
244
+ # the categories of other documents. It uses the same logic as the
245
+ # find_related function to find related documents, then returns the
246
+ # most obvious category from this list.
247
+ #
248
+ # cutoff signifies the number of documents to consider when clasifying
249
+ # text. A cutoff of 1 means that every document in the index votes on
250
+ # what category the document is in. This may not always make sense.
251
+ #
252
+ def classify( doc, cutoff=0.30, &block )
253
+ icutoff = (@items.size * cutoff).round
254
+ carry = proximity_array_for_content( doc, &block )
255
+ carry = carry[0..icutoff-1]
256
+ votes = {}
257
+ carry.each do |pair|
258
+ categories = @items[pair[0]].categories
259
+ categories.each do |category|
260
+ votes[category] ||= 0.0
261
+ votes[category] += pair[1]
262
+ end
263
+ end
264
+
265
+ ranking = votes.keys.sort_by { |x| votes[x] }
266
+ return ranking[-1]
267
+ end
268
+
269
+ # Prototype, only works on indexed documents.
270
+ # I have no clue if this is going to work, but in theory
271
+ # it's supposed to.
272
+ def highest_ranked_stems( doc, count=3 )
273
+ raise "Requested stem ranking on non-indexed content!" unless @items[doc]
274
+ arr = node_for_content(doc).lsi_vector.to_a
275
+ top_n = arr.sort.reverse[0..count-1]
276
+ return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
277
+ end
278
+
279
+ private
280
+ def build_reduced_matrix( matrix, cutoff=0.75 )
281
+ # TODO: Check that M>=N on these dimensions! Transpose helps assure this
282
+ u, v, s = matrix.SV_decomp
283
+
284
+ # TODO: Better than 75% term, please. :\
285
+ s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
286
+ s.size.times do |ord|
287
+ s[ord] = 0.0 if s[ord] < s_cutoff
288
+ end
289
+ # Reconstruct the term document matrix, only with reduced rank
290
+ u * ($GSL ? GSL::Matrix : ::Matrix).diag( s ) * v.trans
291
+ end
292
+
293
+ def node_for_content(item, &block)
294
+ if @items[item]
295
+ return @items[item]
296
+ else
297
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
298
+
299
+ cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
300
+
301
+ unless needs_rebuild?
302
+ cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
303
+ end
304
+ end
305
+
306
+ return cn
307
+ end
308
+
309
+ def make_word_list
310
+ @word_list = WordList.new
311
+ @items.each_value do |node|
312
+ node.word_hash.each_key { |key| @word_list.add_word key }
313
+ end
314
+ end
315
+
316
+ end
317
+ end
318
+