otherinbox-classifier 1.3.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,20 @@
1
+ module GSL
2
+
3
+ class Vector
4
+ def _dump(v)
5
+ Marshal.dump( self.to_a )
6
+ end
7
+
8
+ def self._load(arr)
9
+ arry = Marshal.load(arr)
10
+ return GSL::Vector.alloc(arry)
11
+ end
12
+
13
+ end
14
+
15
+ class Matrix
16
+ class <<self
17
+ alias :diag :diagonal
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,154 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ # These are extensions to the String class to provide convenience
6
+ # methods for the Classifier package.
7
+ class String
8
+
9
+ # Removes common punctuation symbols, returning a new string.
10
+ # E.g.,
11
+ # "Hello (greeting's), with {braces} < >...?".without_punctuation
12
+ # => "Hello greetings with braces "
13
+ def without_punctuation
14
+ tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
15
+ end
16
+
17
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
18
+ # interned, and indexes to its frequency in the document.
19
+ def word_hash
20
+ word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
21
+ end
22
+
23
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
24
+ def clean_word_hash
25
+ word_hash_for_words gsub(/[^\w\s]/,"").split
26
+ end
27
+
28
+ private
29
+
30
+ def word_hash_for_words(words)
31
+ d = Hash.new
32
+ words.each do |word|
33
+ word.downcase! if word =~ /[\w]+/
34
+ # Don't stem words -- most sources I've read say that it's more hurtful than helpful and my tests found the same thing
35
+ #key = word.stem.intern
36
+ # Ignore words if they have no word chars, are in the skip list, all numbers or length <= 2
37
+ if word =~ /\w/ && word !~ /\d+/ && word.length > 2 && !CORPUS_SKIP_WORDS.include?(word)
38
+ key = word.intern
39
+ d[key] ||= 0
40
+ d[key] += 1
41
+ end
42
+ end
43
+ return d
44
+ end
45
+
46
+ CORPUS_SKIP_WORDS = [
47
+ # Hopefully all of the HTML is gone by this point but because broken HTML is out there
48
+ # we're not able to easily get rid of it all. This really messes up the classifier
49
+ # so we're doing a last stand and removing HTML artifacts here.
50
+ # TODO replace with a definitive and less haphazard list
51
+ "href",
52
+ "http",
53
+ "https",
54
+ "alt",
55
+ "coords",
56
+ "nbsp",
57
+ "target",
58
+ "com",
59
+ "net",
60
+ "org",
61
+ "shape",
62
+ "rect",
63
+ "apos",
64
+ "quot",
65
+ "bull",
66
+ "html",
67
+ "www",
68
+ # These are OI specific but are still showing up as a result of broken HTML
69
+ "otherinbox",
70
+ "beta",
71
+ "blank",
72
+
73
+ "a",
74
+ "again",
75
+ "all",
76
+ "along",
77
+ "are",
78
+ "also",
79
+ "an",
80
+ "and",
81
+ "as",
82
+ "at",
83
+ "but",
84
+ "by",
85
+ "came",
86
+ "can",
87
+ "cant",
88
+ "couldnt",
89
+ "did",
90
+ "didn",
91
+ "didnt",
92
+ "do",
93
+ "doesnt",
94
+ "dont",
95
+ "ever",
96
+ "first",
97
+ "from",
98
+ "have",
99
+ "her",
100
+ "here",
101
+ "him",
102
+ "how",
103
+ "i",
104
+ "if",
105
+ "in",
106
+ "into",
107
+ "is",
108
+ "isnt",
109
+ "it",
110
+ "itll",
111
+ "just",
112
+ "last",
113
+ "least",
114
+ "like",
115
+ "most",
116
+ "my",
117
+ "new",
118
+ "no",
119
+ "not",
120
+ "now",
121
+ "of",
122
+ "on",
123
+ "or",
124
+ "should",
125
+ "sinc",
126
+ "so",
127
+ "some",
128
+ "th",
129
+ "than",
130
+ "this",
131
+ "that",
132
+ "the",
133
+ "their",
134
+ "then",
135
+ "those",
136
+ "to",
137
+ "told",
138
+ "too",
139
+ "true",
140
+ "try",
141
+ "until",
142
+ "url",
143
+ "us",
144
+ "were",
145
+ "when",
146
+ "whether",
147
+ "while",
148
+ "with",
149
+ "within",
150
+ "yes",
151
+ "you",
152
+ "youll",
153
+ ].to_set
154
+ end
@@ -0,0 +1,318 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ begin
6
+ raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
7
+
8
+ require 'gsl' # requires http://rb-gsl.rubyforge.org/
9
+ require 'classifier/extensions/vector_serialize'
10
+ $GSL = true
11
+
12
+ rescue LoadError
13
+ warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
14
+ require 'classifier/extensions/vector'
15
+ end
16
+
17
+ require 'classifier/lsi/word_list'
18
+ require 'classifier/lsi/content_node'
19
+ require 'classifier/lsi/summary'
20
+
21
+ module Classifier
22
+
23
+ # This class implements a Latent Semantic Indexer, which can search, classify and cluster
24
+ # data based on underlying semantic relations. For more information on the algorithms used,
25
+ # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
26
+ class LSI
27
+
28
+ attr_reader :word_list
29
+ attr_accessor :auto_rebuild
30
+
31
+ # Create a fresh index.
32
+ # If you want to call #build_index manually, use
33
+ # Classifier::LSI.new :auto_rebuild => false
34
+ #
35
+ def initialize(options = {})
36
+ @auto_rebuild = true unless options[:auto_rebuild] == false
37
+ @word_list, @items = WordList.new, {}
38
+ @version, @built_at_version = 0, -1
39
+ end
40
+
41
+ # Returns true if the index needs to be rebuilt. The index needs
42
+ # to be built after all informaton is added, but before you start
43
+ # using it for search, classification and cluster detection.
44
+ def needs_rebuild?
45
+ (@items.keys.size > 1) && (@version != @built_at_version)
46
+ end
47
+
48
+ # Adds an item to the index. item is assumed to be a string, but
49
+ # any item may be indexed so long as it responds to #to_s or if
50
+ # you provide an optional block explaining how the indexer can
51
+ # fetch fresh string data. This optional block is passed the item,
52
+ # so the item may only be a reference to a URL or file name.
53
+ #
54
+ # For example:
55
+ # lsi = Classifier::LSI.new
56
+ # lsi.add_item "This is just plain text"
57
+ # lsi.add_item "/home/me/filename.txt" { |x| File.read x }
58
+ # ar = ActiveRecordObject.find( :all )
59
+ # lsi.add_item ar, *ar.categories { |x| ar.content }
60
+ #
61
+ def add_item( item, *categories, &block )
62
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
63
+ @items[item] = ContentNode.new(clean_word_hash, *categories)
64
+ @version += 1
65
+ build_index if @auto_rebuild
66
+ end
67
+
68
+ # A less flexible shorthand for add_item that assumes
69
+ # you are passing in a string with no categorries. item
70
+ # will be duck typed via to_s .
71
+ #
72
+ def <<( item )
73
+ add_item item
74
+ end
75
+
76
+ # Returns the categories for a given indexed items. You are free to add and remove
77
+ # items from this as you see fit. It does not invalide an index to change its categories.
78
+ def categories_for(item)
79
+ return [] unless @items[item]
80
+ return @items[item].categories
81
+ end
82
+
83
+ # Removes an item from the database, if it is indexed.
84
+ #
85
+ def remove_item( item )
86
+ if @items.keys.contain? item
87
+ @items.remove item
88
+ @version += 1
89
+ end
90
+ end
91
+
92
+ # Returns an array of items that are indexed.
93
+ def items
94
+ @items.keys
95
+ end
96
+
97
+ # Returns the categories for a given indexed items. You are free to add and remove
98
+ # items from this as you see fit. It does not invalide an index to change its categories.
99
+ def categories_for(item)
100
+ return [] unless @items[item]
101
+ return @items[item].categories
102
+ end
103
+
104
+ # This function rebuilds the index if needs_rebuild? returns true.
105
+ # For very large document spaces, this indexing operation may take some
106
+ # time to complete, so it may be wise to place the operation in another
107
+ # thread.
108
+ #
109
+ # As a rule, indexing will be fairly swift on modern machines until
110
+ # you have well over 500 documents indexed, or have an incredibly diverse
111
+ # vocabulary for your documents.
112
+ #
113
+ # The optional parameter "cutoff" is a tuning parameter. When the index is
114
+ # built, a certain number of s-values are discarded from the system. The
115
+ # cutoff parameter tells the indexer how many of these values to keep.
116
+ # A value of 1 for cutoff means that no semantic analysis will take place,
117
+ # turning the LSI class into a simple vector search engine.
118
+ def build_index( cutoff=0.75 )
119
+ return unless needs_rebuild?
120
+ make_word_list
121
+
122
+ doc_list = @items.values
123
+ tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
124
+
125
+ if $GSL
126
+ tdm = GSL::Matrix.alloc(*tda).trans
127
+ ntdm = build_reduced_matrix(tdm, cutoff)
128
+
129
+ ntdm.size[1].times do |col|
130
+ vec = GSL::Vector.alloc( ntdm.column(col) ).row
131
+ doc_list[col].lsi_vector = vec
132
+ doc_list[col].lsi_norm = vec.normalize
133
+ end
134
+ else
135
+ tdm = Matrix.rows(tda).trans
136
+ ntdm = build_reduced_matrix(tdm, cutoff)
137
+
138
+ ntdm.row_size.times do |col|
139
+ doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
140
+ doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
141
+ end
142
+ end
143
+
144
+ @built_at_version = @version
145
+ end
146
+
147
+ # This method returns max_chunks entries, ordered by their average semantic rating.
148
+ # Essentially, the average distance of each entry from all other entries is calculated,
149
+ # the highest are returned.
150
+ #
151
+ # This can be used to build a summary service, or to provide more information about
152
+ # your dataset's general content. For example, if you were to use categorize on the
153
+ # results of this data, you could gather information on what your dataset is generally
154
+ # about.
155
+ def highest_relative_content( max_chunks=10 )
156
+ return [] if needs_rebuild?
157
+
158
+ avg_density = Hash.new
159
+ @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
160
+
161
+ avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
162
+ end
163
+
164
+ # This function is the primitive that find_related and classify
165
+ # build upon. It returns an array of 2-element arrays. The first element
166
+ # of this array is a document, and the second is its "score", defining
167
+ # how "close" it is to other indexed items.
168
+ #
169
+ # These values are somewhat arbitrary, having to do with the vector space
170
+ # created by your content, so the magnitude is interpretable but not always
171
+ # meaningful between indexes.
172
+ #
173
+ # The parameter doc is the content to compare. If that content is not
174
+ # indexed, you can pass an optional block to define how to create the
175
+ # text data. See add_item for examples of how this works.
176
+ def proximity_array_for_content( doc, &block )
177
+ return [] if needs_rebuild?
178
+
179
+ content_node = node_for_content( doc, &block )
180
+ result =
181
+ @items.keys.collect do |item|
182
+ if $GSL
183
+ val = content_node.search_vector * @items[item].search_vector.col
184
+ else
185
+ val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
186
+ end
187
+ [item, val]
188
+ end
189
+ result.sort_by { |x| x[1] }.reverse
190
+ end
191
+
192
+ # Similar to proximity_array_for_content, this function takes similar
193
+ # arguments and returns a similar array. However, it uses the normalized
194
+ # calculated vectors instead of their full versions. This is useful when
195
+ # you're trying to perform operations on content that is much smaller than
196
+ # the text you're working with. search uses this primitive.
197
+ def proximity_norms_for_content( doc, &block )
198
+ return [] if needs_rebuild?
199
+
200
+ content_node = node_for_content( doc, &block )
201
+ result =
202
+ @items.keys.collect do |item|
203
+ if $GSL
204
+ val = content_node.search_norm * @items[item].search_norm.col
205
+ else
206
+ val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
207
+ end
208
+ [item, val]
209
+ end
210
+ result.sort_by { |x| x[1] }.reverse
211
+ end
212
+
213
+ # This function allows for text-based search of your index. Unlike other functions
214
+ # like find_related and classify, search only takes short strings. It will also ignore
215
+ # factors like repeated words. It is best for short, google-like search terms.
216
+ # A search will first priortize lexical relationships, then semantic ones.
217
+ #
218
+ # While this may seem backwards compared to the other functions that LSI supports,
219
+ # it is actually the same algorithm, just applied on a smaller document.
220
+ def search( string, max_nearest=3 )
221
+ return [] if needs_rebuild?
222
+ carry = proximity_norms_for_content( string )
223
+ result = carry.collect { |x| x[0] }
224
+ return result[0..max_nearest-1]
225
+ end
226
+
227
+ # This function takes content and finds other documents
228
+ # that are semantically "close", returning an array of documents sorted
229
+ # from most to least relavant.
230
+ # max_nearest specifies the number of documents to return. A value of
231
+ # 0 means that it returns all the indexed documents, sorted by relavence.
232
+ #
233
+ # This is particularly useful for identifing clusters in your document space.
234
+ # For example you may want to identify several "What's Related" items for weblog
235
+ # articles, or find paragraphs that relate to each other in an essay.
236
+ def find_related( doc, max_nearest=3, &block )
237
+ carry =
238
+ proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
239
+ result = carry.collect { |x| x[0] }
240
+ return result[0..max_nearest-1]
241
+ end
242
+
243
+ # This function uses a voting system to categorize documents, based on
244
+ # the categories of other documents. It uses the same logic as the
245
+ # find_related function to find related documents, then returns the
246
+ # most obvious category from this list.
247
+ #
248
+ # cutoff signifies the number of documents to consider when clasifying
249
+ # text. A cutoff of 1 means that every document in the index votes on
250
+ # what category the document is in. This may not always make sense.
251
+ #
252
+ def classify( doc, cutoff=0.30, &block )
253
+ icutoff = (@items.size * cutoff).round
254
+ carry = proximity_array_for_content( doc, &block )
255
+ carry = carry[0..icutoff-1]
256
+ votes = {}
257
+ carry.each do |pair|
258
+ categories = @items[pair[0]].categories
259
+ categories.each do |category|
260
+ votes[category] ||= 0.0
261
+ votes[category] += pair[1]
262
+ end
263
+ end
264
+
265
+ ranking = votes.keys.sort_by { |x| votes[x] }
266
+ return ranking[-1]
267
+ end
268
+
269
+ # Prototype, only works on indexed documents.
270
+ # I have no clue if this is going to work, but in theory
271
+ # it's supposed to.
272
+ def highest_ranked_stems( doc, count=3 )
273
+ raise "Requested stem ranking on non-indexed content!" unless @items[doc]
274
+ arr = node_for_content(doc).lsi_vector.to_a
275
+ top_n = arr.sort.reverse[0..count-1]
276
+ return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
277
+ end
278
+
279
+ private
280
+ def build_reduced_matrix( matrix, cutoff=0.75 )
281
+ # TODO: Check that M>=N on these dimensions! Transpose helps assure this
282
+ u, v, s = matrix.SV_decomp
283
+
284
+ # TODO: Better than 75% term, please. :\
285
+ s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
286
+ s.size.times do |ord|
287
+ s[ord] = 0.0 if s[ord] < s_cutoff
288
+ end
289
+ # Reconstruct the term document matrix, only with reduced rank
290
+ u * Matrix.diag( s ) * v.trans
291
+ end
292
+
293
+ def node_for_content(item, &block)
294
+ if @items[item]
295
+ return @items[item]
296
+ else
297
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
298
+
299
+ cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
300
+
301
+ unless needs_rebuild?
302
+ cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
303
+ end
304
+ end
305
+
306
+ return cn
307
+ end
308
+
309
+ def make_word_list
310
+ @word_list = WordList.new
311
+ @items.each_value do |node|
312
+ node.word_hash.each_key { |key| @word_list.add_word key }
313
+ end
314
+ end
315
+
316
+ end
317
+ end
318
+