otherinbox-classifier 1.3.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,20 @@
1
+ module GSL
2
+
3
+ class Vector
4
+ def _dump(v)
5
+ Marshal.dump( self.to_a )
6
+ end
7
+
8
+ def self._load(arr)
9
+ arry = Marshal.load(arr)
10
+ return GSL::Vector.alloc(arry)
11
+ end
12
+
13
+ end
14
+
15
+ class Matrix
16
+ class <<self
17
+ alias :diag :diagonal
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,154 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ # These are extensions to the String class to provide convenience
6
+ # methods for the Classifier package.
7
+ class String
8
+
9
+ # Removes common punctuation symbols, returning a new string.
10
+ # E.g.,
11
+ # "Hello (greeting's), with {braces} < >...?".without_punctuation
12
+ # => "Hello greetings with braces "
13
+ def without_punctuation
14
+ tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
15
+ end
16
+
17
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
18
+ # interned, and indexes to its frequency in the document.
19
+ def word_hash
20
+ word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
21
+ end
22
+
23
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
24
+ def clean_word_hash
25
+ word_hash_for_words gsub(/[^\w\s]/,"").split
26
+ end
27
+
28
+ private
29
+
30
+ def word_hash_for_words(words)
31
+ d = Hash.new
32
+ words.each do |word|
33
+ word.downcase! if word =~ /[\w]+/
34
+ # Don't stem words -- most sources I've read say that it's more hurtful than helpful and my tests found the same thing
35
+ #key = word.stem.intern
36
+ # Ignore words if they have no word chars, are in the skip list, all numbers or length <= 2
37
+ if word =~ /\w/ && word !~ /\d+/ && word.length > 2 && !CORPUS_SKIP_WORDS.include?(word)
38
+ key = word.intern
39
+ d[key] ||= 0
40
+ d[key] += 1
41
+ end
42
+ end
43
+ return d
44
+ end
45
+
46
+ CORPUS_SKIP_WORDS = [
47
+ # Hopefully all of the HTML is gone by this point but because broken HTML is out there
48
+ # we're not able to easily get rid of it all. This really messes up the classifier
49
+ # so we're doing a last stand and removing HTML artifacts here.
50
+ # TODO replace with a definitive and less haphazard list
51
+ "href",
52
+ "http",
53
+ "https",
54
+ "alt",
55
+ "coords",
56
+ "nbsp",
57
+ "target",
58
+ "com",
59
+ "net",
60
+ "org",
61
+ "shape",
62
+ "rect",
63
+ "apos",
64
+ "quot",
65
+ "bull",
66
+ "html",
67
+ "www",
68
+ # These are OI specific but are still showing up as a result of broken HTML
69
+ "otherinbox",
70
+ "beta",
71
+ "blank",
72
+
73
+ "a",
74
+ "again",
75
+ "all",
76
+ "along",
77
+ "are",
78
+ "also",
79
+ "an",
80
+ "and",
81
+ "as",
82
+ "at",
83
+ "but",
84
+ "by",
85
+ "came",
86
+ "can",
87
+ "cant",
88
+ "couldnt",
89
+ "did",
90
+ "didn",
91
+ "didnt",
92
+ "do",
93
+ "doesnt",
94
+ "dont",
95
+ "ever",
96
+ "first",
97
+ "from",
98
+ "have",
99
+ "her",
100
+ "here",
101
+ "him",
102
+ "how",
103
+ "i",
104
+ "if",
105
+ "in",
106
+ "into",
107
+ "is",
108
+ "isnt",
109
+ "it",
110
+ "itll",
111
+ "just",
112
+ "last",
113
+ "least",
114
+ "like",
115
+ "most",
116
+ "my",
117
+ "new",
118
+ "no",
119
+ "not",
120
+ "now",
121
+ "of",
122
+ "on",
123
+ "or",
124
+ "should",
125
+ "sinc",
126
+ "so",
127
+ "some",
128
+ "th",
129
+ "than",
130
+ "this",
131
+ "that",
132
+ "the",
133
+ "their",
134
+ "then",
135
+ "those",
136
+ "to",
137
+ "told",
138
+ "too",
139
+ "true",
140
+ "try",
141
+ "until",
142
+ "url",
143
+ "us",
144
+ "were",
145
+ "when",
146
+ "whether",
147
+ "while",
148
+ "with",
149
+ "within",
150
+ "yes",
151
+ "you",
152
+ "youll",
153
+ ].to_set
154
+ end
@@ -0,0 +1,318 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ begin
6
+ raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
7
+
8
+ require 'gsl' # requires http://rb-gsl.rubyforge.org/
9
+ require 'classifier/extensions/vector_serialize'
10
+ $GSL = true
11
+
12
+ rescue LoadError
13
+ warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
14
+ require 'classifier/extensions/vector'
15
+ end
16
+
17
+ require 'classifier/lsi/word_list'
18
+ require 'classifier/lsi/content_node'
19
+ require 'classifier/lsi/summary'
20
+
21
+ module Classifier
22
+
23
+ # This class implements a Latent Semantic Indexer, which can search, classify and cluster
24
+ # data based on underlying semantic relations. For more information on the algorithms used,
25
+ # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
26
+ class LSI
27
+
28
+ attr_reader :word_list
29
+ attr_accessor :auto_rebuild
30
+
31
+ # Create a fresh index.
32
+ # If you want to call #build_index manually, use
33
+ # Classifier::LSI.new :auto_rebuild => false
34
+ #
35
+ def initialize(options = {})
36
+ @auto_rebuild = true unless options[:auto_rebuild] == false
37
+ @word_list, @items = WordList.new, {}
38
+ @version, @built_at_version = 0, -1
39
+ end
40
+
41
+ # Returns true if the index needs to be rebuilt. The index needs
42
+ # to be built after all informaton is added, but before you start
43
+ # using it for search, classification and cluster detection.
44
+ def needs_rebuild?
45
+ (@items.keys.size > 1) && (@version != @built_at_version)
46
+ end
47
+
48
+ # Adds an item to the index. item is assumed to be a string, but
49
+ # any item may be indexed so long as it responds to #to_s or if
50
+ # you provide an optional block explaining how the indexer can
51
+ # fetch fresh string data. This optional block is passed the item,
52
+ # so the item may only be a reference to a URL or file name.
53
+ #
54
+ # For example:
55
+ # lsi = Classifier::LSI.new
56
+ # lsi.add_item "This is just plain text"
57
+ # lsi.add_item "/home/me/filename.txt" { |x| File.read x }
58
+ # ar = ActiveRecordObject.find( :all )
59
+ # lsi.add_item ar, *ar.categories { |x| ar.content }
60
+ #
61
+ def add_item( item, *categories, &block )
62
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
63
+ @items[item] = ContentNode.new(clean_word_hash, *categories)
64
+ @version += 1
65
+ build_index if @auto_rebuild
66
+ end
67
+
68
+ # A less flexible shorthand for add_item that assumes
69
+ # you are passing in a string with no categorries. item
70
+ # will be duck typed via to_s .
71
+ #
72
+ def <<( item )
73
+ add_item item
74
+ end
75
+
76
+ # Returns the categories for a given indexed items. You are free to add and remove
77
+ # items from this as you see fit. It does not invalide an index to change its categories.
78
+ def categories_for(item)
79
+ return [] unless @items[item]
80
+ return @items[item].categories
81
+ end
82
+
83
+ # Removes an item from the database, if it is indexed.
84
+ #
85
+ def remove_item( item )
86
+ if @items.keys.contain? item
87
+ @items.remove item
88
+ @version += 1
89
+ end
90
+ end
91
+
92
+ # Returns an array of items that are indexed.
93
+ def items
94
+ @items.keys
95
+ end
96
+
97
+ # Returns the categories for a given indexed items. You are free to add and remove
98
+ # items from this as you see fit. It does not invalide an index to change its categories.
99
+ def categories_for(item)
100
+ return [] unless @items[item]
101
+ return @items[item].categories
102
+ end
103
+
104
+ # This function rebuilds the index if needs_rebuild? returns true.
105
+ # For very large document spaces, this indexing operation may take some
106
+ # time to complete, so it may be wise to place the operation in another
107
+ # thread.
108
+ #
109
+ # As a rule, indexing will be fairly swift on modern machines until
110
+ # you have well over 500 documents indexed, or have an incredibly diverse
111
+ # vocabulary for your documents.
112
+ #
113
+ # The optional parameter "cutoff" is a tuning parameter. When the index is
114
+ # built, a certain number of s-values are discarded from the system. The
115
+ # cutoff parameter tells the indexer how many of these values to keep.
116
+ # A value of 1 for cutoff means that no semantic analysis will take place,
117
+ # turning the LSI class into a simple vector search engine.
118
+ def build_index( cutoff=0.75 )
119
+ return unless needs_rebuild?
120
+ make_word_list
121
+
122
+ doc_list = @items.values
123
+ tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
124
+
125
+ if $GSL
126
+ tdm = GSL::Matrix.alloc(*tda).trans
127
+ ntdm = build_reduced_matrix(tdm, cutoff)
128
+
129
+ ntdm.size[1].times do |col|
130
+ vec = GSL::Vector.alloc( ntdm.column(col) ).row
131
+ doc_list[col].lsi_vector = vec
132
+ doc_list[col].lsi_norm = vec.normalize
133
+ end
134
+ else
135
+ tdm = Matrix.rows(tda).trans
136
+ ntdm = build_reduced_matrix(tdm, cutoff)
137
+
138
+ ntdm.row_size.times do |col|
139
+ doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
140
+ doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
141
+ end
142
+ end
143
+
144
+ @built_at_version = @version
145
+ end
146
+
147
+ # This method returns max_chunks entries, ordered by their average semantic rating.
148
+ # Essentially, the average distance of each entry from all other entries is calculated,
149
+ # the highest are returned.
150
+ #
151
+ # This can be used to build a summary service, or to provide more information about
152
+ # your dataset's general content. For example, if you were to use categorize on the
153
+ # results of this data, you could gather information on what your dataset is generally
154
+ # about.
155
+ def highest_relative_content( max_chunks=10 )
156
+ return [] if needs_rebuild?
157
+
158
+ avg_density = Hash.new
159
+ @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
160
+
161
+ avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
162
+ end
163
+
164
+ # This function is the primitive that find_related and classify
165
+ # build upon. It returns an array of 2-element arrays. The first element
166
+ # of this array is a document, and the second is its "score", defining
167
+ # how "close" it is to other indexed items.
168
+ #
169
+ # These values are somewhat arbitrary, having to do with the vector space
170
+ # created by your content, so the magnitude is interpretable but not always
171
+ # meaningful between indexes.
172
+ #
173
+ # The parameter doc is the content to compare. If that content is not
174
+ # indexed, you can pass an optional block to define how to create the
175
+ # text data. See add_item for examples of how this works.
176
+ def proximity_array_for_content( doc, &block )
177
+ return [] if needs_rebuild?
178
+
179
+ content_node = node_for_content( doc, &block )
180
+ result =
181
+ @items.keys.collect do |item|
182
+ if $GSL
183
+ val = content_node.search_vector * @items[item].search_vector.col
184
+ else
185
+ val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
186
+ end
187
+ [item, val]
188
+ end
189
+ result.sort_by { |x| x[1] }.reverse
190
+ end
191
+
192
+ # Similar to proximity_array_for_content, this function takes similar
193
+ # arguments and returns a similar array. However, it uses the normalized
194
+ # calculated vectors instead of their full versions. This is useful when
195
+ # you're trying to perform operations on content that is much smaller than
196
+ # the text you're working with. search uses this primitive.
197
+ def proximity_norms_for_content( doc, &block )
198
+ return [] if needs_rebuild?
199
+
200
+ content_node = node_for_content( doc, &block )
201
+ result =
202
+ @items.keys.collect do |item|
203
+ if $GSL
204
+ val = content_node.search_norm * @items[item].search_norm.col
205
+ else
206
+ val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
207
+ end
208
+ [item, val]
209
+ end
210
+ result.sort_by { |x| x[1] }.reverse
211
+ end
212
+
213
+ # This function allows for text-based search of your index. Unlike other functions
214
+ # like find_related and classify, search only takes short strings. It will also ignore
215
+ # factors like repeated words. It is best for short, google-like search terms.
216
+ # A search will first priortize lexical relationships, then semantic ones.
217
+ #
218
+ # While this may seem backwards compared to the other functions that LSI supports,
219
+ # it is actually the same algorithm, just applied on a smaller document.
220
+ def search( string, max_nearest=3 )
221
+ return [] if needs_rebuild?
222
+ carry = proximity_norms_for_content( string )
223
+ result = carry.collect { |x| x[0] }
224
+ return result[0..max_nearest-1]
225
+ end
226
+
227
+ # This function takes content and finds other documents
228
+ # that are semantically "close", returning an array of documents sorted
229
+ # from most to least relavant.
230
+ # max_nearest specifies the number of documents to return. A value of
231
+ # 0 means that it returns all the indexed documents, sorted by relavence.
232
+ #
233
+ # This is particularly useful for identifing clusters in your document space.
234
+ # For example you may want to identify several "What's Related" items for weblog
235
+ # articles, or find paragraphs that relate to each other in an essay.
236
+ def find_related( doc, max_nearest=3, &block )
237
+ carry =
238
+ proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
239
+ result = carry.collect { |x| x[0] }
240
+ return result[0..max_nearest-1]
241
+ end
242
+
243
+ # This function uses a voting system to categorize documents, based on
244
+ # the categories of other documents. It uses the same logic as the
245
+ # find_related function to find related documents, then returns the
246
+ # most obvious category from this list.
247
+ #
248
+ # cutoff signifies the number of documents to consider when clasifying
249
+ # text. A cutoff of 1 means that every document in the index votes on
250
+ # what category the document is in. This may not always make sense.
251
+ #
252
+ def classify( doc, cutoff=0.30, &block )
253
+ icutoff = (@items.size * cutoff).round
254
+ carry = proximity_array_for_content( doc, &block )
255
+ carry = carry[0..icutoff-1]
256
+ votes = {}
257
+ carry.each do |pair|
258
+ categories = @items[pair[0]].categories
259
+ categories.each do |category|
260
+ votes[category] ||= 0.0
261
+ votes[category] += pair[1]
262
+ end
263
+ end
264
+
265
+ ranking = votes.keys.sort_by { |x| votes[x] }
266
+ return ranking[-1]
267
+ end
268
+
269
+ # Prototype, only works on indexed documents.
270
+ # I have no clue if this is going to work, but in theory
271
+ # it's supposed to.
272
+ def highest_ranked_stems( doc, count=3 )
273
+ raise "Requested stem ranking on non-indexed content!" unless @items[doc]
274
+ arr = node_for_content(doc).lsi_vector.to_a
275
+ top_n = arr.sort.reverse[0..count-1]
276
+ return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
277
+ end
278
+
279
+ private
280
+ def build_reduced_matrix( matrix, cutoff=0.75 )
281
+ # TODO: Check that M>=N on these dimensions! Transpose helps assure this
282
+ u, v, s = matrix.SV_decomp
283
+
284
+ # TODO: Better than 75% term, please. :\
285
+ s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
286
+ s.size.times do |ord|
287
+ s[ord] = 0.0 if s[ord] < s_cutoff
288
+ end
289
+ # Reconstruct the term document matrix, only with reduced rank
290
+ u * Matrix.diag( s ) * v.trans
291
+ end
292
+
293
+ def node_for_content(item, &block)
294
+ if @items[item]
295
+ return @items[item]
296
+ else
297
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
298
+
299
+ cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
300
+
301
+ unless needs_rebuild?
302
+ cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
303
+ end
304
+ end
305
+
306
+ return cn
307
+ end
308
+
309
+ def make_word_list
310
+ @word_list = WordList.new
311
+ @items.each_value do |node|
312
+ node.word_hash.each_key { |key| @word_list.add_word key }
313
+ end
314
+ end
315
+
316
+ end
317
+ end
318
+