logankoester-classifier 1.4.3

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile ADDED
@@ -0,0 +1,21 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |s|
7
+ s.name = "logankoester-classifier"
8
+ s.summary = "A general classifier module to allow Bayesian and other types of classifications."
9
+ s.description = "Bayesian classifier and others."
10
+ s.homepage = "http://github.com/logankoester/classifier"
11
+ s.author = "Luis Parravicini"
12
+ s.email = "lparravi@gmail.com"
13
+
14
+ s.add_dependency "activesupport", ">= 2.2.2"
15
+ s.add_dependency "ruby-stemmer", ">= 0.5.1"
16
+ end
17
+ rescue LoadError
18
+ puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
19
+ end
20
+
21
+ Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }
data/VERSION.yml ADDED
@@ -0,0 +1,5 @@
1
+ ---
2
+ :major: 1
3
+ :minor: 4
4
+ :patch: 3
5
+ :build:
data/lib/classifier.rb ADDED
@@ -0,0 +1,31 @@
1
+ #--
2
+ # Copyright (c) 2005 Lucas Carlson
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
24
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
25
+ # License:: LGPL
26
+
27
+ require 'active_support'
28
+ require 'lingua/stemmer'
29
+ require 'classifier/base'
30
+ require 'classifier/bayes'
31
+ require 'classifier/lsi'
@@ -0,0 +1,65 @@
1
+ module Classifier
2
+ class Base
3
+
4
+ def initialize(options = {})
5
+ options.reverse_merge!(:language => 'en')
6
+ options.reverse_merge!(:encoding => 'UTF_8')
7
+
8
+ @options = options
9
+ end
10
+
11
+ def prepare_category_name val
12
+ val.to_s.gsub("_"," ").capitalize
13
+ end
14
+
15
+ # Removes common punctuation symbols, returning a new string.
16
+ # E.g.,
17
+ # "Hello (greeting's), with {braces} < >...?".without_punctuation
18
+ # => "Hello greetings with braces "
19
+ def without_punctuation str
20
+ str.tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
21
+ end
22
+
23
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
24
+ # and indexes to its frequency in the document.
25
+ def word_hash str
26
+ word_hash_for_words(str.gsub(/[^\w\s]/,"").split + str.gsub(/[\w]/," ").split)
27
+ end
28
+
29
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
30
+ def clean_word_hash str
31
+ word_hash_for_words str.gsub(/[^\w\s]/,"").split
32
+ end
33
+
34
+ # When a Classifier instance is serialized, it is saved with an instance
35
+ # of Lingua::Stemmer that may not be initialized when deserialized later,
36
+ # raising a "RuntimeError: Stemmer is not initialized".
37
+ #
38
+ # You can run remove_stemmer to force a new Stemmer to be initialized.
39
+ def remove_stemmer
40
+ @stemmer = nil
41
+ end
42
+
43
+ private
44
+
45
+ def stemmer
46
+ @stemmer ||= Lingua::Stemmer.new(@options)
47
+ end
48
+
49
+ def word_hash_for_words(words)
50
+ d = Hash.new
51
+ skip_words = StopWords.for(@options[:language], @options[:lang_dir])
52
+ encoding_name = @options[:encoding].gsub(/_/, '-')
53
+ words.each do |word|
54
+ word = word.mb_chars.downcase.to_s if word =~ /[\w]+/
55
+ key = stemmer.stem(word)
56
+ key.force_encoding(encoding_name) if defined?(Encoding) && key && key.respond_to?(:force_encoding)
57
+ if word =~ /[^\w]/ || ! skip_words.include?(word) && word.length > 2
58
+ d[key] ||= 0
59
+ d[key] += 1
60
+ end
61
+ end
62
+ return d
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,145 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ require 'classifier/stopwords'
6
+
7
+ module Classifier
8
+
9
+ class Bayes < Classifier::Base
10
+
11
+ # The class can be created with one or more categories, each of which will be
12
+ # initialized and given a training method. E.g.,
13
+ # b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam']
14
+ # you can specify language and encoding parameters for stemmer
15
+ # (default values - :language => 'en', :encoding => 'UTF_8')
16
+ # b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam'], :language => 'ru'
17
+ def initialize(options = {})
18
+ @categories = Hash.new
19
+ options.reverse_merge!(:categories => [])
20
+ options[:categories].each { |category| @categories[prepare_category_name(category)] = Hash.new }
21
+ @total_words = 0
22
+ super
23
+ end
24
+
25
+ #
26
+ # Provides a general training method for all categories specified in Bayes#new
27
+ # For example:
28
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
29
+ # b.train :this, "This text"
30
+ # b.train "that", "That text"
31
+ # b.train "The other", "The other text"
32
+ def train(category, text)
33
+ category = prepare_category_name(category)
34
+ word_hash(text).each do |word, count|
35
+ @categories[category][word] ||= 0
36
+ @categories[category][word] += count
37
+ @total_words += count
38
+ end
39
+ end
40
+
41
+ #
42
+ # Provides a untraining method for all categories specified in Bayes#new
43
+ # Be very careful with this method.
44
+ #
45
+ # For example:
46
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
47
+ # b.train :this, "This text"
48
+ # b.untrain :this, "This text"
49
+ def untrain(category, text)
50
+ category = prepare_category_name(category)
51
+ word_hash(text).each do |word, count|
52
+ if @total_words >= 0
53
+ orig = @categories[category][word] || 0
54
+ @categories[category][word] ||= 0
55
+ @categories[category][word] -= count
56
+ if @categories[category][word] <= 0
57
+ @categories[category].delete(word)
58
+ count = orig
59
+ end
60
+ @total_words -= count
61
+ end
62
+ end
63
+ end
64
+
65
+ #
66
+ # Returns the scores in each category the provided +text+. E.g.,
67
+ # b.classifications "I hate bad words and you"
68
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
69
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
70
+ def classifications(text)
71
+ score = Hash.new
72
+ @categories.each do |category, category_words|
73
+ score[category.to_s] = 0
74
+ total = category_words.values.sum
75
+ word_hash(text).each do |word, count|
76
+ s = category_words.has_key?(word) ? category_words[word] : 0.1
77
+ score[category.to_s] += Math.log(s/total.to_f)
78
+ end
79
+ end
80
+ return score
81
+ end
82
+
83
+ #
84
+ # Returns the classification of the provided +text+, which is one of the
85
+ # categories given in the initializer. E.g.,
86
+ # b.classify "I hate bad words and you"
87
+ # => 'Uninteresting'
88
+ def classify(text)
89
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
90
+ end
91
+
92
+ #
93
+ # Provides training and untraining methods for the categories specified in Bayes#new
94
+ # For example:
95
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
96
+ # b.train_this "This text"
97
+ # b.train_that "That text"
98
+ # b.untrain_that "That text"
99
+ # b.train_the_other "The other text"
100
+ def method_missing(name, *args)
101
+ category = prepare_category_name(name.to_s.gsub(/(un)?train_([\w]+)/, '\2'))
102
+ if @categories.has_key? category
103
+ args.each { |text| eval("#{$1}train(category, text)") }
104
+ elsif name.to_s =~ /(un)?train_([\w]+)/
105
+ raise StandardError, "No such category: #{category}"
106
+ else
107
+ super #raise StandardError, "No such method: #{name}"
108
+ end
109
+ end
110
+
111
+ #
112
+ # Provides a list of category names
113
+ # For example:
114
+ # b.categories
115
+ # => ['This', 'That', 'the_other']
116
+ def categories # :nodoc:
117
+ @categories.keys.collect {|c| c.to_s}
118
+ end
119
+
120
+ #
121
+ # Allows you to add categories to the classifier.
122
+ # For example:
123
+ # b.add_category "Not spam"
124
+ #
125
+ # WARNING: Adding categories to a trained classifier will
126
+ # result in an undertrained category that will tend to match
127
+ # more criteria than the trained selective categories. In short,
128
+ # try to initialize your categories at initialization.
129
+ def add_category(category)
130
+ @categories[prepare_category_name(category)] = Hash.new
131
+ end
132
+
133
+ alias append_category add_category
134
+
135
+ def marshal_dump
136
+ [@categories, @total_words, @options ]
137
+ end
138
+
139
+ def marshal_load(data)
140
+ @categories, @total_words, @options = data
141
+ end
142
+
143
+ end
144
+
145
+ end
@@ -0,0 +1,100 @@
1
+ # Author:: Ernest Ellingson
2
+ # Copyright:: Copyright (c) 2005
3
+
4
+ # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
5
+
6
+ require 'matrix'
7
+ require 'mathn'
8
+
9
+ class Vector
10
+ def magnitude
11
+ sumsqs = 0.0
12
+ self.size.times do |i|
13
+ sumsqs += self[i] ** 2.0
14
+ end
15
+ Math.sqrt(sumsqs)
16
+ end
17
+ def normalize
18
+ nv = []
19
+ mag = self.magnitude
20
+ self.size.times do |i|
21
+
22
+ nv << (self[i] / mag)
23
+
24
+ end
25
+ Vector[*nv]
26
+ end
27
+ end
28
+
29
+ class Matrix
30
+ def Matrix.diag(s)
31
+ Matrix.diagonal(*s)
32
+ end
33
+
34
+ alias :trans :transpose
35
+
36
+ def SV_decomp(maxSweeps = 20)
37
+ if self.row_size >= self.column_size
38
+ q = self.trans * self
39
+ else
40
+ q = self * self.trans
41
+ end
42
+
43
+ qrot = q.dup
44
+ v = Matrix.identity(q.row_size)
45
+ azrot = nil
46
+ mzrot = nil
47
+ cnt = 0
48
+ s_old = nil
49
+ mu = nil
50
+
51
+ while true do
52
+ cnt += 1
53
+ for row in (0...qrot.row_size-1) do
54
+ for col in (1..qrot.row_size-1) do
55
+ next if row == col
56
+ h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
57
+ hcos = Math.cos(h)
58
+ hsin = Math.sin(h)
59
+ mzrot = Matrix.identity(qrot.row_size)
60
+ mzrot[row,row] = hcos
61
+ mzrot[row,col] = -hsin
62
+ mzrot[col,row] = hsin
63
+ mzrot[col,col] = hcos
64
+ qrot = mzrot.trans * qrot * mzrot
65
+ v = v * mzrot
66
+ end
67
+ end
68
+ s_old = qrot.dup if cnt == 1
69
+ sum_qrot = 0.0
70
+ if cnt > 1
71
+ qrot.row_size.times do |r|
72
+ sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
73
+ end
74
+ s_old = qrot.dup
75
+ end
76
+ break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
77
+ end # of do while true
78
+ s = []
79
+ qrot.row_size.times do |r|
80
+ s << Math.sqrt(qrot[r,r])
81
+ end
82
+ #puts "cnt = #{cnt}"
83
+ if self.row_size >= self.column_size
84
+ mu = self * v * Matrix.diagonal(*s).inverse
85
+ return [mu, v, s]
86
+ else
87
+ puts v.row_size
88
+ puts v.column_size
89
+ puts self.row_size
90
+ puts self.column_size
91
+ puts s.size
92
+
93
+ mu = (self.trans * v * Matrix.diagonal(*s).inverse)
94
+ return [mu, v, s]
95
+ end
96
+ end
97
+ def []=(i,j,val)
98
+ @rows[i][j] = val
99
+ end
100
+ end
@@ -0,0 +1,20 @@
1
+ module GSL
2
+
3
+ class Vector
4
+ def _dump(v)
5
+ Marshal.dump( self.to_a )
6
+ end
7
+
8
+ def self._load(arr)
9
+ arry = Marshal.load(arr)
10
+ return GSL::Vector.alloc(arry)
11
+ end
12
+
13
+ end
14
+
15
+ class Matrix
16
+ class <<self
17
+ alias :diag :diagonal
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,348 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ begin
6
+ raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
7
+
8
+ require 'gsl' # requires http://rb-gsl.rubyforge.org/
9
+ require 'classifier/extensions/vector_serialize'
10
+ $GSL = true
11
+
12
+ rescue LoadError
13
+ warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
14
+ require 'classifier/extensions/vector'
15
+ end
16
+
17
+ require 'classifier/lsi/word_list'
18
+ require 'classifier/lsi/content_node'
19
+ require 'classifier/lsi/summary'
20
+
21
+ module Classifier
22
+
23
+ # This class implements a Latent Semantic Indexer, which can search, classify and cluster
24
+ # data based on underlying semantic relations. For more information on the algorithms used,
25
+ # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
26
+ class LSI < Classifier::Base
27
+
28
+ attr_reader :word_list
29
+ attr_accessor :auto_rebuild
30
+
31
+ # Create a fresh index.
32
+ # If you want to call #build_index manually, use
33
+ # Classifier::LSI.new :auto_rebuild => false
34
+ #
35
+ def initialize(options = {})
36
+ @auto_rebuild = true unless options[:auto_rebuild] == false
37
+ @word_list, @items = WordList.new, {}
38
+ @version, @built_at_version = 0, -1
39
+ super
40
+ end
41
+
42
+ # Returns true if the index needs to be rebuilt. The index needs
43
+ # to be built after all informaton is added, but before you start
44
+ # using it for search, classification and cluster detection.
45
+ def needs_rebuild?
46
+ (@items.keys.size > 1) && (@version != @built_at_version)
47
+ end
48
+
49
+ # Adds an item to the index. item is assumed to be a string, but
50
+ # any item may be indexed so long as it responds to #to_s or if
51
+ # you provide an optional block explaining how the indexer can
52
+ # fetch fresh string data. This optional block is passed the item,
53
+ # so the item may only be a reference to a URL or file name.
54
+ #
55
+ # For example:
56
+ # lsi = Classifier::LSI.new
57
+ # lsi.add_item "This is just plain text"
58
+ # lsi.add_item "/home/me/filename.txt" { |x| File.read x }
59
+ # ar = ActiveRecordObject.find( :all )
60
+ # lsi.add_item ar, *ar.categories { |x| ar.content }
61
+ #
62
+ def add_item( item, *categories, &block )
63
+ clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
64
+ @items[item] = ContentNode.new(clean_word_hash, *categories)
65
+ @version += 1
66
+ build_index if @auto_rebuild
67
+ end
68
+
69
+ # A less flexible shorthand for add_item that assumes
70
+ # you are passing in a string with no categorries. item
71
+ # will be duck typed via to_s .
72
+ #
73
+ def <<( item )
74
+ add_item item
75
+ end
76
+
77
+ # Returns the categories for a given indexed items. You are free to add and remove
78
+ # items from this as you see fit. It does not invalide an index to change its categories.
79
+ def categories_for(item)
80
+ return [] unless @items[item]
81
+ return @items[item].categories
82
+ end
83
+
84
+ # Removes an item from the database, if it is indexed.
85
+ #
86
+ def remove_item( item )
87
+ if @items.keys.contain? item
88
+ @items.remove item
89
+ @version += 1
90
+ end
91
+ end
92
+
93
+ # Returns an array of items that are indexed.
94
+ def items
95
+ @items.keys
96
+ end
97
+
98
+ # Returns the categories for a given indexed items. You are free to add and remove
99
+ # items from this as you see fit. It does not invalide an index to change its categories.
100
+ def categories_for(item)
101
+ return [] unless @items[item]
102
+ return @items[item].categories
103
+ end
104
+
105
+ # This function rebuilds the index if needs_rebuild? returns true.
106
+ # For very large document spaces, this indexing operation may take some
107
+ # time to complete, so it may be wise to place the operation in another
108
+ # thread.
109
+ #
110
+ # As a rule, indexing will be fairly swift on modern machines until
111
+ # you have well over 500 documents indexed, or have an incredibly diverse
112
+ # vocabulary for your documents.
113
+ #
114
+ # The optional parameter "cutoff" is a tuning parameter. When the index is
115
+ # built, a certain number of s-values are discarded from the system. The
116
+ # cutoff parameter tells the indexer how many of these values to keep.
117
+ # A value of 1 for cutoff means that no semantic analysis will take place,
118
+ # turning the LSI class into a simple vector search engine.
119
+ def build_index( cutoff=0.75 )
120
+ return unless needs_rebuild?
121
+ make_word_list
122
+
123
+ doc_list = @items.values
124
+ tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
125
+
126
+ if $GSL
127
+ tdm = GSL::Matrix.alloc(*tda).trans
128
+ ntdm = build_reduced_matrix(tdm, cutoff)
129
+
130
+ ntdm.size[1].times do |col|
131
+ vec = GSL::Vector.alloc( ntdm.column(col) ).row
132
+ doc_list[col].lsi_vector = vec
133
+ doc_list[col].lsi_norm = vec.normalize
134
+ end
135
+ else
136
+ tdm = Matrix.rows(tda).trans
137
+ ntdm = build_reduced_matrix(tdm, cutoff)
138
+
139
+ ntdm.row_size.times do |col|
140
+ doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
141
+ doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
142
+ end
143
+ end
144
+
145
+ @built_at_version = @version
146
+ end
147
+
148
+ # This method returns max_chunks entries, ordered by their average semantic rating.
149
+ # Essentially, the average distance of each entry from all other entries is calculated,
150
+ # the highest are returned.
151
+ #
152
+ # This can be used to build a summary service, or to provide more information about
153
+ # your dataset's general content. For example, if you were to use categorize on the
154
+ # results of this data, you could gather information on what your dataset is generally
155
+ # about.
156
+ def highest_relative_content( max_chunks=10 )
157
+ return [] if needs_rebuild?
158
+
159
+ avg_density = Hash.new
160
+ @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
161
+
162
+ avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
163
+ end
164
+
165
+ # This function is the primitive that find_related and classify
166
+ # build upon. It returns an array of 2-element arrays. The first element
167
+ # of this array is a document, and the second is its "score", defining
168
+ # how "close" it is to other indexed items.
169
+ #
170
+ # These values are somewhat arbitrary, having to do with the vector space
171
+ # created by your content, so the magnitude is interpretable but not always
172
+ # meaningful between indexes.
173
+ #
174
+ # The parameter doc is the content to compare. If that content is not
175
+ # indexed, you can pass an optional block to define how to create the
176
+ # text data. See add_item for examples of how this works.
177
+ def proximity_array_for_content( doc, &block )
178
+ return [] if needs_rebuild?
179
+
180
+ content_node = node_for_content( doc, &block )
181
+ result =
182
+ @items.keys.collect do |item|
183
+ next if @items[item].search_vector.blank? # not enough data
184
+ if $GSL
185
+ val = content_node.search_vector * @items[item].search_vector.col
186
+ else
187
+ val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
188
+ end
189
+ [item, val]
190
+ end
191
+ result.compact.sort_by { |x| x[1] }.reverse
192
+ end
193
+
194
+ # Similar to proximity_array_for_content, this function takes similar
195
+ # arguments and returns a similar array. However, it uses the normalized
196
+ # calculated vectors instead of their full versions. This is useful when
197
+ # you're trying to perform operations on content that is much smaller than
198
+ # the text you're working with. search uses this primitive.
199
+ def proximity_norms_for_content( doc, &block )
200
+ return [] if needs_rebuild?
201
+
202
+ content_node = node_for_content( doc, &block )
203
+ result =
204
+ @items.keys.collect do |item|
205
+ next if @items[item].search_norm.blank? # not enough data
206
+ if $GSL
207
+ val = content_node.search_norm * @items[item].search_norm.col
208
+ else
209
+ val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
210
+ end
211
+ [item, val]
212
+ end
213
+ result.compact.sort_by { |x| x[1] }.reverse
214
+ end
215
+
216
+ # This function allows for text-based search of your index. Unlike other functions
217
+ # like find_related and classify, search only takes short strings. It will also ignore
218
+ # factors like repeated words. It is best for short, google-like search terms.
219
+ # A search will first priortize lexical relationships, then semantic ones.
220
+ #
221
+ # While this may seem backwards compared to the other functions that LSI supports,
222
+ # it is actually the same algorithm, just applied on a smaller document.
223
+ def search( string, max_nearest=3 )
224
+ return [] if needs_rebuild?
225
+ carry = proximity_norms_for_content( string )
226
+ result = carry.collect { |x| x[0] }
227
+ return result[0..max_nearest-1]
228
+ end
229
+
230
+ # This function takes content and finds other documents
231
+ # that are semantically "close", returning an array of documents sorted
232
+ # from most to least relavant.
233
+ # max_nearest specifies the number of documents to return. A value of
234
+ # 0 means that it returns all the indexed documents, sorted by relavence.
235
+ #
236
+ # This is particularly useful for identifing clusters in your document space.
237
+ # For example you may want to identify several "What's Related" items for weblog
238
+ # articles, or find paragraphs that relate to each other in an essay.
239
+ def find_related( doc, max_nearest=3, &block )
240
+ carry =
241
+ proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
242
+ result = carry.collect { |x| x[0] }
243
+ return result[0..max_nearest-1]
244
+ end
245
+
246
+ # This function uses a voting system to categorize documents, based on
247
+ # the categories of other documents. It uses the same logic as the
248
+ # find_related function to find related documents, then returns the
249
+ # most obvious category from this list.
250
+ #
251
+ # cutoff signifies the number of documents to consider when clasifying
252
+ # text. A cutoff of 1 means that every document in the index votes on
253
+ # what category the document is in. This may not always make sense.
254
+ #
255
+ def classify( doc, cutoff=0.30, &block )
256
+ icutoff = (@items.size * cutoff).round
257
+ carry = proximity_array_for_content( doc, &block )
258
+ carry = carry[0..icutoff-1]
259
+ votes = {}
260
+ carry.each do |pair|
261
+ categories = @items[pair[0]].categories
262
+ categories.each do |category|
263
+ votes[category] ||= 0.0
264
+ votes[category] += pair[1]
265
+ end
266
+ end
267
+
268
+ ranking = votes.keys.sort_by { |x| votes[x] }
269
+ return ranking[-1]
270
+ end
271
+
272
+ # Same as previous but returns all results, also more permissive in default cut-off
273
+ def classify_multiple( doc, cutoff=0.50, &block )
274
+ icutoff = (@items.size * cutoff).round
275
+ carry = proximity_array_for_content( doc, &block )
276
+ carry = carry[0..icutoff-1]
277
+ votes = {}
278
+ carry.each do |pair|
279
+ categories = @items[pair[0]].categories
280
+ categories.each do |category|
281
+ votes[category] ||= 0.0
282
+ votes[category] += pair[1]
283
+ end
284
+ end
285
+ votes.delete_if{|key, value| value<1 }.keys.sort_by { |x| -votes[x] }
286
+ end
287
+
288
+ # Prototype, only works on indexed documents.
289
+ # I have no clue if this is going to work, but in theory
290
+ # it's supposed to.
291
+ def highest_ranked_stems( doc, count=3 )
292
+ raise "Requested stem ranking on non-indexed content!" unless @items[doc]
293
+ arr = node_for_content(doc).lsi_vector.to_a
294
+ top_n = arr.sort.reverse[0..count-1]
295
+ return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
296
+ end
297
+
298
+ def marshal_dump
299
+ [ @auto_rebuild, @word_list, @items, @version, @built_at_version,
300
+ @options,
301
+ ]
302
+ end
303
+
304
+ def marshal_load(data)
305
+ @auto_rebuild, @word_list, @items, @version, @built_at_version,
306
+ @options = data
307
+ end
308
+
309
+ private
310
+ def build_reduced_matrix( matrix, cutoff=0.75 )
311
+ # TODO: Check that M>=N on these dimensions! Transpose helps assure this
312
+ u, v, s = matrix.SV_decomp
313
+
314
+ # TODO: Better than 75% term, please. :\
315
+ s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
316
+ s.size.times do |ord|
317
+ s[ord] = 0.0 if s[ord] < s_cutoff
318
+ end
319
+ # Reconstruct the term document matrix, only with reduced rank
320
+ u * ($GSL ? GSL::Matrix : Matrix).diag( s ) * v.trans
321
+ end
322
+
323
+ def node_for_content(item, &block)
324
+ if @items[item]
325
+ return @items[item]
326
+ else
327
+ clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
328
+
329
+ cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
330
+
331
+ unless needs_rebuild?
332
+ cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
333
+ end
334
+ end
335
+
336
+ return cn
337
+ end
338
+
339
+ def make_word_list
340
+ @word_list = WordList.new
341
+ @items.each_value do |node|
342
+ node.word_hash.each_key { |key| @word_list.add_word key }
343
+ end
344
+ end
345
+
346
+ end
347
+ end
348
+