logankoester-classifier 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile ADDED
@@ -0,0 +1,21 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |s|
7
+ s.name = "logankoester-classifier"
8
+ s.summary = "A general classifier module to allow Bayesian and other types of classifications."
9
+ s.description = "Bayesian classifier and others."
10
+ s.homepage = "http://github.com/logankoester/classifier"
11
+ s.author = "Luis Parravicini"
12
+ s.email = "lparravi@gmail.com"
13
+
14
+ s.add_dependency "activesupport", ">= 2.2.2"
15
+ s.add_dependency "ruby-stemmer", ">= 0.5.1"
16
+ end
17
+ rescue LoadError
18
+ puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
19
+ end
20
+
21
+ Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }
data/VERSION.yml ADDED
@@ -0,0 +1,5 @@
1
+ ---
2
+ :major: 1
3
+ :minor: 4
4
+ :patch: 3
5
+ :build:
data/lib/classifier.rb ADDED
@@ -0,0 +1,31 @@
1
+ #--
2
+ # Copyright (c) 2005 Lucas Carlson
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
24
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
25
+ # License:: LGPL
26
+
27
+ require 'active_support'
28
+ require 'lingua/stemmer'
29
+ require 'classifier/base'
30
+ require 'classifier/bayes'
31
+ require 'classifier/lsi'
@@ -0,0 +1,65 @@
1
+ module Classifier
2
+ class Base
3
+
4
+ def initialize(options = {})
5
+ options.reverse_merge!(:language => 'en')
6
+ options.reverse_merge!(:encoding => 'UTF_8')
7
+
8
+ @options = options
9
+ end
10
+
11
+ def prepare_category_name val
12
+ val.to_s.gsub("_"," ").capitalize
13
+ end
14
+
15
+ # Removes common punctuation symbols, returning a new string.
16
+ # E.g.,
17
+ # "Hello (greeting's), with {braces} < >...?".without_punctuation
18
+ # => "Hello greetings with braces "
19
+ def without_punctuation str
20
+ str.tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
21
+ end
22
+
23
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
24
+ # and indexes to its frequency in the document.
25
+ def word_hash str
26
+ word_hash_for_words(str.gsub(/[^\w\s]/,"").split + str.gsub(/[\w]/," ").split)
27
+ end
28
+
29
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
30
+ def clean_word_hash str
31
+ word_hash_for_words str.gsub(/[^\w\s]/,"").split
32
+ end
33
+
34
+ # When a Classifier instance is serialized, it is saved with an instance
35
+ # of Lingua::Stemmer that may not be initialized when deserialized later,
36
+ # raising a "RuntimeError: Stemmer is not initialized".
37
+ #
38
+ # You can run remove_stemmer to force a new Stemmer to be initialized.
39
+ def remove_stemmer
40
+ @stemmer = nil
41
+ end
42
+
43
+ private
44
+
45
+ def stemmer
46
+ @stemmer ||= Lingua::Stemmer.new(@options)
47
+ end
48
+
49
+ def word_hash_for_words(words)
50
+ d = Hash.new
51
+ skip_words = StopWords.for(@options[:language], @options[:lang_dir])
52
+ encoding_name = @options[:encoding].gsub(/_/, '-')
53
+ words.each do |word|
54
+ word = word.mb_chars.downcase.to_s if word =~ /[\w]+/
55
+ key = stemmer.stem(word)
56
+ key.force_encoding(encoding_name) if defined?(Encoding) && key && key.respond_to?(:force_encoding)
57
+ if word =~ /[^\w]/ || ! skip_words.include?(word) && word.length > 2
58
+ d[key] ||= 0
59
+ d[key] += 1
60
+ end
61
+ end
62
+ return d
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,145 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ require 'classifier/stopwords'
6
+
7
+ module Classifier
8
+
9
+ class Bayes < Classifier::Base
10
+
11
+ # The class can be created with one or more categories, each of which will be
12
+ # initialized and given a training method. E.g.,
13
+ # b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam']
14
+ # you can specify language and encoding parameters for stemmer
15
+ # (default values - :language => 'en', :encoding => 'UTF_8')
16
+ # b = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting', 'Spam'], :language => 'ru'
17
+ def initialize(options = {})
18
+ @categories = Hash.new
19
+ options.reverse_merge!(:categories => [])
20
+ options[:categories].each { |category| @categories[prepare_category_name(category)] = Hash.new }
21
+ @total_words = 0
22
+ super
23
+ end
24
+
25
+ #
26
+ # Provides a general training method for all categories specified in Bayes#new
27
+ # For example:
28
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
29
+ # b.train :this, "This text"
30
+ # b.train "that", "That text"
31
+ # b.train "The other", "The other text"
32
+ def train(category, text)
33
+ category = prepare_category_name(category)
34
+ word_hash(text).each do |word, count|
35
+ @categories[category][word] ||= 0
36
+ @categories[category][word] += count
37
+ @total_words += count
38
+ end
39
+ end
40
+
41
+ #
42
+ # Provides a untraining method for all categories specified in Bayes#new
43
+ # Be very careful with this method.
44
+ #
45
+ # For example:
46
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
47
+ # b.train :this, "This text"
48
+ # b.untrain :this, "This text"
49
+ def untrain(category, text)
50
+ category = prepare_category_name(category)
51
+ word_hash(text).each do |word, count|
52
+ if @total_words >= 0
53
+ orig = @categories[category][word] || 0
54
+ @categories[category][word] ||= 0
55
+ @categories[category][word] -= count
56
+ if @categories[category][word] <= 0
57
+ @categories[category].delete(word)
58
+ count = orig
59
+ end
60
+ @total_words -= count
61
+ end
62
+ end
63
+ end
64
+
65
+ #
66
+ # Returns the scores in each category the provided +text+. E.g.,
67
+ # b.classifications "I hate bad words and you"
68
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
69
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
70
+ def classifications(text)
71
+ score = Hash.new
72
+ @categories.each do |category, category_words|
73
+ score[category.to_s] = 0
74
+ total = category_words.values.sum
75
+ word_hash(text).each do |word, count|
76
+ s = category_words.has_key?(word) ? category_words[word] : 0.1
77
+ score[category.to_s] += Math.log(s/total.to_f)
78
+ end
79
+ end
80
+ return score
81
+ end
82
+
83
+ #
84
+ # Returns the classification of the provided +text+, which is one of the
85
+ # categories given in the initializer. E.g.,
86
+ # b.classify "I hate bad words and you"
87
+ # => 'Uninteresting'
88
+ def classify(text)
89
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
90
+ end
91
+
92
+ #
93
+ # Provides training and untraining methods for the categories specified in Bayes#new
94
+ # For example:
95
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
96
+ # b.train_this "This text"
97
+ # b.train_that "That text"
98
+ # b.untrain_that "That text"
99
+ # b.train_the_other "The other text"
100
+ def method_missing(name, *args)
101
+ category = prepare_category_name(name.to_s.gsub(/(un)?train_([\w]+)/, '\2'))
102
+ if @categories.has_key? category
103
+ args.each { |text| eval("#{$1}train(category, text)") }
104
+ elsif name.to_s =~ /(un)?train_([\w]+)/
105
+ raise StandardError, "No such category: #{category}"
106
+ else
107
+ super #raise StandardError, "No such method: #{name}"
108
+ end
109
+ end
110
+
111
+ #
112
+ # Provides a list of category names
113
+ # For example:
114
+ # b.categories
115
+ # => ['This', 'That', 'the_other']
116
+ def categories # :nodoc:
117
+ @categories.keys.collect {|c| c.to_s}
118
+ end
119
+
120
+ #
121
+ # Allows you to add categories to the classifier.
122
+ # For example:
123
+ # b.add_category "Not spam"
124
+ #
125
+ # WARNING: Adding categories to a trained classifier will
126
+ # result in an undertrained category that will tend to match
127
+ # more criteria than the trained selective categories. In short,
128
+ # try to initialize your categories at initialization.
129
+ def add_category(category)
130
+ @categories[prepare_category_name(category)] = Hash.new
131
+ end
132
+
133
+ alias append_category add_category
134
+
135
+ def marshal_dump
136
+ [@categories, @total_words, @options ]
137
+ end
138
+
139
+ def marshal_load(data)
140
+ @categories, @total_words, @options = data
141
+ end
142
+
143
+ end
144
+
145
+ end
@@ -0,0 +1,100 @@
1
+ # Author:: Ernest Ellingson
2
+ # Copyright:: Copyright (c) 2005
3
+
4
+ # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
5
+
6
+ require 'matrix'
7
+ require 'mathn'
8
+
9
+ class Vector
10
+ def magnitude
11
+ sumsqs = 0.0
12
+ self.size.times do |i|
13
+ sumsqs += self[i] ** 2.0
14
+ end
15
+ Math.sqrt(sumsqs)
16
+ end
17
+ def normalize
18
+ nv = []
19
+ mag = self.magnitude
20
+ self.size.times do |i|
21
+
22
+ nv << (self[i] / mag)
23
+
24
+ end
25
+ Vector[*nv]
26
+ end
27
+ end
28
+
29
+ class Matrix
30
+ def Matrix.diag(s)
31
+ Matrix.diagonal(*s)
32
+ end
33
+
34
+ alias :trans :transpose
35
+
36
+ def SV_decomp(maxSweeps = 20)
37
+ if self.row_size >= self.column_size
38
+ q = self.trans * self
39
+ else
40
+ q = self * self.trans
41
+ end
42
+
43
+ qrot = q.dup
44
+ v = Matrix.identity(q.row_size)
45
+ azrot = nil
46
+ mzrot = nil
47
+ cnt = 0
48
+ s_old = nil
49
+ mu = nil
50
+
51
+ while true do
52
+ cnt += 1
53
+ for row in (0...qrot.row_size-1) do
54
+ for col in (1..qrot.row_size-1) do
55
+ next if row == col
56
+ h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
57
+ hcos = Math.cos(h)
58
+ hsin = Math.sin(h)
59
+ mzrot = Matrix.identity(qrot.row_size)
60
+ mzrot[row,row] = hcos
61
+ mzrot[row,col] = -hsin
62
+ mzrot[col,row] = hsin
63
+ mzrot[col,col] = hcos
64
+ qrot = mzrot.trans * qrot * mzrot
65
+ v = v * mzrot
66
+ end
67
+ end
68
+ s_old = qrot.dup if cnt == 1
69
+ sum_qrot = 0.0
70
+ if cnt > 1
71
+ qrot.row_size.times do |r|
72
+ sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
73
+ end
74
+ s_old = qrot.dup
75
+ end
76
+ break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
77
+ end # of do while true
78
+ s = []
79
+ qrot.row_size.times do |r|
80
+ s << Math.sqrt(qrot[r,r])
81
+ end
82
+ #puts "cnt = #{cnt}"
83
+ if self.row_size >= self.column_size
84
+ mu = self * v * Matrix.diagonal(*s).inverse
85
+ return [mu, v, s]
86
+ else
87
+ puts v.row_size
88
+ puts v.column_size
89
+ puts self.row_size
90
+ puts self.column_size
91
+ puts s.size
92
+
93
+ mu = (self.trans * v * Matrix.diagonal(*s).inverse)
94
+ return [mu, v, s]
95
+ end
96
+ end
97
+ def []=(i,j,val)
98
+ @rows[i][j] = val
99
+ end
100
+ end
@@ -0,0 +1,20 @@
1
+ module GSL
2
+
3
+ class Vector
4
+ def _dump(v)
5
+ Marshal.dump( self.to_a )
6
+ end
7
+
8
+ def self._load(arr)
9
+ arry = Marshal.load(arr)
10
+ return GSL::Vector.alloc(arry)
11
+ end
12
+
13
+ end
14
+
15
+ class Matrix
16
+ class <<self
17
+ alias :diag :diagonal
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,348 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ begin
6
+ raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
7
+
8
+ require 'gsl' # requires http://rb-gsl.rubyforge.org/
9
+ require 'classifier/extensions/vector_serialize'
10
+ $GSL = true
11
+
12
+ rescue LoadError
13
+ warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
14
+ require 'classifier/extensions/vector'
15
+ end
16
+
17
+ require 'classifier/lsi/word_list'
18
+ require 'classifier/lsi/content_node'
19
+ require 'classifier/lsi/summary'
20
+
21
+ module Classifier
22
+
23
+ # This class implements a Latent Semantic Indexer, which can search, classify and cluster
24
+ # data based on underlying semantic relations. For more information on the algorithms used,
25
+ # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
26
+ class LSI < Classifier::Base
27
+
28
+ attr_reader :word_list
29
+ attr_accessor :auto_rebuild
30
+
31
+ # Create a fresh index.
32
+ # If you want to call #build_index manually, use
33
+ # Classifier::LSI.new :auto_rebuild => false
34
+ #
35
+ def initialize(options = {})
36
+ @auto_rebuild = true unless options[:auto_rebuild] == false
37
+ @word_list, @items = WordList.new, {}
38
+ @version, @built_at_version = 0, -1
39
+ super
40
+ end
41
+
42
+ # Returns true if the index needs to be rebuilt. The index needs
43
+ # to be built after all informaton is added, but before you start
44
+ # using it for search, classification and cluster detection.
45
+ def needs_rebuild?
46
+ (@items.keys.size > 1) && (@version != @built_at_version)
47
+ end
48
+
49
+ # Adds an item to the index. item is assumed to be a string, but
50
+ # any item may be indexed so long as it responds to #to_s or if
51
+ # you provide an optional block explaining how the indexer can
52
+ # fetch fresh string data. This optional block is passed the item,
53
+ # so the item may only be a reference to a URL or file name.
54
+ #
55
+ # For example:
56
+ # lsi = Classifier::LSI.new
57
+ # lsi.add_item "This is just plain text"
58
+ # lsi.add_item "/home/me/filename.txt" { |x| File.read x }
59
+ # ar = ActiveRecordObject.find( :all )
60
+ # lsi.add_item ar, *ar.categories { |x| ar.content }
61
+ #
62
+ def add_item( item, *categories, &block )
63
+ clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
64
+ @items[item] = ContentNode.new(clean_word_hash, *categories)
65
+ @version += 1
66
+ build_index if @auto_rebuild
67
+ end
68
+
69
+ # A less flexible shorthand for add_item that assumes
70
+ # you are passing in a string with no categorries. item
71
+ # will be duck typed via to_s .
72
+ #
73
+ def <<( item )
74
+ add_item item
75
+ end
76
+
77
+ # Returns the categories for a given indexed items. You are free to add and remove
78
+ # items from this as you see fit. It does not invalide an index to change its categories.
79
+ def categories_for(item)
80
+ return [] unless @items[item]
81
+ return @items[item].categories
82
+ end
83
+
84
+ # Removes an item from the database, if it is indexed.
85
+ #
86
+ def remove_item( item )
87
+ if @items.keys.contain? item
88
+ @items.remove item
89
+ @version += 1
90
+ end
91
+ end
92
+
93
+ # Returns an array of items that are indexed.
94
+ def items
95
+ @items.keys
96
+ end
97
+
98
+ # Returns the categories for a given indexed items. You are free to add and remove
99
+ # items from this as you see fit. It does not invalide an index to change its categories.
100
+ def categories_for(item)
101
+ return [] unless @items[item]
102
+ return @items[item].categories
103
+ end
104
+
105
+ # This function rebuilds the index if needs_rebuild? returns true.
106
+ # For very large document spaces, this indexing operation may take some
107
+ # time to complete, so it may be wise to place the operation in another
108
+ # thread.
109
+ #
110
+ # As a rule, indexing will be fairly swift on modern machines until
111
+ # you have well over 500 documents indexed, or have an incredibly diverse
112
+ # vocabulary for your documents.
113
+ #
114
+ # The optional parameter "cutoff" is a tuning parameter. When the index is
115
+ # built, a certain number of s-values are discarded from the system. The
116
+ # cutoff parameter tells the indexer how many of these values to keep.
117
+ # A value of 1 for cutoff means that no semantic analysis will take place,
118
+ # turning the LSI class into a simple vector search engine.
119
+ def build_index( cutoff=0.75 )
120
+ return unless needs_rebuild?
121
+ make_word_list
122
+
123
+ doc_list = @items.values
124
+ tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
125
+
126
+ if $GSL
127
+ tdm = GSL::Matrix.alloc(*tda).trans
128
+ ntdm = build_reduced_matrix(tdm, cutoff)
129
+
130
+ ntdm.size[1].times do |col|
131
+ vec = GSL::Vector.alloc( ntdm.column(col) ).row
132
+ doc_list[col].lsi_vector = vec
133
+ doc_list[col].lsi_norm = vec.normalize
134
+ end
135
+ else
136
+ tdm = Matrix.rows(tda).trans
137
+ ntdm = build_reduced_matrix(tdm, cutoff)
138
+
139
+ ntdm.row_size.times do |col|
140
+ doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
141
+ doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
142
+ end
143
+ end
144
+
145
+ @built_at_version = @version
146
+ end
147
+
148
+ # This method returns max_chunks entries, ordered by their average semantic rating.
149
+ # Essentially, the average distance of each entry from all other entries is calculated,
150
+ # the highest are returned.
151
+ #
152
+ # This can be used to build a summary service, or to provide more information about
153
+ # your dataset's general content. For example, if you were to use categorize on the
154
+ # results of this data, you could gather information on what your dataset is generally
155
+ # about.
156
+ def highest_relative_content( max_chunks=10 )
157
+ return [] if needs_rebuild?
158
+
159
+ avg_density = Hash.new
160
+ @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
161
+
162
+ avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
163
+ end
164
+
165
+ # This function is the primitive that find_related and classify
166
+ # build upon. It returns an array of 2-element arrays. The first element
167
+ # of this array is a document, and the second is its "score", defining
168
+ # how "close" it is to other indexed items.
169
+ #
170
+ # These values are somewhat arbitrary, having to do with the vector space
171
+ # created by your content, so the magnitude is interpretable but not always
172
+ # meaningful between indexes.
173
+ #
174
+ # The parameter doc is the content to compare. If that content is not
175
+ # indexed, you can pass an optional block to define how to create the
176
+ # text data. See add_item for examples of how this works.
177
+ def proximity_array_for_content( doc, &block )
178
+ return [] if needs_rebuild?
179
+
180
+ content_node = node_for_content( doc, &block )
181
+ result =
182
+ @items.keys.collect do |item|
183
+ next if @items[item].search_vector.blank? # not enough data
184
+ if $GSL
185
+ val = content_node.search_vector * @items[item].search_vector.col
186
+ else
187
+ val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
188
+ end
189
+ [item, val]
190
+ end
191
+ result.compact.sort_by { |x| x[1] }.reverse
192
+ end
193
+
194
+ # Similar to proximity_array_for_content, this function takes similar
195
+ # arguments and returns a similar array. However, it uses the normalized
196
+ # calculated vectors instead of their full versions. This is useful when
197
+ # you're trying to perform operations on content that is much smaller than
198
+ # the text you're working with. search uses this primitive.
199
+ def proximity_norms_for_content( doc, &block )
200
+ return [] if needs_rebuild?
201
+
202
+ content_node = node_for_content( doc, &block )
203
+ result =
204
+ @items.keys.collect do |item|
205
+ next if @items[item].search_norm.blank? # not enough data
206
+ if $GSL
207
+ val = content_node.search_norm * @items[item].search_norm.col
208
+ else
209
+ val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
210
+ end
211
+ [item, val]
212
+ end
213
+ result.compact.sort_by { |x| x[1] }.reverse
214
+ end
215
+
216
+ # This function allows for text-based search of your index. Unlike other functions
217
+ # like find_related and classify, search only takes short strings. It will also ignore
218
+ # factors like repeated words. It is best for short, google-like search terms.
219
+ # A search will first priortize lexical relationships, then semantic ones.
220
+ #
221
+ # While this may seem backwards compared to the other functions that LSI supports,
222
+ # it is actually the same algorithm, just applied on a smaller document.
223
+ def search( string, max_nearest=3 )
224
+ return [] if needs_rebuild?
225
+ carry = proximity_norms_for_content( string )
226
+ result = carry.collect { |x| x[0] }
227
+ return result[0..max_nearest-1]
228
+ end
229
+
230
+ # This function takes content and finds other documents
231
+ # that are semantically "close", returning an array of documents sorted
232
+ # from most to least relavant.
233
+ # max_nearest specifies the number of documents to return. A value of
234
+ # 0 means that it returns all the indexed documents, sorted by relavence.
235
+ #
236
+ # This is particularly useful for identifing clusters in your document space.
237
+ # For example you may want to identify several "What's Related" items for weblog
238
+ # articles, or find paragraphs that relate to each other in an essay.
239
+ def find_related( doc, max_nearest=3, &block )
240
+ carry =
241
+ proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
242
+ result = carry.collect { |x| x[0] }
243
+ return result[0..max_nearest-1]
244
+ end
245
+
246
+ # This function uses a voting system to categorize documents, based on
247
+ # the categories of other documents. It uses the same logic as the
248
+ # find_related function to find related documents, then returns the
249
+ # most obvious category from this list.
250
+ #
251
+ # cutoff signifies the number of documents to consider when clasifying
252
+ # text. A cutoff of 1 means that every document in the index votes on
253
+ # what category the document is in. This may not always make sense.
254
+ #
255
+ def classify( doc, cutoff=0.30, &block )
256
+ icutoff = (@items.size * cutoff).round
257
+ carry = proximity_array_for_content( doc, &block )
258
+ carry = carry[0..icutoff-1]
259
+ votes = {}
260
+ carry.each do |pair|
261
+ categories = @items[pair[0]].categories
262
+ categories.each do |category|
263
+ votes[category] ||= 0.0
264
+ votes[category] += pair[1]
265
+ end
266
+ end
267
+
268
+ ranking = votes.keys.sort_by { |x| votes[x] }
269
+ return ranking[-1]
270
+ end
271
+
272
+ # Same as previous but returns all results, also more permissive in default cut-off
273
+ def classify_multiple( doc, cutoff=0.50, &block )
274
+ icutoff = (@items.size * cutoff).round
275
+ carry = proximity_array_for_content( doc, &block )
276
+ carry = carry[0..icutoff-1]
277
+ votes = {}
278
+ carry.each do |pair|
279
+ categories = @items[pair[0]].categories
280
+ categories.each do |category|
281
+ votes[category] ||= 0.0
282
+ votes[category] += pair[1]
283
+ end
284
+ end
285
+ votes.delete_if{|key, value| value<1 }.keys.sort_by { |x| -votes[x] }
286
+ end
287
+
288
+ # Prototype, only works on indexed documents.
289
+ # I have no clue if this is going to work, but in theory
290
+ # it's supposed to.
291
+ def highest_ranked_stems( doc, count=3 )
292
+ raise "Requested stem ranking on non-indexed content!" unless @items[doc]
293
+ arr = node_for_content(doc).lsi_vector.to_a
294
+ top_n = arr.sort.reverse[0..count-1]
295
+ return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
296
+ end
297
+
298
+ def marshal_dump
299
+ [ @auto_rebuild, @word_list, @items, @version, @built_at_version,
300
+ @options,
301
+ ]
302
+ end
303
+
304
+ def marshal_load(data)
305
+ @auto_rebuild, @word_list, @items, @version, @built_at_version,
306
+ @options = data
307
+ end
308
+
309
+ private
310
+ def build_reduced_matrix( matrix, cutoff=0.75 )
311
+ # TODO: Check that M>=N on these dimensions! Transpose helps assure this
312
+ u, v, s = matrix.SV_decomp
313
+
314
+ # TODO: Better than 75% term, please. :\
315
+ s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
316
+ s.size.times do |ord|
317
+ s[ord] = 0.0 if s[ord] < s_cutoff
318
+ end
319
+ # Reconstruct the term document matrix, only with reduced rank
320
+ u * ($GSL ? GSL::Matrix : Matrix).diag( s ) * v.trans
321
+ end
322
+
323
+ def node_for_content(item, &block)
324
+ if @items[item]
325
+ return @items[item]
326
+ else
327
+ clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
328
+
329
+ cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
330
+
331
+ unless needs_rebuild?
332
+ cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
333
+ end
334
+ end
335
+
336
+ return cn
337
+ end
338
+
339
+ def make_word_list
340
+ @word_list = WordList.new
341
+ @items.each_value do |node|
342
+ node.word_hash.each_key { |key| @word_list.add_word key }
343
+ end
344
+ end
345
+
346
+ end
347
+ end
348
+