classifier 1.3.5 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,15 +3,16 @@
3
3
  # License:: LGPL
4
4
 
5
5
  begin
6
- raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
7
-
8
- require 'gsl' # requires http://rb-gsl.rubyforge.org/
9
- require 'classifier/extensions/vector_serialize'
10
- $GSL = true
6
+ # to test the native vector class, try `rake test NATIVE_VECTOR=true`
7
+ raise LoadError if ENV['NATIVE_VECTOR'] == 'true'
11
8
 
9
+ require 'gsl' # requires https://github.com/SciRuby/rb-gsl/
10
+ require 'classifier/extensions/vector_serialize'
11
+ $GSL = true
12
12
  rescue LoadError
13
- warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
14
- require 'classifier/extensions/vector'
13
+ warn 'Notice: for 10x faster LSI support, please install https://github.com/SciRuby/rb-gsl/'
14
+ $GSL = false
15
+ require 'classifier/extensions/vector'
15
16
  end
16
17
 
17
18
  require 'classifier/lsi/word_list'
@@ -19,12 +20,10 @@ require 'classifier/lsi/content_node'
19
20
  require 'classifier/lsi/summary'
20
21
 
21
22
  module Classifier
22
-
23
23
  # This class implements a Latent Semantic Indexer, which can search, classify and cluster
24
24
  # data based on underlying semantic relations. For more information on the algorithms used,
25
25
  # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
26
26
  class LSI
27
-
28
27
  attr_reader :word_list
29
28
  attr_accessor :auto_rebuild
30
29
 
@@ -34,8 +33,10 @@ module Classifier
34
33
  #
35
34
  def initialize(options = {})
36
35
  @auto_rebuild = true unless options[:auto_rebuild] == false
37
- @word_list, @items = WordList.new, {}
38
- @version, @built_at_version = 0, -1
36
+ @word_list = WordList.new
37
+ @items = {}
38
+ @version = 0
39
+ @built_at_version = -1
39
40
  end
40
41
 
41
42
  # Returns true if the index needs to be rebuilt. The index needs
@@ -58,7 +59,7 @@ module Classifier
58
59
  # ar = ActiveRecordObject.find( :all )
59
60
  # lsi.add_item ar, *ar.categories { |x| ar.content }
60
61
  #
61
- def add_item( item, *categories, &block )
62
+ def add_item(item, *categories, &block)
62
63
  clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
63
64
  @items[item] = ContentNode.new(clean_word_hash, *categories)
64
65
  @version += 1
@@ -69,24 +70,25 @@ module Classifier
69
70
  # you are passing in a string with no categorries. item
70
71
  # will be duck typed via to_s .
71
72
  #
72
- def <<( item )
73
- add_item item
73
+ def <<(item)
74
+ add_item(item)
74
75
  end
75
76
 
76
77
  # Returns the categories for a given indexed items. You are free to add and remove
77
78
  # items from this as you see fit. It does not invalide an index to change its categories.
78
79
  def categories_for(item)
79
80
  return [] unless @items[item]
80
- return @items[item].categories
81
+
82
+ @items[item].categories
81
83
  end
82
84
 
83
85
  # Removes an item from the database, if it is indexed.
84
86
  #
85
- def remove_item( item )
86
- if @items.keys.contain? item
87
- @items.remove item
88
- @version += 1
89
- end
87
+ def remove_item(item)
88
+ return unless @items.key?(item)
89
+
90
+ @items.delete(item)
91
+ @version += 1
90
92
  end
91
93
 
92
94
  # Returns an array of items that are indexed.
@@ -94,13 +96,6 @@ module Classifier
94
96
  @items.keys
95
97
  end
96
98
 
97
- # Returns the categories for a given indexed items. You are free to add and remove
98
- # items from this as you see fit. It does not invalide an index to change its categories.
99
- def categories_for(item)
100
- return [] unless @items[item]
101
- return @items[item].categories
102
- end
103
-
104
99
  # This function rebuilds the index if needs_rebuild? returns true.
105
100
  # For very large document spaces, this indexing operation may take some
106
101
  # time to complete, so it may be wise to place the operation in another
@@ -115,30 +110,31 @@ module Classifier
115
110
  # cutoff parameter tells the indexer how many of these values to keep.
116
111
  # A value of 1 for cutoff means that no semantic analysis will take place,
117
112
  # turning the LSI class into a simple vector search engine.
118
- def build_index( cutoff=0.75 )
113
+ def build_index(cutoff = 0.75)
119
114
  return unless needs_rebuild?
115
+
120
116
  make_word_list
121
117
 
122
118
  doc_list = @items.values
123
- tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
119
+ tda = doc_list.collect { |node| node.raw_vector_with(@word_list) }
124
120
 
125
121
  if $GSL
126
- tdm = GSL::Matrix.alloc(*tda).trans
127
- ntdm = build_reduced_matrix(tdm, cutoff)
128
-
129
- ntdm.size[1].times do |col|
130
- vec = GSL::Vector.alloc( ntdm.column(col) ).row
131
- doc_list[col].lsi_vector = vec
132
- doc_list[col].lsi_norm = vec.normalize
133
- end
122
+ tdm = GSL::Matrix.alloc(*tda).trans
123
+ ntdm = build_reduced_matrix(tdm, cutoff)
124
+
125
+ ntdm.size[1].times do |col|
126
+ vec = GSL::Vector.alloc(ntdm.column(col)).row
127
+ doc_list[col].lsi_vector = vec
128
+ doc_list[col].lsi_norm = vec.normalize
129
+ end
134
130
  else
135
- tdm = Matrix.rows(tda).trans
136
- ntdm = build_reduced_matrix(tdm, cutoff)
131
+ tdm = Matrix.rows(tda).trans
132
+ ntdm = build_reduced_matrix(tdm, cutoff)
137
133
 
138
- ntdm.row_size.times do |col|
139
- doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
140
- doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
141
- end
134
+ ntdm.row_size.times do |col|
135
+ doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
136
+ doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
137
+ end
142
138
  end
143
139
 
144
140
  @built_at_version = @version
@@ -152,13 +148,13 @@ module Classifier
152
148
  # your dataset's general content. For example, if you were to use categorize on the
153
149
  # results of this data, you could gather information on what your dataset is generally
154
150
  # about.
155
- def highest_relative_content( max_chunks=10 )
156
- return [] if needs_rebuild?
151
+ def highest_relative_content(max_chunks = 10)
152
+ return [] if needs_rebuild?
157
153
 
158
- avg_density = Hash.new
159
- @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
154
+ avg_density = {}
155
+ @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x, y| x + y[1] } }
160
156
 
161
- avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
157
+ avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks - 1].map
162
158
  end
163
159
 
164
160
  # This function is the primitive that find_related and classify
@@ -173,17 +169,17 @@ module Classifier
173
169
  # The parameter doc is the content to compare. If that content is not
174
170
  # indexed, you can pass an optional block to define how to create the
175
171
  # text data. See add_item for examples of how this works.
176
- def proximity_array_for_content( doc, &block )
172
+ def proximity_array_for_content(doc, &block)
177
173
  return [] if needs_rebuild?
178
174
 
179
- content_node = node_for_content( doc, &block )
175
+ content_node = node_for_content(doc, &block)
180
176
  result =
181
177
  @items.keys.collect do |item|
182
- if $GSL
183
- val = content_node.search_vector * @items[item].search_vector.col
184
- else
185
- val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
186
- end
178
+ val = if $GSL
179
+ content_node.search_vector * @items[item].search_vector.col
180
+ else
181
+ (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
182
+ end
187
183
  [item, val]
188
184
  end
189
185
  result.sort_by { |x| x[1] }.reverse
@@ -194,17 +190,17 @@ module Classifier
194
190
  # calculated vectors instead of their full versions. This is useful when
195
191
  # you're trying to perform operations on content that is much smaller than
196
192
  # the text you're working with. search uses this primitive.
197
- def proximity_norms_for_content( doc, &block )
193
+ def proximity_norms_for_content(doc, &block)
198
194
  return [] if needs_rebuild?
199
195
 
200
- content_node = node_for_content( doc, &block )
196
+ content_node = node_for_content(doc, &block)
201
197
  result =
202
198
  @items.keys.collect do |item|
203
- if $GSL
204
- val = content_node.search_norm * @items[item].search_norm.col
205
- else
206
- val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
207
- end
199
+ val = if $GSL
200
+ content_node.search_norm * @items[item].search_norm.col
201
+ else
202
+ (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
203
+ end
208
204
  [item, val]
209
205
  end
210
206
  result.sort_by { |x| x[1] }.reverse
@@ -217,11 +213,12 @@ module Classifier
217
213
  #
218
214
  # While this may seem backwards compared to the other functions that LSI supports,
219
215
  # it is actually the same algorithm, just applied on a smaller document.
220
- def search( string, max_nearest=3 )
216
+ def search(string, max_nearest = 3)
221
217
  return [] if needs_rebuild?
222
- carry = proximity_norms_for_content( string )
218
+
219
+ carry = proximity_norms_for_content(string)
223
220
  result = carry.collect { |x| x[0] }
224
- return result[0..max_nearest-1]
221
+ result[0..max_nearest - 1]
225
222
  end
226
223
 
227
224
  # This function takes content and finds other documents
@@ -233,11 +230,11 @@ module Classifier
233
230
  # This is particularly useful for identifing clusters in your document space.
234
231
  # For example you may want to identify several "What's Related" items for weblog
235
232
  # articles, or find paragraphs that relate to each other in an essay.
236
- def find_related( doc, max_nearest=3, &block )
233
+ def find_related(doc, max_nearest = 3, &block)
237
234
  carry =
238
- proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
235
+ proximity_array_for_content(doc, &block).reject { |pair| pair[0] == doc }
239
236
  result = carry.collect { |x| x[0] }
240
- return result[0..max_nearest-1]
237
+ result[0..max_nearest - 1]
241
238
  end
242
239
 
243
240
  # This function uses a voting system to categorize documents, based on
@@ -249,10 +246,17 @@ module Classifier
249
246
  # text. A cutoff of 1 means that every document in the index votes on
250
247
  # what category the document is in. This may not always make sense.
251
248
  #
252
- def classify( doc, cutoff=0.30, &block )
249
+ def classify(doc, cutoff = 0.30, &block)
250
+ votes = vote(doc, cutoff, &block)
251
+
252
+ ranking = votes.keys.sort_by { |x| votes[x] }
253
+ ranking[-1]
254
+ end
255
+
256
+ def vote(doc, cutoff = 0.30, &block)
253
257
  icutoff = (@items.size * cutoff).round
254
- carry = proximity_array_for_content( doc, &block )
255
- carry = carry[0..icutoff-1]
258
+ carry = proximity_array_for_content(doc, &block)
259
+ carry = carry[0..icutoff - 1]
256
260
  votes = {}
257
261
  carry.each do |pair|
258
262
  categories = @items[pair[0]].categories
@@ -261,23 +265,46 @@ module Classifier
261
265
  votes[category] += pair[1]
262
266
  end
263
267
  end
268
+ votes
269
+ end
270
+
271
+ # Returns the same category as classify() but also returns
272
+ # a confidence value derived from the vote share that the
273
+ # winning category got.
274
+ #
275
+ # e.g.
276
+ # category,confidence = classify_with_confidence(doc)
277
+ # if confidence < 0.3
278
+ # category = nil
279
+ # end
280
+ #
281
+ #
282
+ # See classify() for argument docs
283
+ def classify_with_confidence(doc, cutoff = 0.30, &block)
284
+ votes = vote(doc, cutoff, &block)
285
+ votes_sum = votes.values.inject(0.0) { |sum, v| sum + v }
286
+ return [nil, nil] if votes_sum.zero?
264
287
 
265
288
  ranking = votes.keys.sort_by { |x| votes[x] }
266
- return ranking[-1]
289
+ winner = ranking[-1]
290
+ vote_share = votes[winner] / votes_sum.to_f
291
+ [winner, vote_share]
267
292
  end
268
293
 
269
294
  # Prototype, only works on indexed documents.
270
295
  # I have no clue if this is going to work, but in theory
271
296
  # it's supposed to.
272
- def highest_ranked_stems( doc, count=3 )
273
- raise "Requested stem ranking on non-indexed content!" unless @items[doc]
297
+ def highest_ranked_stems(doc, count = 3)
298
+ raise 'Requested stem ranking on non-indexed content!' unless @items[doc]
299
+
274
300
  arr = node_for_content(doc).lsi_vector.to_a
275
- top_n = arr.sort.reverse[0..count-1]
276
- return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
301
+ top_n = arr.sort.reverse[0..count - 1]
302
+ top_n.collect { |x| @word_list.word_for_index(arr.index(x)) }
277
303
  end
278
304
 
279
305
  private
280
- def build_reduced_matrix( matrix, cutoff=0.75 )
306
+
307
+ def build_reduced_matrix(matrix, cutoff = 0.75)
281
308
  # TODO: Check that M>=N on these dimensions! Transpose helps assure this
282
309
  u, v, s = matrix.SV_decomp
283
310
 
@@ -287,23 +314,21 @@ module Classifier
287
314
  s[ord] = 0.0 if s[ord] < s_cutoff
288
315
  end
289
316
  # Reconstruct the term document matrix, only with reduced rank
290
- u * ($GSL ? GSL::Matrix : ::Matrix).diag( s ) * v.trans
317
+ u * ($GSL ? GSL::Matrix : ::Matrix).diag(s) * v.trans
291
318
  end
292
319
 
293
320
  def node_for_content(item, &block)
294
- if @items[item]
295
- return @items[item]
296
- else
297
- clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
321
+ return @items[item] if @items[item]
298
322
 
299
- cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
323
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
300
324
 
301
- unless needs_rebuild?
302
- cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
303
- end
325
+ cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
326
+
327
+ unless needs_rebuild?
328
+ cn.raw_vector_with(@word_list) # make the lsi raw and norm vectors
304
329
  end
305
330
 
306
- return cn
331
+ cn
307
332
  end
308
333
 
309
334
  def make_word_list
@@ -312,7 +337,5 @@ module Classifier
312
337
  node.word_hash.each_key { |key| @word_list.add_word key }
313
338
  end
314
339
  end
315
-
316
340
  end
317
341
  end
318
-
data/lib/classifier.rb CHANGED
@@ -26,5 +26,6 @@
26
26
 
27
27
  require 'rubygems'
28
28
  require 'classifier/extensions/string'
29
+ require 'classifier/extensions/vector'
29
30
  require 'classifier/bayes'
30
31
  require 'classifier/lsi'
@@ -0,0 +1,5 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'minitest'
4
+ require 'minitest/autorun'
5
+ require 'classifier'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.5
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lucas Carlson
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-04-17 00:00:00.000000000 Z
11
+ date: 2024-07-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: fast-stemmer
@@ -24,20 +24,6 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: 1.0.0
27
- - !ruby/object:Gem::Dependency
28
- name: mathn
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: rake
43
29
  requirement: !ruby/object:Gem::Requirement
@@ -99,11 +85,12 @@ files:
99
85
  - lib/classifier/lsi/content_node.rb
100
86
  - lib/classifier/lsi/summary.rb
101
87
  - lib/classifier/lsi/word_list.rb
88
+ - test/test_helper.rb
102
89
  homepage: https://github.com/cardmagic/classifier
103
90
  licenses:
104
91
  - LGPL
105
92
  metadata: {}
106
- post_install_message:
93
+ post_install_message:
107
94
  rdoc_options: []
108
95
  require_paths:
109
96
  - lib
@@ -118,9 +105,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
118
105
  - !ruby/object:Gem::Version
119
106
  version: '0'
120
107
  requirements: []
121
- rubyforge_project:
122
- rubygems_version: 2.7.6
123
- signing_key:
108
+ rubygems_version: 3.5.9
109
+ signing_key:
124
110
  specification_version: 4
125
111
  summary: A general classifier module to allow Bayesian and other types of classifications.
126
112
  test_files: []