classifier 1.3.5 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,15 +3,16 @@
3
3
  # License:: LGPL
4
4
 
5
5
  begin
6
- raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
7
-
8
- require 'gsl' # requires http://rb-gsl.rubyforge.org/
9
- require 'classifier/extensions/vector_serialize'
10
- $GSL = true
6
+ # to test the native vector class, try `rake test NATIVE_VECTOR=true`
7
+ raise LoadError if ENV['NATIVE_VECTOR'] == 'true'
11
8
 
9
+ require 'gsl' # requires https://github.com/SciRuby/rb-gsl/
10
+ require 'classifier/extensions/vector_serialize'
11
+ $GSL = true
12
12
  rescue LoadError
13
- warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
14
- require 'classifier/extensions/vector'
13
+ warn 'Notice: for 10x faster LSI support, please install https://github.com/SciRuby/rb-gsl/'
14
+ $GSL = false
15
+ require 'classifier/extensions/vector'
15
16
  end
16
17
 
17
18
  require 'classifier/lsi/word_list'
@@ -19,12 +20,10 @@ require 'classifier/lsi/content_node'
19
20
  require 'classifier/lsi/summary'
20
21
 
21
22
  module Classifier
22
-
23
23
  # This class implements a Latent Semantic Indexer, which can search, classify and cluster
24
24
  # data based on underlying semantic relations. For more information on the algorithms used,
25
25
  # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
26
26
  class LSI
27
-
28
27
  attr_reader :word_list
29
28
  attr_accessor :auto_rebuild
30
29
 
@@ -34,8 +33,10 @@ module Classifier
34
33
  #
35
34
  def initialize(options = {})
36
35
  @auto_rebuild = true unless options[:auto_rebuild] == false
37
- @word_list, @items = WordList.new, {}
38
- @version, @built_at_version = 0, -1
36
+ @word_list = WordList.new
37
+ @items = {}
38
+ @version = 0
39
+ @built_at_version = -1
39
40
  end
40
41
 
41
42
  # Returns true if the index needs to be rebuilt. The index needs
@@ -58,7 +59,7 @@ module Classifier
58
59
  # ar = ActiveRecordObject.find( :all )
59
60
  # lsi.add_item ar, *ar.categories { |x| ar.content }
60
61
  #
61
- def add_item( item, *categories, &block )
62
+ def add_item(item, *categories, &block)
62
63
  clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
63
64
  @items[item] = ContentNode.new(clean_word_hash, *categories)
64
65
  @version += 1
@@ -69,24 +70,25 @@ module Classifier
69
70
  # you are passing in a string with no categorries. item
70
71
  # will be duck typed via to_s .
71
72
  #
72
- def <<( item )
73
- add_item item
73
+ def <<(item)
74
+ add_item(item)
74
75
  end
75
76
 
76
77
  # Returns the categories for a given indexed items. You are free to add and remove
77
78
  # items from this as you see fit. It does not invalide an index to change its categories.
78
79
  def categories_for(item)
79
80
  return [] unless @items[item]
80
- return @items[item].categories
81
+
82
+ @items[item].categories
81
83
  end
82
84
 
83
85
  # Removes an item from the database, if it is indexed.
84
86
  #
85
- def remove_item( item )
86
- if @items.keys.contain? item
87
- @items.remove item
88
- @version += 1
89
- end
87
+ def remove_item(item)
88
+ return unless @items.key?(item)
89
+
90
+ @items.delete(item)
91
+ @version += 1
90
92
  end
91
93
 
92
94
  # Returns an array of items that are indexed.
@@ -94,13 +96,6 @@ module Classifier
94
96
  @items.keys
95
97
  end
96
98
 
97
- # Returns the categories for a given indexed items. You are free to add and remove
98
- # items from this as you see fit. It does not invalide an index to change its categories.
99
- def categories_for(item)
100
- return [] unless @items[item]
101
- return @items[item].categories
102
- end
103
-
104
99
  # This function rebuilds the index if needs_rebuild? returns true.
105
100
  # For very large document spaces, this indexing operation may take some
106
101
  # time to complete, so it may be wise to place the operation in another
@@ -115,30 +110,31 @@ module Classifier
115
110
  # cutoff parameter tells the indexer how many of these values to keep.
116
111
  # A value of 1 for cutoff means that no semantic analysis will take place,
117
112
  # turning the LSI class into a simple vector search engine.
118
- def build_index( cutoff=0.75 )
113
+ def build_index(cutoff = 0.75)
119
114
  return unless needs_rebuild?
115
+
120
116
  make_word_list
121
117
 
122
118
  doc_list = @items.values
123
- tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
119
+ tda = doc_list.collect { |node| node.raw_vector_with(@word_list) }
124
120
 
125
121
  if $GSL
126
- tdm = GSL::Matrix.alloc(*tda).trans
127
- ntdm = build_reduced_matrix(tdm, cutoff)
128
-
129
- ntdm.size[1].times do |col|
130
- vec = GSL::Vector.alloc( ntdm.column(col) ).row
131
- doc_list[col].lsi_vector = vec
132
- doc_list[col].lsi_norm = vec.normalize
133
- end
122
+ tdm = GSL::Matrix.alloc(*tda).trans
123
+ ntdm = build_reduced_matrix(tdm, cutoff)
124
+
125
+ ntdm.size[1].times do |col|
126
+ vec = GSL::Vector.alloc(ntdm.column(col)).row
127
+ doc_list[col].lsi_vector = vec
128
+ doc_list[col].lsi_norm = vec.normalize
129
+ end
134
130
  else
135
- tdm = Matrix.rows(tda).trans
136
- ntdm = build_reduced_matrix(tdm, cutoff)
131
+ tdm = Matrix.rows(tda).trans
132
+ ntdm = build_reduced_matrix(tdm, cutoff)
137
133
 
138
- ntdm.row_size.times do |col|
139
- doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
140
- doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
141
- end
134
+ ntdm.row_size.times do |col|
135
+ doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
136
+ doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
137
+ end
142
138
  end
143
139
 
144
140
  @built_at_version = @version
@@ -152,13 +148,13 @@ module Classifier
152
148
  # your dataset's general content. For example, if you were to use categorize on the
153
149
  # results of this data, you could gather information on what your dataset is generally
154
150
  # about.
155
- def highest_relative_content( max_chunks=10 )
156
- return [] if needs_rebuild?
151
+ def highest_relative_content(max_chunks = 10)
152
+ return [] if needs_rebuild?
157
153
 
158
- avg_density = Hash.new
159
- @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
154
+ avg_density = {}
155
+ @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x, y| x + y[1] } }
160
156
 
161
- avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
157
+ avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks - 1].map
162
158
  end
163
159
 
164
160
  # This function is the primitive that find_related and classify
@@ -173,17 +169,17 @@ module Classifier
173
169
  # The parameter doc is the content to compare. If that content is not
174
170
  # indexed, you can pass an optional block to define how to create the
175
171
  # text data. See add_item for examples of how this works.
176
- def proximity_array_for_content( doc, &block )
172
+ def proximity_array_for_content(doc, &block)
177
173
  return [] if needs_rebuild?
178
174
 
179
- content_node = node_for_content( doc, &block )
175
+ content_node = node_for_content(doc, &block)
180
176
  result =
181
177
  @items.keys.collect do |item|
182
- if $GSL
183
- val = content_node.search_vector * @items[item].search_vector.col
184
- else
185
- val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
186
- end
178
+ val = if $GSL
179
+ content_node.search_vector * @items[item].search_vector.col
180
+ else
181
+ (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
182
+ end
187
183
  [item, val]
188
184
  end
189
185
  result.sort_by { |x| x[1] }.reverse
@@ -194,17 +190,17 @@ module Classifier
194
190
  # calculated vectors instead of their full versions. This is useful when
195
191
  # you're trying to perform operations on content that is much smaller than
196
192
  # the text you're working with. search uses this primitive.
197
- def proximity_norms_for_content( doc, &block )
193
+ def proximity_norms_for_content(doc, &block)
198
194
  return [] if needs_rebuild?
199
195
 
200
- content_node = node_for_content( doc, &block )
196
+ content_node = node_for_content(doc, &block)
201
197
  result =
202
198
  @items.keys.collect do |item|
203
- if $GSL
204
- val = content_node.search_norm * @items[item].search_norm.col
205
- else
206
- val = (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
207
- end
199
+ val = if $GSL
200
+ content_node.search_norm * @items[item].search_norm.col
201
+ else
202
+ (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
203
+ end
208
204
  [item, val]
209
205
  end
210
206
  result.sort_by { |x| x[1] }.reverse
@@ -217,11 +213,12 @@ module Classifier
217
213
  #
218
214
  # While this may seem backwards compared to the other functions that LSI supports,
219
215
  # it is actually the same algorithm, just applied on a smaller document.
220
- def search( string, max_nearest=3 )
216
+ def search(string, max_nearest = 3)
221
217
  return [] if needs_rebuild?
222
- carry = proximity_norms_for_content( string )
218
+
219
+ carry = proximity_norms_for_content(string)
223
220
  result = carry.collect { |x| x[0] }
224
- return result[0..max_nearest-1]
221
+ result[0..max_nearest - 1]
225
222
  end
226
223
 
227
224
  # This function takes content and finds other documents
@@ -233,11 +230,11 @@ module Classifier
233
230
  # This is particularly useful for identifing clusters in your document space.
234
231
  # For example you may want to identify several "What's Related" items for weblog
235
232
  # articles, or find paragraphs that relate to each other in an essay.
236
- def find_related( doc, max_nearest=3, &block )
233
+ def find_related(doc, max_nearest = 3, &block)
237
234
  carry =
238
- proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
235
+ proximity_array_for_content(doc, &block).reject { |pair| pair[0] == doc }
239
236
  result = carry.collect { |x| x[0] }
240
- return result[0..max_nearest-1]
237
+ result[0..max_nearest - 1]
241
238
  end
242
239
 
243
240
  # This function uses a voting system to categorize documents, based on
@@ -249,10 +246,17 @@ module Classifier
249
246
  # text. A cutoff of 1 means that every document in the index votes on
250
247
  # what category the document is in. This may not always make sense.
251
248
  #
252
- def classify( doc, cutoff=0.30, &block )
249
+ def classify(doc, cutoff = 0.30, &block)
250
+ votes = vote(doc, cutoff, &block)
251
+
252
+ ranking = votes.keys.sort_by { |x| votes[x] }
253
+ ranking[-1]
254
+ end
255
+
256
+ def vote(doc, cutoff = 0.30, &block)
253
257
  icutoff = (@items.size * cutoff).round
254
- carry = proximity_array_for_content( doc, &block )
255
- carry = carry[0..icutoff-1]
258
+ carry = proximity_array_for_content(doc, &block)
259
+ carry = carry[0..icutoff - 1]
256
260
  votes = {}
257
261
  carry.each do |pair|
258
262
  categories = @items[pair[0]].categories
@@ -261,23 +265,46 @@ module Classifier
261
265
  votes[category] += pair[1]
262
266
  end
263
267
  end
268
+ votes
269
+ end
270
+
271
+ # Returns the same category as classify() but also returns
272
+ # a confidence value derived from the vote share that the
273
+ # winning category got.
274
+ #
275
+ # e.g.
276
+ # category,confidence = classify_with_confidence(doc)
277
+ # if confidence < 0.3
278
+ # category = nil
279
+ # end
280
+ #
281
+ #
282
+ # See classify() for argument docs
283
+ def classify_with_confidence(doc, cutoff = 0.30, &block)
284
+ votes = vote(doc, cutoff, &block)
285
+ votes_sum = votes.values.inject(0.0) { |sum, v| sum + v }
286
+ return [nil, nil] if votes_sum.zero?
264
287
 
265
288
  ranking = votes.keys.sort_by { |x| votes[x] }
266
- return ranking[-1]
289
+ winner = ranking[-1]
290
+ vote_share = votes[winner] / votes_sum.to_f
291
+ [winner, vote_share]
267
292
  end
268
293
 
269
294
  # Prototype, only works on indexed documents.
270
295
  # I have no clue if this is going to work, but in theory
271
296
  # it's supposed to.
272
- def highest_ranked_stems( doc, count=3 )
273
- raise "Requested stem ranking on non-indexed content!" unless @items[doc]
297
+ def highest_ranked_stems(doc, count = 3)
298
+ raise 'Requested stem ranking on non-indexed content!' unless @items[doc]
299
+
274
300
  arr = node_for_content(doc).lsi_vector.to_a
275
- top_n = arr.sort.reverse[0..count-1]
276
- return top_n.collect { |x| @word_list.word_for_index(arr.index(x))}
301
+ top_n = arr.sort.reverse[0..count - 1]
302
+ top_n.collect { |x| @word_list.word_for_index(arr.index(x)) }
277
303
  end
278
304
 
279
305
  private
280
- def build_reduced_matrix( matrix, cutoff=0.75 )
306
+
307
+ def build_reduced_matrix(matrix, cutoff = 0.75)
281
308
  # TODO: Check that M>=N on these dimensions! Transpose helps assure this
282
309
  u, v, s = matrix.SV_decomp
283
310
 
@@ -287,23 +314,21 @@ module Classifier
287
314
  s[ord] = 0.0 if s[ord] < s_cutoff
288
315
  end
289
316
  # Reconstruct the term document matrix, only with reduced rank
290
- u * ($GSL ? GSL::Matrix : ::Matrix).diag( s ) * v.trans
317
+ u * ($GSL ? GSL::Matrix : ::Matrix).diag(s) * v.trans
291
318
  end
292
319
 
293
320
  def node_for_content(item, &block)
294
- if @items[item]
295
- return @items[item]
296
- else
297
- clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
321
+ return @items[item] if @items[item]
298
322
 
299
- cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
323
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
300
324
 
301
- unless needs_rebuild?
302
- cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
303
- end
325
+ cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
326
+
327
+ unless needs_rebuild?
328
+ cn.raw_vector_with(@word_list) # make the lsi raw and norm vectors
304
329
  end
305
330
 
306
- return cn
331
+ cn
307
332
  end
308
333
 
309
334
  def make_word_list
@@ -312,7 +337,5 @@ module Classifier
312
337
  node.word_hash.each_key { |key| @word_list.add_word key }
313
338
  end
314
339
  end
315
-
316
340
  end
317
341
  end
318
-
data/lib/classifier.rb CHANGED
@@ -26,5 +26,6 @@
26
26
 
27
27
  require 'rubygems'
28
28
  require 'classifier/extensions/string'
29
+ require 'classifier/extensions/vector'
29
30
  require 'classifier/bayes'
30
31
  require 'classifier/lsi'
@@ -0,0 +1,5 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'minitest'
4
+ require 'minitest/autorun'
5
+ require 'classifier'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.5
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lucas Carlson
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-04-17 00:00:00.000000000 Z
11
+ date: 2024-07-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: fast-stemmer
@@ -24,20 +24,6 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: 1.0.0
27
- - !ruby/object:Gem::Dependency
28
- name: mathn
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: rake
43
29
  requirement: !ruby/object:Gem::Requirement
@@ -99,11 +85,12 @@ files:
99
85
  - lib/classifier/lsi/content_node.rb
100
86
  - lib/classifier/lsi/summary.rb
101
87
  - lib/classifier/lsi/word_list.rb
88
+ - test/test_helper.rb
102
89
  homepage: https://github.com/cardmagic/classifier
103
90
  licenses:
104
91
  - LGPL
105
92
  metadata: {}
106
- post_install_message:
93
+ post_install_message:
107
94
  rdoc_options: []
108
95
  require_paths:
109
96
  - lib
@@ -118,9 +105,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
118
105
  - !ruby/object:Gem::Version
119
106
  version: '0'
120
107
  requirements: []
121
- rubyforge_project:
122
- rubygems_version: 2.7.6
123
- signing_key:
108
+ rubygems_version: 3.5.9
109
+ signing_key:
124
110
  specification_version: 4
125
111
  summary: A general classifier module to allow Bayesian and other types of classifications.
126
112
  test_files: []