classifier 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,32 +6,55 @@
6
6
 
7
7
  module Classifier
8
8
  class LSI
9
- # @rbs @gsl_available: bool
10
- @gsl_available = false
9
+ # Backend options: :native, :ruby
10
+ # @rbs @backend: Symbol
11
+ @backend = :ruby
11
12
 
12
13
  class << self
13
- # @rbs @gsl_available: bool
14
- attr_accessor :gsl_available
14
+ # @rbs @backend: Symbol
15
+ attr_accessor :backend
16
+
17
+ # Check if using native C extension
18
+ # @rbs () -> bool
19
+ def native_available?
20
+ backend == :native
21
+ end
22
+
23
+ # Get the Vector class for the current backend
24
+ # @rbs () -> Class
25
+ def vector_class
26
+ backend == :native ? Classifier::Linalg::Vector : ::Vector
27
+ end
28
+
29
+ # Get the Matrix class for the current backend
30
+ # @rbs () -> Class
31
+ def matrix_class
32
+ backend == :native ? Classifier::Linalg::Matrix : ::Matrix
33
+ end
15
34
  end
16
35
  end
17
36
  end
18
37
 
38
+ # Backend detection: native extension > pure Ruby
39
+ # Set NATIVE_VECTOR=true to force pure Ruby implementation
40
+
19
41
  begin
20
- # to test the native vector class, try `rake test NATIVE_VECTOR=true`
21
42
  raise LoadError if ENV['NATIVE_VECTOR'] == 'true'
22
- raise LoadError unless Gem::Specification.find_all_by_name('gsl').any?
23
43
 
24
- require 'gsl'
25
- require 'classifier/extensions/vector_serialize'
26
- Classifier::LSI.gsl_available = true
44
+ require 'classifier/classifier_ext'
45
+ Classifier::LSI.backend = :native
27
46
  rescue LoadError
28
- unless ENV['SUPPRESS_GSL_WARNING'] == 'true'
29
- warn 'Notice: for 10x faster LSI, run `gem install gsl`. Set SUPPRESS_GSL_WARNING=true to hide this.'
47
+ # Fall back to pure Ruby implementation
48
+ unless ENV['SUPPRESS_LSI_WARNING'] == 'true'
49
+ warn 'Notice: for 5-10x faster LSI, install the classifier gem with native extensions. ' \
50
+ 'Set SUPPRESS_LSI_WARNING=true to hide this.'
30
51
  end
31
- Classifier::LSI.gsl_available = false
52
+ Classifier::LSI.backend = :ruby
32
53
  require 'classifier/extensions/vector'
33
54
  end
34
55
 
56
+ require 'json'
57
+ require 'mutex_m'
35
58
  require 'classifier/lsi/word_list'
36
59
  require 'classifier/lsi/content_node'
37
60
  require 'classifier/lsi/summary'
@@ -41,14 +64,19 @@ module Classifier
41
64
  # data based on underlying semantic relations. For more information on the algorithms used,
42
65
  # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
43
66
  class LSI
67
+ include Mutex_m
68
+
44
69
  # @rbs @auto_rebuild: bool
45
70
  # @rbs @word_list: WordList
46
71
  # @rbs @items: Hash[untyped, ContentNode]
47
72
  # @rbs @version: Integer
48
73
  # @rbs @built_at_version: Integer
74
+ # @rbs @singular_values: Array[Float]?
75
+ # @rbs @dirty: bool
76
+ # @rbs @storage: Storage::Base?
49
77
 
50
- attr_reader :word_list
51
- attr_accessor :auto_rebuild
78
+ attr_reader :word_list, :singular_values
79
+ attr_accessor :auto_rebuild, :storage
52
80
 
53
81
  # Create a fresh index.
54
82
  # If you want to call #build_index manually, use
@@ -56,11 +84,14 @@ module Classifier
56
84
  #
57
85
  # @rbs (?Hash[Symbol, untyped]) -> void
58
86
  def initialize(options = {})
87
+ super()
59
88
  @auto_rebuild = true unless options[:auto_rebuild] == false
60
89
  @word_list = WordList.new
61
90
  @items = {}
62
91
  @version = 0
63
92
  @built_at_version = -1
93
+ @dirty = false
94
+ @storage = nil
64
95
  end
65
96
 
66
97
  # Returns true if the index needs to be rebuilt. The index needs
@@ -69,7 +100,26 @@ module Classifier
69
100
  #
70
101
  # @rbs () -> bool
71
102
  def needs_rebuild?
72
- (@items.keys.size > 1) && (@version != @built_at_version)
103
+ synchronize { (@items.keys.size > 1) && (@version != @built_at_version) }
104
+ end
105
+
106
+ # @rbs () -> Array[Hash[Symbol, untyped]]?
107
+ def singular_value_spectrum
108
+ return nil unless @singular_values
109
+
110
+ total = @singular_values.sum
111
+ return nil if total.zero?
112
+
113
+ cumulative = 0.0
114
+ @singular_values.map.with_index do |value, i|
115
+ cumulative += value
116
+ {
117
+ dimension: i,
118
+ value: value,
119
+ percentage: value / total,
120
+ cumulative_percentage: cumulative / total
121
+ }
122
+ end
73
123
  end
74
124
 
75
125
  # Adds an item to the index. item is assumed to be a string, but
@@ -88,8 +138,11 @@ module Classifier
88
138
  # @rbs (String, *String | Symbol) ?{ (String) -> String } -> void
89
139
  def add_item(item, *categories, &block)
90
140
  clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
91
- @items[item] = ContentNode.new(clean_word_hash, *categories)
92
- @version += 1
141
+ synchronize do
142
+ @items[item] = ContentNode.new(clean_word_hash, *categories)
143
+ @version += 1
144
+ @dirty = true
145
+ end
93
146
  build_index if @auto_rebuild
94
147
  end
95
148
 
@@ -107,25 +160,32 @@ module Classifier
107
160
  #
108
161
  # @rbs (String) -> Array[String | Symbol]
109
162
  def categories_for(item)
110
- return [] unless @items[item]
163
+ synchronize do
164
+ return [] unless @items[item]
111
165
 
112
- @items[item].categories
166
+ @items[item].categories
167
+ end
113
168
  end
114
169
 
115
170
  # Removes an item from the database, if it is indexed.
116
171
  #
117
172
  # @rbs (String) -> void
118
173
  def remove_item(item)
119
- return unless @items.key?(item)
174
+ removed = synchronize do
175
+ next false unless @items.key?(item)
120
176
 
121
- @items.delete(item)
122
- @version += 1
177
+ @items.delete(item)
178
+ @version += 1
179
+ @dirty = true
180
+ true
181
+ end
182
+ build_index if removed && @auto_rebuild
123
183
  end
124
184
 
125
185
  # Returns an array of items that are indexed.
126
186
  # @rbs () -> Array[untyped]
127
187
  def items
128
- @items.keys
188
+ synchronize { @items.keys }
129
189
  end
130
190
 
131
191
  # This function rebuilds the index if needs_rebuild? returns true.
@@ -145,38 +205,30 @@ module Classifier
145
205
  #
146
206
  # @rbs (?Float) -> void
147
207
  def build_index(cutoff = 0.75)
148
- return unless needs_rebuild?
149
-
150
- make_word_list
151
-
152
- doc_list = @items.values
153
- tda = doc_list.collect { |node| node.raw_vector_with(@word_list) }
154
-
155
- if self.class.gsl_available
156
- tdm = GSL::Matrix.alloc(*tda).trans
157
- ntdm = build_reduced_matrix(tdm, cutoff)
158
-
159
- ntdm.size[1].times do |col|
160
- vec = GSL::Vector.alloc(ntdm.column(col)).row
161
- doc_list[col].lsi_vector = vec
162
- doc_list[col].lsi_norm = vec.normalize
208
+ validate_cutoff!(cutoff)
209
+
210
+ synchronize do
211
+ return unless needs_rebuild_unlocked?
212
+
213
+ make_word_list
214
+
215
+ doc_list = @items.values
216
+ tda = doc_list.collect { |node| node.raw_vector_with(@word_list) }
217
+
218
+ if self.class.native_available?
219
+ # Convert vectors to arrays for matrix construction
220
+ tda_arrays = tda.map { |v| v.respond_to?(:to_a) ? v.to_a : v }
221
+ tdm = self.class.matrix_class.alloc(*tda_arrays).trans
222
+ ntdm = build_reduced_matrix(tdm, cutoff)
223
+ assign_native_ext_lsi_vectors(ntdm, doc_list)
224
+ else
225
+ tdm = Matrix.rows(tda).trans
226
+ ntdm = build_reduced_matrix(tdm, cutoff)
227
+ assign_ruby_lsi_vectors(ntdm, doc_list)
163
228
  end
164
- else
165
- tdm = Matrix.rows(tda).trans
166
- ntdm = build_reduced_matrix(tdm, cutoff)
167
-
168
- ntdm.column_size.times do |col|
169
- next unless doc_list[col]
170
-
171
- column = ntdm.column(col)
172
- next unless column
173
229
 
174
- doc_list[col].lsi_vector = column
175
- doc_list[col].lsi_norm = column.normalize
176
- end
230
+ @built_at_version = @version
177
231
  end
178
-
179
- @built_at_version = @version
180
232
  end
181
233
 
182
234
  # This method returns max_chunks entries, ordered by their average semantic rating.
@@ -190,12 +242,14 @@ module Classifier
190
242
  #
191
243
  # @rbs (?Integer) -> Array[String]
192
244
  def highest_relative_content(max_chunks = 10)
193
- return [] if needs_rebuild?
245
+ synchronize do
246
+ return [] if needs_rebuild_unlocked?
194
247
 
195
- avg_density = {}
196
- @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).sum { |pair| pair[1] } }
248
+ avg_density = {}
249
+ @items.each_key { |x| avg_density[x] = proximity_array_for_content_unlocked(x).sum { |pair| pair[1] } }
197
250
 
198
- avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..(max_chunks - 1)].map
251
+ avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..(max_chunks - 1)].map
252
+ end
199
253
  end
200
254
 
201
255
  # This function is the primitive that find_related and classify
@@ -212,20 +266,8 @@ module Classifier
212
266
  # text data. See add_item for examples of how this works.
213
267
  #
214
268
  # @rbs (String) ?{ (String) -> String } -> Array[[String, Float]]
215
- def proximity_array_for_content(doc, &)
216
- return [] if needs_rebuild?
217
-
218
- content_node = node_for_content(doc, &)
219
- result =
220
- @items.keys.collect do |item|
221
- val = if self.class.gsl_available
222
- content_node.search_vector * @items[item].search_vector.col
223
- else
224
- (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
225
- end
226
- [item, val]
227
- end
228
- result.sort_by { |x| x[1] }.reverse
269
+ def proximity_array_for_content(doc, &block)
270
+ synchronize { proximity_array_for_content_unlocked(doc, &block) }
229
271
  end
230
272
 
231
273
  # Similar to proximity_array_for_content, this function takes similar
@@ -235,20 +277,8 @@ module Classifier
235
277
  # the text you're working with. search uses this primitive.
236
278
  #
237
279
  # @rbs (String) ?{ (String) -> String } -> Array[[String, Float]]
238
- def proximity_norms_for_content(doc, &)
239
- return [] if needs_rebuild?
240
-
241
- content_node = node_for_content(doc, &)
242
- result =
243
- @items.keys.collect do |item|
244
- val = if self.class.gsl_available
245
- content_node.search_norm * @items[item].search_norm.col
246
- else
247
- (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
248
- end
249
- [item, val]
250
- end
251
- result.sort_by { |x| x[1] }.reverse
280
+ def proximity_norms_for_content(doc, &block)
281
+ synchronize { proximity_norms_for_content_unlocked(doc, &block) }
252
282
  end
253
283
 
254
284
  # This function allows for text-based search of your index. Unlike other functions
@@ -261,11 +291,13 @@ module Classifier
261
291
  #
262
292
  # @rbs (String, ?Integer) -> Array[String]
263
293
  def search(string, max_nearest = 3)
264
- return [] if needs_rebuild?
294
+ synchronize do
295
+ return [] if needs_rebuild_unlocked?
265
296
 
266
- carry = proximity_norms_for_content(string)
267
- result = carry.collect { |x| x[0] }
268
- result[0..(max_nearest - 1)]
297
+ carry = proximity_norms_for_content_unlocked(string)
298
+ result = carry.collect { |x| x[0] }
299
+ result[0..(max_nearest - 1)]
300
+ end
269
301
  end
270
302
 
271
303
  # This function takes content and finds other documents
@@ -280,10 +312,12 @@ module Classifier
280
312
  #
281
313
  # @rbs (String, ?Integer) ?{ (String) -> String } -> Array[String]
282
314
  def find_related(doc, max_nearest = 3, &block)
283
- carry =
284
- proximity_array_for_content(doc, &block).reject { |pair| pair[0] == doc }
285
- result = carry.collect { |x| x[0] }
286
- result[0..(max_nearest - 1)]
315
+ synchronize do
316
+ carry =
317
+ proximity_array_for_content_unlocked(doc, &block).reject { |pair| pair[0] == doc }
318
+ result = carry.collect { |x| x[0] }
319
+ result[0..(max_nearest - 1)]
320
+ end
287
321
  end
288
322
 
289
323
  # This function uses a voting system to categorize documents, based on
@@ -291,32 +325,23 @@ module Classifier
291
325
  # find_related function to find related documents, then returns the
292
326
  # most obvious category from this list.
293
327
  #
294
- # cutoff signifies the number of documents to consider when clasifying
295
- # text. A cutoff of 1 means that every document in the index votes on
296
- # what category the document is in. This may not always make sense.
297
- #
298
328
  # @rbs (String, ?Float) ?{ (String) -> String } -> String | Symbol
299
- def classify(doc, cutoff = 0.30, &)
300
- votes = vote(doc, cutoff, &)
329
+ def classify(doc, cutoff = 0.30, &block)
330
+ validate_cutoff!(cutoff)
331
+
332
+ synchronize do
333
+ votes = vote_unlocked(doc, cutoff, &block)
301
334
 
302
- ranking = votes.keys.sort_by { |x| votes[x] }
303
- ranking[-1]
335
+ ranking = votes.keys.sort_by { |x| votes[x] }
336
+ ranking[-1]
337
+ end
304
338
  end
305
339
 
306
340
  # @rbs (String, ?Float) ?{ (String) -> String } -> Hash[String | Symbol, Float]
307
- def vote(doc, cutoff = 0.30, &)
308
- icutoff = (@items.size * cutoff).round
309
- carry = proximity_array_for_content(doc, &)
310
- carry = carry[0..(icutoff - 1)]
311
- votes = {}
312
- carry.each do |pair|
313
- categories = @items[pair[0]].categories
314
- categories.each do |category|
315
- votes[category] ||= 0.0
316
- votes[category] += pair[1]
317
- end
318
- end
319
- votes
341
+ def vote(doc, cutoff = 0.30, &block)
342
+ validate_cutoff!(cutoff)
343
+
344
+ synchronize { vote_unlocked(doc, cutoff, &block) }
320
345
  end
321
346
 
322
347
  # Returns the same category as classify() but also returns
@@ -331,15 +356,19 @@ module Classifier
331
356
  #
332
357
  # See classify() for argument docs
333
358
  # @rbs (String, ?Float) ?{ (String) -> String } -> [String | Symbol | nil, Float?]
334
- def classify_with_confidence(doc, cutoff = 0.30, &)
335
- votes = vote(doc, cutoff, &)
336
- votes_sum = votes.values.sum
337
- return [nil, nil] if votes_sum.zero?
338
-
339
- ranking = votes.keys.sort_by { |x| votes[x] }
340
- winner = ranking[-1]
341
- vote_share = votes[winner] / votes_sum.to_f
342
- [winner, vote_share]
359
+ def classify_with_confidence(doc, cutoff = 0.30, &block)
360
+ validate_cutoff!(cutoff)
361
+
362
+ synchronize do
363
+ votes = vote_unlocked(doc, cutoff, &block)
364
+ votes_sum = votes.values.sum
365
+ return [nil, nil] if votes_sum.zero?
366
+
367
+ ranking = votes.keys.sort_by { |x| votes[x] }
368
+ winner = ranking[-1]
369
+ vote_share = votes[winner] / votes_sum.to_f
370
+ [winner, vote_share]
371
+ end
343
372
  end
344
373
 
345
374
  # Prototype, only works on indexed documents.
@@ -347,45 +376,314 @@ module Classifier
347
376
  # it's supposed to.
348
377
  # @rbs (String, ?Integer) -> Array[Symbol]
349
378
  def highest_ranked_stems(doc, count = 3)
350
- raise 'Requested stem ranking on non-indexed content!' unless @items[doc]
379
+ synchronize do
380
+ raise 'Requested stem ranking on non-indexed content!' unless @items[doc]
381
+
382
+ arr = node_for_content_unlocked(doc).lsi_vector.to_a
383
+ top_n = arr.sort.reverse[0..(count - 1)]
384
+ top_n.collect { |x| @word_list.word_for_index(arr.index(x)) }
385
+ end
386
+ end
387
+
388
+ # Custom marshal serialization to exclude mutex state
389
+ # @rbs () -> Array[untyped]
390
+ def marshal_dump
391
+ [@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty]
392
+ end
393
+
394
+ # Custom marshal deserialization to recreate mutex
395
+ # @rbs (Array[untyped]) -> void
396
+ def marshal_load(data)
397
+ mu_initialize
398
+ @auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty = data
399
+ @storage = nil
400
+ end
401
+
402
+ # Returns a hash representation of the LSI index.
403
+ # Only source data (word_hash, categories) is included, not computed vectors.
404
+ # This can be converted to JSON or used directly.
405
+ #
406
+ # @rbs () -> untyped
407
+ def as_json(*)
408
+ items_data = @items.transform_values do |node|
409
+ {
410
+ word_hash: node.word_hash.transform_keys(&:to_s),
411
+ categories: node.categories.map(&:to_s)
412
+ }
413
+ end
414
+
415
+ {
416
+ version: 1,
417
+ type: 'lsi',
418
+ auto_rebuild: @auto_rebuild,
419
+ items: items_data
420
+ }
421
+ end
422
+
423
+ # Serializes the LSI index to a JSON string.
424
+ # Only source data (word_hash, categories) is serialized, not computed vectors.
425
+ # On load, the index will be rebuilt automatically.
426
+ #
427
+ # @rbs () -> String
428
+ def to_json(*)
429
+ as_json.to_json
430
+ end
431
+
432
+ # Loads an LSI index from a JSON string or Hash created by #to_json or #as_json.
433
+ # The index will be rebuilt after loading.
434
+ #
435
+ # @rbs (String | Hash[String, untyped]) -> LSI
436
+ def self.from_json(json)
437
+ data = json.is_a?(String) ? JSON.parse(json) : json
438
+ raise ArgumentError, "Invalid classifier type: #{data['type']}" unless data['type'] == 'lsi'
439
+
440
+ # Create instance with auto_rebuild disabled during loading
441
+ instance = new(auto_rebuild: false)
442
+
443
+ # Restore items (categories stay as strings, matching original storage)
444
+ data['items'].each do |item_key, item_data|
445
+ word_hash = item_data['word_hash'].transform_keys(&:to_sym)
446
+ categories = item_data['categories']
447
+ instance.instance_variable_get(:@items)[item_key] = ContentNode.new(word_hash, *categories)
448
+ instance.instance_variable_set(:@version, instance.instance_variable_get(:@version) + 1)
449
+ end
450
+
451
+ # Restore auto_rebuild setting and rebuild index
452
+ instance.auto_rebuild = data['auto_rebuild']
453
+ instance.build_index
454
+ instance
455
+ end
456
+
457
+ # Saves the LSI index to the configured storage.
458
+ # Raises ArgumentError if no storage is configured.
459
+ #
460
+ # @rbs () -> void
461
+ def save
462
+ raise ArgumentError, 'No storage configured. Use save_to_file(path) or set storage=' unless storage
463
+
464
+ storage.write(to_json)
465
+ @dirty = false
466
+ end
467
+
468
+ # Saves the LSI index to a file (legacy API).
469
+ #
470
+ # @rbs (String) -> Integer
471
+ def save_to_file(path)
472
+ result = File.write(path, to_json)
473
+ @dirty = false
474
+ result
475
+ end
476
+
477
+ # Reloads the LSI index from the configured storage.
478
+ # Raises UnsavedChangesError if there are unsaved changes.
479
+ # Use reload! to force reload and discard changes.
480
+ #
481
+ # @rbs () -> self
482
+ def reload
483
+ raise ArgumentError, 'No storage configured' unless storage
484
+ raise UnsavedChangesError, 'Unsaved changes would be lost. Call save first or use reload!' if @dirty
351
485
 
352
- arr = node_for_content(doc).lsi_vector.to_a
353
- top_n = arr.sort.reverse[0..(count - 1)]
354
- top_n.collect { |x| @word_list.word_for_index(arr.index(x)) }
486
+ data = storage.read
487
+ raise StorageError, 'No saved state found' unless data
488
+
489
+ restore_from_json(data)
490
+ @dirty = false
491
+ self
492
+ end
493
+
494
+ # Force reloads the LSI index from storage, discarding any unsaved changes.
495
+ #
496
+ # @rbs () -> self
497
+ def reload!
498
+ raise ArgumentError, 'No storage configured' unless storage
499
+
500
+ data = storage.read
501
+ raise StorageError, 'No saved state found' unless data
502
+
503
+ restore_from_json(data)
504
+ @dirty = false
505
+ self
506
+ end
507
+
508
+ # Returns true if there are unsaved changes.
509
+ #
510
+ # @rbs () -> bool
511
+ def dirty?
512
+ @dirty
513
+ end
514
+
515
+ # Loads an LSI index from the configured storage.
516
+ # The storage is set on the returned instance.
517
+ #
518
+ # @rbs (storage: Storage::Base) -> LSI
519
+ def self.load(storage:)
520
+ data = storage.read
521
+ raise StorageError, 'No saved state found' unless data
522
+
523
+ instance = from_json(data)
524
+ instance.storage = storage
525
+ instance
526
+ end
527
+
528
+ # Loads an LSI index from a file (legacy API).
529
+ #
530
+ # @rbs (String) -> LSI
531
+ def self.load_from_file(path)
532
+ from_json(File.read(path))
355
533
  end
356
534
 
357
535
  private
358
536
 
537
+ # Restores LSI state from a JSON string (used by reload)
538
+ # @rbs (String) -> void
539
+ def restore_from_json(json)
540
+ data = JSON.parse(json)
541
+ raise ArgumentError, "Invalid classifier type: #{data['type']}" unless data['type'] == 'lsi'
542
+
543
+ synchronize do
544
+ # Recreate the items
545
+ @items = {}
546
+ data['items'].each do |item_key, item_data|
547
+ word_hash = item_data['word_hash'].transform_keys(&:to_sym)
548
+ categories = item_data['categories']
549
+ @items[item_key] = ContentNode.new(word_hash, *categories)
550
+ end
551
+
552
+ # Restore settings
553
+ @auto_rebuild = data['auto_rebuild']
554
+ @version += 1
555
+ @built_at_version = -1
556
+ @word_list = WordList.new
557
+ @dirty = false
558
+ end
559
+
560
+ # Rebuild the index
561
+ build_index
562
+ end
563
+
564
+ # @rbs (Float) -> void
565
+ def validate_cutoff!(cutoff)
566
+ return if cutoff.positive? && cutoff < 1
567
+
568
+ raise ArgumentError, "cutoff must be between 0 and 1 (exclusive), got #{cutoff}"
569
+ end
570
+
571
+ # Assigns LSI vectors using native C extension
572
+ # @rbs (untyped, Array[ContentNode]) -> void
573
+ def assign_native_ext_lsi_vectors(ntdm, doc_list)
574
+ ntdm.size[1].times do |col|
575
+ vec = self.class.vector_class.alloc(ntdm.column(col).to_a).row
576
+ doc_list[col].lsi_vector = vec
577
+ doc_list[col].lsi_norm = vec.normalize
578
+ end
579
+ end
580
+
581
+ # Assigns LSI vectors using pure Ruby Matrix
582
+ # @rbs (untyped, Array[ContentNode]) -> void
583
+ def assign_ruby_lsi_vectors(ntdm, doc_list)
584
+ ntdm.column_size.times do |col|
585
+ next unless doc_list[col]
586
+
587
+ column = ntdm.column(col)
588
+ next unless column
589
+
590
+ doc_list[col].lsi_vector = column
591
+ doc_list[col].lsi_norm = column.normalize
592
+ end
593
+ end
594
+
595
+ # Unlocked version of needs_rebuild? for internal use when lock is already held
596
+ # @rbs () -> bool
597
+ def needs_rebuild_unlocked?
598
+ (@items.keys.size > 1) && (@version != @built_at_version)
599
+ end
600
+
601
+ # Unlocked version of proximity_array_for_content for internal use
602
+ # @rbs (String) ?{ (String) -> String } -> Array[[String, Float]]
603
+ def proximity_array_for_content_unlocked(doc, &)
604
+ return [] if needs_rebuild_unlocked?
605
+
606
+ content_node = node_for_content_unlocked(doc, &)
607
+ result =
608
+ @items.keys.collect do |item|
609
+ val = if self.class.native_available?
610
+ content_node.search_vector * @items[item].search_vector.col
611
+ else
612
+ (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
613
+ end
614
+ [item, val]
615
+ end
616
+ result.sort_by { |x| x[1] }.reverse
617
+ end
618
+
619
+ # Unlocked version of proximity_norms_for_content for internal use
620
+ # @rbs (String) ?{ (String) -> String } -> Array[[String, Float]]
621
+ def proximity_norms_for_content_unlocked(doc, &)
622
+ return [] if needs_rebuild_unlocked?
623
+
624
+ content_node = node_for_content_unlocked(doc, &)
625
+ result =
626
+ @items.keys.collect do |item|
627
+ val = if self.class.native_available?
628
+ content_node.search_norm * @items[item].search_norm.col
629
+ else
630
+ (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
631
+ end
632
+ [item, val]
633
+ end
634
+ result.sort_by { |x| x[1] }.reverse
635
+ end
636
+
637
+ # Unlocked version of vote for internal use
638
+ # @rbs (String, ?Float) ?{ (String) -> String } -> Hash[String | Symbol, Float]
639
+ def vote_unlocked(doc, cutoff = 0.30, &)
640
+ icutoff = (@items.size * cutoff).round
641
+ carry = proximity_array_for_content_unlocked(doc, &)
642
+ carry = carry[0..(icutoff - 1)]
643
+ votes = {}
644
+ carry.each do |pair|
645
+ categories = @items[pair[0]].categories
646
+ categories.each do |category|
647
+ votes[category] ||= 0.0
648
+ votes[category] += pair[1]
649
+ end
650
+ end
651
+ votes
652
+ end
653
+
654
+ # Unlocked version of node_for_content for internal use
655
+ # @rbs (String) ?{ (String) -> String } -> ContentNode
656
+ def node_for_content_unlocked(item, &block)
657
+ return @items[item] if @items[item]
658
+
659
+ clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
660
+ cn = ContentNode.new(clean_word_hash, &block)
661
+ cn.raw_vector_with(@word_list) unless needs_rebuild_unlocked?
662
+ cn
663
+ end
664
+
359
665
  # @rbs (untyped, ?Float) -> untyped
360
666
  def build_reduced_matrix(matrix, cutoff = 0.75)
361
667
  # TODO: Check that M>=N on these dimensions! Transpose helps assure this
362
668
  u, v, s = matrix.SV_decomp
363
669
 
364
- # TODO: Better than 75% term, please. :\
365
- s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
670
+ @singular_values = s.sort.reverse
671
+
672
+ s_cutoff_index = [(s.size * cutoff).round - 1, 0].max
673
+ s_cutoff = @singular_values[s_cutoff_index]
366
674
  s.size.times do |ord|
367
675
  s[ord] = 0.0 if s[ord] < s_cutoff
368
676
  end
369
677
  # Reconstruct the term document matrix, only with reduced rank
370
- result = u * (self.class.gsl_available ? GSL::Matrix : ::Matrix).diag(s) * v.trans
678
+ result = u * self.class.matrix_class.diag(s) * v.trans
371
679
 
372
- # Native Ruby SVD returns transposed dimensions when row_size < column_size
680
+ # SVD may return transposed dimensions when row_size < column_size
373
681
  # Ensure result matches input dimensions
374
- result = result.trans if !self.class.gsl_available && result.row_size != matrix.row_size
682
+ result = result.trans if result.row_size != matrix.row_size
375
683
 
376
684
  result
377
685
  end
378
686
 
379
- # @rbs (String) ?{ (String) -> String } -> ContentNode
380
- def node_for_content(item, &block)
381
- return @items[item] if @items[item]
382
-
383
- clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
384
- cn = ContentNode.new(clean_word_hash, &block)
385
- cn.raw_vector_with(@word_list) unless needs_rebuild?
386
- cn
387
- end
388
-
389
687
  # @rbs () -> void
390
688
  def make_word_list
391
689
  @word_list = WordList.new