classifier 1.4.4 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,20 +1,60 @@
1
+ # rbs_inline: enabled
2
+
1
3
  # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
4
  # Copyright:: Copyright (c) 2005 David Fayram II
3
5
  # License:: LGPL
4
6
 
7
+ module Classifier
8
+ class LSI
9
+ # Backend options: :native, :ruby
10
+ # @rbs @backend: Symbol
11
+ @backend = :ruby
12
+
13
+ class << self
14
+ # @rbs @backend: Symbol
15
+ attr_accessor :backend
16
+
17
+ # Check if using native C extension
18
+ # @rbs () -> bool
19
+ def native_available?
20
+ backend == :native
21
+ end
22
+
23
+ # Get the Vector class for the current backend
24
+ # @rbs () -> Class
25
+ def vector_class
26
+ backend == :native ? Classifier::Linalg::Vector : ::Vector
27
+ end
28
+
29
+ # Get the Matrix class for the current backend
30
+ # @rbs () -> Class
31
+ def matrix_class
32
+ backend == :native ? Classifier::Linalg::Matrix : ::Matrix
33
+ end
34
+ end
35
+ end
36
+ end
37
+
38
+ # Backend detection: native extension > pure Ruby
39
+ # Set NATIVE_VECTOR=true to force pure Ruby implementation
40
+
5
41
  begin
6
- # to test the native vector class, try `rake test NATIVE_VECTOR=true`
7
42
  raise LoadError if ENV['NATIVE_VECTOR'] == 'true'
8
43
 
9
- require 'gsl' # requires https://github.com/SciRuby/rb-gsl/
10
- require 'classifier/extensions/vector_serialize'
11
- $GSL = true
44
+ require 'classifier/classifier_ext'
45
+ Classifier::LSI.backend = :native
12
46
  rescue LoadError
13
- warn 'Notice: for 10x faster LSI support, please install https://github.com/SciRuby/rb-gsl/'
14
- $GSL = false
47
+ # Fall back to pure Ruby implementation
48
+ unless ENV['SUPPRESS_LSI_WARNING'] == 'true'
49
+ warn 'Notice: for 5-10x faster LSI, install the classifier gem with native extensions. ' \
50
+ 'Set SUPPRESS_LSI_WARNING=true to hide this.'
51
+ end
52
+ Classifier::LSI.backend = :ruby
15
53
  require 'classifier/extensions/vector'
16
54
  end
17
55
 
56
+ require 'json'
57
+ require 'mutex_m'
18
58
  require 'classifier/lsi/word_list'
19
59
  require 'classifier/lsi/content_node'
20
60
  require 'classifier/lsi/summary'
@@ -24,26 +64,62 @@ module Classifier
24
64
  # data based on underlying semantic relations. For more information on the algorithms used,
25
65
  # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
26
66
  class LSI
27
- attr_reader :word_list
28
- attr_accessor :auto_rebuild
67
+ include Mutex_m
68
+
69
+ # @rbs @auto_rebuild: bool
70
+ # @rbs @word_list: WordList
71
+ # @rbs @items: Hash[untyped, ContentNode]
72
+ # @rbs @version: Integer
73
+ # @rbs @built_at_version: Integer
74
+ # @rbs @singular_values: Array[Float]?
75
+ # @rbs @dirty: bool
76
+ # @rbs @storage: Storage::Base?
77
+
78
+ attr_reader :word_list, :singular_values
79
+ attr_accessor :auto_rebuild, :storage
29
80
 
30
81
  # Create a fresh index.
31
82
  # If you want to call #build_index manually, use
32
- # Classifier::LSI.new :auto_rebuild => false
83
+ # Classifier::LSI.new auto_rebuild: false
33
84
  #
85
+ # @rbs (?Hash[Symbol, untyped]) -> void
34
86
  def initialize(options = {})
87
+ super()
35
88
  @auto_rebuild = true unless options[:auto_rebuild] == false
36
89
  @word_list = WordList.new
37
90
  @items = {}
38
91
  @version = 0
39
92
  @built_at_version = -1
93
+ @dirty = false
94
+ @storage = nil
40
95
  end
41
96
 
42
97
  # Returns true if the index needs to be rebuilt. The index needs
43
98
  # to be built after all informaton is added, but before you start
44
99
  # using it for search, classification and cluster detection.
100
+ #
101
+ # @rbs () -> bool
45
102
  def needs_rebuild?
46
- (@items.keys.size > 1) && (@version != @built_at_version)
103
+ synchronize { (@items.keys.size > 1) && (@version != @built_at_version) }
104
+ end
105
+
106
+ # @rbs () -> Array[Hash[Symbol, untyped]]?
107
+ def singular_value_spectrum
108
+ return nil unless @singular_values
109
+
110
+ total = @singular_values.sum
111
+ return nil if total.zero?
112
+
113
+ cumulative = 0.0
114
+ @singular_values.map.with_index do |value, i|
115
+ cumulative += value
116
+ {
117
+ dimension: i,
118
+ value: value,
119
+ percentage: value / total,
120
+ cumulative_percentage: cumulative / total
121
+ }
122
+ end
47
123
  end
48
124
 
49
125
  # Adds an item to the index. item is assumed to be a string, but
@@ -59,10 +135,14 @@ module Classifier
59
135
  # ar = ActiveRecordObject.find( :all )
60
136
  # lsi.add_item ar, *ar.categories { |x| ar.content }
61
137
  #
138
+ # @rbs (String, *String | Symbol) ?{ (String) -> String } -> void
62
139
  def add_item(item, *categories, &block)
63
140
  clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
64
- @items[item] = ContentNode.new(clean_word_hash, *categories)
65
- @version += 1
141
+ synchronize do
142
+ @items[item] = ContentNode.new(clean_word_hash, *categories)
143
+ @version += 1
144
+ @dirty = true
145
+ end
66
146
  build_index if @auto_rebuild
67
147
  end
68
148
 
@@ -70,30 +150,42 @@ module Classifier
70
150
  # you are passing in a string with no categorries. item
71
151
  # will be duck typed via to_s .
72
152
  #
153
+ # @rbs (String) -> void
73
154
  def <<(item)
74
155
  add_item(item)
75
156
  end
76
157
 
77
158
  # Returns the categories for a given indexed items. You are free to add and remove
78
159
  # items from this as you see fit. It does not invalide an index to change its categories.
160
+ #
161
+ # @rbs (String) -> Array[String | Symbol]
79
162
  def categories_for(item)
80
- return [] unless @items[item]
163
+ synchronize do
164
+ return [] unless @items[item]
81
165
 
82
- @items[item].categories
166
+ @items[item].categories
167
+ end
83
168
  end
84
169
 
85
170
  # Removes an item from the database, if it is indexed.
86
171
  #
172
+ # @rbs (String) -> void
87
173
  def remove_item(item)
88
- return unless @items.key?(item)
174
+ removed = synchronize do
175
+ next false unless @items.key?(item)
89
176
 
90
- @items.delete(item)
91
- @version += 1
177
+ @items.delete(item)
178
+ @version += 1
179
+ @dirty = true
180
+ true
181
+ end
182
+ build_index if removed && @auto_rebuild
92
183
  end
93
184
 
94
185
  # Returns an array of items that are indexed.
186
+ # @rbs () -> Array[untyped]
95
187
  def items
96
- @items.keys
188
+ synchronize { @items.keys }
97
189
  end
98
190
 
99
191
  # This function rebuilds the index if needs_rebuild? returns true.
@@ -110,34 +202,33 @@ module Classifier
110
202
  # cutoff parameter tells the indexer how many of these values to keep.
111
203
  # A value of 1 for cutoff means that no semantic analysis will take place,
112
204
  # turning the LSI class into a simple vector search engine.
205
+ #
206
+ # @rbs (?Float) -> void
113
207
  def build_index(cutoff = 0.75)
114
- return unless needs_rebuild?
115
-
116
- make_word_list
117
-
118
- doc_list = @items.values
119
- tda = doc_list.collect { |node| node.raw_vector_with(@word_list) }
120
-
121
- if $GSL
122
- tdm = GSL::Matrix.alloc(*tda).trans
123
- ntdm = build_reduced_matrix(tdm, cutoff)
124
-
125
- ntdm.size[1].times do |col|
126
- vec = GSL::Vector.alloc(ntdm.column(col)).row
127
- doc_list[col].lsi_vector = vec
128
- doc_list[col].lsi_norm = vec.normalize
208
+ validate_cutoff!(cutoff)
209
+
210
+ synchronize do
211
+ return unless needs_rebuild_unlocked?
212
+
213
+ make_word_list
214
+
215
+ doc_list = @items.values
216
+ tda = doc_list.collect { |node| node.raw_vector_with(@word_list) }
217
+
218
+ if self.class.native_available?
219
+ # Convert vectors to arrays for matrix construction
220
+ tda_arrays = tda.map { |v| v.respond_to?(:to_a) ? v.to_a : v }
221
+ tdm = self.class.matrix_class.alloc(*tda_arrays).trans
222
+ ntdm = build_reduced_matrix(tdm, cutoff)
223
+ assign_native_ext_lsi_vectors(ntdm, doc_list)
224
+ else
225
+ tdm = Matrix.rows(tda).trans
226
+ ntdm = build_reduced_matrix(tdm, cutoff)
227
+ assign_ruby_lsi_vectors(ntdm, doc_list)
129
228
  end
130
- else
131
- tdm = Matrix.rows(tda).trans
132
- ntdm = build_reduced_matrix(tdm, cutoff)
133
229
 
134
- ntdm.row_size.times do |col|
135
- doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
136
- doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
137
- end
230
+ @built_at_version = @version
138
231
  end
139
-
140
- @built_at_version = @version
141
232
  end
142
233
 
143
234
  # This method returns max_chunks entries, ordered by their average semantic rating.
@@ -148,13 +239,17 @@ module Classifier
148
239
  # your dataset's general content. For example, if you were to use categorize on the
149
240
  # results of this data, you could gather information on what your dataset is generally
150
241
  # about.
242
+ #
243
+ # @rbs (?Integer) -> Array[String]
151
244
  def highest_relative_content(max_chunks = 10)
152
- return [] if needs_rebuild?
245
+ synchronize do
246
+ return [] if needs_rebuild_unlocked?
153
247
 
154
- avg_density = {}
155
- @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x, y| x + y[1] } }
248
+ avg_density = {}
249
+ @items.each_key { |x| avg_density[x] = proximity_array_for_content_unlocked(x).sum { |pair| pair[1] } }
156
250
 
157
- avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks - 1].map
251
+ avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..(max_chunks - 1)].map
252
+ end
158
253
  end
159
254
 
160
255
  # This function is the primitive that find_related and classify
@@ -169,20 +264,10 @@ module Classifier
169
264
  # The parameter doc is the content to compare. If that content is not
170
265
  # indexed, you can pass an optional block to define how to create the
171
266
  # text data. See add_item for examples of how this works.
267
+ #
268
+ # @rbs (String) ?{ (String) -> String } -> Array[[String, Float]]
172
269
  def proximity_array_for_content(doc, &block)
173
- return [] if needs_rebuild?
174
-
175
- content_node = node_for_content(doc, &block)
176
- result =
177
- @items.keys.collect do |item|
178
- val = if $GSL
179
- content_node.search_vector * @items[item].search_vector.col
180
- else
181
- (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
182
- end
183
- [item, val]
184
- end
185
- result.sort_by { |x| x[1] }.reverse
270
+ synchronize { proximity_array_for_content_unlocked(doc, &block) }
186
271
  end
187
272
 
188
273
  # Similar to proximity_array_for_content, this function takes similar
@@ -190,20 +275,10 @@ module Classifier
190
275
  # calculated vectors instead of their full versions. This is useful when
191
276
  # you're trying to perform operations on content that is much smaller than
192
277
  # the text you're working with. search uses this primitive.
278
+ #
279
+ # @rbs (String) ?{ (String) -> String } -> Array[[String, Float]]
193
280
  def proximity_norms_for_content(doc, &block)
194
- return [] if needs_rebuild?
195
-
196
- content_node = node_for_content(doc, &block)
197
- result =
198
- @items.keys.collect do |item|
199
- val = if $GSL
200
- content_node.search_norm * @items[item].search_norm.col
201
- else
202
- (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
203
- end
204
- [item, val]
205
- end
206
- result.sort_by { |x| x[1] }.reverse
281
+ synchronize { proximity_norms_for_content_unlocked(doc, &block) }
207
282
  end
208
283
 
209
284
  # This function allows for text-based search of your index. Unlike other functions
@@ -213,12 +288,16 @@ module Classifier
213
288
  #
214
289
  # While this may seem backwards compared to the other functions that LSI supports,
215
290
  # it is actually the same algorithm, just applied on a smaller document.
291
+ #
292
+ # @rbs (String, ?Integer) -> Array[String]
216
293
  def search(string, max_nearest = 3)
217
- return [] if needs_rebuild?
294
+ synchronize do
295
+ return [] if needs_rebuild_unlocked?
218
296
 
219
- carry = proximity_norms_for_content(string)
220
- result = carry.collect { |x| x[0] }
221
- result[0..max_nearest - 1]
297
+ carry = proximity_norms_for_content_unlocked(string)
298
+ result = carry.collect { |x| x[0] }
299
+ result[0..(max_nearest - 1)]
300
+ end
222
301
  end
223
302
 
224
303
  # This function takes content and finds other documents
@@ -230,11 +309,15 @@ module Classifier
230
309
  # This is particularly useful for identifing clusters in your document space.
231
310
  # For example you may want to identify several "What's Related" items for weblog
232
311
  # articles, or find paragraphs that relate to each other in an essay.
312
+ #
313
+ # @rbs (String, ?Integer) ?{ (String) -> String } -> Array[String]
233
314
  def find_related(doc, max_nearest = 3, &block)
234
- carry =
235
- proximity_array_for_content(doc, &block).reject { |pair| pair[0] == doc }
236
- result = carry.collect { |x| x[0] }
237
- result[0..max_nearest - 1]
315
+ synchronize do
316
+ carry =
317
+ proximity_array_for_content_unlocked(doc, &block).reject { |pair| pair[0] == doc }
318
+ result = carry.collect { |x| x[0] }
319
+ result[0..(max_nearest - 1)]
320
+ end
238
321
  end
239
322
 
240
323
  # This function uses a voting system to categorize documents, based on
@@ -242,30 +325,23 @@ module Classifier
242
325
  # find_related function to find related documents, then returns the
243
326
  # most obvious category from this list.
244
327
  #
245
- # cutoff signifies the number of documents to consider when clasifying
246
- # text. A cutoff of 1 means that every document in the index votes on
247
- # what category the document is in. This may not always make sense.
248
- #
328
+ # @rbs (String, ?Float) ?{ (String) -> String } -> String | Symbol
249
329
  def classify(doc, cutoff = 0.30, &block)
250
- votes = vote(doc, cutoff, &block)
330
+ validate_cutoff!(cutoff)
331
+
332
+ synchronize do
333
+ votes = vote_unlocked(doc, cutoff, &block)
251
334
 
252
- ranking = votes.keys.sort_by { |x| votes[x] }
253
- ranking[-1]
335
+ ranking = votes.keys.sort_by { |x| votes[x] }
336
+ ranking[-1]
337
+ end
254
338
  end
255
339
 
340
+ # @rbs (String, ?Float) ?{ (String) -> String } -> Hash[String | Symbol, Float]
256
341
  def vote(doc, cutoff = 0.30, &block)
257
- icutoff = (@items.size * cutoff).round
258
- carry = proximity_array_for_content(doc, &block)
259
- carry = carry[0..icutoff - 1]
260
- votes = {}
261
- carry.each do |pair|
262
- categories = @items[pair[0]].categories
263
- categories.each do |category|
264
- votes[category] ||= 0.0
265
- votes[category] += pair[1]
266
- end
267
- end
268
- votes
342
+ validate_cutoff!(cutoff)
343
+
344
+ synchronize { vote_unlocked(doc, cutoff, &block) }
269
345
  end
270
346
 
271
347
  # Returns the same category as classify() but also returns
@@ -278,59 +354,337 @@ module Classifier
278
354
  # category = nil
279
355
  # end
280
356
  #
281
- #
282
357
  # See classify() for argument docs
358
+ # @rbs (String, ?Float) ?{ (String) -> String } -> [String | Symbol | nil, Float?]
283
359
  def classify_with_confidence(doc, cutoff = 0.30, &block)
284
- votes = vote(doc, cutoff, &block)
285
- votes_sum = votes.values.inject(0.0) { |sum, v| sum + v }
286
- return [nil, nil] if votes_sum.zero?
360
+ validate_cutoff!(cutoff)
287
361
 
288
- ranking = votes.keys.sort_by { |x| votes[x] }
289
- winner = ranking[-1]
290
- vote_share = votes[winner] / votes_sum.to_f
291
- [winner, vote_share]
362
+ synchronize do
363
+ votes = vote_unlocked(doc, cutoff, &block)
364
+ votes_sum = votes.values.sum
365
+ return [nil, nil] if votes_sum.zero?
366
+
367
+ ranking = votes.keys.sort_by { |x| votes[x] }
368
+ winner = ranking[-1]
369
+ vote_share = votes[winner] / votes_sum.to_f
370
+ [winner, vote_share]
371
+ end
292
372
  end
293
373
 
294
374
  # Prototype, only works on indexed documents.
295
375
  # I have no clue if this is going to work, but in theory
296
376
  # it's supposed to.
377
+ # @rbs (String, ?Integer) -> Array[Symbol]
297
378
  def highest_ranked_stems(doc, count = 3)
298
- raise 'Requested stem ranking on non-indexed content!' unless @items[doc]
379
+ synchronize do
380
+ raise 'Requested stem ranking on non-indexed content!' unless @items[doc]
381
+
382
+ arr = node_for_content_unlocked(doc).lsi_vector.to_a
383
+ top_n = arr.sort.reverse[0..(count - 1)]
384
+ top_n.collect { |x| @word_list.word_for_index(arr.index(x)) }
385
+ end
386
+ end
387
+
388
+ # Custom marshal serialization to exclude mutex state
389
+ # @rbs () -> Array[untyped]
390
+ def marshal_dump
391
+ [@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty]
392
+ end
393
+
394
+ # Custom marshal deserialization to recreate mutex
395
+ # @rbs (Array[untyped]) -> void
396
+ def marshal_load(data)
397
+ mu_initialize
398
+ @auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty = data
399
+ @storage = nil
400
+ end
401
+
402
+ # Returns a hash representation of the LSI index.
403
+ # Only source data (word_hash, categories) is included, not computed vectors.
404
+ # This can be converted to JSON or used directly.
405
+ #
406
+ # @rbs () -> untyped
407
+ def as_json(*)
408
+ items_data = @items.transform_values do |node|
409
+ {
410
+ word_hash: node.word_hash.transform_keys(&:to_s),
411
+ categories: node.categories.map(&:to_s)
412
+ }
413
+ end
414
+
415
+ {
416
+ version: 1,
417
+ type: 'lsi',
418
+ auto_rebuild: @auto_rebuild,
419
+ items: items_data
420
+ }
421
+ end
422
+
423
+ # Serializes the LSI index to a JSON string.
424
+ # Only source data (word_hash, categories) is serialized, not computed vectors.
425
+ # On load, the index will be rebuilt automatically.
426
+ #
427
+ # @rbs () -> String
428
+ def to_json(*)
429
+ as_json.to_json
430
+ end
431
+
432
+ # Loads an LSI index from a JSON string or Hash created by #to_json or #as_json.
433
+ # The index will be rebuilt after loading.
434
+ #
435
+ # @rbs (String | Hash[String, untyped]) -> LSI
436
+ def self.from_json(json)
437
+ data = json.is_a?(String) ? JSON.parse(json) : json
438
+ raise ArgumentError, "Invalid classifier type: #{data['type']}" unless data['type'] == 'lsi'
439
+
440
+ # Create instance with auto_rebuild disabled during loading
441
+ instance = new(auto_rebuild: false)
442
+
443
+ # Restore items (categories stay as strings, matching original storage)
444
+ data['items'].each do |item_key, item_data|
445
+ word_hash = item_data['word_hash'].transform_keys(&:to_sym)
446
+ categories = item_data['categories']
447
+ instance.instance_variable_get(:@items)[item_key] = ContentNode.new(word_hash, *categories)
448
+ instance.instance_variable_set(:@version, instance.instance_variable_get(:@version) + 1)
449
+ end
299
450
 
300
- arr = node_for_content(doc).lsi_vector.to_a
301
- top_n = arr.sort.reverse[0..count - 1]
302
- top_n.collect { |x| @word_list.word_for_index(arr.index(x)) }
451
+ # Restore auto_rebuild setting and rebuild index
452
+ instance.auto_rebuild = data['auto_rebuild']
453
+ instance.build_index
454
+ instance
455
+ end
456
+
457
+ # Saves the LSI index to the configured storage.
458
+ # Raises ArgumentError if no storage is configured.
459
+ #
460
+ # @rbs () -> void
461
+ def save
462
+ raise ArgumentError, 'No storage configured. Use save_to_file(path) or set storage=' unless storage
463
+
464
+ storage.write(to_json)
465
+ @dirty = false
466
+ end
467
+
468
+ # Saves the LSI index to a file (legacy API).
469
+ #
470
+ # @rbs (String) -> Integer
471
+ def save_to_file(path)
472
+ result = File.write(path, to_json)
473
+ @dirty = false
474
+ result
475
+ end
476
+
477
+ # Reloads the LSI index from the configured storage.
478
+ # Raises UnsavedChangesError if there are unsaved changes.
479
+ # Use reload! to force reload and discard changes.
480
+ #
481
+ # @rbs () -> self
482
+ def reload
483
+ raise ArgumentError, 'No storage configured' unless storage
484
+ raise UnsavedChangesError, 'Unsaved changes would be lost. Call save first or use reload!' if @dirty
485
+
486
+ data = storage.read
487
+ raise StorageError, 'No saved state found' unless data
488
+
489
+ restore_from_json(data)
490
+ @dirty = false
491
+ self
492
+ end
493
+
494
+ # Force reloads the LSI index from storage, discarding any unsaved changes.
495
+ #
496
+ # @rbs () -> self
497
+ def reload!
498
+ raise ArgumentError, 'No storage configured' unless storage
499
+
500
+ data = storage.read
501
+ raise StorageError, 'No saved state found' unless data
502
+
503
+ restore_from_json(data)
504
+ @dirty = false
505
+ self
506
+ end
507
+
508
+ # Returns true if there are unsaved changes.
509
+ #
510
+ # @rbs () -> bool
511
+ def dirty?
512
+ @dirty
513
+ end
514
+
515
+ # Loads an LSI index from the configured storage.
516
+ # The storage is set on the returned instance.
517
+ #
518
+ # @rbs (storage: Storage::Base) -> LSI
519
+ def self.load(storage:)
520
+ data = storage.read
521
+ raise StorageError, 'No saved state found' unless data
522
+
523
+ instance = from_json(data)
524
+ instance.storage = storage
525
+ instance
526
+ end
527
+
528
+ # Loads an LSI index from a file (legacy API).
529
+ #
530
+ # @rbs (String) -> LSI
531
+ def self.load_from_file(path)
532
+ from_json(File.read(path))
303
533
  end
304
534
 
305
535
  private
306
536
 
307
- def build_reduced_matrix(matrix, cutoff = 0.75)
308
- # TODO: Check that M>=N on these dimensions! Transpose helps assure this
309
- u, v, s = matrix.SV_decomp
537
+ # Restores LSI state from a JSON string (used by reload)
538
+ # @rbs (String) -> void
539
+ def restore_from_json(json)
540
+ data = JSON.parse(json)
541
+ raise ArgumentError, "Invalid classifier type: #{data['type']}" unless data['type'] == 'lsi'
542
+
543
+ synchronize do
544
+ # Recreate the items
545
+ @items = {}
546
+ data['items'].each do |item_key, item_data|
547
+ word_hash = item_data['word_hash'].transform_keys(&:to_sym)
548
+ categories = item_data['categories']
549
+ @items[item_key] = ContentNode.new(word_hash, *categories)
550
+ end
310
551
 
311
- # TODO: Better than 75% term, please. :\
312
- s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1]
313
- s.size.times do |ord|
314
- s[ord] = 0.0 if s[ord] < s_cutoff
552
+ # Restore settings
553
+ @auto_rebuild = data['auto_rebuild']
554
+ @version += 1
555
+ @built_at_version = -1
556
+ @word_list = WordList.new
557
+ @dirty = false
315
558
  end
316
- # Reconstruct the term document matrix, only with reduced rank
317
- u * ($GSL ? GSL::Matrix : ::Matrix).diag(s) * v.trans
559
+
560
+ # Rebuild the index
561
+ build_index
562
+ end
563
+
564
+ # @rbs (Float) -> void
565
+ def validate_cutoff!(cutoff)
566
+ return if cutoff.positive? && cutoff < 1
567
+
568
+ raise ArgumentError, "cutoff must be between 0 and 1 (exclusive), got #{cutoff}"
569
+ end
570
+
571
+ # Assigns LSI vectors using native C extension
572
+ # @rbs (untyped, Array[ContentNode]) -> void
573
+ def assign_native_ext_lsi_vectors(ntdm, doc_list)
574
+ ntdm.size[1].times do |col|
575
+ vec = self.class.vector_class.alloc(ntdm.column(col).to_a).row
576
+ doc_list[col].lsi_vector = vec
577
+ doc_list[col].lsi_norm = vec.normalize
578
+ end
579
+ end
580
+
581
+ # Assigns LSI vectors using pure Ruby Matrix
582
+ # @rbs (untyped, Array[ContentNode]) -> void
583
+ def assign_ruby_lsi_vectors(ntdm, doc_list)
584
+ ntdm.column_size.times do |col|
585
+ next unless doc_list[col]
586
+
587
+ column = ntdm.column(col)
588
+ next unless column
589
+
590
+ doc_list[col].lsi_vector = column
591
+ doc_list[col].lsi_norm = column.normalize
592
+ end
593
+ end
594
+
595
+ # Unlocked version of needs_rebuild? for internal use when lock is already held
596
+ # @rbs () -> bool
597
+ def needs_rebuild_unlocked?
598
+ (@items.keys.size > 1) && (@version != @built_at_version)
318
599
  end
319
600
 
320
- def node_for_content(item, &block)
601
+ # Unlocked version of proximity_array_for_content for internal use
602
+ # @rbs (String) ?{ (String) -> String } -> Array[[String, Float]]
603
+ def proximity_array_for_content_unlocked(doc, &)
604
+ return [] if needs_rebuild_unlocked?
605
+
606
+ content_node = node_for_content_unlocked(doc, &)
607
+ result =
608
+ @items.keys.collect do |item|
609
+ val = if self.class.native_available?
610
+ content_node.search_vector * @items[item].search_vector.col
611
+ else
612
+ (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
613
+ end
614
+ [item, val]
615
+ end
616
+ result.sort_by { |x| x[1] }.reverse
617
+ end
618
+
619
+ # Unlocked version of proximity_norms_for_content for internal use
620
+ # @rbs (String) ?{ (String) -> String } -> Array[[String, Float]]
621
+ def proximity_norms_for_content_unlocked(doc, &)
622
+ return [] if needs_rebuild_unlocked?
623
+
624
+ content_node = node_for_content_unlocked(doc, &)
625
+ result =
626
+ @items.keys.collect do |item|
627
+ val = if self.class.native_available?
628
+ content_node.search_norm * @items[item].search_norm.col
629
+ else
630
+ (Matrix[content_node.search_norm] * @items[item].search_norm)[0]
631
+ end
632
+ [item, val]
633
+ end
634
+ result.sort_by { |x| x[1] }.reverse
635
+ end
636
+
637
+ # Unlocked version of vote for internal use
638
+ # @rbs (String, ?Float) ?{ (String) -> String } -> Hash[String | Symbol, Float]
639
+ def vote_unlocked(doc, cutoff = 0.30, &)
640
+ icutoff = (@items.size * cutoff).round
641
+ carry = proximity_array_for_content_unlocked(doc, &)
642
+ carry = carry[0..(icutoff - 1)]
643
+ votes = {}
644
+ carry.each do |pair|
645
+ categories = @items[pair[0]].categories
646
+ categories.each do |category|
647
+ votes[category] ||= 0.0
648
+ votes[category] += pair[1]
649
+ end
650
+ end
651
+ votes
652
+ end
653
+
654
+ # Unlocked version of node_for_content for internal use
655
+ # @rbs (String) ?{ (String) -> String } -> ContentNode
656
+ def node_for_content_unlocked(item, &block)
321
657
  return @items[item] if @items[item]
322
658
 
323
659
  clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
660
+ cn = ContentNode.new(clean_word_hash, &block)
661
+ cn.raw_vector_with(@word_list) unless needs_rebuild_unlocked?
662
+ cn
663
+ end
664
+
665
+ # @rbs (untyped, ?Float) -> untyped
666
+ def build_reduced_matrix(matrix, cutoff = 0.75)
667
+ # TODO: Check that M>=N on these dimensions! Transpose helps assure this
668
+ u, v, s = matrix.SV_decomp
324
669
 
325
- cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
670
+ @singular_values = s.sort.reverse
326
671
 
327
- unless needs_rebuild?
328
- cn.raw_vector_with(@word_list) # make the lsi raw and norm vectors
672
+ s_cutoff_index = [(s.size * cutoff).round - 1, 0].max
673
+ s_cutoff = @singular_values[s_cutoff_index]
674
+ s.size.times do |ord|
675
+ s[ord] = 0.0 if s[ord] < s_cutoff
329
676
  end
677
+ # Reconstruct the term document matrix, only with reduced rank
678
+ result = u * self.class.matrix_class.diag(s) * v.trans
330
679
 
331
- cn
680
+ # SVD may return transposed dimensions when row_size < column_size
681
+ # Ensure result matches input dimensions
682
+ result = result.trans if result.row_size != matrix.row_size
683
+
684
+ result
332
685
  end
333
686
 
687
+ # @rbs () -> void
334
688
  def make_word_list
335
689
  @word_list = WordList.new
336
690
  @items.each_value do |node|