classifier 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CLAUDE.md +23 -13
- data/README.md +82 -67
- data/ext/classifier/classifier_ext.c +25 -0
- data/ext/classifier/extconf.rb +15 -0
- data/ext/classifier/linalg.h +64 -0
- data/ext/classifier/matrix.c +387 -0
- data/ext/classifier/svd.c +208 -0
- data/ext/classifier/vector.c +319 -0
- data/lib/classifier/bayes.rb +253 -33
- data/lib/classifier/errors.rb +16 -0
- data/lib/classifier/extensions/vector.rb +12 -4
- data/lib/classifier/lsi/content_node.rb +5 -5
- data/lib/classifier/lsi.rb +439 -141
- data/lib/classifier/storage/base.rb +50 -0
- data/lib/classifier/storage/file.rb +51 -0
- data/lib/classifier/storage/memory.rb +49 -0
- data/lib/classifier/storage.rb +9 -0
- data/lib/classifier.rb +2 -0
- data/sig/vendor/json.rbs +4 -0
- data/sig/vendor/mutex_m.rbs +16 -0
- data/test/test_helper.rb +2 -0
- metadata +36 -5
- data/lib/classifier/extensions/vector_serialize.rb +0 -18
data/lib/classifier/lsi.rb
CHANGED
|
@@ -6,32 +6,55 @@
|
|
|
6
6
|
|
|
7
7
|
module Classifier
|
|
8
8
|
class LSI
|
|
9
|
-
#
|
|
10
|
-
@
|
|
9
|
+
# Backend options: :native, :ruby
|
|
10
|
+
# @rbs @backend: Symbol
|
|
11
|
+
@backend = :ruby
|
|
11
12
|
|
|
12
13
|
class << self
|
|
13
|
-
# @rbs @
|
|
14
|
-
attr_accessor :
|
|
14
|
+
# @rbs @backend: Symbol
|
|
15
|
+
attr_accessor :backend
|
|
16
|
+
|
|
17
|
+
# Check if using native C extension
|
|
18
|
+
# @rbs () -> bool
|
|
19
|
+
def native_available?
|
|
20
|
+
backend == :native
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Get the Vector class for the current backend
|
|
24
|
+
# @rbs () -> Class
|
|
25
|
+
def vector_class
|
|
26
|
+
backend == :native ? Classifier::Linalg::Vector : ::Vector
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Get the Matrix class for the current backend
|
|
30
|
+
# @rbs () -> Class
|
|
31
|
+
def matrix_class
|
|
32
|
+
backend == :native ? Classifier::Linalg::Matrix : ::Matrix
|
|
33
|
+
end
|
|
15
34
|
end
|
|
16
35
|
end
|
|
17
36
|
end
|
|
18
37
|
|
|
38
|
+
# Backend detection: native extension > pure Ruby
|
|
39
|
+
# Set NATIVE_VECTOR=true to force pure Ruby implementation
|
|
40
|
+
|
|
19
41
|
begin
|
|
20
|
-
# to test the native vector class, try `rake test NATIVE_VECTOR=true`
|
|
21
42
|
raise LoadError if ENV['NATIVE_VECTOR'] == 'true'
|
|
22
|
-
raise LoadError unless Gem::Specification.find_all_by_name('gsl').any?
|
|
23
43
|
|
|
24
|
-
require '
|
|
25
|
-
|
|
26
|
-
Classifier::LSI.gsl_available = true
|
|
44
|
+
require 'classifier/classifier_ext'
|
|
45
|
+
Classifier::LSI.backend = :native
|
|
27
46
|
rescue LoadError
|
|
28
|
-
|
|
29
|
-
|
|
47
|
+
# Fall back to pure Ruby implementation
|
|
48
|
+
unless ENV['SUPPRESS_LSI_WARNING'] == 'true'
|
|
49
|
+
warn 'Notice: for 5-10x faster LSI, install the classifier gem with native extensions. ' \
|
|
50
|
+
'Set SUPPRESS_LSI_WARNING=true to hide this.'
|
|
30
51
|
end
|
|
31
|
-
Classifier::LSI.
|
|
52
|
+
Classifier::LSI.backend = :ruby
|
|
32
53
|
require 'classifier/extensions/vector'
|
|
33
54
|
end
|
|
34
55
|
|
|
56
|
+
require 'json'
|
|
57
|
+
require 'mutex_m'
|
|
35
58
|
require 'classifier/lsi/word_list'
|
|
36
59
|
require 'classifier/lsi/content_node'
|
|
37
60
|
require 'classifier/lsi/summary'
|
|
@@ -41,14 +64,19 @@ module Classifier
|
|
|
41
64
|
# data based on underlying semantic relations. For more information on the algorithms used,
|
|
42
65
|
# please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
|
|
43
66
|
class LSI
|
|
67
|
+
include Mutex_m
|
|
68
|
+
|
|
44
69
|
# @rbs @auto_rebuild: bool
|
|
45
70
|
# @rbs @word_list: WordList
|
|
46
71
|
# @rbs @items: Hash[untyped, ContentNode]
|
|
47
72
|
# @rbs @version: Integer
|
|
48
73
|
# @rbs @built_at_version: Integer
|
|
74
|
+
# @rbs @singular_values: Array[Float]?
|
|
75
|
+
# @rbs @dirty: bool
|
|
76
|
+
# @rbs @storage: Storage::Base?
|
|
49
77
|
|
|
50
|
-
attr_reader :word_list
|
|
51
|
-
attr_accessor :auto_rebuild
|
|
78
|
+
attr_reader :word_list, :singular_values
|
|
79
|
+
attr_accessor :auto_rebuild, :storage
|
|
52
80
|
|
|
53
81
|
# Create a fresh index.
|
|
54
82
|
# If you want to call #build_index manually, use
|
|
@@ -56,11 +84,14 @@ module Classifier
|
|
|
56
84
|
#
|
|
57
85
|
# @rbs (?Hash[Symbol, untyped]) -> void
|
|
58
86
|
def initialize(options = {})
|
|
87
|
+
super()
|
|
59
88
|
@auto_rebuild = true unless options[:auto_rebuild] == false
|
|
60
89
|
@word_list = WordList.new
|
|
61
90
|
@items = {}
|
|
62
91
|
@version = 0
|
|
63
92
|
@built_at_version = -1
|
|
93
|
+
@dirty = false
|
|
94
|
+
@storage = nil
|
|
64
95
|
end
|
|
65
96
|
|
|
66
97
|
# Returns true if the index needs to be rebuilt. The index needs
|
|
@@ -69,7 +100,26 @@ module Classifier
|
|
|
69
100
|
#
|
|
70
101
|
# @rbs () -> bool
|
|
71
102
|
def needs_rebuild?
|
|
72
|
-
(@items.keys.size > 1) && (@version != @built_at_version)
|
|
103
|
+
synchronize { (@items.keys.size > 1) && (@version != @built_at_version) }
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# @rbs () -> Array[Hash[Symbol, untyped]]?
|
|
107
|
+
def singular_value_spectrum
|
|
108
|
+
return nil unless @singular_values
|
|
109
|
+
|
|
110
|
+
total = @singular_values.sum
|
|
111
|
+
return nil if total.zero?
|
|
112
|
+
|
|
113
|
+
cumulative = 0.0
|
|
114
|
+
@singular_values.map.with_index do |value, i|
|
|
115
|
+
cumulative += value
|
|
116
|
+
{
|
|
117
|
+
dimension: i,
|
|
118
|
+
value: value,
|
|
119
|
+
percentage: value / total,
|
|
120
|
+
cumulative_percentage: cumulative / total
|
|
121
|
+
}
|
|
122
|
+
end
|
|
73
123
|
end
|
|
74
124
|
|
|
75
125
|
# Adds an item to the index. item is assumed to be a string, but
|
|
@@ -88,8 +138,11 @@ module Classifier
|
|
|
88
138
|
# @rbs (String, *String | Symbol) ?{ (String) -> String } -> void
|
|
89
139
|
def add_item(item, *categories, &block)
|
|
90
140
|
clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
|
|
91
|
-
|
|
92
|
-
|
|
141
|
+
synchronize do
|
|
142
|
+
@items[item] = ContentNode.new(clean_word_hash, *categories)
|
|
143
|
+
@version += 1
|
|
144
|
+
@dirty = true
|
|
145
|
+
end
|
|
93
146
|
build_index if @auto_rebuild
|
|
94
147
|
end
|
|
95
148
|
|
|
@@ -107,25 +160,32 @@ module Classifier
|
|
|
107
160
|
#
|
|
108
161
|
# @rbs (String) -> Array[String | Symbol]
|
|
109
162
|
def categories_for(item)
|
|
110
|
-
|
|
163
|
+
synchronize do
|
|
164
|
+
return [] unless @items[item]
|
|
111
165
|
|
|
112
|
-
|
|
166
|
+
@items[item].categories
|
|
167
|
+
end
|
|
113
168
|
end
|
|
114
169
|
|
|
115
170
|
# Removes an item from the database, if it is indexed.
|
|
116
171
|
#
|
|
117
172
|
# @rbs (String) -> void
|
|
118
173
|
def remove_item(item)
|
|
119
|
-
|
|
174
|
+
removed = synchronize do
|
|
175
|
+
next false unless @items.key?(item)
|
|
120
176
|
|
|
121
|
-
|
|
122
|
-
|
|
177
|
+
@items.delete(item)
|
|
178
|
+
@version += 1
|
|
179
|
+
@dirty = true
|
|
180
|
+
true
|
|
181
|
+
end
|
|
182
|
+
build_index if removed && @auto_rebuild
|
|
123
183
|
end
|
|
124
184
|
|
|
125
185
|
# Returns an array of items that are indexed.
|
|
126
186
|
# @rbs () -> Array[untyped]
|
|
127
187
|
def items
|
|
128
|
-
@items.keys
|
|
188
|
+
synchronize { @items.keys }
|
|
129
189
|
end
|
|
130
190
|
|
|
131
191
|
# This function rebuilds the index if needs_rebuild? returns true.
|
|
@@ -145,38 +205,30 @@ module Classifier
|
|
|
145
205
|
#
|
|
146
206
|
# @rbs (?Float) -> void
|
|
147
207
|
def build_index(cutoff = 0.75)
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
208
|
+
validate_cutoff!(cutoff)
|
|
209
|
+
|
|
210
|
+
synchronize do
|
|
211
|
+
return unless needs_rebuild_unlocked?
|
|
212
|
+
|
|
213
|
+
make_word_list
|
|
214
|
+
|
|
215
|
+
doc_list = @items.values
|
|
216
|
+
tda = doc_list.collect { |node| node.raw_vector_with(@word_list) }
|
|
217
|
+
|
|
218
|
+
if self.class.native_available?
|
|
219
|
+
# Convert vectors to arrays for matrix construction
|
|
220
|
+
tda_arrays = tda.map { |v| v.respond_to?(:to_a) ? v.to_a : v }
|
|
221
|
+
tdm = self.class.matrix_class.alloc(*tda_arrays).trans
|
|
222
|
+
ntdm = build_reduced_matrix(tdm, cutoff)
|
|
223
|
+
assign_native_ext_lsi_vectors(ntdm, doc_list)
|
|
224
|
+
else
|
|
225
|
+
tdm = Matrix.rows(tda).trans
|
|
226
|
+
ntdm = build_reduced_matrix(tdm, cutoff)
|
|
227
|
+
assign_ruby_lsi_vectors(ntdm, doc_list)
|
|
163
228
|
end
|
|
164
|
-
else
|
|
165
|
-
tdm = Matrix.rows(tda).trans
|
|
166
|
-
ntdm = build_reduced_matrix(tdm, cutoff)
|
|
167
|
-
|
|
168
|
-
ntdm.column_size.times do |col|
|
|
169
|
-
next unless doc_list[col]
|
|
170
|
-
|
|
171
|
-
column = ntdm.column(col)
|
|
172
|
-
next unless column
|
|
173
229
|
|
|
174
|
-
|
|
175
|
-
doc_list[col].lsi_norm = column.normalize
|
|
176
|
-
end
|
|
230
|
+
@built_at_version = @version
|
|
177
231
|
end
|
|
178
|
-
|
|
179
|
-
@built_at_version = @version
|
|
180
232
|
end
|
|
181
233
|
|
|
182
234
|
# This method returns max_chunks entries, ordered by their average semantic rating.
|
|
@@ -190,12 +242,14 @@ module Classifier
|
|
|
190
242
|
#
|
|
191
243
|
# @rbs (?Integer) -> Array[String]
|
|
192
244
|
def highest_relative_content(max_chunks = 10)
|
|
193
|
-
|
|
245
|
+
synchronize do
|
|
246
|
+
return [] if needs_rebuild_unlocked?
|
|
194
247
|
|
|
195
|
-
|
|
196
|
-
|
|
248
|
+
avg_density = {}
|
|
249
|
+
@items.each_key { |x| avg_density[x] = proximity_array_for_content_unlocked(x).sum { |pair| pair[1] } }
|
|
197
250
|
|
|
198
|
-
|
|
251
|
+
avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..(max_chunks - 1)].map
|
|
252
|
+
end
|
|
199
253
|
end
|
|
200
254
|
|
|
201
255
|
# This function is the primitive that find_related and classify
|
|
@@ -212,20 +266,8 @@ module Classifier
|
|
|
212
266
|
# text data. See add_item for examples of how this works.
|
|
213
267
|
#
|
|
214
268
|
# @rbs (String) ?{ (String) -> String } -> Array[[String, Float]]
|
|
215
|
-
def proximity_array_for_content(doc, &)
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
content_node = node_for_content(doc, &)
|
|
219
|
-
result =
|
|
220
|
-
@items.keys.collect do |item|
|
|
221
|
-
val = if self.class.gsl_available
|
|
222
|
-
content_node.search_vector * @items[item].search_vector.col
|
|
223
|
-
else
|
|
224
|
-
(Matrix[content_node.search_vector] * @items[item].search_vector)[0]
|
|
225
|
-
end
|
|
226
|
-
[item, val]
|
|
227
|
-
end
|
|
228
|
-
result.sort_by { |x| x[1] }.reverse
|
|
269
|
+
def proximity_array_for_content(doc, &block)
|
|
270
|
+
synchronize { proximity_array_for_content_unlocked(doc, &block) }
|
|
229
271
|
end
|
|
230
272
|
|
|
231
273
|
# Similar to proximity_array_for_content, this function takes similar
|
|
@@ -235,20 +277,8 @@ module Classifier
|
|
|
235
277
|
# the text you're working with. search uses this primitive.
|
|
236
278
|
#
|
|
237
279
|
# @rbs (String) ?{ (String) -> String } -> Array[[String, Float]]
|
|
238
|
-
def proximity_norms_for_content(doc, &)
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
content_node = node_for_content(doc, &)
|
|
242
|
-
result =
|
|
243
|
-
@items.keys.collect do |item|
|
|
244
|
-
val = if self.class.gsl_available
|
|
245
|
-
content_node.search_norm * @items[item].search_norm.col
|
|
246
|
-
else
|
|
247
|
-
(Matrix[content_node.search_norm] * @items[item].search_norm)[0]
|
|
248
|
-
end
|
|
249
|
-
[item, val]
|
|
250
|
-
end
|
|
251
|
-
result.sort_by { |x| x[1] }.reverse
|
|
280
|
+
def proximity_norms_for_content(doc, &block)
|
|
281
|
+
synchronize { proximity_norms_for_content_unlocked(doc, &block) }
|
|
252
282
|
end
|
|
253
283
|
|
|
254
284
|
# This function allows for text-based search of your index. Unlike other functions
|
|
@@ -261,11 +291,13 @@ module Classifier
|
|
|
261
291
|
#
|
|
262
292
|
# @rbs (String, ?Integer) -> Array[String]
|
|
263
293
|
def search(string, max_nearest = 3)
|
|
264
|
-
|
|
294
|
+
synchronize do
|
|
295
|
+
return [] if needs_rebuild_unlocked?
|
|
265
296
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
297
|
+
carry = proximity_norms_for_content_unlocked(string)
|
|
298
|
+
result = carry.collect { |x| x[0] }
|
|
299
|
+
result[0..(max_nearest - 1)]
|
|
300
|
+
end
|
|
269
301
|
end
|
|
270
302
|
|
|
271
303
|
# This function takes content and finds other documents
|
|
@@ -280,10 +312,12 @@ module Classifier
|
|
|
280
312
|
#
|
|
281
313
|
# @rbs (String, ?Integer) ?{ (String) -> String } -> Array[String]
|
|
282
314
|
def find_related(doc, max_nearest = 3, &block)
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
315
|
+
synchronize do
|
|
316
|
+
carry =
|
|
317
|
+
proximity_array_for_content_unlocked(doc, &block).reject { |pair| pair[0] == doc }
|
|
318
|
+
result = carry.collect { |x| x[0] }
|
|
319
|
+
result[0..(max_nearest - 1)]
|
|
320
|
+
end
|
|
287
321
|
end
|
|
288
322
|
|
|
289
323
|
# This function uses a voting system to categorize documents, based on
|
|
@@ -291,32 +325,23 @@ module Classifier
|
|
|
291
325
|
# find_related function to find related documents, then returns the
|
|
292
326
|
# most obvious category from this list.
|
|
293
327
|
#
|
|
294
|
-
# cutoff signifies the number of documents to consider when clasifying
|
|
295
|
-
# text. A cutoff of 1 means that every document in the index votes on
|
|
296
|
-
# what category the document is in. This may not always make sense.
|
|
297
|
-
#
|
|
298
328
|
# @rbs (String, ?Float) ?{ (String) -> String } -> String | Symbol
|
|
299
|
-
def classify(doc, cutoff = 0.30, &)
|
|
300
|
-
|
|
329
|
+
def classify(doc, cutoff = 0.30, &block)
|
|
330
|
+
validate_cutoff!(cutoff)
|
|
331
|
+
|
|
332
|
+
synchronize do
|
|
333
|
+
votes = vote_unlocked(doc, cutoff, &block)
|
|
301
334
|
|
|
302
|
-
|
|
303
|
-
|
|
335
|
+
ranking = votes.keys.sort_by { |x| votes[x] }
|
|
336
|
+
ranking[-1]
|
|
337
|
+
end
|
|
304
338
|
end
|
|
305
339
|
|
|
306
340
|
# @rbs (String, ?Float) ?{ (String) -> String } -> Hash[String | Symbol, Float]
|
|
307
|
-
def vote(doc, cutoff = 0.30, &)
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
votes = {}
|
|
312
|
-
carry.each do |pair|
|
|
313
|
-
categories = @items[pair[0]].categories
|
|
314
|
-
categories.each do |category|
|
|
315
|
-
votes[category] ||= 0.0
|
|
316
|
-
votes[category] += pair[1]
|
|
317
|
-
end
|
|
318
|
-
end
|
|
319
|
-
votes
|
|
341
|
+
def vote(doc, cutoff = 0.30, &block)
|
|
342
|
+
validate_cutoff!(cutoff)
|
|
343
|
+
|
|
344
|
+
synchronize { vote_unlocked(doc, cutoff, &block) }
|
|
320
345
|
end
|
|
321
346
|
|
|
322
347
|
# Returns the same category as classify() but also returns
|
|
@@ -331,15 +356,19 @@ module Classifier
|
|
|
331
356
|
#
|
|
332
357
|
# See classify() for argument docs
|
|
333
358
|
# @rbs (String, ?Float) ?{ (String) -> String } -> [String | Symbol | nil, Float?]
|
|
334
|
-
def classify_with_confidence(doc, cutoff = 0.30, &)
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
359
|
+
def classify_with_confidence(doc, cutoff = 0.30, &block)
|
|
360
|
+
validate_cutoff!(cutoff)
|
|
361
|
+
|
|
362
|
+
synchronize do
|
|
363
|
+
votes = vote_unlocked(doc, cutoff, &block)
|
|
364
|
+
votes_sum = votes.values.sum
|
|
365
|
+
return [nil, nil] if votes_sum.zero?
|
|
366
|
+
|
|
367
|
+
ranking = votes.keys.sort_by { |x| votes[x] }
|
|
368
|
+
winner = ranking[-1]
|
|
369
|
+
vote_share = votes[winner] / votes_sum.to_f
|
|
370
|
+
[winner, vote_share]
|
|
371
|
+
end
|
|
343
372
|
end
|
|
344
373
|
|
|
345
374
|
# Prototype, only works on indexed documents.
|
|
@@ -347,45 +376,314 @@ module Classifier
|
|
|
347
376
|
# it's supposed to.
|
|
348
377
|
# @rbs (String, ?Integer) -> Array[Symbol]
|
|
349
378
|
def highest_ranked_stems(doc, count = 3)
|
|
350
|
-
|
|
379
|
+
synchronize do
|
|
380
|
+
raise 'Requested stem ranking on non-indexed content!' unless @items[doc]
|
|
381
|
+
|
|
382
|
+
arr = node_for_content_unlocked(doc).lsi_vector.to_a
|
|
383
|
+
top_n = arr.sort.reverse[0..(count - 1)]
|
|
384
|
+
top_n.collect { |x| @word_list.word_for_index(arr.index(x)) }
|
|
385
|
+
end
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
# Custom marshal serialization to exclude mutex state
|
|
389
|
+
# @rbs () -> Array[untyped]
|
|
390
|
+
def marshal_dump
|
|
391
|
+
[@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty]
|
|
392
|
+
end
|
|
393
|
+
|
|
394
|
+
# Custom marshal deserialization to recreate mutex
|
|
395
|
+
# @rbs (Array[untyped]) -> void
|
|
396
|
+
def marshal_load(data)
|
|
397
|
+
mu_initialize
|
|
398
|
+
@auto_rebuild, @word_list, @items, @version, @built_at_version, @dirty = data
|
|
399
|
+
@storage = nil
|
|
400
|
+
end
|
|
401
|
+
|
|
402
|
+
# Returns a hash representation of the LSI index.
|
|
403
|
+
# Only source data (word_hash, categories) is included, not computed vectors.
|
|
404
|
+
# This can be converted to JSON or used directly.
|
|
405
|
+
#
|
|
406
|
+
# @rbs () -> untyped
|
|
407
|
+
def as_json(*)
|
|
408
|
+
items_data = @items.transform_values do |node|
|
|
409
|
+
{
|
|
410
|
+
word_hash: node.word_hash.transform_keys(&:to_s),
|
|
411
|
+
categories: node.categories.map(&:to_s)
|
|
412
|
+
}
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
{
|
|
416
|
+
version: 1,
|
|
417
|
+
type: 'lsi',
|
|
418
|
+
auto_rebuild: @auto_rebuild,
|
|
419
|
+
items: items_data
|
|
420
|
+
}
|
|
421
|
+
end
|
|
422
|
+
|
|
423
|
+
# Serializes the LSI index to a JSON string.
|
|
424
|
+
# Only source data (word_hash, categories) is serialized, not computed vectors.
|
|
425
|
+
# On load, the index will be rebuilt automatically.
|
|
426
|
+
#
|
|
427
|
+
# @rbs () -> String
|
|
428
|
+
def to_json(*)
|
|
429
|
+
as_json.to_json
|
|
430
|
+
end
|
|
431
|
+
|
|
432
|
+
# Loads an LSI index from a JSON string or Hash created by #to_json or #as_json.
|
|
433
|
+
# The index will be rebuilt after loading.
|
|
434
|
+
#
|
|
435
|
+
# @rbs (String | Hash[String, untyped]) -> LSI
|
|
436
|
+
def self.from_json(json)
|
|
437
|
+
data = json.is_a?(String) ? JSON.parse(json) : json
|
|
438
|
+
raise ArgumentError, "Invalid classifier type: #{data['type']}" unless data['type'] == 'lsi'
|
|
439
|
+
|
|
440
|
+
# Create instance with auto_rebuild disabled during loading
|
|
441
|
+
instance = new(auto_rebuild: false)
|
|
442
|
+
|
|
443
|
+
# Restore items (categories stay as strings, matching original storage)
|
|
444
|
+
data['items'].each do |item_key, item_data|
|
|
445
|
+
word_hash = item_data['word_hash'].transform_keys(&:to_sym)
|
|
446
|
+
categories = item_data['categories']
|
|
447
|
+
instance.instance_variable_get(:@items)[item_key] = ContentNode.new(word_hash, *categories)
|
|
448
|
+
instance.instance_variable_set(:@version, instance.instance_variable_get(:@version) + 1)
|
|
449
|
+
end
|
|
450
|
+
|
|
451
|
+
# Restore auto_rebuild setting and rebuild index
|
|
452
|
+
instance.auto_rebuild = data['auto_rebuild']
|
|
453
|
+
instance.build_index
|
|
454
|
+
instance
|
|
455
|
+
end
|
|
456
|
+
|
|
457
|
+
# Saves the LSI index to the configured storage.
|
|
458
|
+
# Raises ArgumentError if no storage is configured.
|
|
459
|
+
#
|
|
460
|
+
# @rbs () -> void
|
|
461
|
+
def save
|
|
462
|
+
raise ArgumentError, 'No storage configured. Use save_to_file(path) or set storage=' unless storage
|
|
463
|
+
|
|
464
|
+
storage.write(to_json)
|
|
465
|
+
@dirty = false
|
|
466
|
+
end
|
|
467
|
+
|
|
468
|
+
# Saves the LSI index to a file (legacy API).
|
|
469
|
+
#
|
|
470
|
+
# @rbs (String) -> Integer
|
|
471
|
+
def save_to_file(path)
|
|
472
|
+
result = File.write(path, to_json)
|
|
473
|
+
@dirty = false
|
|
474
|
+
result
|
|
475
|
+
end
|
|
476
|
+
|
|
477
|
+
# Reloads the LSI index from the configured storage.
|
|
478
|
+
# Raises UnsavedChangesError if there are unsaved changes.
|
|
479
|
+
# Use reload! to force reload and discard changes.
|
|
480
|
+
#
|
|
481
|
+
# @rbs () -> self
|
|
482
|
+
def reload
|
|
483
|
+
raise ArgumentError, 'No storage configured' unless storage
|
|
484
|
+
raise UnsavedChangesError, 'Unsaved changes would be lost. Call save first or use reload!' if @dirty
|
|
351
485
|
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
486
|
+
data = storage.read
|
|
487
|
+
raise StorageError, 'No saved state found' unless data
|
|
488
|
+
|
|
489
|
+
restore_from_json(data)
|
|
490
|
+
@dirty = false
|
|
491
|
+
self
|
|
492
|
+
end
|
|
493
|
+
|
|
494
|
+
# Force reloads the LSI index from storage, discarding any unsaved changes.
|
|
495
|
+
#
|
|
496
|
+
# @rbs () -> self
|
|
497
|
+
def reload!
|
|
498
|
+
raise ArgumentError, 'No storage configured' unless storage
|
|
499
|
+
|
|
500
|
+
data = storage.read
|
|
501
|
+
raise StorageError, 'No saved state found' unless data
|
|
502
|
+
|
|
503
|
+
restore_from_json(data)
|
|
504
|
+
@dirty = false
|
|
505
|
+
self
|
|
506
|
+
end
|
|
507
|
+
|
|
508
|
+
# Returns true if there are unsaved changes.
|
|
509
|
+
#
|
|
510
|
+
# @rbs () -> bool
|
|
511
|
+
def dirty?
|
|
512
|
+
@dirty
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
# Loads an LSI index from the configured storage.
|
|
516
|
+
# The storage is set on the returned instance.
|
|
517
|
+
#
|
|
518
|
+
# @rbs (storage: Storage::Base) -> LSI
|
|
519
|
+
def self.load(storage:)
|
|
520
|
+
data = storage.read
|
|
521
|
+
raise StorageError, 'No saved state found' unless data
|
|
522
|
+
|
|
523
|
+
instance = from_json(data)
|
|
524
|
+
instance.storage = storage
|
|
525
|
+
instance
|
|
526
|
+
end
|
|
527
|
+
|
|
528
|
+
# Loads an LSI index from a file (legacy API).
|
|
529
|
+
#
|
|
530
|
+
# @rbs (String) -> LSI
|
|
531
|
+
def self.load_from_file(path)
|
|
532
|
+
from_json(File.read(path))
|
|
355
533
|
end
|
|
356
534
|
|
|
357
535
|
private
|
|
358
536
|
|
|
537
|
+
# Restores LSI state from a JSON string (used by reload)
|
|
538
|
+
# @rbs (String) -> void
|
|
539
|
+
def restore_from_json(json)
|
|
540
|
+
data = JSON.parse(json)
|
|
541
|
+
raise ArgumentError, "Invalid classifier type: #{data['type']}" unless data['type'] == 'lsi'
|
|
542
|
+
|
|
543
|
+
synchronize do
|
|
544
|
+
# Recreate the items
|
|
545
|
+
@items = {}
|
|
546
|
+
data['items'].each do |item_key, item_data|
|
|
547
|
+
word_hash = item_data['word_hash'].transform_keys(&:to_sym)
|
|
548
|
+
categories = item_data['categories']
|
|
549
|
+
@items[item_key] = ContentNode.new(word_hash, *categories)
|
|
550
|
+
end
|
|
551
|
+
|
|
552
|
+
# Restore settings
|
|
553
|
+
@auto_rebuild = data['auto_rebuild']
|
|
554
|
+
@version += 1
|
|
555
|
+
@built_at_version = -1
|
|
556
|
+
@word_list = WordList.new
|
|
557
|
+
@dirty = false
|
|
558
|
+
end
|
|
559
|
+
|
|
560
|
+
# Rebuild the index
|
|
561
|
+
build_index
|
|
562
|
+
end
|
|
563
|
+
|
|
564
|
+
# @rbs (Float) -> void
|
|
565
|
+
def validate_cutoff!(cutoff)
|
|
566
|
+
return if cutoff.positive? && cutoff < 1
|
|
567
|
+
|
|
568
|
+
raise ArgumentError, "cutoff must be between 0 and 1 (exclusive), got #{cutoff}"
|
|
569
|
+
end
|
|
570
|
+
|
|
571
|
+
# Assigns LSI vectors using native C extension
|
|
572
|
+
# @rbs (untyped, Array[ContentNode]) -> void
|
|
573
|
+
def assign_native_ext_lsi_vectors(ntdm, doc_list)
|
|
574
|
+
ntdm.size[1].times do |col|
|
|
575
|
+
vec = self.class.vector_class.alloc(ntdm.column(col).to_a).row
|
|
576
|
+
doc_list[col].lsi_vector = vec
|
|
577
|
+
doc_list[col].lsi_norm = vec.normalize
|
|
578
|
+
end
|
|
579
|
+
end
|
|
580
|
+
|
|
581
|
+
# Assigns LSI vectors using pure Ruby Matrix
|
|
582
|
+
# @rbs (untyped, Array[ContentNode]) -> void
|
|
583
|
+
def assign_ruby_lsi_vectors(ntdm, doc_list)
|
|
584
|
+
ntdm.column_size.times do |col|
|
|
585
|
+
next unless doc_list[col]
|
|
586
|
+
|
|
587
|
+
column = ntdm.column(col)
|
|
588
|
+
next unless column
|
|
589
|
+
|
|
590
|
+
doc_list[col].lsi_vector = column
|
|
591
|
+
doc_list[col].lsi_norm = column.normalize
|
|
592
|
+
end
|
|
593
|
+
end
|
|
594
|
+
|
|
595
|
+
# Unlocked version of needs_rebuild? for internal use when lock is already held
|
|
596
|
+
# @rbs () -> bool
|
|
597
|
+
def needs_rebuild_unlocked?
|
|
598
|
+
(@items.keys.size > 1) && (@version != @built_at_version)
|
|
599
|
+
end
|
|
600
|
+
|
|
601
|
+
# Unlocked version of proximity_array_for_content for internal use
|
|
602
|
+
# @rbs (String) ?{ (String) -> String } -> Array[[String, Float]]
|
|
603
|
+
def proximity_array_for_content_unlocked(doc, &)
|
|
604
|
+
return [] if needs_rebuild_unlocked?
|
|
605
|
+
|
|
606
|
+
content_node = node_for_content_unlocked(doc, &)
|
|
607
|
+
result =
|
|
608
|
+
@items.keys.collect do |item|
|
|
609
|
+
val = if self.class.native_available?
|
|
610
|
+
content_node.search_vector * @items[item].search_vector.col
|
|
611
|
+
else
|
|
612
|
+
(Matrix[content_node.search_vector] * @items[item].search_vector)[0]
|
|
613
|
+
end
|
|
614
|
+
[item, val]
|
|
615
|
+
end
|
|
616
|
+
result.sort_by { |x| x[1] }.reverse
|
|
617
|
+
end
|
|
618
|
+
|
|
619
|
+
# Unlocked version of proximity_norms_for_content for internal use
|
|
620
|
+
# @rbs (String) ?{ (String) -> String } -> Array[[String, Float]]
|
|
621
|
+
def proximity_norms_for_content_unlocked(doc, &)
|
|
622
|
+
return [] if needs_rebuild_unlocked?
|
|
623
|
+
|
|
624
|
+
content_node = node_for_content_unlocked(doc, &)
|
|
625
|
+
result =
|
|
626
|
+
@items.keys.collect do |item|
|
|
627
|
+
val = if self.class.native_available?
|
|
628
|
+
content_node.search_norm * @items[item].search_norm.col
|
|
629
|
+
else
|
|
630
|
+
(Matrix[content_node.search_norm] * @items[item].search_norm)[0]
|
|
631
|
+
end
|
|
632
|
+
[item, val]
|
|
633
|
+
end
|
|
634
|
+
result.sort_by { |x| x[1] }.reverse
|
|
635
|
+
end
|
|
636
|
+
|
|
637
|
+
# Unlocked version of vote for internal use
|
|
638
|
+
# @rbs (String, ?Float) ?{ (String) -> String } -> Hash[String | Symbol, Float]
|
|
639
|
+
def vote_unlocked(doc, cutoff = 0.30, &)
|
|
640
|
+
icutoff = (@items.size * cutoff).round
|
|
641
|
+
carry = proximity_array_for_content_unlocked(doc, &)
|
|
642
|
+
carry = carry[0..(icutoff - 1)]
|
|
643
|
+
votes = {}
|
|
644
|
+
carry.each do |pair|
|
|
645
|
+
categories = @items[pair[0]].categories
|
|
646
|
+
categories.each do |category|
|
|
647
|
+
votes[category] ||= 0.0
|
|
648
|
+
votes[category] += pair[1]
|
|
649
|
+
end
|
|
650
|
+
end
|
|
651
|
+
votes
|
|
652
|
+
end
|
|
653
|
+
|
|
654
|
+
# Unlocked version of node_for_content for internal use
|
|
655
|
+
# @rbs (String) ?{ (String) -> String } -> ContentNode
|
|
656
|
+
def node_for_content_unlocked(item, &block)
|
|
657
|
+
return @items[item] if @items[item]
|
|
658
|
+
|
|
659
|
+
clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
|
|
660
|
+
cn = ContentNode.new(clean_word_hash, &block)
|
|
661
|
+
cn.raw_vector_with(@word_list) unless needs_rebuild_unlocked?
|
|
662
|
+
cn
|
|
663
|
+
end
|
|
664
|
+
|
|
359
665
|
# @rbs (untyped, ?Float) -> untyped
|
|
360
666
|
def build_reduced_matrix(matrix, cutoff = 0.75)
|
|
361
667
|
# TODO: Check that M>=N on these dimensions! Transpose helps assure this
|
|
362
668
|
u, v, s = matrix.SV_decomp
|
|
363
669
|
|
|
364
|
-
|
|
365
|
-
|
|
670
|
+
@singular_values = s.sort.reverse
|
|
671
|
+
|
|
672
|
+
s_cutoff_index = [(s.size * cutoff).round - 1, 0].max
|
|
673
|
+
s_cutoff = @singular_values[s_cutoff_index]
|
|
366
674
|
s.size.times do |ord|
|
|
367
675
|
s[ord] = 0.0 if s[ord] < s_cutoff
|
|
368
676
|
end
|
|
369
677
|
# Reconstruct the term document matrix, only with reduced rank
|
|
370
|
-
result = u *
|
|
678
|
+
result = u * self.class.matrix_class.diag(s) * v.trans
|
|
371
679
|
|
|
372
|
-
#
|
|
680
|
+
# SVD may return transposed dimensions when row_size < column_size
|
|
373
681
|
# Ensure result matches input dimensions
|
|
374
|
-
result = result.trans if
|
|
682
|
+
result = result.trans if result.row_size != matrix.row_size
|
|
375
683
|
|
|
376
684
|
result
|
|
377
685
|
end
|
|
378
686
|
|
|
379
|
-
# @rbs (String) ?{ (String) -> String } -> ContentNode
|
|
380
|
-
def node_for_content(item, &block)
|
|
381
|
-
return @items[item] if @items[item]
|
|
382
|
-
|
|
383
|
-
clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
|
|
384
|
-
cn = ContentNode.new(clean_word_hash, &block)
|
|
385
|
-
cn.raw_vector_with(@word_list) unless needs_rebuild?
|
|
386
|
-
cn
|
|
387
|
-
end
|
|
388
|
-
|
|
389
687
|
# @rbs () -> void
|
|
390
688
|
def make_word_list
|
|
391
689
|
@word_list = WordList.new
|